166b28810SBrad Fitzpatrick /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
260d70942SAnatoly Vorobey /*
3c9607c6dSBrad Fitzpatrick * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4c9607c6dSBrad Fitzpatrick * and are divided into chunks. The chunk sizes start off at the size of the
5c9607c6dSBrad Fitzpatrick * "item" structure plus space for a small key and value. They increase by
6c9607c6dSBrad Fitzpatrick * a multiplier factor from there, up to half the maximum slab size. The last
7c9607c6dSBrad Fitzpatrick * slab size is always 1MB, since that's the maximum item size allowed by the
8c9607c6dSBrad Fitzpatrick * memcached protocol.
960d70942SAnatoly Vorobey */
1056b8339eSSteven Grimm #include "memcached.h"
1160d70942SAnatoly Vorobey #include <sys/stat.h>
1260d70942SAnatoly Vorobey #include <sys/socket.h>
1360d70942SAnatoly Vorobey #include <sys/resource.h>
1460d70942SAnatoly Vorobey #include <fcntl.h>
15c6975ef4SPaul Lindner #include <netinet/in.h>
16c6975ef4SPaul Lindner #include <errno.h>
1760d70942SAnatoly Vorobey #include <stdlib.h>
1860d70942SAnatoly Vorobey #include <stdio.h>
1960d70942SAnatoly Vorobey #include <string.h>
20e688e97dSNatanael Copa #include <signal.h>
21c08383afSBrad Fitzpatrick #include <assert.h>
22d9220d64STrond Norbye #include <pthread.h>
23294c32d7SToru Maesaka
24a836eabcSdormando //#define DEBUG_SLAB_MOVER
25c9607c6dSBrad Fitzpatrick /* powers-of-N allocation structures */
2686969ea4SBrad Fitzpatrick
2760d70942SAnatoly Vorobey typedef struct {
2860d70942SAnatoly Vorobey unsigned int size; /* sizes of items */
2960d70942SAnatoly Vorobey unsigned int perslab; /* how many items per slab */
3086969ea4SBrad Fitzpatrick
3110698baeSdormando void *slots; /* list of item ptrs */
3210698baeSdormando unsigned int sl_curr; /* total free items in list */
3386969ea4SBrad Fitzpatrick
3460d70942SAnatoly Vorobey unsigned int slabs; /* how many slabs were allocated for this class */
3586969ea4SBrad Fitzpatrick
36b7b9d5f4SBrad Fitzpatrick void **slab_list; /* array of slab pointers */
37b7b9d5f4SBrad Fitzpatrick unsigned int list_size; /* size of prev array */
3886969ea4SBrad Fitzpatrick
397a82822bSTrond Norbye size_t requested; /* The number of requested bytes */
4060d70942SAnatoly Vorobey } slabclass_t;
4186969ea4SBrad Fitzpatrick
427173856aSDustin Sallings static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
43c9607c6dSBrad Fitzpatrick static size_t mem_limit = 0;
44c9607c6dSBrad Fitzpatrick static size_t mem_malloced = 0;
45fb269897Sdormando /* If the memory limit has been hit once. Used as a hint to decide when to
46fb269897Sdormando * early-wake the LRU maintenance thread */
47fb269897Sdormando static bool mem_limit_reached = false;
48c9607c6dSBrad Fitzpatrick static int power_largest;
4986969ea4SBrad Fitzpatrick
50a6b35b44STrond Norbye static void *mem_base = NULL;
51a6b35b44STrond Norbye static void *mem_current = NULL;
52a6b35b44STrond Norbye static size_t mem_avail = 0;
53a6b35b44STrond Norbye
54d9220d64STrond Norbye /**
55d9220d64STrond Norbye * Access to the slab allocator is protected by this lock
56d9220d64STrond Norbye */
57d9220d64STrond Norbye static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
583b2dc737Sdormando static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
59d9220d64STrond Norbye
60c9607c6dSBrad Fitzpatrick /*
6177dde9f9SPaul Lindner * Forward Declarations
6277dde9f9SPaul Lindner */
6356b8339eSSteven Grimm static int do_slabs_newslab(const unsigned int id);
64a6b35b44STrond Norbye static void *memory_allocate(size_t size);
65845a4fe1Sdormando static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
6677dde9f9SPaul Lindner
6777dde9f9SPaul Lindner /* Preallocate as many slab pages as possible (called from slabs_init)
6877dde9f9SPaul Lindner on start-up, so users don't get confused out-of-memory errors when
6977dde9f9SPaul Lindner they do have free (in-slab) space, but no space to make new slabs.
7077dde9f9SPaul Lindner if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
7177dde9f9SPaul Lindner slab types can be made. if max memory is less than 18 MB, only the
7277dde9f9SPaul Lindner smaller ones will be made. */
7377dde9f9SPaul Lindner static void slabs_preallocate (const unsigned int maxslabs);
7477dde9f9SPaul Lindner
7577dde9f9SPaul Lindner /*
76c9607c6dSBrad Fitzpatrick * Figures out which slab class (chunk size) is required to store an item of
77c9607c6dSBrad Fitzpatrick * a given size.
7877dde9f9SPaul Lindner *
7977dde9f9SPaul Lindner * Given object size, return id to use when allocating/freeing memory for object
8077dde9f9SPaul Lindner * 0 means error: can't store such a large object
81c9607c6dSBrad Fitzpatrick */
8277dde9f9SPaul Lindner
slabs_clsid(const size_t size)8377dde9f9SPaul Lindner unsigned int slabs_clsid(const size_t size) {
84c9607c6dSBrad Fitzpatrick int res = POWER_SMALLEST;
8586969ea4SBrad Fitzpatrick
860567967aSdormando if (size == 0 || size > settings.item_size_max)
8760d70942SAnatoly Vorobey return 0;
88c9607c6dSBrad Fitzpatrick while (size > slabclass[res].size)
89c9607c6dSBrad Fitzpatrick if (res++ == power_largest) /* won't fit in the biggest slab */
900567967aSdormando return power_largest;
9160d70942SAnatoly Vorobey return res;
9260d70942SAnatoly Vorobey }
9386969ea4SBrad Fitzpatrick
9444ef96eeSPaul Lindner /**
95c9607c6dSBrad Fitzpatrick * Determines the chunk sizes and initializes the slab class descriptors
96c9607c6dSBrad Fitzpatrick * accordingly.
97c9607c6dSBrad Fitzpatrick */
slabs_init(const size_t limit,const double factor,const bool prealloc,const uint32_t * slab_sizes)98d7fb022dSdormando void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) {
99c9607c6dSBrad Fitzpatrick int i = POWER_SMALLEST - 1;
100c9607c6dSBrad Fitzpatrick unsigned int size = sizeof(item) + settings.chunk_size;
10186969ea4SBrad Fitzpatrick
10260d70942SAnatoly Vorobey mem_limit = limit;
103a6b35b44STrond Norbye
104a6b35b44STrond Norbye if (prealloc) {
105a6b35b44STrond Norbye /* Allocate everything in a big chunk with malloc */
106a6b35b44STrond Norbye mem_base = malloc(mem_limit);
107a6b35b44STrond Norbye if (mem_base != NULL) {
108a6b35b44STrond Norbye mem_current = mem_base;
109a6b35b44STrond Norbye mem_avail = mem_limit;
110a6b35b44STrond Norbye } else {
111a6b35b44STrond Norbye fprintf(stderr, "Warning: Failed to allocate requested memory in"
112a6b35b44STrond Norbye " one large chunk.\nWill allocate in smaller chunks\n");
113a6b35b44STrond Norbye }
114a6b35b44STrond Norbye }
115a6b35b44STrond Norbye
116c9607c6dSBrad Fitzpatrick memset(slabclass, 0, sizeof(slabclass));
11786969ea4SBrad Fitzpatrick
118d7fb022dSdormando while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
119d7fb022dSdormando if (slab_sizes != NULL) {
120d7fb022dSdormando if (slab_sizes[i-1] == 0)
121d7fb022dSdormando break;
122d7fb022dSdormando size = slab_sizes[i-1];
1230567967aSdormando } else if (size >= settings.slab_chunk_size_max / factor) {
124d7fb022dSdormando break;
125d7fb022dSdormando }
126c9607c6dSBrad Fitzpatrick /* Make sure items are always n-byte aligned */
127c9607c6dSBrad Fitzpatrick if (size % CHUNK_ALIGN_BYTES)
128c9607c6dSBrad Fitzpatrick size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
12986969ea4SBrad Fitzpatrick
13060d70942SAnatoly Vorobey slabclass[i].size = size;
1310567967aSdormando slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
132d7fb022dSdormando if (slab_sizes == NULL)
133c9607c6dSBrad Fitzpatrick size *= factor;
134c9607c6dSBrad Fitzpatrick if (settings.verbose > 1) {
1357a1da66aSdormando fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
136c9607c6dSBrad Fitzpatrick i, slabclass[i].size, slabclass[i].perslab);
137c9607c6dSBrad Fitzpatrick }
13860d70942SAnatoly Vorobey }
13986969ea4SBrad Fitzpatrick
140c9607c6dSBrad Fitzpatrick power_largest = i;
1410567967aSdormando slabclass[power_largest].size = settings.slab_chunk_size_max;
1420567967aSdormando slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
1437a1da66aSdormando if (settings.verbose > 1) {
1447a1da66aSdormando fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
1457a1da66aSdormando i, slabclass[i].size, slabclass[i].perslab);
1467a1da66aSdormando }
14786969ea4SBrad Fitzpatrick
1480140fa41SBrad Fitzpatrick /* for the test suite: faking of how much we've already malloc'd */
1490140fa41SBrad Fitzpatrick {
1500140fa41SBrad Fitzpatrick char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
1510140fa41SBrad Fitzpatrick if (t_initial_malloc) {
15277dde9f9SPaul Lindner mem_malloced = (size_t)atol(t_initial_malloc);
1530140fa41SBrad Fitzpatrick }
1540140fa41SBrad Fitzpatrick
1550140fa41SBrad Fitzpatrick }
1560140fa41SBrad Fitzpatrick
157c979d213Sdormando if (prealloc) {
1583938c578SPaul Lindner slabs_preallocate(power_largest);
1590140fa41SBrad Fitzpatrick }
1600140fa41SBrad Fitzpatrick }
16186969ea4SBrad Fitzpatrick
slabs_preallocate(const unsigned int maxslabs)16277dde9f9SPaul Lindner static void slabs_preallocate (const unsigned int maxslabs) {
163c252f6e9SBrad Fitzpatrick int i;
164c252f6e9SBrad Fitzpatrick unsigned int prealloc = 0;
16586969ea4SBrad Fitzpatrick
166c252f6e9SBrad Fitzpatrick /* pre-allocate a 1MB slab in every size class so people don't get
167c252f6e9SBrad Fitzpatrick confused by non-intuitive "SERVER_ERROR out of memory"
168c252f6e9SBrad Fitzpatrick messages. this is the most common question on the mailing
169c252f6e9SBrad Fitzpatrick list. if you really don't want this, you can rebuild without
170c252f6e9SBrad Fitzpatrick these three lines. */
17186969ea4SBrad Fitzpatrick
172a2fc8e93Sdormando for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
173c252f6e9SBrad Fitzpatrick if (++prealloc > maxslabs)
174c252f6e9SBrad Fitzpatrick return;
175c979d213Sdormando if (do_slabs_newslab(i) == 0) {
176c979d213Sdormando fprintf(stderr, "Error while preallocating slab memory!\n"
177c979d213Sdormando "If using -L or other prealloc options, max memory must be "
178c979d213Sdormando "at least %d megabytes.\n", power_largest);
179c979d213Sdormando exit(1);
180c979d213Sdormando }
181c252f6e9SBrad Fitzpatrick }
18286969ea4SBrad Fitzpatrick
18360d70942SAnatoly Vorobey }
18486969ea4SBrad Fitzpatrick
grow_slab_list(const unsigned int id)18577dde9f9SPaul Lindner static int grow_slab_list (const unsigned int id) {
186d72b1a2dSBrad Fitzpatrick slabclass_t *p = &slabclass[id];
187d72b1a2dSBrad Fitzpatrick if (p->slabs == p->list_size) {
18877dde9f9SPaul Lindner size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
189e3ca4e30SBrad Fitzpatrick void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
190d72b1a2dSBrad Fitzpatrick if (new_list == 0) return 0;
191d72b1a2dSBrad Fitzpatrick p->list_size = new_size;
192d72b1a2dSBrad Fitzpatrick p->slab_list = new_list;
193d72b1a2dSBrad Fitzpatrick }
194d72b1a2dSBrad Fitzpatrick return 1;
195d72b1a2dSBrad Fitzpatrick }
19686969ea4SBrad Fitzpatrick
split_slab_page_into_freelist(char * ptr,const unsigned int id)197845a4fe1Sdormando static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
198845a4fe1Sdormando slabclass_t *p = &slabclass[id];
199845a4fe1Sdormando int x;
200845a4fe1Sdormando for (x = 0; x < p->perslab; x++) {
201845a4fe1Sdormando do_slabs_free(ptr, 0, id);
202845a4fe1Sdormando ptr += p->size;
203845a4fe1Sdormando }
204845a4fe1Sdormando }
205845a4fe1Sdormando
206d6e96467Sdormando /* Fast FIFO queue */
get_page_from_global_pool(void)207d6e96467Sdormando static void *get_page_from_global_pool(void) {
208d6e96467Sdormando slabclass_t *p = &slabclass[SLAB_GLOBAL_PAGE_POOL];
209d6e96467Sdormando if (p->slabs < 1) {
210d6e96467Sdormando return NULL;
211d6e96467Sdormando }
212d6e96467Sdormando char *ret = p->slab_list[p->slabs - 1];
213d6e96467Sdormando p->slabs--;
214d6e96467Sdormando return ret;
215d6e96467Sdormando }
216d6e96467Sdormando
do_slabs_newslab(const unsigned int id)21756b8339eSSteven Grimm static int do_slabs_newslab(const unsigned int id) {
21860d70942SAnatoly Vorobey slabclass_t *p = &slabclass[id];
219d6e96467Sdormando slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
22010698baeSdormando int len = settings.slab_reassign ? settings.item_size_max
22110698baeSdormando : p->size * p->perslab;
22260d70942SAnatoly Vorobey char *ptr;
22386969ea4SBrad Fitzpatrick
224d6e96467Sdormando if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
225d6e96467Sdormando && g->slabs == 0)) {
226fb269897Sdormando mem_limit_reached = true;
227fb269897Sdormando MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
228fb269897Sdormando return 0;
229fb269897Sdormando }
230fb269897Sdormando
231fb269897Sdormando if ((grow_slab_list(id) == 0) ||
232d6e96467Sdormando (((ptr = get_page_from_global_pool()) == NULL) &&
233d6e96467Sdormando ((ptr = memory_allocate((size_t)len)) == 0))) {
23468957214STrond Norbye
23568957214STrond Norbye MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
23660d70942SAnatoly Vorobey return 0;
23768957214STrond Norbye }
23886969ea4SBrad Fitzpatrick
23977dde9f9SPaul Lindner memset(ptr, 0, (size_t)len);
240845a4fe1Sdormando split_slab_page_into_freelist(ptr, id);
24186969ea4SBrad Fitzpatrick
242b7b9d5f4SBrad Fitzpatrick p->slab_list[p->slabs++] = ptr;
24368957214STrond Norbye MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
24461e08596SToru Maesaka
24560d70942SAnatoly Vorobey return 1;
24660d70942SAnatoly Vorobey }
24786969ea4SBrad Fitzpatrick
2480567967aSdormando /* This calculation ends up adding sizeof(void *) to the item size. */
do_slabs_alloc_chunked(const size_t size,slabclass_t * p,unsigned int id)2490567967aSdormando static void *do_slabs_alloc_chunked(const size_t size, slabclass_t *p, unsigned int id) {
2500567967aSdormando void *ret = NULL;
2510567967aSdormando item *it = NULL;
2520567967aSdormando int x;
2530567967aSdormando int csize = p->size - sizeof(item_chunk);
2540567967aSdormando unsigned int chunks_req = size / csize;
2550567967aSdormando if (size % csize != 0)
2560567967aSdormando chunks_req++;
2570567967aSdormando while (p->sl_curr < chunks_req) {
2580567967aSdormando if (do_slabs_newslab(id) == 0)
2590567967aSdormando break;
2600567967aSdormando }
2610567967aSdormando
2620567967aSdormando if (p->sl_curr >= chunks_req) {
2630567967aSdormando item_chunk *chunk = NULL;
2640567967aSdormando
2650567967aSdormando /* Configure the head item in the chain. */
2660567967aSdormando it = (item *)p->slots;
2670567967aSdormando p->slots = it->next;
2680567967aSdormando if (it->next) it->next->prev = 0;
2690567967aSdormando
2700567967aSdormando /* Squirrel away the "top chunk" into h_next for now */
2710567967aSdormando it->h_next = (item *)p->slots;
2720567967aSdormando assert(it->h_next != 0);
2730567967aSdormando chunk = (item_chunk *) it->h_next;
2740567967aSdormando
2750567967aSdormando /* roll down the chunks, marking them as such. */
2760567967aSdormando for (x = 0; x < chunks_req-1; x++) {
2770567967aSdormando chunk->it_flags &= ~ITEM_SLABBED;
278*ee461d11Sdormando chunk->it_flags |= ITEM_CHUNK;
2790567967aSdormando /* Chunks always have a direct reference to the head item */
2800567967aSdormando chunk->head = it;
2810567967aSdormando chunk->size = p->size - sizeof(item_chunk);
2820567967aSdormando chunk->used = 0;
2830567967aSdormando chunk = chunk->next;
2840567967aSdormando }
2850567967aSdormando
2860567967aSdormando /* The final "next" is now the top of the slab freelist */
2870567967aSdormando p->slots = chunk;
2880567967aSdormando if (chunk && chunk->prev) {
2890567967aSdormando /* Disconnect the final chunk from the chain */
2900567967aSdormando chunk->prev->next = 0;
2910567967aSdormando chunk->prev = 0;
2920567967aSdormando }
2930567967aSdormando
2940567967aSdormando it->it_flags &= ~ITEM_SLABBED;
2950567967aSdormando it->it_flags |= ITEM_CHUNKED;
2960567967aSdormando it->refcount = 1;
2970567967aSdormando p->sl_curr -= chunks_req;
2980567967aSdormando ret = (void *)it;
2990567967aSdormando } else {
3000567967aSdormando ret = NULL;
3010567967aSdormando }
3020567967aSdormando
3030567967aSdormando return ret;
3040567967aSdormando }
3050567967aSdormando
30677dde9f9SPaul Lindner /*@null@*/
do_slabs_alloc(const size_t size,unsigned int id,uint64_t * total_bytes,unsigned int flags)307*ee461d11Sdormando static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_bytes,
308b1debc4cSdormando unsigned int flags) {
30960d70942SAnatoly Vorobey slabclass_t *p;
31068957214STrond Norbye void *ret = NULL;
31110698baeSdormando item *it = NULL;
31286969ea4SBrad Fitzpatrick
31368957214STrond Norbye if (id < POWER_SMALLEST || id > power_largest) {
31468957214STrond Norbye MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
31577dde9f9SPaul Lindner return NULL;
31668957214STrond Norbye }
31760d70942SAnatoly Vorobey p = &slabclass[id];
31810698baeSdormando assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
319*ee461d11Sdormando if (total_bytes != NULL) {
320*ee461d11Sdormando *total_bytes = p->requested;
321004e2211Sdormando }
3220567967aSdormando
3230567967aSdormando if (size <= p->size) {
32466b28810SBrad Fitzpatrick /* fail unless we have space at the end of a recently allocated page,
32566b28810SBrad Fitzpatrick we have something on our freelist, or we could allocate a new page */
326b1debc4cSdormando if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
327b1debc4cSdormando do_slabs_newslab(id);
328b1debc4cSdormando }
329b1debc4cSdormando
330b1debc4cSdormando if (p->sl_curr != 0) {
33168957214STrond Norbye /* return off our freelist */
33210698baeSdormando it = (item *)p->slots;
33310698baeSdormando p->slots = it->next;
334324975ceSdormando if (it->next) it->next->prev = 0;
33562415f16Sdormando /* Kill flag and initialize refcount here for lock safety in slab
33662415f16Sdormando * mover's freeness detection. */
33762415f16Sdormando it->it_flags &= ~ITEM_SLABBED;
33862415f16Sdormando it->refcount = 1;
33910698baeSdormando p->sl_curr--;
34010698baeSdormando ret = (void *)it;
341b1debc4cSdormando } else {
342b1debc4cSdormando ret = NULL;
34366b28810SBrad Fitzpatrick }
3440567967aSdormando } else {
3450567967aSdormando /* Dealing with a chunked item. */
3460567967aSdormando ret = do_slabs_alloc_chunked(size, p, id);
3470567967aSdormando }
34886969ea4SBrad Fitzpatrick
34968957214STrond Norbye if (ret) {
3507a82822bSTrond Norbye p->requested += size;
35168957214STrond Norbye MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
35268957214STrond Norbye } else {
35368957214STrond Norbye MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
35468957214STrond Norbye }
35568957214STrond Norbye
35668957214STrond Norbye return ret;
35760d70942SAnatoly Vorobey }
35886969ea4SBrad Fitzpatrick
do_slabs_free_chunked(item * it,const size_t size,unsigned int id,slabclass_t * p)3590567967aSdormando static void do_slabs_free_chunked(item *it, const size_t size, unsigned int id,
3600567967aSdormando slabclass_t *p) {
3610567967aSdormando item_chunk *chunk = (item_chunk *) ITEM_data(it);
3620567967aSdormando size_t realsize = size;
3630567967aSdormando while (chunk) {
3640567967aSdormando realsize += sizeof(item_chunk);
3650567967aSdormando chunk = chunk->next;
3660567967aSdormando }
3670567967aSdormando chunk = (item_chunk *) ITEM_data(it);
368*ee461d11Sdormando unsigned int chunks_found = 1;
3690567967aSdormando
3700567967aSdormando it->it_flags = ITEM_SLABBED;
3710567967aSdormando it->slabs_clsid = 0;
3720567967aSdormando it->prev = 0;
3730567967aSdormando it->next = (item *) chunk->next;
3740567967aSdormando assert(it->next);
3750567967aSdormando /* top chunk should already point back to head */
3760567967aSdormando assert(it->next && (void*)it->next->prev == (void*)chunk);
3770567967aSdormando chunk = chunk->next;
3780567967aSdormando chunk->prev = (item_chunk *)it;
3790567967aSdormando
380*ee461d11Sdormando while (chunk) {
381*ee461d11Sdormando assert(chunk->it_flags == ITEM_CHUNK);
3820567967aSdormando chunk->it_flags = ITEM_SLABBED;
383b05653f9Sdormando chunk->slabs_clsid = 0;
384*ee461d11Sdormando chunks_found++;
385*ee461d11Sdormando if (chunk->next) {
3860567967aSdormando chunk = chunk->next;
387*ee461d11Sdormando } else {
388*ee461d11Sdormando break;
389*ee461d11Sdormando }
3900567967aSdormando }
3910567967aSdormando /* must have had nothing hanging off of the final chunk */
392b05653f9Sdormando assert(chunk && chunk->next == 0);
393b05653f9Sdormando /* Tail chunk, link the freelist here. */
394b05653f9Sdormando chunk->next = p->slots;
395b05653f9Sdormando if (chunk->next) chunk->next->prev = chunk;
3960567967aSdormando
3970567967aSdormando p->slots = it;
398*ee461d11Sdormando p->sl_curr += chunks_found;
3990567967aSdormando p->requested -= size;
4000567967aSdormando
4010567967aSdormando return;
4020567967aSdormando }
4030567967aSdormando
4040567967aSdormando
do_slabs_free(void * ptr,const size_t size,unsigned int id)405d9220d64STrond Norbye static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
406c08383afSBrad Fitzpatrick slabclass_t *p;
40710698baeSdormando item *it;
40886969ea4SBrad Fitzpatrick
409c9607c6dSBrad Fitzpatrick assert(id >= POWER_SMALLEST && id <= power_largest);
410c9607c6dSBrad Fitzpatrick if (id < POWER_SMALLEST || id > power_largest)
41160d70942SAnatoly Vorobey return;
41286969ea4SBrad Fitzpatrick
41368957214STrond Norbye MEMCACHED_SLABS_FREE(size, id, ptr);
41460d70942SAnatoly Vorobey p = &slabclass[id];
41586969ea4SBrad Fitzpatrick
41610698baeSdormando it = (item *)ptr;
4170567967aSdormando if ((it->it_flags & ITEM_CHUNKED) == 0) {
418186509c2Sdormando it->it_flags = ITEM_SLABBED;
41962415f16Sdormando it->slabs_clsid = 0;
42010698baeSdormando it->prev = 0;
42110698baeSdormando it->next = p->slots;
42210698baeSdormando if (it->next) it->next->prev = it;
42310698baeSdormando p->slots = it;
42410698baeSdormando
42510698baeSdormando p->sl_curr++;
4267a82822bSTrond Norbye p->requested -= size;
4270567967aSdormando } else {
4280567967aSdormando do_slabs_free_chunked(it, size, id, p);
4290567967aSdormando }
43060d70942SAnatoly Vorobey return;
43160d70942SAnatoly Vorobey }
43286969ea4SBrad Fitzpatrick
nz_strcmp(int nzlength,const char * nz,const char * z)433bd6a8278SDustin Sallings static int nz_strcmp(int nzlength, const char *nz, const char *z) {
434bd6a8278SDustin Sallings int zlength=strlen(z);
435bd6a8278SDustin Sallings return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
436bd6a8278SDustin Sallings }
437bd6a8278SDustin Sallings
get_stats(const char * stat_type,int nkey,ADD_STAT add_stats,void * c)43817df5c0eSTrond Norbye bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
43917df5c0eSTrond Norbye bool ret = true;
44048f45982SToru Maesaka
44117df5c0eSTrond Norbye if (add_stats != NULL) {
4424c77f591SToru Maesaka if (!stat_type) {
44348f45982SToru Maesaka /* prepare general statistics for the engine */
444bc340cdcSTrond Norbye STATS_LOCK();
445cb01d504Sdormando APPEND_STAT("bytes", "%llu", (unsigned long long)stats_state.curr_bytes);
446cb01d504Sdormando APPEND_STAT("curr_items", "%llu", (unsigned long long)stats_state.curr_items);
4479517c656Sdormando APPEND_STAT("total_items", "%llu", (unsigned long long)stats.total_items);
448bc340cdcSTrond Norbye STATS_UNLOCK();
449d6e96467Sdormando if (settings.slab_automove > 0) {
450d6e96467Sdormando pthread_mutex_lock(&slabs_lock);
451d6e96467Sdormando APPEND_STAT("slab_global_page_pool", "%u", slabclass[SLAB_GLOBAL_PAGE_POOL].slabs);
452d6e96467Sdormando pthread_mutex_unlock(&slabs_lock);
453d6e96467Sdormando }
45467d7cfc1Sdormando item_stats_totals(add_stats, c);
455bd6a8278SDustin Sallings } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
45617df5c0eSTrond Norbye item_stats(add_stats, c);
457bd6a8278SDustin Sallings } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
45817df5c0eSTrond Norbye slabs_stats(add_stats, c);
459bd6a8278SDustin Sallings } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
46017df5c0eSTrond Norbye item_stats_sizes(add_stats, c);
461ae6f4267Sdormando } else if (nz_strcmp(nkey, stat_type, "sizes_enable") == 0) {
462ae6f4267Sdormando item_stats_sizes_enable(add_stats, c);
463ae6f4267Sdormando } else if (nz_strcmp(nkey, stat_type, "sizes_disable") == 0) {
464ae6f4267Sdormando item_stats_sizes_disable(add_stats, c);
46517df5c0eSTrond Norbye } else {
46617df5c0eSTrond Norbye ret = false;
46717df5c0eSTrond Norbye }
46817df5c0eSTrond Norbye } else {
46917df5c0eSTrond Norbye ret = false;
4704c77f591SToru Maesaka }
471df8cf5e7SToru Maesaka
47217df5c0eSTrond Norbye return ret;
473f4a8e7ffSToru Maesaka }
4744c77f591SToru Maesaka
47577dde9f9SPaul Lindner /*@null@*/
do_slabs_stats(ADD_STAT add_stats,void * c)47617df5c0eSTrond Norbye static void do_slabs_stats(ADD_STAT add_stats, void *c) {
47717df5c0eSTrond Norbye int i, total;
478e71ea432SDustin Sallings /* Get the per-thread stats which contain some interesting aggregates */
479e71ea432SDustin Sallings struct thread_stats thread_stats;
480e71ea432SDustin Sallings threadlocal_stats_aggregate(&thread_stats);
481e71ea432SDustin Sallings
48266b28810SBrad Fitzpatrick total = 0;
483c9607c6dSBrad Fitzpatrick for(i = POWER_SMALLEST; i <= power_largest; i++) {
48466b28810SBrad Fitzpatrick slabclass_t *p = &slabclass[i];
48577dde9f9SPaul Lindner if (p->slabs != 0) {
486a20d4b8bSToru Maesaka uint32_t perslab, slabs;
48766b28810SBrad Fitzpatrick slabs = p->slabs;
48866b28810SBrad Fitzpatrick perslab = p->perslab;
48986969ea4SBrad Fitzpatrick
49088a68689SDustin Sallings char key_str[STAT_KEY_LEN];
49188a68689SDustin Sallings char val_str[STAT_VAL_LEN];
49229345c04SDustin Sallings int klen = 0, vlen = 0;
493a20d4b8bSToru Maesaka
49429345c04SDustin Sallings APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
49529345c04SDustin Sallings APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
49629345c04SDustin Sallings APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
49729345c04SDustin Sallings APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
49829345c04SDustin Sallings APPEND_NUM_STAT(i, "used_chunks", "%u",
499423b9fd4Sdormando slabs*perslab - p->sl_curr);
50029345c04SDustin Sallings APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
501423b9fd4Sdormando /* Stat is dead, but displaying zero instead of removing it. */
502423b9fd4Sdormando APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
5037a82822bSTrond Norbye APPEND_NUM_STAT(i, "mem_requested", "%llu",
5047a82822bSTrond Norbye (unsigned long long)p->requested);
50529345c04SDustin Sallings APPEND_NUM_STAT(i, "get_hits", "%llu",
506e71ea432SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].get_hits);
50729345c04SDustin Sallings APPEND_NUM_STAT(i, "cmd_set", "%llu",
508e71ea432SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].set_cmds);
50929345c04SDustin Sallings APPEND_NUM_STAT(i, "delete_hits", "%llu",
510a77d12b0SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].delete_hits);
51129345c04SDustin Sallings APPEND_NUM_STAT(i, "incr_hits", "%llu",
5123e030782SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].incr_hits);
51329345c04SDustin Sallings APPEND_NUM_STAT(i, "decr_hits", "%llu",
5143e030782SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].decr_hits);
51529345c04SDustin Sallings APPEND_NUM_STAT(i, "cas_hits", "%llu",
51615e64625SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].cas_hits);
51729345c04SDustin Sallings APPEND_NUM_STAT(i, "cas_badval", "%llu",
51815e64625SDustin Sallings (unsigned long long)thread_stats.slab_stats[i].cas_badval);
519d87f568aSdormando APPEND_NUM_STAT(i, "touch_hits", "%llu",
520d87f568aSdormando (unsigned long long)thread_stats.slab_stats[i].touch_hits);
52160d70942SAnatoly Vorobey total++;
52260d70942SAnatoly Vorobey }
52360d70942SAnatoly Vorobey }
524a20d4b8bSToru Maesaka
525a20d4b8bSToru Maesaka /* add overall slab stats and append terminator */
526a20d4b8bSToru Maesaka
52729345c04SDustin Sallings APPEND_STAT("active_slabs", "%d", total);
52829345c04SDustin Sallings APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
52917df5c0eSTrond Norbye add_stats(NULL, 0, NULL, 0, c);
53060d70942SAnatoly Vorobey }
53186969ea4SBrad Fitzpatrick
memory_allocate(size_t size)532a6b35b44STrond Norbye static void *memory_allocate(size_t size) {
533a6b35b44STrond Norbye void *ret;
534a6b35b44STrond Norbye
535a6b35b44STrond Norbye if (mem_base == NULL) {
536a6b35b44STrond Norbye /* We are not using a preallocated large memory chunk */
537a6b35b44STrond Norbye ret = malloc(size);
538a6b35b44STrond Norbye } else {
539a6b35b44STrond Norbye ret = mem_current;
540a6b35b44STrond Norbye
541a6b35b44STrond Norbye if (size > mem_avail) {
542a6b35b44STrond Norbye return NULL;
543a6b35b44STrond Norbye }
544a6b35b44STrond Norbye
545a6b35b44STrond Norbye /* mem_current pointer _must_ be aligned!!! */
546a6b35b44STrond Norbye if (size % CHUNK_ALIGN_BYTES) {
547a6b35b44STrond Norbye size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
548a6b35b44STrond Norbye }
549a6b35b44STrond Norbye
550df1b7e42STrond Norbye mem_current = ((char*)mem_current) + size;
551a6b35b44STrond Norbye if (size < mem_avail) {
552a6b35b44STrond Norbye mem_avail -= size;
553a6b35b44STrond Norbye } else {
554a6b35b44STrond Norbye mem_avail = 0;
555a6b35b44STrond Norbye }
556a6b35b44STrond Norbye }
557ec937e5eSdormando mem_malloced += size;
558a6b35b44STrond Norbye
559a6b35b44STrond Norbye return ret;
560a6b35b44STrond Norbye }
561d9220d64STrond Norbye
56231541b37Sdormando /* Must only be used if all pages are item_size_max */
memory_release()56331541b37Sdormando static void memory_release() {
56431541b37Sdormando void *p = NULL;
56531541b37Sdormando if (mem_base != NULL)
56631541b37Sdormando return;
56731541b37Sdormando
56831541b37Sdormando if (!settings.slab_reassign)
56931541b37Sdormando return;
57031541b37Sdormando
57131541b37Sdormando while (mem_malloced > mem_limit &&
57231541b37Sdormando (p = get_page_from_global_pool()) != NULL) {
57331541b37Sdormando free(p);
57431541b37Sdormando mem_malloced -= settings.item_size_max;
57531541b37Sdormando }
57631541b37Sdormando }
57731541b37Sdormando
slabs_alloc(size_t size,unsigned int id,uint64_t * total_bytes,unsigned int flags)578*ee461d11Sdormando void *slabs_alloc(size_t size, unsigned int id, uint64_t *total_bytes,
579b1debc4cSdormando unsigned int flags) {
580d9220d64STrond Norbye void *ret;
581d9220d64STrond Norbye
582d9220d64STrond Norbye pthread_mutex_lock(&slabs_lock);
583*ee461d11Sdormando ret = do_slabs_alloc(size, id, total_bytes, flags);
584d9220d64STrond Norbye pthread_mutex_unlock(&slabs_lock);
585d9220d64STrond Norbye return ret;
586d9220d64STrond Norbye }
587d9220d64STrond Norbye
slabs_free(void * ptr,size_t size,unsigned int id)588d9220d64STrond Norbye void slabs_free(void *ptr, size_t size, unsigned int id) {
589d9220d64STrond Norbye pthread_mutex_lock(&slabs_lock);
590d9220d64STrond Norbye do_slabs_free(ptr, size, id);
591d9220d64STrond Norbye pthread_mutex_unlock(&slabs_lock);
592d9220d64STrond Norbye }
593d9220d64STrond Norbye
slabs_stats(ADD_STAT add_stats,void * c)59417df5c0eSTrond Norbye void slabs_stats(ADD_STAT add_stats, void *c) {
595d9220d64STrond Norbye pthread_mutex_lock(&slabs_lock);
59617df5c0eSTrond Norbye do_slabs_stats(add_stats, c);
597d9220d64STrond Norbye pthread_mutex_unlock(&slabs_lock);
598d9220d64STrond Norbye }
599efad616dSTrond Norbye
do_slabs_adjust_mem_limit(size_t new_mem_limit)60031541b37Sdormando static bool do_slabs_adjust_mem_limit(size_t new_mem_limit) {
60131541b37Sdormando /* Cannot adjust memory limit at runtime if prealloc'ed */
60231541b37Sdormando if (mem_base != NULL)
60331541b37Sdormando return false;
60431541b37Sdormando settings.maxbytes = new_mem_limit;
60531541b37Sdormando mem_limit = new_mem_limit;
60631541b37Sdormando mem_limit_reached = false; /* Will reset on next alloc */
60731541b37Sdormando memory_release(); /* free what might already be in the global pool */
60831541b37Sdormando return true;
60931541b37Sdormando }
61031541b37Sdormando
slabs_adjust_mem_limit(size_t new_mem_limit)61131541b37Sdormando bool slabs_adjust_mem_limit(size_t new_mem_limit) {
61231541b37Sdormando bool ret;
61331541b37Sdormando pthread_mutex_lock(&slabs_lock);
61431541b37Sdormando ret = do_slabs_adjust_mem_limit(new_mem_limit);
61531541b37Sdormando pthread_mutex_unlock(&slabs_lock);
61631541b37Sdormando return ret;
61731541b37Sdormando }
61831541b37Sdormando
slabs_adjust_mem_requested(unsigned int id,size_t old,size_t ntotal)619efad616dSTrond Norbye void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
620efad616dSTrond Norbye {
621efad616dSTrond Norbye pthread_mutex_lock(&slabs_lock);
622efad616dSTrond Norbye slabclass_t *p;
623efad616dSTrond Norbye if (id < POWER_SMALLEST || id > power_largest) {
624efad616dSTrond Norbye fprintf(stderr, "Internal error! Invalid slab class\n");
625efad616dSTrond Norbye abort();
626efad616dSTrond Norbye }
627efad616dSTrond Norbye
628efad616dSTrond Norbye p = &slabclass[id];
629efad616dSTrond Norbye p->requested = p->requested - old + ntotal;
630efad616dSTrond Norbye pthread_mutex_unlock(&slabs_lock);
631efad616dSTrond Norbye }
63210698baeSdormando
slabs_available_chunks(const unsigned int id,bool * mem_flag,uint64_t * total_bytes,unsigned int * chunks_perslab)633fb269897Sdormando unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
634*ee461d11Sdormando uint64_t *total_bytes, unsigned int *chunks_perslab) {
635fb269897Sdormando unsigned int ret;
636fb269897Sdormando slabclass_t *p;
637fb269897Sdormando
638fb269897Sdormando pthread_mutex_lock(&slabs_lock);
639fb269897Sdormando p = &slabclass[id];
640fb269897Sdormando ret = p->sl_curr;
641e708513aSdormando if (mem_flag != NULL)
642fb269897Sdormando *mem_flag = mem_limit_reached;
643*ee461d11Sdormando if (total_bytes != NULL)
644*ee461d11Sdormando *total_bytes = p->requested;
645d6e96467Sdormando if (chunks_perslab != NULL)
646d6e96467Sdormando *chunks_perslab = p->perslab;
647fb269897Sdormando pthread_mutex_unlock(&slabs_lock);
648fb269897Sdormando return ret;
649fb269897Sdormando }
650fb269897Sdormando
65157a9856aSdormando static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
65210698baeSdormando static volatile int do_run_slab_thread = 1;
65357a9856aSdormando static volatile int do_run_slab_rebalance_thread = 1;
65410698baeSdormando
65510698baeSdormando #define DEFAULT_SLAB_BULK_CHECK 1
65610698baeSdormando int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
65710698baeSdormando
slab_rebalance_start(void)65810698baeSdormando static int slab_rebalance_start(void) {
65910698baeSdormando slabclass_t *s_cls;
66010698baeSdormando int no_go = 0;
66110698baeSdormando
66210698baeSdormando pthread_mutex_lock(&slabs_lock);
66310698baeSdormando
66410698baeSdormando if (slab_rebal.s_clsid < POWER_SMALLEST ||
66510698baeSdormando slab_rebal.s_clsid > power_largest ||
666d6e96467Sdormando slab_rebal.d_clsid < SLAB_GLOBAL_PAGE_POOL ||
6678c1c18edSdormando slab_rebal.d_clsid > power_largest ||
6688c1c18edSdormando slab_rebal.s_clsid == slab_rebal.d_clsid)
66910698baeSdormando no_go = -2;
67010698baeSdormando
67110698baeSdormando s_cls = &slabclass[slab_rebal.s_clsid];
67210698baeSdormando
673423b9fd4Sdormando if (!grow_slab_list(slab_rebal.d_clsid)) {
67410698baeSdormando no_go = -1;
67510698baeSdormando }
67610698baeSdormando
67710698baeSdormando if (s_cls->slabs < 2)
67810698baeSdormando no_go = -3;
67910698baeSdormando
68010698baeSdormando if (no_go != 0) {
68110698baeSdormando pthread_mutex_unlock(&slabs_lock);
68210698baeSdormando return no_go; /* Should use a wrapper function... */
68310698baeSdormando }
68410698baeSdormando
685d5185f9cSdormando /* Always kill the first available slab page as it is most likely to
686d5185f9cSdormando * contain the oldest items
687d5185f9cSdormando */
688d5185f9cSdormando slab_rebal.slab_start = s_cls->slab_list[0];
68910698baeSdormando slab_rebal.slab_end = (char *)slab_rebal.slab_start +
69010698baeSdormando (s_cls->size * s_cls->perslab);
69110698baeSdormando slab_rebal.slab_pos = slab_rebal.slab_start;
69210698baeSdormando slab_rebal.done = 0;
69310698baeSdormando
69410698baeSdormando /* Also tells do_item_get to search for items in this slab */
69510698baeSdormando slab_rebalance_signal = 2;
69610698baeSdormando
69710698baeSdormando if (settings.verbose > 1) {
69810698baeSdormando fprintf(stderr, "Started a slab rebalance\n");
69910698baeSdormando }
70010698baeSdormando
70110698baeSdormando pthread_mutex_unlock(&slabs_lock);
70210698baeSdormando
70310698baeSdormando STATS_LOCK();
704cb01d504Sdormando stats_state.slab_reassign_running = true;
70510698baeSdormando STATS_UNLOCK();
70610698baeSdormando
70710698baeSdormando return 0;
70810698baeSdormando }
70910698baeSdormando
710b1debc4cSdormando /* CALLED WITH slabs_lock HELD */
slab_rebalance_alloc(const size_t size,unsigned int id)711b1debc4cSdormando static void *slab_rebalance_alloc(const size_t size, unsigned int id) {
712b1debc4cSdormando slabclass_t *s_cls;
713b1debc4cSdormando s_cls = &slabclass[slab_rebal.s_clsid];
714b1debc4cSdormando int x;
715b1debc4cSdormando item *new_it = NULL;
716b1debc4cSdormando
717b1debc4cSdormando for (x = 0; x < s_cls->perslab; x++) {
718b1debc4cSdormando new_it = do_slabs_alloc(size, id, NULL, SLABS_ALLOC_NO_NEWPAGE);
719b1debc4cSdormando /* check that memory isn't within the range to clear */
720b1debc4cSdormando if (new_it == NULL) {
721b1debc4cSdormando break;
722b1debc4cSdormando }
723b1debc4cSdormando if ((void *)new_it >= slab_rebal.slab_start
724b1debc4cSdormando && (void *)new_it < slab_rebal.slab_end) {
725b1debc4cSdormando /* Pulled something we intend to free. Mark it as freed since
726b1debc4cSdormando * we've already done the work of unlinking it from the freelist.
727b1debc4cSdormando */
728b1debc4cSdormando s_cls->requested -= size;
729b1debc4cSdormando new_it->refcount = 0;
730b1debc4cSdormando new_it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
731*ee461d11Sdormando #ifdef DEBUG_SLAB_MOVER
732*ee461d11Sdormando memcpy(ITEM_key(new_it), "deadbeef", 8);
733*ee461d11Sdormando #endif
734b1debc4cSdormando new_it = NULL;
735b1debc4cSdormando slab_rebal.inline_reclaim++;
736b1debc4cSdormando } else {
737b1debc4cSdormando break;
738b1debc4cSdormando }
739b1debc4cSdormando }
740b1debc4cSdormando return new_it;
741b1debc4cSdormando }
742b1debc4cSdormando
743*ee461d11Sdormando /* CALLED WITH slabs_lock HELD */
744*ee461d11Sdormando /* detatches item/chunk from freelist. */
slab_rebalance_cut_free(slabclass_t * s_cls,item * it)745*ee461d11Sdormando static void slab_rebalance_cut_free(slabclass_t *s_cls, item *it) {
746*ee461d11Sdormando /* Ensure this was on the freelist and nothing else. */
747*ee461d11Sdormando assert(it->it_flags == ITEM_SLABBED);
748*ee461d11Sdormando if (s_cls->slots == it) {
749*ee461d11Sdormando s_cls->slots = it->next;
750*ee461d11Sdormando }
751*ee461d11Sdormando if (it->next) it->next->prev = it->prev;
752*ee461d11Sdormando if (it->prev) it->prev->next = it->next;
753*ee461d11Sdormando s_cls->sl_curr--;
754*ee461d11Sdormando }
755*ee461d11Sdormando
756f4983b20Sdormando enum move_status {
757dc272ba5Sdormando MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
758f4983b20Sdormando };
759f4983b20Sdormando
76069d1c699Sdormando /* refcount == 0 is safe since nobody can incr while item_lock is held.
76110698baeSdormando * refcount != 0 is impossible since flags/etc can be modified in other
76210698baeSdormando * threads. instead, note we found a busy one and bail. logic in do_item_get
76310698baeSdormando * will prevent busy items from continuing to be busy
76462415f16Sdormando * NOTE: This is checking it_flags outside of an item lock. I believe this
76562415f16Sdormando * works since it_flags is 8 bits, and we're only ever comparing a single bit
76662415f16Sdormando * regardless. ITEM_SLABBED bit will always be correct since we're holding the
76762415f16Sdormando * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
76862415f16Sdormando * item having ITEM_SLABBED removed, and the key hasn't been added to the item
76962415f16Sdormando * yet. The memory barrier from the slabs lock should order the key write and the
77062415f16Sdormando * flags to the item?
77162415f16Sdormando * If ITEM_LINKED did exist and was just removed, but we still see it, that's
77262415f16Sdormando * still safe since it will have a valid key, which we then lock, and then
77362415f16Sdormando * recheck everything.
77462415f16Sdormando * This may not be safe on all platforms; If not, slabs_alloc() will need to
77562415f16Sdormando * seed the item key while holding slabs_lock.
77610698baeSdormando */
slab_rebalance_move(void)77710698baeSdormando static int slab_rebalance_move(void) {
77810698baeSdormando slabclass_t *s_cls;
77910698baeSdormando int x;
78010698baeSdormando int was_busy = 0;
781f4983b20Sdormando int refcount = 0;
782dc272ba5Sdormando uint32_t hv;
783dc272ba5Sdormando void *hold_lock;
784f4983b20Sdormando enum move_status status = MOVE_PASS;
78510698baeSdormando
78610698baeSdormando pthread_mutex_lock(&slabs_lock);
78710698baeSdormando
78810698baeSdormando s_cls = &slabclass[slab_rebal.s_clsid];
78910698baeSdormando
79010698baeSdormando for (x = 0; x < slab_bulk_check; x++) {
791dc272ba5Sdormando hv = 0;
792dc272ba5Sdormando hold_lock = NULL;
79310698baeSdormando item *it = slab_rebal.slab_pos;
794*ee461d11Sdormando item_chunk *ch = NULL;
795f4983b20Sdormando status = MOVE_PASS;
796*ee461d11Sdormando if (it->it_flags & ITEM_CHUNK) {
797*ee461d11Sdormando /* This chunk is a chained part of a larger item. */
798*ee461d11Sdormando ch = (item_chunk *) it;
799*ee461d11Sdormando /* Instead, we use the head chunk to find the item and effectively
800*ee461d11Sdormando * lock the entire structure. If a chunk has ITEM_CHUNK flag, its
801*ee461d11Sdormando * head cannot be slabbed, so the normal routine is safe. */
802*ee461d11Sdormando it = ch->head;
803*ee461d11Sdormando assert(it->it_flags & ITEM_CHUNKED);
804*ee461d11Sdormando }
805*ee461d11Sdormando
806186509c2Sdormando /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared
807186509c2Sdormando * the chunk for move. Only these two flags should exist.
808186509c2Sdormando */
809186509c2Sdormando if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) {
81062415f16Sdormando /* ITEM_SLABBED can only be added/removed under the slabs_lock */
81110698baeSdormando if (it->it_flags & ITEM_SLABBED) {
812*ee461d11Sdormando assert(ch == NULL);
813*ee461d11Sdormando slab_rebalance_cut_free(s_cls, it);
814dc272ba5Sdormando status = MOVE_FROM_SLAB;
81562415f16Sdormando } else if ((it->it_flags & ITEM_LINKED) != 0) {
81662415f16Sdormando /* If it doesn't have ITEM_SLABBED, the item could be in any
81762415f16Sdormando * state on its way to being freed or written to. If no
81862415f16Sdormando * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
81962415f16Sdormando * and have the key written to it already.
82062415f16Sdormando */
821dc272ba5Sdormando hv = hash(ITEM_key(it), it->nkey);
82262415f16Sdormando if ((hold_lock = item_trylock(hv)) == NULL) {
82362415f16Sdormando status = MOVE_LOCKED;
824f4983b20Sdormando } else {
82562415f16Sdormando refcount = refcount_incr(&it->refcount);
82662415f16Sdormando if (refcount == 2) { /* item is linked but not busy */
82762415f16Sdormando /* Double check ITEM_LINKED flag here, since we're
828dc272ba5Sdormando * past a memory barrier from the mutex. */
829f4983b20Sdormando if ((it->it_flags & ITEM_LINKED) != 0) {
830dc272ba5Sdormando status = MOVE_FROM_LRU;
831f4983b20Sdormando } else {
832f4983b20Sdormando /* refcount == 1 + !ITEM_LINKED means the item is being
833f4983b20Sdormando * uploaded to, or was just unlinked but hasn't been freed
834f4983b20Sdormando * yet. Let it bleed off on its own and try again later */
835f4983b20Sdormando status = MOVE_BUSY;
836f4983b20Sdormando }
83710698baeSdormando } else {
83810698baeSdormando if (settings.verbose > 2) {
83910698baeSdormando fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
84010698baeSdormando it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
84110698baeSdormando }
842f4983b20Sdormando status = MOVE_BUSY;
843f4983b20Sdormando }
84469d1c699Sdormando /* Item lock must be held while modifying refcount */
845dc272ba5Sdormando if (status == MOVE_BUSY) {
84669d1c699Sdormando refcount_decr(&it->refcount);
8471c94e12cSdormando item_trylock_unlock(hold_lock);
8482db1bf46Sdormando }
849f4983b20Sdormando }
850a836eabcSdormando } else {
851a836eabcSdormando /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark
852a836eabcSdormando * busy and wait for item to complete its upload. */
853a836eabcSdormando status = MOVE_BUSY;
85462415f16Sdormando }
855dc272ba5Sdormando }
856f4983b20Sdormando
857004e2211Sdormando int save_item = 0;
858004e2211Sdormando item *new_it = NULL;
859004e2211Sdormando size_t ntotal = 0;
860f4983b20Sdormando switch (status) {
861dc272ba5Sdormando case MOVE_FROM_LRU:
862dc272ba5Sdormando /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
863dc272ba5Sdormando * We only need to hold the slabs_lock while initially looking
864dc272ba5Sdormando * at an item, and at this point we have an exclusive refcount
865dc272ba5Sdormando * (2) + the item is locked. Drop slabs lock, drop item to
866dc272ba5Sdormando * refcount 1 (just our own, then fall through and wipe it
867dc272ba5Sdormando */
868004e2211Sdormando /* Check if expired or flushed */
869004e2211Sdormando ntotal = ITEM_ntotal(it);
870004e2211Sdormando /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */
871*ee461d11Sdormando if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) {
872*ee461d11Sdormando /* Chunked should be identical to non-chunked, except we need
873*ee461d11Sdormando * to swap out ntotal for the head-chunk-total. */
874*ee461d11Sdormando ntotal = s_cls->size;
875*ee461d11Sdormando }
876004e2211Sdormando if ((it->exptime != 0 && it->exptime < current_time)
877004e2211Sdormando || item_is_flushed(it)) {
878*ee461d11Sdormando /* Expired, don't save. */
879004e2211Sdormando save_item = 0;
880*ee461d11Sdormando } else if (ch == NULL &&
881*ee461d11Sdormando (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) {
882*ee461d11Sdormando /* Not a chunk of an item, and nomem. */
883*ee461d11Sdormando save_item = 0;
884*ee461d11Sdormando slab_rebal.evictions_nomem++;
885*ee461d11Sdormando } else if (ch != NULL &&
886*ee461d11Sdormando (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) {
887*ee461d11Sdormando /* Is a chunk of an item, and nomem. */
888004e2211Sdormando save_item = 0;
8898fa54f7eSdormando slab_rebal.evictions_nomem++;
890004e2211Sdormando } else {
891*ee461d11Sdormando /* Was whatever it was, and we have memory for it. */
892004e2211Sdormando save_item = 1;
893004e2211Sdormando }
894dc272ba5Sdormando pthread_mutex_unlock(&slabs_lock);
895*ee461d11Sdormando unsigned int requested_adjust = 0;
896004e2211Sdormando if (save_item) {
897*ee461d11Sdormando if (ch == NULL) {
898*ee461d11Sdormando assert((new_it->it_flags & ITEM_CHUNKED) == 0);
899004e2211Sdormando /* if free memory, memcpy. clear prev/next/h_bucket */
900004e2211Sdormando memcpy(new_it, it, ntotal);
901004e2211Sdormando new_it->prev = 0;
902004e2211Sdormando new_it->next = 0;
903004e2211Sdormando new_it->h_next = 0;
904004e2211Sdormando /* These are definitely required. else fails assert */
905004e2211Sdormando new_it->it_flags &= ~ITEM_LINKED;
906004e2211Sdormando new_it->refcount = 0;
907004e2211Sdormando do_item_replace(it, new_it, hv);
908*ee461d11Sdormando /* Need to walk the chunks and repoint head */
909*ee461d11Sdormando if (new_it->it_flags & ITEM_CHUNKED) {
910*ee461d11Sdormando item_chunk *fch = (item_chunk *) ITEM_data(new_it);
911*ee461d11Sdormando fch->next->prev = fch;
912*ee461d11Sdormando while (fch) {
913*ee461d11Sdormando fch->head = new_it;
914*ee461d11Sdormando fch = fch->next;
915*ee461d11Sdormando }
916*ee461d11Sdormando }
917*ee461d11Sdormando it->refcount = 0;
918*ee461d11Sdormando it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
919*ee461d11Sdormando #ifdef DEBUG_SLAB_MOVER
920*ee461d11Sdormando memcpy(ITEM_key(it), "deadbeef", 8);
921*ee461d11Sdormando #endif
9226ee8daefSdormando slab_rebal.rescues++;
923*ee461d11Sdormando requested_adjust = ntotal;
924004e2211Sdormando } else {
925*ee461d11Sdormando item_chunk *nch = (item_chunk *) new_it;
926*ee461d11Sdormando /* Chunks always have head chunk (the main it) */
927*ee461d11Sdormando ch->prev->next = nch;
928*ee461d11Sdormando if (ch->next)
929*ee461d11Sdormando ch->next->prev = nch;
930*ee461d11Sdormando memcpy(nch, ch, ch->used + sizeof(item_chunk));
931*ee461d11Sdormando ch->refcount = 0;
932*ee461d11Sdormando ch->it_flags = ITEM_SLABBED|ITEM_FETCHED;
933*ee461d11Sdormando slab_rebal.chunk_rescues++;
934*ee461d11Sdormando #ifdef DEBUG_SLAB_MOVER
935*ee461d11Sdormando memcpy(ITEM_key((item *)ch), "deadbeef", 8);
936*ee461d11Sdormando #endif
937*ee461d11Sdormando refcount_decr(&it->refcount);
938*ee461d11Sdormando requested_adjust = s_cls->size;
939*ee461d11Sdormando }
940*ee461d11Sdormando } else {
941*ee461d11Sdormando /* restore ntotal in case we tried saving a head chunk. */
942*ee461d11Sdormando ntotal = ITEM_ntotal(it);
943dc272ba5Sdormando do_item_unlink(it, hv);
944*ee461d11Sdormando slabs_free(it, ntotal, slab_rebal.s_clsid);
945*ee461d11Sdormando /* Swing around again later to remove it from the freelist. */
946*ee461d11Sdormando slab_rebal.busy_items++;
947*ee461d11Sdormando was_busy++;
948004e2211Sdormando }
949dc272ba5Sdormando item_trylock_unlock(hold_lock);
950dc272ba5Sdormando pthread_mutex_lock(&slabs_lock);
95111eb3f23Sdormando /* Always remove the ntotal, as we added it in during
95211eb3f23Sdormando * do_slabs_alloc() when copying the item.
95311eb3f23Sdormando */
954*ee461d11Sdormando s_cls->requested -= requested_adjust;
955*ee461d11Sdormando break;
956dc272ba5Sdormando case MOVE_FROM_SLAB:
957f4983b20Sdormando it->refcount = 0;
958186509c2Sdormando it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
959a836eabcSdormando #ifdef DEBUG_SLAB_MOVER
960a836eabcSdormando memcpy(ITEM_key(it), "deadbeef", 8);
961a836eabcSdormando #endif
962f4983b20Sdormando break;
963f4983b20Sdormando case MOVE_BUSY:
9642db1bf46Sdormando case MOVE_LOCKED:
96510698baeSdormando slab_rebal.busy_items++;
96610698baeSdormando was_busy++;
967f4983b20Sdormando break;
968f4983b20Sdormando case MOVE_PASS:
969f4983b20Sdormando break;
97010698baeSdormando }
97110698baeSdormando
97210698baeSdormando slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
97310698baeSdormando if (slab_rebal.slab_pos >= slab_rebal.slab_end)
97410698baeSdormando break;
97510698baeSdormando }
97610698baeSdormando
97710698baeSdormando if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
97810698baeSdormando /* Some items were busy, start again from the top */
97910698baeSdormando if (slab_rebal.busy_items) {
98010698baeSdormando slab_rebal.slab_pos = slab_rebal.slab_start;
9816ee8daefSdormando STATS_LOCK();
9826ee8daefSdormando stats.slab_reassign_busy_items += slab_rebal.busy_items;
9836ee8daefSdormando STATS_UNLOCK();
98410698baeSdormando slab_rebal.busy_items = 0;
98510698baeSdormando } else {
98610698baeSdormando slab_rebal.done++;
98710698baeSdormando }
98810698baeSdormando }
98910698baeSdormando
99010698baeSdormando pthread_mutex_unlock(&slabs_lock);
99110698baeSdormando
99210698baeSdormando return was_busy;
99310698baeSdormando }
99410698baeSdormando
slab_rebalance_finish(void)99510698baeSdormando static void slab_rebalance_finish(void) {
99610698baeSdormando slabclass_t *s_cls;
99710698baeSdormando slabclass_t *d_cls;
998d5185f9cSdormando int x;
9996ee8daefSdormando uint32_t rescues;
10008fa54f7eSdormando uint32_t evictions_nomem;
1001b1debc4cSdormando uint32_t inline_reclaim;
1002*ee461d11Sdormando uint32_t chunk_rescues;
100310698baeSdormando
100410698baeSdormando pthread_mutex_lock(&slabs_lock);
100510698baeSdormando
100610698baeSdormando s_cls = &slabclass[slab_rebal.s_clsid];
100710698baeSdormando d_cls = &slabclass[slab_rebal.d_clsid];
100810698baeSdormando
1009a836eabcSdormando #ifdef DEBUG_SLAB_MOVER
1010a836eabcSdormando /* If the algorithm is broken, live items can sneak in. */
1011a836eabcSdormando slab_rebal.slab_pos = slab_rebal.slab_start;
1012a836eabcSdormando while (1) {
1013a836eabcSdormando item *it = slab_rebal.slab_pos;
1014186509c2Sdormando assert(it->it_flags == (ITEM_SLABBED|ITEM_FETCHED));
1015a836eabcSdormando assert(memcmp(ITEM_key(it), "deadbeef", 8) == 0);
1016186509c2Sdormando it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1017a836eabcSdormando slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1018a836eabcSdormando if (slab_rebal.slab_pos >= slab_rebal.slab_end)
1019a836eabcSdormando break;
1020a836eabcSdormando }
1021a836eabcSdormando #endif
1022a836eabcSdormando
1023d5185f9cSdormando /* At this point the stolen slab is completely clear.
1024d5185f9cSdormando * We always kill the "first"/"oldest" slab page in the slab_list, so
1025d5185f9cSdormando * shuffle the page list backwards and decrement.
1026d5185f9cSdormando */
1027fa51ad84Sdormando s_cls->slabs--;
1028d5185f9cSdormando for (x = 0; x < s_cls->slabs; x++) {
1029d5185f9cSdormando s_cls->slab_list[x] = s_cls->slab_list[x+1];
1030d5185f9cSdormando }
103110698baeSdormando
103210698baeSdormando d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
1033d6e96467Sdormando /* Don't need to split the page into chunks if we're just storing it */
1034d6e96467Sdormando if (slab_rebal.d_clsid > SLAB_GLOBAL_PAGE_POOL) {
1035a836eabcSdormando memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
1036845a4fe1Sdormando split_slab_page_into_freelist(slab_rebal.slab_start,
1037845a4fe1Sdormando slab_rebal.d_clsid);
103831541b37Sdormando } else if (slab_rebal.d_clsid == SLAB_GLOBAL_PAGE_POOL) {
103931541b37Sdormando /* mem_malloc'ed might be higher than mem_limit. */
104031541b37Sdormando memory_release();
1041d6e96467Sdormando }
104210698baeSdormando
104310698baeSdormando slab_rebal.done = 0;
104410698baeSdormando slab_rebal.s_clsid = 0;
104510698baeSdormando slab_rebal.d_clsid = 0;
104610698baeSdormando slab_rebal.slab_start = NULL;
104710698baeSdormando slab_rebal.slab_end = NULL;
104810698baeSdormando slab_rebal.slab_pos = NULL;
10498fa54f7eSdormando evictions_nomem = slab_rebal.evictions_nomem;
1050b1debc4cSdormando inline_reclaim = slab_rebal.inline_reclaim;
10516ee8daefSdormando rescues = slab_rebal.rescues;
1052*ee461d11Sdormando chunk_rescues = slab_rebal.chunk_rescues;
10538fa54f7eSdormando slab_rebal.evictions_nomem = 0;
1054b1debc4cSdormando slab_rebal.inline_reclaim = 0;
10556ee8daefSdormando slab_rebal.rescues = 0;
105610698baeSdormando
105710698baeSdormando slab_rebalance_signal = 0;
105810698baeSdormando
105910698baeSdormando pthread_mutex_unlock(&slabs_lock);
106010698baeSdormando
106110698baeSdormando STATS_LOCK();
106210698baeSdormando stats.slabs_moved++;
10636ee8daefSdormando stats.slab_reassign_rescues += rescues;
10648fa54f7eSdormando stats.slab_reassign_evictions_nomem += evictions_nomem;
1065b1debc4cSdormando stats.slab_reassign_inline_reclaim += inline_reclaim;
1066*ee461d11Sdormando stats.slab_reassign_chunk_rescues += chunk_rescues;
1067cb01d504Sdormando stats_state.slab_reassign_running = false;
106810698baeSdormando STATS_UNLOCK();
106910698baeSdormando
107010698baeSdormando if (settings.verbose > 1) {
107110698baeSdormando fprintf(stderr, "finished a slab move\n");
107210698baeSdormando }
107310698baeSdormando }
107410698baeSdormando
107557a9856aSdormando /* Slab mover thread.
107657a9856aSdormando * Sits waiting for a condition to jump off and shovel some memory about
107757a9856aSdormando */
slab_rebalance_thread(void * arg)107857a9856aSdormando static void *slab_rebalance_thread(void *arg) {
107957a9856aSdormando int was_busy = 0;
10801c94e12cSdormando /* So we first pass into cond_wait with the mutex held */
10811c94e12cSdormando mutex_lock(&slabs_rebalance_lock);
108257a9856aSdormando
108357a9856aSdormando while (do_run_slab_rebalance_thread) {
108410698baeSdormando if (slab_rebalance_signal == 1) {
108510698baeSdormando if (slab_rebalance_start() < 0) {
108610698baeSdormando /* Handle errors with more specifity as required. */
108710698baeSdormando slab_rebalance_signal = 0;
108810698baeSdormando }
108910698baeSdormando
109057a9856aSdormando was_busy = 0;
109110698baeSdormando } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
109210698baeSdormando was_busy = slab_rebalance_move();
109310698baeSdormando }
109410698baeSdormando
109510698baeSdormando if (slab_rebal.done) {
109610698baeSdormando slab_rebalance_finish();
109763bf748aSdormando } else if (was_busy) {
109863bf748aSdormando /* Stuck waiting for some items to unlock, so slow down a bit
109963bf748aSdormando * to give them a chance to free up */
110063bf748aSdormando usleep(50);
110163bf748aSdormando }
110263bf748aSdormando
110363bf748aSdormando if (slab_rebalance_signal == 0) {
11043b2dc737Sdormando /* always hold this lock while we're running */
11053b2dc737Sdormando pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
110610698baeSdormando }
110710698baeSdormando }
110810698baeSdormando return NULL;
110910698baeSdormando }
111010698baeSdormando
1111c2e0023dSdormando /* Iterate at most once through the slab classes and pick a "random" source.
1112c2e0023dSdormando * I like this better than calling rand() since rand() is slow enough that we
1113c2e0023dSdormando * can just check all of the classes once instead.
1114c2e0023dSdormando */
slabs_reassign_pick_any(int dst)1115c2e0023dSdormando static int slabs_reassign_pick_any(int dst) {
1116c2e0023dSdormando static int cur = POWER_SMALLEST - 1;
1117c2e0023dSdormando int tries = power_largest - POWER_SMALLEST + 1;
1118c2e0023dSdormando for (; tries > 0; tries--) {
1119c2e0023dSdormando cur++;
1120c2e0023dSdormando if (cur > power_largest)
1121c2e0023dSdormando cur = POWER_SMALLEST;
1122c2e0023dSdormando if (cur == dst)
1123c2e0023dSdormando continue;
1124c2e0023dSdormando if (slabclass[cur].slabs > 1) {
1125c2e0023dSdormando return cur;
1126c2e0023dSdormando }
1127c2e0023dSdormando }
1128c2e0023dSdormando return -1;
1129c2e0023dSdormando }
1130c2e0023dSdormando
do_slabs_reassign(int src,int dst)113110698baeSdormando static enum reassign_result_type do_slabs_reassign(int src, int dst) {
113210698baeSdormando if (slab_rebalance_signal != 0)
113310698baeSdormando return REASSIGN_RUNNING;
113410698baeSdormando
11358c1c18edSdormando if (src == dst)
11368c1c18edSdormando return REASSIGN_SRC_DST_SAME;
11378c1c18edSdormando
1138c2e0023dSdormando /* Special indicator to choose ourselves. */
1139c2e0023dSdormando if (src == -1) {
1140c2e0023dSdormando src = slabs_reassign_pick_any(dst);
1141c2e0023dSdormando /* TODO: If we end up back at -1, return a new error type */
1142c2e0023dSdormando }
1143c2e0023dSdormando
114410698baeSdormando if (src < POWER_SMALLEST || src > power_largest ||
1145d6e96467Sdormando dst < SLAB_GLOBAL_PAGE_POOL || dst > power_largest)
114610698baeSdormando return REASSIGN_BADCLASS;
114710698baeSdormando
114810698baeSdormando if (slabclass[src].slabs < 2)
114910698baeSdormando return REASSIGN_NOSPARE;
115010698baeSdormando
115110698baeSdormando slab_rebal.s_clsid = src;
115210698baeSdormando slab_rebal.d_clsid = dst;
115310698baeSdormando
115410698baeSdormando slab_rebalance_signal = 1;
115557a9856aSdormando pthread_cond_signal(&slab_rebalance_cond);
115610698baeSdormando
115710698baeSdormando return REASSIGN_OK;
115810698baeSdormando }
115910698baeSdormando
slabs_reassign(int src,int dst)116010698baeSdormando enum reassign_result_type slabs_reassign(int src, int dst) {
116110698baeSdormando enum reassign_result_type ret;
11623b2dc737Sdormando if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
11633b2dc737Sdormando return REASSIGN_RUNNING;
11643b2dc737Sdormando }
116510698baeSdormando ret = do_slabs_reassign(src, dst);
11663b2dc737Sdormando pthread_mutex_unlock(&slabs_rebalance_lock);
116710698baeSdormando return ret;
116810698baeSdormando }
116910698baeSdormando
11701c94e12cSdormando /* If we hold this lock, rebalancer can't wake up or move */
slabs_rebalancer_pause(void)11711c94e12cSdormando void slabs_rebalancer_pause(void) {
11721c94e12cSdormando pthread_mutex_lock(&slabs_rebalance_lock);
11731c94e12cSdormando }
11741c94e12cSdormando
slabs_rebalancer_resume(void)11751c94e12cSdormando void slabs_rebalancer_resume(void) {
11761c94e12cSdormando pthread_mutex_unlock(&slabs_rebalance_lock);
11771c94e12cSdormando }
11781c94e12cSdormando
117957a9856aSdormando static pthread_t rebalance_tid;
118010698baeSdormando
start_slab_maintenance_thread(void)118110698baeSdormando int start_slab_maintenance_thread(void) {
118210698baeSdormando int ret;
118310698baeSdormando slab_rebalance_signal = 0;
118410698baeSdormando slab_rebal.slab_start = NULL;
118510698baeSdormando char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
118610698baeSdormando if (env != NULL) {
118710698baeSdormando slab_bulk_check = atoi(env);
118810698baeSdormando if (slab_bulk_check == 0) {
118910698baeSdormando slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
119010698baeSdormando }
119110698baeSdormando }
119257a9856aSdormando
119357a9856aSdormando if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
119457a9856aSdormando fprintf(stderr, "Can't intiialize rebalance condition\n");
119557a9856aSdormando return -1;
119657a9856aSdormando }
11973b2dc737Sdormando pthread_mutex_init(&slabs_rebalance_lock, NULL);
119857a9856aSdormando
119957a9856aSdormando if ((ret = pthread_create(&rebalance_tid, NULL,
120057a9856aSdormando slab_rebalance_thread, NULL)) != 0) {
120157a9856aSdormando fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
120210698baeSdormando return -1;
120310698baeSdormando }
120410698baeSdormando return 0;
120510698baeSdormando }
120610698baeSdormando
120769d1c699Sdormando /* The maintenance thread is on a sleep/loop cycle, so it should join after a
120869d1c699Sdormando * short wait */
stop_slab_maintenance_thread(void)120910698baeSdormando void stop_slab_maintenance_thread(void) {
121069d1c699Sdormando mutex_lock(&slabs_rebalance_lock);
121110698baeSdormando do_run_slab_thread = 0;
121257a9856aSdormando do_run_slab_rebalance_thread = 0;
121369d1c699Sdormando pthread_cond_signal(&slab_rebalance_cond);
121469d1c699Sdormando pthread_mutex_unlock(&slabs_rebalance_lock);
121510698baeSdormando
121610698baeSdormando /* Wait for the maintenance thread to stop */
121757a9856aSdormando pthread_join(rebalance_tid, NULL);
121810698baeSdormando }
1219