1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
4 * and are divided into chunks. The chunk sizes start off at the size of the
5 * "item" structure plus space for a small key and value. They increase by
6 * a multiplier factor from there, up to half the maximum slab size. The last
7 * slab size is always 1MB, since that's the maximum item size allowed by the
8 * memcached protocol.
9 */
10 #include "memcached.h"
11 #include <sys/stat.h>
12 #include <sys/socket.h>
13 #include <sys/resource.h>
14 #include <fcntl.h>
15 #include <netinet/in.h>
16 #include <errno.h>
17 #include <stdlib.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <signal.h>
21 #include <assert.h>
22 #include <pthread.h>
23
24 //#define DEBUG_SLAB_MOVER
25 /* powers-of-N allocation structures */
26
27 typedef struct {
28 unsigned int size; /* sizes of items */
29 unsigned int perslab; /* how many items per slab */
30
31 void *slots; /* list of item ptrs */
32 unsigned int sl_curr; /* total free items in list */
33
34 unsigned int slabs; /* how many slabs were allocated for this class */
35
36 void **slab_list; /* array of slab pointers */
37 unsigned int list_size; /* size of prev array */
38
39 size_t requested; /* The number of requested bytes */
40 } slabclass_t;
41
42 static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
43 static size_t mem_limit = 0;
44 static size_t mem_malloced = 0;
45 /* If the memory limit has been hit once. Used as a hint to decide when to
46 * early-wake the LRU maintenance thread */
47 static bool mem_limit_reached = false;
48 static int power_largest;
49
50 static void *mem_base = NULL;
51 static void *mem_current = NULL;
52 static size_t mem_avail = 0;
53
54 /**
55 * Access to the slab allocator is protected by this lock
56 */
57 static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
58 static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
59
60 /*
61 * Forward Declarations
62 */
63 static int do_slabs_newslab(const unsigned int id);
64 static void *memory_allocate(size_t size);
65 static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
66
67 /* Preallocate as many slab pages as possible (called from slabs_init)
68 on start-up, so users don't get confused out-of-memory errors when
69 they do have free (in-slab) space, but no space to make new slabs.
70 if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
71 slab types can be made. if max memory is less than 18 MB, only the
72 smaller ones will be made. */
73 static void slabs_preallocate (const unsigned int maxslabs);
74
75 /*
76 * Figures out which slab class (chunk size) is required to store an item of
77 * a given size.
78 *
79 * Given object size, return id to use when allocating/freeing memory for object
80 * 0 means error: can't store such a large object
81 */
82
slabs_clsid(const size_t size)83 unsigned int slabs_clsid(const size_t size) {
84 int res = POWER_SMALLEST;
85
86 if (size == 0 || size > settings.item_size_max)
87 return 0;
88 while (size > slabclass[res].size)
89 if (res++ == power_largest) /* won't fit in the biggest slab */
90 return power_largest;
91 return res;
92 }
93
94 /**
95 * Determines the chunk sizes and initializes the slab class descriptors
96 * accordingly.
97 */
slabs_init(const size_t limit,const double factor,const bool prealloc,const uint32_t * slab_sizes)98 void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) {
99 int i = POWER_SMALLEST - 1;
100 unsigned int size = sizeof(item) + settings.chunk_size;
101
102 mem_limit = limit;
103
104 if (prealloc) {
105 /* Allocate everything in a big chunk with malloc */
106 mem_base = malloc(mem_limit);
107 if (mem_base != NULL) {
108 mem_current = mem_base;
109 mem_avail = mem_limit;
110 } else {
111 fprintf(stderr, "Warning: Failed to allocate requested memory in"
112 " one large chunk.\nWill allocate in smaller chunks\n");
113 }
114 }
115
116 memset(slabclass, 0, sizeof(slabclass));
117
118 while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
119 if (slab_sizes != NULL) {
120 if (slab_sizes[i-1] == 0)
121 break;
122 size = slab_sizes[i-1];
123 } else if (size >= settings.slab_chunk_size_max / factor) {
124 break;
125 }
126 /* Make sure items are always n-byte aligned */
127 if (size % CHUNK_ALIGN_BYTES)
128 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
129
130 slabclass[i].size = size;
131 slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
132 if (slab_sizes == NULL)
133 size *= factor;
134 if (settings.verbose > 1) {
135 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
136 i, slabclass[i].size, slabclass[i].perslab);
137 }
138 }
139
140 power_largest = i;
141 slabclass[power_largest].size = settings.slab_chunk_size_max;
142 slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
143 if (settings.verbose > 1) {
144 fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
145 i, slabclass[i].size, slabclass[i].perslab);
146 }
147
148 /* for the test suite: faking of how much we've already malloc'd */
149 {
150 char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
151 if (t_initial_malloc) {
152 mem_malloced = (size_t)atol(t_initial_malloc);
153 }
154
155 }
156
157 if (prealloc) {
158 slabs_preallocate(power_largest);
159 }
160 }
161
slabs_preallocate(const unsigned int maxslabs)162 static void slabs_preallocate (const unsigned int maxslabs) {
163 int i;
164 unsigned int prealloc = 0;
165
166 /* pre-allocate a 1MB slab in every size class so people don't get
167 confused by non-intuitive "SERVER_ERROR out of memory"
168 messages. this is the most common question on the mailing
169 list. if you really don't want this, you can rebuild without
170 these three lines. */
171
172 for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
173 if (++prealloc > maxslabs)
174 return;
175 if (do_slabs_newslab(i) == 0) {
176 fprintf(stderr, "Error while preallocating slab memory!\n"
177 "If using -L or other prealloc options, max memory must be "
178 "at least %d megabytes.\n", power_largest);
179 exit(1);
180 }
181 }
182
183 }
184
grow_slab_list(const unsigned int id)185 static int grow_slab_list (const unsigned int id) {
186 slabclass_t *p = &slabclass[id];
187 if (p->slabs == p->list_size) {
188 size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
189 void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
190 if (new_list == 0) return 0;
191 p->list_size = new_size;
192 p->slab_list = new_list;
193 }
194 return 1;
195 }
196
split_slab_page_into_freelist(char * ptr,const unsigned int id)197 static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
198 slabclass_t *p = &slabclass[id];
199 int x;
200 for (x = 0; x < p->perslab; x++) {
201 do_slabs_free(ptr, 0, id);
202 ptr += p->size;
203 }
204 }
205
206 /* Fast FIFO queue */
get_page_from_global_pool(void)207 static void *get_page_from_global_pool(void) {
208 slabclass_t *p = &slabclass[SLAB_GLOBAL_PAGE_POOL];
209 if (p->slabs < 1) {
210 return NULL;
211 }
212 char *ret = p->slab_list[p->slabs - 1];
213 p->slabs--;
214 return ret;
215 }
216
do_slabs_newslab(const unsigned int id)217 static int do_slabs_newslab(const unsigned int id) {
218 slabclass_t *p = &slabclass[id];
219 slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
220 int len = settings.slab_reassign ? settings.item_size_max
221 : p->size * p->perslab;
222 char *ptr;
223
224 if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
225 && g->slabs == 0)) {
226 mem_limit_reached = true;
227 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
228 return 0;
229 }
230
231 if ((grow_slab_list(id) == 0) ||
232 (((ptr = get_page_from_global_pool()) == NULL) &&
233 ((ptr = memory_allocate((size_t)len)) == 0))) {
234
235 MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
236 return 0;
237 }
238
239 memset(ptr, 0, (size_t)len);
240 split_slab_page_into_freelist(ptr, id);
241
242 p->slab_list[p->slabs++] = ptr;
243 MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
244
245 return 1;
246 }
247
248 /* This calculation ends up adding sizeof(void *) to the item size. */
do_slabs_alloc_chunked(const size_t size,slabclass_t * p,unsigned int id)249 static void *do_slabs_alloc_chunked(const size_t size, slabclass_t *p, unsigned int id) {
250 void *ret = NULL;
251 item *it = NULL;
252 int x;
253 int csize = p->size - sizeof(item_chunk);
254 unsigned int chunks_req = size / csize;
255 if (size % csize != 0)
256 chunks_req++;
257 while (p->sl_curr < chunks_req) {
258 if (do_slabs_newslab(id) == 0)
259 break;
260 }
261
262 if (p->sl_curr >= chunks_req) {
263 item_chunk *chunk = NULL;
264
265 /* Configure the head item in the chain. */
266 it = (item *)p->slots;
267 p->slots = it->next;
268 if (it->next) it->next->prev = 0;
269
270 /* Squirrel away the "top chunk" into h_next for now */
271 it->h_next = (item *)p->slots;
272 assert(it->h_next != 0);
273 chunk = (item_chunk *) it->h_next;
274
275 /* roll down the chunks, marking them as such. */
276 for (x = 0; x < chunks_req-1; x++) {
277 chunk->it_flags &= ~ITEM_SLABBED;
278 chunk->it_flags |= ITEM_CHUNK;
279 /* Chunks always have a direct reference to the head item */
280 chunk->head = it;
281 chunk->size = p->size - sizeof(item_chunk);
282 chunk->used = 0;
283 chunk = chunk->next;
284 }
285
286 /* The final "next" is now the top of the slab freelist */
287 p->slots = chunk;
288 if (chunk && chunk->prev) {
289 /* Disconnect the final chunk from the chain */
290 chunk->prev->next = 0;
291 chunk->prev = 0;
292 }
293
294 it->it_flags &= ~ITEM_SLABBED;
295 it->it_flags |= ITEM_CHUNKED;
296 it->refcount = 1;
297 p->sl_curr -= chunks_req;
298 ret = (void *)it;
299 } else {
300 ret = NULL;
301 }
302
303 return ret;
304 }
305
306 /*@null@*/
do_slabs_alloc(const size_t size,unsigned int id,uint64_t * total_bytes,unsigned int flags)307 static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_bytes,
308 unsigned int flags) {
309 slabclass_t *p;
310 void *ret = NULL;
311 item *it = NULL;
312
313 if (id < POWER_SMALLEST || id > power_largest) {
314 MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
315 return NULL;
316 }
317 p = &slabclass[id];
318 assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
319 if (total_bytes != NULL) {
320 *total_bytes = p->requested;
321 }
322
323 if (size <= p->size) {
324 /* fail unless we have space at the end of a recently allocated page,
325 we have something on our freelist, or we could allocate a new page */
326 if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
327 do_slabs_newslab(id);
328 }
329
330 if (p->sl_curr != 0) {
331 /* return off our freelist */
332 it = (item *)p->slots;
333 p->slots = it->next;
334 if (it->next) it->next->prev = 0;
335 /* Kill flag and initialize refcount here for lock safety in slab
336 * mover's freeness detection. */
337 it->it_flags &= ~ITEM_SLABBED;
338 it->refcount = 1;
339 p->sl_curr--;
340 ret = (void *)it;
341 } else {
342 ret = NULL;
343 }
344 } else {
345 /* Dealing with a chunked item. */
346 ret = do_slabs_alloc_chunked(size, p, id);
347 }
348
349 if (ret) {
350 p->requested += size;
351 MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
352 } else {
353 MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
354 }
355
356 return ret;
357 }
358
do_slabs_free_chunked(item * it,const size_t size,unsigned int id,slabclass_t * p)359 static void do_slabs_free_chunked(item *it, const size_t size, unsigned int id,
360 slabclass_t *p) {
361 item_chunk *chunk = (item_chunk *) ITEM_data(it);
362 size_t realsize = size;
363 while (chunk) {
364 realsize += sizeof(item_chunk);
365 chunk = chunk->next;
366 }
367 chunk = (item_chunk *) ITEM_data(it);
368 unsigned int chunks_found = 1;
369
370 it->it_flags = ITEM_SLABBED;
371 it->slabs_clsid = 0;
372 it->prev = 0;
373 it->next = (item *) chunk->next;
374 assert(it->next);
375 /* top chunk should already point back to head */
376 assert(it->next && (void*)it->next->prev == (void*)chunk);
377 chunk = chunk->next;
378 chunk->prev = (item_chunk *)it;
379
380 while (chunk) {
381 assert(chunk->it_flags == ITEM_CHUNK);
382 chunk->it_flags = ITEM_SLABBED;
383 chunk->slabs_clsid = 0;
384 chunks_found++;
385 if (chunk->next) {
386 chunk = chunk->next;
387 } else {
388 break;
389 }
390 }
391 /* must have had nothing hanging off of the final chunk */
392 assert(chunk && chunk->next == 0);
393 /* Tail chunk, link the freelist here. */
394 chunk->next = p->slots;
395 if (chunk->next) chunk->next->prev = chunk;
396
397 p->slots = it;
398 p->sl_curr += chunks_found;
399 p->requested -= size;
400
401 return;
402 }
403
404
do_slabs_free(void * ptr,const size_t size,unsigned int id)405 static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
406 slabclass_t *p;
407 item *it;
408
409 assert(id >= POWER_SMALLEST && id <= power_largest);
410 if (id < POWER_SMALLEST || id > power_largest)
411 return;
412
413 MEMCACHED_SLABS_FREE(size, id, ptr);
414 p = &slabclass[id];
415
416 it = (item *)ptr;
417 if ((it->it_flags & ITEM_CHUNKED) == 0) {
418 it->it_flags = ITEM_SLABBED;
419 it->slabs_clsid = 0;
420 it->prev = 0;
421 it->next = p->slots;
422 if (it->next) it->next->prev = it;
423 p->slots = it;
424
425 p->sl_curr++;
426 p->requested -= size;
427 } else {
428 do_slabs_free_chunked(it, size, id, p);
429 }
430 return;
431 }
432
nz_strcmp(int nzlength,const char * nz,const char * z)433 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
434 int zlength=strlen(z);
435 return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
436 }
437
get_stats(const char * stat_type,int nkey,ADD_STAT add_stats,void * c)438 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
439 bool ret = true;
440
441 if (add_stats != NULL) {
442 if (!stat_type) {
443 /* prepare general statistics for the engine */
444 STATS_LOCK();
445 APPEND_STAT("bytes", "%llu", (unsigned long long)stats_state.curr_bytes);
446 APPEND_STAT("curr_items", "%llu", (unsigned long long)stats_state.curr_items);
447 APPEND_STAT("total_items", "%llu", (unsigned long long)stats.total_items);
448 STATS_UNLOCK();
449 if (settings.slab_automove > 0) {
450 pthread_mutex_lock(&slabs_lock);
451 APPEND_STAT("slab_global_page_pool", "%u", slabclass[SLAB_GLOBAL_PAGE_POOL].slabs);
452 pthread_mutex_unlock(&slabs_lock);
453 }
454 item_stats_totals(add_stats, c);
455 } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
456 item_stats(add_stats, c);
457 } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
458 slabs_stats(add_stats, c);
459 } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
460 item_stats_sizes(add_stats, c);
461 } else if (nz_strcmp(nkey, stat_type, "sizes_enable") == 0) {
462 item_stats_sizes_enable(add_stats, c);
463 } else if (nz_strcmp(nkey, stat_type, "sizes_disable") == 0) {
464 item_stats_sizes_disable(add_stats, c);
465 } else {
466 ret = false;
467 }
468 } else {
469 ret = false;
470 }
471
472 return ret;
473 }
474
475 /*@null@*/
do_slabs_stats(ADD_STAT add_stats,void * c)476 static void do_slabs_stats(ADD_STAT add_stats, void *c) {
477 int i, total;
478 /* Get the per-thread stats which contain some interesting aggregates */
479 struct thread_stats thread_stats;
480 threadlocal_stats_aggregate(&thread_stats);
481
482 total = 0;
483 for(i = POWER_SMALLEST; i <= power_largest; i++) {
484 slabclass_t *p = &slabclass[i];
485 if (p->slabs != 0) {
486 uint32_t perslab, slabs;
487 slabs = p->slabs;
488 perslab = p->perslab;
489
490 char key_str[STAT_KEY_LEN];
491 char val_str[STAT_VAL_LEN];
492 int klen = 0, vlen = 0;
493
494 APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
495 APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
496 APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
497 APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
498 APPEND_NUM_STAT(i, "used_chunks", "%u",
499 slabs*perslab - p->sl_curr);
500 APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
501 /* Stat is dead, but displaying zero instead of removing it. */
502 APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
503 APPEND_NUM_STAT(i, "mem_requested", "%llu",
504 (unsigned long long)p->requested);
505 APPEND_NUM_STAT(i, "get_hits", "%llu",
506 (unsigned long long)thread_stats.slab_stats[i].get_hits);
507 APPEND_NUM_STAT(i, "cmd_set", "%llu",
508 (unsigned long long)thread_stats.slab_stats[i].set_cmds);
509 APPEND_NUM_STAT(i, "delete_hits", "%llu",
510 (unsigned long long)thread_stats.slab_stats[i].delete_hits);
511 APPEND_NUM_STAT(i, "incr_hits", "%llu",
512 (unsigned long long)thread_stats.slab_stats[i].incr_hits);
513 APPEND_NUM_STAT(i, "decr_hits", "%llu",
514 (unsigned long long)thread_stats.slab_stats[i].decr_hits);
515 APPEND_NUM_STAT(i, "cas_hits", "%llu",
516 (unsigned long long)thread_stats.slab_stats[i].cas_hits);
517 APPEND_NUM_STAT(i, "cas_badval", "%llu",
518 (unsigned long long)thread_stats.slab_stats[i].cas_badval);
519 APPEND_NUM_STAT(i, "touch_hits", "%llu",
520 (unsigned long long)thread_stats.slab_stats[i].touch_hits);
521 total++;
522 }
523 }
524
525 /* add overall slab stats and append terminator */
526
527 APPEND_STAT("active_slabs", "%d", total);
528 APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
529 add_stats(NULL, 0, NULL, 0, c);
530 }
531
memory_allocate(size_t size)532 static void *memory_allocate(size_t size) {
533 void *ret;
534
535 if (mem_base == NULL) {
536 /* We are not using a preallocated large memory chunk */
537 ret = malloc(size);
538 } else {
539 ret = mem_current;
540
541 if (size > mem_avail) {
542 return NULL;
543 }
544
545 /* mem_current pointer _must_ be aligned!!! */
546 if (size % CHUNK_ALIGN_BYTES) {
547 size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
548 }
549
550 mem_current = ((char*)mem_current) + size;
551 if (size < mem_avail) {
552 mem_avail -= size;
553 } else {
554 mem_avail = 0;
555 }
556 }
557 mem_malloced += size;
558
559 return ret;
560 }
561
562 /* Must only be used if all pages are item_size_max */
memory_release()563 static void memory_release() {
564 void *p = NULL;
565 if (mem_base != NULL)
566 return;
567
568 if (!settings.slab_reassign)
569 return;
570
571 while (mem_malloced > mem_limit &&
572 (p = get_page_from_global_pool()) != NULL) {
573 free(p);
574 mem_malloced -= settings.item_size_max;
575 }
576 }
577
slabs_alloc(size_t size,unsigned int id,uint64_t * total_bytes,unsigned int flags)578 void *slabs_alloc(size_t size, unsigned int id, uint64_t *total_bytes,
579 unsigned int flags) {
580 void *ret;
581
582 pthread_mutex_lock(&slabs_lock);
583 ret = do_slabs_alloc(size, id, total_bytes, flags);
584 pthread_mutex_unlock(&slabs_lock);
585 return ret;
586 }
587
slabs_free(void * ptr,size_t size,unsigned int id)588 void slabs_free(void *ptr, size_t size, unsigned int id) {
589 pthread_mutex_lock(&slabs_lock);
590 do_slabs_free(ptr, size, id);
591 pthread_mutex_unlock(&slabs_lock);
592 }
593
slabs_stats(ADD_STAT add_stats,void * c)594 void slabs_stats(ADD_STAT add_stats, void *c) {
595 pthread_mutex_lock(&slabs_lock);
596 do_slabs_stats(add_stats, c);
597 pthread_mutex_unlock(&slabs_lock);
598 }
599
do_slabs_adjust_mem_limit(size_t new_mem_limit)600 static bool do_slabs_adjust_mem_limit(size_t new_mem_limit) {
601 /* Cannot adjust memory limit at runtime if prealloc'ed */
602 if (mem_base != NULL)
603 return false;
604 settings.maxbytes = new_mem_limit;
605 mem_limit = new_mem_limit;
606 mem_limit_reached = false; /* Will reset on next alloc */
607 memory_release(); /* free what might already be in the global pool */
608 return true;
609 }
610
slabs_adjust_mem_limit(size_t new_mem_limit)611 bool slabs_adjust_mem_limit(size_t new_mem_limit) {
612 bool ret;
613 pthread_mutex_lock(&slabs_lock);
614 ret = do_slabs_adjust_mem_limit(new_mem_limit);
615 pthread_mutex_unlock(&slabs_lock);
616 return ret;
617 }
618
slabs_adjust_mem_requested(unsigned int id,size_t old,size_t ntotal)619 void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
620 {
621 pthread_mutex_lock(&slabs_lock);
622 slabclass_t *p;
623 if (id < POWER_SMALLEST || id > power_largest) {
624 fprintf(stderr, "Internal error! Invalid slab class\n");
625 abort();
626 }
627
628 p = &slabclass[id];
629 p->requested = p->requested - old + ntotal;
630 pthread_mutex_unlock(&slabs_lock);
631 }
632
slabs_available_chunks(const unsigned int id,bool * mem_flag,uint64_t * total_bytes,unsigned int * chunks_perslab)633 unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
634 uint64_t *total_bytes, unsigned int *chunks_perslab) {
635 unsigned int ret;
636 slabclass_t *p;
637
638 pthread_mutex_lock(&slabs_lock);
639 p = &slabclass[id];
640 ret = p->sl_curr;
641 if (mem_flag != NULL)
642 *mem_flag = mem_limit_reached;
643 if (total_bytes != NULL)
644 *total_bytes = p->requested;
645 if (chunks_perslab != NULL)
646 *chunks_perslab = p->perslab;
647 pthread_mutex_unlock(&slabs_lock);
648 return ret;
649 }
650
651 static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
652 static volatile int do_run_slab_thread = 1;
653 static volatile int do_run_slab_rebalance_thread = 1;
654
655 #define DEFAULT_SLAB_BULK_CHECK 1
656 int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
657
slab_rebalance_start(void)658 static int slab_rebalance_start(void) {
659 slabclass_t *s_cls;
660 int no_go = 0;
661
662 pthread_mutex_lock(&slabs_lock);
663
664 if (slab_rebal.s_clsid < POWER_SMALLEST ||
665 slab_rebal.s_clsid > power_largest ||
666 slab_rebal.d_clsid < SLAB_GLOBAL_PAGE_POOL ||
667 slab_rebal.d_clsid > power_largest ||
668 slab_rebal.s_clsid == slab_rebal.d_clsid)
669 no_go = -2;
670
671 s_cls = &slabclass[slab_rebal.s_clsid];
672
673 if (!grow_slab_list(slab_rebal.d_clsid)) {
674 no_go = -1;
675 }
676
677 if (s_cls->slabs < 2)
678 no_go = -3;
679
680 if (no_go != 0) {
681 pthread_mutex_unlock(&slabs_lock);
682 return no_go; /* Should use a wrapper function... */
683 }
684
685 /* Always kill the first available slab page as it is most likely to
686 * contain the oldest items
687 */
688 slab_rebal.slab_start = s_cls->slab_list[0];
689 slab_rebal.slab_end = (char *)slab_rebal.slab_start +
690 (s_cls->size * s_cls->perslab);
691 slab_rebal.slab_pos = slab_rebal.slab_start;
692 slab_rebal.done = 0;
693
694 /* Also tells do_item_get to search for items in this slab */
695 slab_rebalance_signal = 2;
696
697 if (settings.verbose > 1) {
698 fprintf(stderr, "Started a slab rebalance\n");
699 }
700
701 pthread_mutex_unlock(&slabs_lock);
702
703 STATS_LOCK();
704 stats_state.slab_reassign_running = true;
705 STATS_UNLOCK();
706
707 return 0;
708 }
709
710 /* CALLED WITH slabs_lock HELD */
slab_rebalance_alloc(const size_t size,unsigned int id)711 static void *slab_rebalance_alloc(const size_t size, unsigned int id) {
712 slabclass_t *s_cls;
713 s_cls = &slabclass[slab_rebal.s_clsid];
714 int x;
715 item *new_it = NULL;
716
717 for (x = 0; x < s_cls->perslab; x++) {
718 new_it = do_slabs_alloc(size, id, NULL, SLABS_ALLOC_NO_NEWPAGE);
719 /* check that memory isn't within the range to clear */
720 if (new_it == NULL) {
721 break;
722 }
723 if ((void *)new_it >= slab_rebal.slab_start
724 && (void *)new_it < slab_rebal.slab_end) {
725 /* Pulled something we intend to free. Mark it as freed since
726 * we've already done the work of unlinking it from the freelist.
727 */
728 s_cls->requested -= size;
729 new_it->refcount = 0;
730 new_it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
731 #ifdef DEBUG_SLAB_MOVER
732 memcpy(ITEM_key(new_it), "deadbeef", 8);
733 #endif
734 new_it = NULL;
735 slab_rebal.inline_reclaim++;
736 } else {
737 break;
738 }
739 }
740 return new_it;
741 }
742
743 /* CALLED WITH slabs_lock HELD */
744 /* detatches item/chunk from freelist. */
slab_rebalance_cut_free(slabclass_t * s_cls,item * it)745 static void slab_rebalance_cut_free(slabclass_t *s_cls, item *it) {
746 /* Ensure this was on the freelist and nothing else. */
747 assert(it->it_flags == ITEM_SLABBED);
748 if (s_cls->slots == it) {
749 s_cls->slots = it->next;
750 }
751 if (it->next) it->next->prev = it->prev;
752 if (it->prev) it->prev->next = it->next;
753 s_cls->sl_curr--;
754 }
755
756 enum move_status {
757 MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
758 };
759
760 /* refcount == 0 is safe since nobody can incr while item_lock is held.
761 * refcount != 0 is impossible since flags/etc can be modified in other
762 * threads. instead, note we found a busy one and bail. logic in do_item_get
763 * will prevent busy items from continuing to be busy
764 * NOTE: This is checking it_flags outside of an item lock. I believe this
765 * works since it_flags is 8 bits, and we're only ever comparing a single bit
766 * regardless. ITEM_SLABBED bit will always be correct since we're holding the
767 * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
768 * item having ITEM_SLABBED removed, and the key hasn't been added to the item
769 * yet. The memory barrier from the slabs lock should order the key write and the
770 * flags to the item?
771 * If ITEM_LINKED did exist and was just removed, but we still see it, that's
772 * still safe since it will have a valid key, which we then lock, and then
773 * recheck everything.
774 * This may not be safe on all platforms; If not, slabs_alloc() will need to
775 * seed the item key while holding slabs_lock.
776 */
slab_rebalance_move(void)777 static int slab_rebalance_move(void) {
778 slabclass_t *s_cls;
779 int x;
780 int was_busy = 0;
781 int refcount = 0;
782 uint32_t hv;
783 void *hold_lock;
784 enum move_status status = MOVE_PASS;
785
786 pthread_mutex_lock(&slabs_lock);
787
788 s_cls = &slabclass[slab_rebal.s_clsid];
789
790 for (x = 0; x < slab_bulk_check; x++) {
791 hv = 0;
792 hold_lock = NULL;
793 item *it = slab_rebal.slab_pos;
794 item_chunk *ch = NULL;
795 status = MOVE_PASS;
796 if (it->it_flags & ITEM_CHUNK) {
797 /* This chunk is a chained part of a larger item. */
798 ch = (item_chunk *) it;
799 /* Instead, we use the head chunk to find the item and effectively
800 * lock the entire structure. If a chunk has ITEM_CHUNK flag, its
801 * head cannot be slabbed, so the normal routine is safe. */
802 it = ch->head;
803 assert(it->it_flags & ITEM_CHUNKED);
804 }
805
806 /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared
807 * the chunk for move. Only these two flags should exist.
808 */
809 if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) {
810 /* ITEM_SLABBED can only be added/removed under the slabs_lock */
811 if (it->it_flags & ITEM_SLABBED) {
812 assert(ch == NULL);
813 slab_rebalance_cut_free(s_cls, it);
814 status = MOVE_FROM_SLAB;
815 } else if ((it->it_flags & ITEM_LINKED) != 0) {
816 /* If it doesn't have ITEM_SLABBED, the item could be in any
817 * state on its way to being freed or written to. If no
818 * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
819 * and have the key written to it already.
820 */
821 hv = hash(ITEM_key(it), it->nkey);
822 if ((hold_lock = item_trylock(hv)) == NULL) {
823 status = MOVE_LOCKED;
824 } else {
825 refcount = refcount_incr(&it->refcount);
826 if (refcount == 2) { /* item is linked but not busy */
827 /* Double check ITEM_LINKED flag here, since we're
828 * past a memory barrier from the mutex. */
829 if ((it->it_flags & ITEM_LINKED) != 0) {
830 status = MOVE_FROM_LRU;
831 } else {
832 /* refcount == 1 + !ITEM_LINKED means the item is being
833 * uploaded to, or was just unlinked but hasn't been freed
834 * yet. Let it bleed off on its own and try again later */
835 status = MOVE_BUSY;
836 }
837 } else {
838 if (settings.verbose > 2) {
839 fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
840 it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
841 }
842 status = MOVE_BUSY;
843 }
844 /* Item lock must be held while modifying refcount */
845 if (status == MOVE_BUSY) {
846 refcount_decr(&it->refcount);
847 item_trylock_unlock(hold_lock);
848 }
849 }
850 } else {
851 /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark
852 * busy and wait for item to complete its upload. */
853 status = MOVE_BUSY;
854 }
855 }
856
857 int save_item = 0;
858 item *new_it = NULL;
859 size_t ntotal = 0;
860 switch (status) {
861 case MOVE_FROM_LRU:
862 /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
863 * We only need to hold the slabs_lock while initially looking
864 * at an item, and at this point we have an exclusive refcount
865 * (2) + the item is locked. Drop slabs lock, drop item to
866 * refcount 1 (just our own, then fall through and wipe it
867 */
868 /* Check if expired or flushed */
869 ntotal = ITEM_ntotal(it);
870 /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */
871 if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) {
872 /* Chunked should be identical to non-chunked, except we need
873 * to swap out ntotal for the head-chunk-total. */
874 ntotal = s_cls->size;
875 }
876 if ((it->exptime != 0 && it->exptime < current_time)
877 || item_is_flushed(it)) {
878 /* Expired, don't save. */
879 save_item = 0;
880 } else if (ch == NULL &&
881 (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) {
882 /* Not a chunk of an item, and nomem. */
883 save_item = 0;
884 slab_rebal.evictions_nomem++;
885 } else if (ch != NULL &&
886 (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) {
887 /* Is a chunk of an item, and nomem. */
888 save_item = 0;
889 slab_rebal.evictions_nomem++;
890 } else {
891 /* Was whatever it was, and we have memory for it. */
892 save_item = 1;
893 }
894 pthread_mutex_unlock(&slabs_lock);
895 unsigned int requested_adjust = 0;
896 if (save_item) {
897 if (ch == NULL) {
898 assert((new_it->it_flags & ITEM_CHUNKED) == 0);
899 /* if free memory, memcpy. clear prev/next/h_bucket */
900 memcpy(new_it, it, ntotal);
901 new_it->prev = 0;
902 new_it->next = 0;
903 new_it->h_next = 0;
904 /* These are definitely required. else fails assert */
905 new_it->it_flags &= ~ITEM_LINKED;
906 new_it->refcount = 0;
907 do_item_replace(it, new_it, hv);
908 /* Need to walk the chunks and repoint head */
909 if (new_it->it_flags & ITEM_CHUNKED) {
910 item_chunk *fch = (item_chunk *) ITEM_data(new_it);
911 fch->next->prev = fch;
912 while (fch) {
913 fch->head = new_it;
914 fch = fch->next;
915 }
916 }
917 it->refcount = 0;
918 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
919 #ifdef DEBUG_SLAB_MOVER
920 memcpy(ITEM_key(it), "deadbeef", 8);
921 #endif
922 slab_rebal.rescues++;
923 requested_adjust = ntotal;
924 } else {
925 item_chunk *nch = (item_chunk *) new_it;
926 /* Chunks always have head chunk (the main it) */
927 ch->prev->next = nch;
928 if (ch->next)
929 ch->next->prev = nch;
930 memcpy(nch, ch, ch->used + sizeof(item_chunk));
931 ch->refcount = 0;
932 ch->it_flags = ITEM_SLABBED|ITEM_FETCHED;
933 slab_rebal.chunk_rescues++;
934 #ifdef DEBUG_SLAB_MOVER
935 memcpy(ITEM_key((item *)ch), "deadbeef", 8);
936 #endif
937 refcount_decr(&it->refcount);
938 requested_adjust = s_cls->size;
939 }
940 } else {
941 /* restore ntotal in case we tried saving a head chunk. */
942 ntotal = ITEM_ntotal(it);
943 do_item_unlink(it, hv);
944 slabs_free(it, ntotal, slab_rebal.s_clsid);
945 /* Swing around again later to remove it from the freelist. */
946 slab_rebal.busy_items++;
947 was_busy++;
948 }
949 item_trylock_unlock(hold_lock);
950 pthread_mutex_lock(&slabs_lock);
951 /* Always remove the ntotal, as we added it in during
952 * do_slabs_alloc() when copying the item.
953 */
954 s_cls->requested -= requested_adjust;
955 break;
956 case MOVE_FROM_SLAB:
957 it->refcount = 0;
958 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
959 #ifdef DEBUG_SLAB_MOVER
960 memcpy(ITEM_key(it), "deadbeef", 8);
961 #endif
962 break;
963 case MOVE_BUSY:
964 case MOVE_LOCKED:
965 slab_rebal.busy_items++;
966 was_busy++;
967 break;
968 case MOVE_PASS:
969 break;
970 }
971
972 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
973 if (slab_rebal.slab_pos >= slab_rebal.slab_end)
974 break;
975 }
976
977 if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
978 /* Some items were busy, start again from the top */
979 if (slab_rebal.busy_items) {
980 slab_rebal.slab_pos = slab_rebal.slab_start;
981 STATS_LOCK();
982 stats.slab_reassign_busy_items += slab_rebal.busy_items;
983 STATS_UNLOCK();
984 slab_rebal.busy_items = 0;
985 } else {
986 slab_rebal.done++;
987 }
988 }
989
990 pthread_mutex_unlock(&slabs_lock);
991
992 return was_busy;
993 }
994
slab_rebalance_finish(void)995 static void slab_rebalance_finish(void) {
996 slabclass_t *s_cls;
997 slabclass_t *d_cls;
998 int x;
999 uint32_t rescues;
1000 uint32_t evictions_nomem;
1001 uint32_t inline_reclaim;
1002 uint32_t chunk_rescues;
1003
1004 pthread_mutex_lock(&slabs_lock);
1005
1006 s_cls = &slabclass[slab_rebal.s_clsid];
1007 d_cls = &slabclass[slab_rebal.d_clsid];
1008
1009 #ifdef DEBUG_SLAB_MOVER
1010 /* If the algorithm is broken, live items can sneak in. */
1011 slab_rebal.slab_pos = slab_rebal.slab_start;
1012 while (1) {
1013 item *it = slab_rebal.slab_pos;
1014 assert(it->it_flags == (ITEM_SLABBED|ITEM_FETCHED));
1015 assert(memcmp(ITEM_key(it), "deadbeef", 8) == 0);
1016 it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
1017 slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
1018 if (slab_rebal.slab_pos >= slab_rebal.slab_end)
1019 break;
1020 }
1021 #endif
1022
1023 /* At this point the stolen slab is completely clear.
1024 * We always kill the "first"/"oldest" slab page in the slab_list, so
1025 * shuffle the page list backwards and decrement.
1026 */
1027 s_cls->slabs--;
1028 for (x = 0; x < s_cls->slabs; x++) {
1029 s_cls->slab_list[x] = s_cls->slab_list[x+1];
1030 }
1031
1032 d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
1033 /* Don't need to split the page into chunks if we're just storing it */
1034 if (slab_rebal.d_clsid > SLAB_GLOBAL_PAGE_POOL) {
1035 memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
1036 split_slab_page_into_freelist(slab_rebal.slab_start,
1037 slab_rebal.d_clsid);
1038 } else if (slab_rebal.d_clsid == SLAB_GLOBAL_PAGE_POOL) {
1039 /* mem_malloc'ed might be higher than mem_limit. */
1040 memory_release();
1041 }
1042
1043 slab_rebal.done = 0;
1044 slab_rebal.s_clsid = 0;
1045 slab_rebal.d_clsid = 0;
1046 slab_rebal.slab_start = NULL;
1047 slab_rebal.slab_end = NULL;
1048 slab_rebal.slab_pos = NULL;
1049 evictions_nomem = slab_rebal.evictions_nomem;
1050 inline_reclaim = slab_rebal.inline_reclaim;
1051 rescues = slab_rebal.rescues;
1052 chunk_rescues = slab_rebal.chunk_rescues;
1053 slab_rebal.evictions_nomem = 0;
1054 slab_rebal.inline_reclaim = 0;
1055 slab_rebal.rescues = 0;
1056
1057 slab_rebalance_signal = 0;
1058
1059 pthread_mutex_unlock(&slabs_lock);
1060
1061 STATS_LOCK();
1062 stats.slabs_moved++;
1063 stats.slab_reassign_rescues += rescues;
1064 stats.slab_reassign_evictions_nomem += evictions_nomem;
1065 stats.slab_reassign_inline_reclaim += inline_reclaim;
1066 stats.slab_reassign_chunk_rescues += chunk_rescues;
1067 stats_state.slab_reassign_running = false;
1068 STATS_UNLOCK();
1069
1070 if (settings.verbose > 1) {
1071 fprintf(stderr, "finished a slab move\n");
1072 }
1073 }
1074
1075 /* Slab mover thread.
1076 * Sits waiting for a condition to jump off and shovel some memory about
1077 */
slab_rebalance_thread(void * arg)1078 static void *slab_rebalance_thread(void *arg) {
1079 int was_busy = 0;
1080 /* So we first pass into cond_wait with the mutex held */
1081 mutex_lock(&slabs_rebalance_lock);
1082
1083 while (do_run_slab_rebalance_thread) {
1084 if (slab_rebalance_signal == 1) {
1085 if (slab_rebalance_start() < 0) {
1086 /* Handle errors with more specifity as required. */
1087 slab_rebalance_signal = 0;
1088 }
1089
1090 was_busy = 0;
1091 } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
1092 was_busy = slab_rebalance_move();
1093 }
1094
1095 if (slab_rebal.done) {
1096 slab_rebalance_finish();
1097 } else if (was_busy) {
1098 /* Stuck waiting for some items to unlock, so slow down a bit
1099 * to give them a chance to free up */
1100 usleep(50);
1101 }
1102
1103 if (slab_rebalance_signal == 0) {
1104 /* always hold this lock while we're running */
1105 pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
1106 }
1107 }
1108 return NULL;
1109 }
1110
1111 /* Iterate at most once through the slab classes and pick a "random" source.
1112 * I like this better than calling rand() since rand() is slow enough that we
1113 * can just check all of the classes once instead.
1114 */
slabs_reassign_pick_any(int dst)1115 static int slabs_reassign_pick_any(int dst) {
1116 static int cur = POWER_SMALLEST - 1;
1117 int tries = power_largest - POWER_SMALLEST + 1;
1118 for (; tries > 0; tries--) {
1119 cur++;
1120 if (cur > power_largest)
1121 cur = POWER_SMALLEST;
1122 if (cur == dst)
1123 continue;
1124 if (slabclass[cur].slabs > 1) {
1125 return cur;
1126 }
1127 }
1128 return -1;
1129 }
1130
do_slabs_reassign(int src,int dst)1131 static enum reassign_result_type do_slabs_reassign(int src, int dst) {
1132 if (slab_rebalance_signal != 0)
1133 return REASSIGN_RUNNING;
1134
1135 if (src == dst)
1136 return REASSIGN_SRC_DST_SAME;
1137
1138 /* Special indicator to choose ourselves. */
1139 if (src == -1) {
1140 src = slabs_reassign_pick_any(dst);
1141 /* TODO: If we end up back at -1, return a new error type */
1142 }
1143
1144 if (src < POWER_SMALLEST || src > power_largest ||
1145 dst < SLAB_GLOBAL_PAGE_POOL || dst > power_largest)
1146 return REASSIGN_BADCLASS;
1147
1148 if (slabclass[src].slabs < 2)
1149 return REASSIGN_NOSPARE;
1150
1151 slab_rebal.s_clsid = src;
1152 slab_rebal.d_clsid = dst;
1153
1154 slab_rebalance_signal = 1;
1155 pthread_cond_signal(&slab_rebalance_cond);
1156
1157 return REASSIGN_OK;
1158 }
1159
slabs_reassign(int src,int dst)1160 enum reassign_result_type slabs_reassign(int src, int dst) {
1161 enum reassign_result_type ret;
1162 if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
1163 return REASSIGN_RUNNING;
1164 }
1165 ret = do_slabs_reassign(src, dst);
1166 pthread_mutex_unlock(&slabs_rebalance_lock);
1167 return ret;
1168 }
1169
1170 /* If we hold this lock, rebalancer can't wake up or move */
slabs_rebalancer_pause(void)1171 void slabs_rebalancer_pause(void) {
1172 pthread_mutex_lock(&slabs_rebalance_lock);
1173 }
1174
slabs_rebalancer_resume(void)1175 void slabs_rebalancer_resume(void) {
1176 pthread_mutex_unlock(&slabs_rebalance_lock);
1177 }
1178
1179 static pthread_t rebalance_tid;
1180
start_slab_maintenance_thread(void)1181 int start_slab_maintenance_thread(void) {
1182 int ret;
1183 slab_rebalance_signal = 0;
1184 slab_rebal.slab_start = NULL;
1185 char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
1186 if (env != NULL) {
1187 slab_bulk_check = atoi(env);
1188 if (slab_bulk_check == 0) {
1189 slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
1190 }
1191 }
1192
1193 if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
1194 fprintf(stderr, "Can't intiialize rebalance condition\n");
1195 return -1;
1196 }
1197 pthread_mutex_init(&slabs_rebalance_lock, NULL);
1198
1199 if ((ret = pthread_create(&rebalance_tid, NULL,
1200 slab_rebalance_thread, NULL)) != 0) {
1201 fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
1202 return -1;
1203 }
1204 return 0;
1205 }
1206
1207 /* The maintenance thread is on a sleep/loop cycle, so it should join after a
1208 * short wait */
stop_slab_maintenance_thread(void)1209 void stop_slab_maintenance_thread(void) {
1210 mutex_lock(&slabs_rebalance_lock);
1211 do_run_slab_thread = 0;
1212 do_run_slab_rebalance_thread = 0;
1213 pthread_cond_signal(&slab_rebalance_cond);
1214 pthread_mutex_unlock(&slabs_rebalance_lock);
1215
1216 /* Wait for the maintenance thread to stop */
1217 pthread_join(rebalance_tid, NULL);
1218 }
1219