1825db0f9SBrad Fitzpatrick /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2825db0f9SBrad Fitzpatrick /*
3825db0f9SBrad Fitzpatrick * Hash table
4825db0f9SBrad Fitzpatrick *
5825db0f9SBrad Fitzpatrick * The hash function used here is by Bob Jenkins, 1996:
6825db0f9SBrad Fitzpatrick * <http://burtleburtle.net/bob/hash/doobs.html>
7825db0f9SBrad Fitzpatrick * "By Bob Jenkins, 1996. [email protected].
8825db0f9SBrad Fitzpatrick * You may use this code any way you wish, private, educational,
9825db0f9SBrad Fitzpatrick * or commercial. It's free."
10825db0f9SBrad Fitzpatrick *
11825db0f9SBrad Fitzpatrick * The rest of the file is licensed under the BSD license. See LICENSE.
12825db0f9SBrad Fitzpatrick */
1356b8339eSSteven Grimm
1456b8339eSSteven Grimm #include "memcached.h"
15825db0f9SBrad Fitzpatrick #include <sys/stat.h>
16825db0f9SBrad Fitzpatrick #include <sys/socket.h>
17825db0f9SBrad Fitzpatrick #include <sys/resource.h>
18e688e97dSNatanael Copa #include <signal.h>
19825db0f9SBrad Fitzpatrick #include <fcntl.h>
20c6975ef4SPaul Lindner #include <netinet/in.h>
21c6975ef4SPaul Lindner #include <errno.h>
22825db0f9SBrad Fitzpatrick #include <stdlib.h>
23825db0f9SBrad Fitzpatrick #include <stdio.h>
24825db0f9SBrad Fitzpatrick #include <string.h>
2518a72ad2SBrad Fitzpatrick #include <assert.h>
267f09e20bSTrond Norbye #include <pthread.h>
277f09e20bSTrond Norbye
287f09e20bSTrond Norbye static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
296af7aa0bSdormando static pthread_mutex_t maintenance_lock = PTHREAD_MUTEX_INITIALIZER;
305d7dc88dSdormando static pthread_mutex_t hash_items_counter_lock = PTHREAD_MUTEX_INITIALIZER;
3186969ea4SBrad Fitzpatrick
32217dcce0SSteven Grimm typedef unsigned long int ub4; /* unsigned 4-byte quantities */
33217dcce0SSteven Grimm typedef unsigned char ub1; /* unsigned 1-byte quantities */
34217dcce0SSteven Grimm
35217dcce0SSteven Grimm /* how many powers of 2's worth of buckets we use */
361c94e12cSdormando unsigned int hashpower = HASHPOWER_DEFAULT;
37217dcce0SSteven Grimm
38217dcce0SSteven Grimm #define hashsize(n) ((ub4)1<<(n))
39217dcce0SSteven Grimm #define hashmask(n) (hashsize(n)-1)
40217dcce0SSteven Grimm
41217dcce0SSteven Grimm /* Main hash table. This is where we look except during expansion. */
42217dcce0SSteven Grimm static item** primary_hashtable = 0;
43217dcce0SSteven Grimm
44217dcce0SSteven Grimm /*
45217dcce0SSteven Grimm * Previous hash table. During expansion, we look here for keys that haven't
46217dcce0SSteven Grimm * been moved over to the primary yet.
47217dcce0SSteven Grimm */
48217dcce0SSteven Grimm static item** old_hashtable = 0;
49217dcce0SSteven Grimm
50217dcce0SSteven Grimm /* Number of items in the hash table. */
5144ef96eeSPaul Lindner static unsigned int hash_items = 0;
52217dcce0SSteven Grimm
53217dcce0SSteven Grimm /* Flag: Are we in the middle of expanding now? */
5444ef96eeSPaul Lindner static bool expanding = false;
551c94e12cSdormando static bool started_expanding = false;
56217dcce0SSteven Grimm
57217dcce0SSteven Grimm /*
58217dcce0SSteven Grimm * During expansion we migrate values with bucket granularity; this is how
59217dcce0SSteven Grimm * far we've gotten so far. Ranges from 0 .. hashsize(hashpower - 1) - 1.
60217dcce0SSteven Grimm */
6144ef96eeSPaul Lindner static unsigned int expand_bucket = 0;
6286969ea4SBrad Fitzpatrick
assoc_init(const int hashtable_init)631db1de38Sdormando void assoc_init(const int hashtable_init) {
641db1de38Sdormando if (hashtable_init) {
651db1de38Sdormando hashpower = hashtable_init;
661db1de38Sdormando }
67984053cdSDustin Sallings primary_hashtable = calloc(hashsize(hashpower), sizeof(void *));
68217dcce0SSteven Grimm if (! primary_hashtable) {
69825db0f9SBrad Fitzpatrick fprintf(stderr, "Failed to init hashtable.\n");
70c8425072SPaul Lindner exit(EXIT_FAILURE);
71825db0f9SBrad Fitzpatrick }
72108e2cd6Sdormando STATS_LOCK();
73*cb01d504Sdormando stats_state.hash_power_level = hashpower;
74*cb01d504Sdormando stats_state.hash_bytes = hashsize(hashpower) * sizeof(void *);
75108e2cd6Sdormando STATS_UNLOCK();
76825db0f9SBrad Fitzpatrick }
7786969ea4SBrad Fitzpatrick
assoc_find(const char * key,const size_t nkey,const uint32_t hv)78bab9acd1Sdormando item *assoc_find(const char *key, const size_t nkey, const uint32_t hv) {
79217dcce0SSteven Grimm item *it;
8044ef96eeSPaul Lindner unsigned int oldbucket;
81217dcce0SSteven Grimm
82217dcce0SSteven Grimm if (expanding &&
83217dcce0SSteven Grimm (oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket)
84217dcce0SSteven Grimm {
85217dcce0SSteven Grimm it = old_hashtable[oldbucket];
86217dcce0SSteven Grimm } else {
87217dcce0SSteven Grimm it = primary_hashtable[hv & hashmask(hashpower)];
88217dcce0SSteven Grimm }
8986969ea4SBrad Fitzpatrick
9068957214STrond Norbye item *ret = NULL;
9168957214STrond Norbye int depth = 0;
92f6d334e0SBrad Fitzpatrick while (it) {
9368957214STrond Norbye if ((nkey == it->nkey) && (memcmp(key, ITEM_key(it), nkey) == 0)) {
9468957214STrond Norbye ret = it;
9568957214STrond Norbye break;
96217dcce0SSteven Grimm }
97f6d334e0SBrad Fitzpatrick it = it->h_next;
9868957214STrond Norbye ++depth;
99825db0f9SBrad Fitzpatrick }
10080ec0955STrond Norbye MEMCACHED_ASSOC_FIND(key, nkey, depth);
10168957214STrond Norbye return ret;
102825db0f9SBrad Fitzpatrick }
10386969ea4SBrad Fitzpatrick
1047917af40SBrad Fitzpatrick /* returns the address of the item pointer before the key. if *item == 0,
1057917af40SBrad Fitzpatrick the item wasn't found */
10686969ea4SBrad Fitzpatrick
_hashitem_before(const char * key,const size_t nkey,const uint32_t hv)107bab9acd1Sdormando static item** _hashitem_before (const char *key, const size_t nkey, const uint32_t hv) {
108217dcce0SSteven Grimm item **pos;
10944ef96eeSPaul Lindner unsigned int oldbucket;
11086969ea4SBrad Fitzpatrick
111217dcce0SSteven Grimm if (expanding &&
112217dcce0SSteven Grimm (oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket)
113217dcce0SSteven Grimm {
114217dcce0SSteven Grimm pos = &old_hashtable[oldbucket];
115217dcce0SSteven Grimm } else {
116217dcce0SSteven Grimm pos = &primary_hashtable[hv & hashmask(hashpower)];
117217dcce0SSteven Grimm }
118217dcce0SSteven Grimm
119217dcce0SSteven Grimm while (*pos && ((nkey != (*pos)->nkey) || memcmp(key, ITEM_key(*pos), nkey))) {
120f6d334e0SBrad Fitzpatrick pos = &(*pos)->h_next;
121825db0f9SBrad Fitzpatrick }
122825db0f9SBrad Fitzpatrick return pos;
123825db0f9SBrad Fitzpatrick }
12486969ea4SBrad Fitzpatrick
125217dcce0SSteven Grimm /* grows the hashtable to the next power of 2. */
assoc_expand(void)126217dcce0SSteven Grimm static void assoc_expand(void) {
127217dcce0SSteven Grimm old_hashtable = primary_hashtable;
128217dcce0SSteven Grimm
129217dcce0SSteven Grimm primary_hashtable = calloc(hashsize(hashpower + 1), sizeof(void *));
130217dcce0SSteven Grimm if (primary_hashtable) {
131217dcce0SSteven Grimm if (settings.verbose > 1)
132217dcce0SSteven Grimm fprintf(stderr, "Hash table expansion starting\n");
133217dcce0SSteven Grimm hashpower++;
13444ef96eeSPaul Lindner expanding = true;
135217dcce0SSteven Grimm expand_bucket = 0;
136108e2cd6Sdormando STATS_LOCK();
137*cb01d504Sdormando stats_state.hash_power_level = hashpower;
138*cb01d504Sdormando stats_state.hash_bytes += hashsize(hashpower) * sizeof(void *);
139*cb01d504Sdormando stats_state.hash_is_expanding = true;
140108e2cd6Sdormando STATS_UNLOCK();
141217dcce0SSteven Grimm } else {
142217dcce0SSteven Grimm primary_hashtable = old_hashtable;
143217dcce0SSteven Grimm /* Bad news, but we can keep running. */
144217dcce0SSteven Grimm }
145217dcce0SSteven Grimm }
146217dcce0SSteven Grimm
assoc_start_expand(void)1471c94e12cSdormando static void assoc_start_expand(void) {
1481c94e12cSdormando if (started_expanding)
1491c94e12cSdormando return;
150d2676b4aSJason CHAN
1511c94e12cSdormando started_expanding = true;
1521c94e12cSdormando pthread_cond_signal(&maintenance_cond);
1531c94e12cSdormando }
1541c94e12cSdormando
1557917af40SBrad Fitzpatrick /* Note: this isn't an assoc_update. The key must not already exist to call this */
assoc_insert(item * it,const uint32_t hv)156bab9acd1Sdormando int assoc_insert(item *it, const uint32_t hv) {
15744ef96eeSPaul Lindner unsigned int oldbucket;
158217dcce0SSteven Grimm
159bab9acd1Sdormando // assert(assoc_find(ITEM_key(it), it->nkey) == 0); /* shouldn't have duplicately named things defined */
160217dcce0SSteven Grimm
161217dcce0SSteven Grimm if (expanding &&
162217dcce0SSteven Grimm (oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket)
163217dcce0SSteven Grimm {
164217dcce0SSteven Grimm it->h_next = old_hashtable[oldbucket];
165217dcce0SSteven Grimm old_hashtable[oldbucket] = it;
166217dcce0SSteven Grimm } else {
167217dcce0SSteven Grimm it->h_next = primary_hashtable[hv & hashmask(hashpower)];
168217dcce0SSteven Grimm primary_hashtable[hv & hashmask(hashpower)] = it;
169217dcce0SSteven Grimm }
170217dcce0SSteven Grimm
1715d7dc88dSdormando pthread_mutex_lock(&hash_items_counter_lock);
172217dcce0SSteven Grimm hash_items++;
173217dcce0SSteven Grimm if (! expanding && hash_items > (hashsize(hashpower) * 3) / 2) {
1741c94e12cSdormando assoc_start_expand();
175217dcce0SSteven Grimm }
1765d7dc88dSdormando pthread_mutex_unlock(&hash_items_counter_lock);
177217dcce0SSteven Grimm
17880ec0955STrond Norbye MEMCACHED_ASSOC_INSERT(ITEM_key(it), it->nkey, hash_items);
179825db0f9SBrad Fitzpatrick return 1;
180825db0f9SBrad Fitzpatrick }
18186969ea4SBrad Fitzpatrick
assoc_delete(const char * key,const size_t nkey,const uint32_t hv)182bab9acd1Sdormando void assoc_delete(const char *key, const size_t nkey, const uint32_t hv) {
183bab9acd1Sdormando item **before = _hashitem_before(key, nkey, hv);
184217dcce0SSteven Grimm
185825db0f9SBrad Fitzpatrick if (*before) {
18668957214STrond Norbye item *nxt;
1875d7dc88dSdormando pthread_mutex_lock(&hash_items_counter_lock);
18868957214STrond Norbye hash_items--;
1895d7dc88dSdormando pthread_mutex_unlock(&hash_items_counter_lock);
19068957214STrond Norbye /* The DTrace probe cannot be triggered as the last instruction
19168957214STrond Norbye * due to possible tail-optimization by the compiler
19268957214STrond Norbye */
19380ec0955STrond Norbye MEMCACHED_ASSOC_DELETE(key, nkey, hash_items);
19468957214STrond Norbye nxt = (*before)->h_next;
195f6d334e0SBrad Fitzpatrick (*before)->h_next = 0; /* probably pointless, but whatever. */
196f6d334e0SBrad Fitzpatrick *before = nxt;
197f6d334e0SBrad Fitzpatrick return;
198825db0f9SBrad Fitzpatrick }
1997917af40SBrad Fitzpatrick /* Note: we never actually get here. the callers don't delete things
200c08383afSBrad Fitzpatrick they can't find. */
201c08383afSBrad Fitzpatrick assert(*before != 0);
202825db0f9SBrad Fitzpatrick }
2037f09e20bSTrond Norbye
2047f09e20bSTrond Norbye
2057f09e20bSTrond Norbye static volatile int do_run_maintenance_thread = 1;
2067f09e20bSTrond Norbye
2077f09e20bSTrond Norbye #define DEFAULT_HASH_BULK_MOVE 1
2087f09e20bSTrond Norbye int hash_bulk_move = DEFAULT_HASH_BULK_MOVE;
2097f09e20bSTrond Norbye
assoc_maintenance_thread(void * arg)2107f09e20bSTrond Norbye static void *assoc_maintenance_thread(void *arg) {
2117f09e20bSTrond Norbye
2126af7aa0bSdormando mutex_lock(&maintenance_lock);
2137f09e20bSTrond Norbye while (do_run_maintenance_thread) {
214dfc5130eSDustin Sallings int ii = 0;
2157f09e20bSTrond Norbye
2166af7aa0bSdormando /* There is only one expansion thread, so no need to global lock. */
217dfc5130eSDustin Sallings for (ii = 0; ii < hash_bulk_move && expanding; ++ii) {
2187f09e20bSTrond Norbye item *it, *next;
2197f09e20bSTrond Norbye int bucket;
220d2676b4aSJason CHAN void *item_lock = NULL;
2217f09e20bSTrond Norbye
222d2676b4aSJason CHAN /* bucket = hv & hashmask(hashpower) =>the bucket of hash table
223d2676b4aSJason CHAN * is the lowest N bits of the hv, and the bucket of item_locks is
224d2676b4aSJason CHAN * also the lowest M bits of hv, and N is greater than M.
225d2676b4aSJason CHAN * So we can process expanding with only one item_lock. cool! */
226d2676b4aSJason CHAN if ((item_lock = item_trylock(expand_bucket))) {
2277f09e20bSTrond Norbye for (it = old_hashtable[expand_bucket]; NULL != it; it = next) {
2287f09e20bSTrond Norbye next = it->h_next;
22905ca809cSdormando bucket = hash(ITEM_key(it), it->nkey) & hashmask(hashpower);
2307f09e20bSTrond Norbye it->h_next = primary_hashtable[bucket];
2317f09e20bSTrond Norbye primary_hashtable[bucket] = it;
2327f09e20bSTrond Norbye }
2337f09e20bSTrond Norbye
2347f09e20bSTrond Norbye old_hashtable[expand_bucket] = NULL;
2357f09e20bSTrond Norbye
2367f09e20bSTrond Norbye expand_bucket++;
2377f09e20bSTrond Norbye if (expand_bucket == hashsize(hashpower - 1)) {
2387f09e20bSTrond Norbye expanding = false;
2397f09e20bSTrond Norbye free(old_hashtable);
240108e2cd6Sdormando STATS_LOCK();
241*cb01d504Sdormando stats_state.hash_bytes -= hashsize(hashpower - 1) * sizeof(void *);
242*cb01d504Sdormando stats_state.hash_is_expanding = false;
243108e2cd6Sdormando STATS_UNLOCK();
2447f09e20bSTrond Norbye if (settings.verbose > 1)
2457f09e20bSTrond Norbye fprintf(stderr, "Hash table expansion done\n");
2467f09e20bSTrond Norbye }
247d2676b4aSJason CHAN
248d2676b4aSJason CHAN } else {
2496af7aa0bSdormando usleep(10*1000);
2507f09e20bSTrond Norbye }
2517f09e20bSTrond Norbye
252d2676b4aSJason CHAN if (item_lock) {
253d2676b4aSJason CHAN item_trylock_unlock(item_lock);
254d2676b4aSJason CHAN item_lock = NULL;
255d2676b4aSJason CHAN }
256d2676b4aSJason CHAN }
2571c94e12cSdormando
2587f09e20bSTrond Norbye if (!expanding) {
2597f09e20bSTrond Norbye /* We are done expanding.. just wait for next invocation */
2608963836cSdormando started_expanding = false;
2616af7aa0bSdormando pthread_cond_wait(&maintenance_cond, &maintenance_lock);
2626af7aa0bSdormando /* assoc_expand() swaps out the hash table entirely, so we need
2636af7aa0bSdormando * all threads to not hold any references related to the hash
2646af7aa0bSdormando * table while this happens.
2656af7aa0bSdormando * This is instead of a more complex, possibly slower algorithm to
2666af7aa0bSdormando * allow dynamic hash table expansion without causing significant
2676af7aa0bSdormando * wait times.
2686af7aa0bSdormando */
2696af7aa0bSdormando pause_threads(PAUSE_ALL_THREADS);
2701c94e12cSdormando assoc_expand();
2716af7aa0bSdormando pause_threads(RESUME_ALL_THREADS);
2721c94e12cSdormando }
2737f09e20bSTrond Norbye }
2747f09e20bSTrond Norbye return NULL;
2757f09e20bSTrond Norbye }
2767f09e20bSTrond Norbye
2777f09e20bSTrond Norbye static pthread_t maintenance_tid;
2787f09e20bSTrond Norbye
start_assoc_maintenance_thread()2797f09e20bSTrond Norbye int start_assoc_maintenance_thread() {
2807f09e20bSTrond Norbye int ret;
2817f09e20bSTrond Norbye char *env = getenv("MEMCACHED_HASH_BULK_MOVE");
2827f09e20bSTrond Norbye if (env != NULL) {
2837f09e20bSTrond Norbye hash_bulk_move = atoi(env);
2847f09e20bSTrond Norbye if (hash_bulk_move == 0) {
2857f09e20bSTrond Norbye hash_bulk_move = DEFAULT_HASH_BULK_MOVE;
2867f09e20bSTrond Norbye }
2877f09e20bSTrond Norbye }
2886af7aa0bSdormando pthread_mutex_init(&maintenance_lock, NULL);
2897f09e20bSTrond Norbye if ((ret = pthread_create(&maintenance_tid, NULL,
2907f09e20bSTrond Norbye assoc_maintenance_thread, NULL)) != 0) {
2917f09e20bSTrond Norbye fprintf(stderr, "Can't create thread: %s\n", strerror(ret));
2927f09e20bSTrond Norbye return -1;
2937f09e20bSTrond Norbye }
2947f09e20bSTrond Norbye return 0;
2957f09e20bSTrond Norbye }
2967f09e20bSTrond Norbye
stop_assoc_maintenance_thread()2977f09e20bSTrond Norbye void stop_assoc_maintenance_thread() {
2986af7aa0bSdormando mutex_lock(&maintenance_lock);
2997f09e20bSTrond Norbye do_run_maintenance_thread = 0;
3007f09e20bSTrond Norbye pthread_cond_signal(&maintenance_cond);
3016af7aa0bSdormando mutex_unlock(&maintenance_lock);
3027f09e20bSTrond Norbye
3037f09e20bSTrond Norbye /* Wait for the maintenance thread to stop */
3047f09e20bSTrond Norbye pthread_join(maintenance_tid, NULL);
3057f09e20bSTrond Norbye }
3067f09e20bSTrond Norbye
307