1 // Copyright (c) 2013, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 #pragma once 7 8 #ifndef ROCKSDB_LITE 9 10 #include <assert.h> 11 #include <list> 12 #include <vector> 13 14 #ifdef OS_LINUX 15 #include <sys/mman.h> 16 #endif 17 18 #include "rocksdb/env.h" 19 #include "util/mutexlock.h" 20 21 namespace ROCKSDB_NAMESPACE { 22 23 // HashTable<T, Hash, Equal> 24 // 25 // Traditional implementation of hash table with synchronization built on top 26 // don't perform very well in multi-core scenarios. This is an implementation 27 // designed for multi-core scenarios with high lock contention. 28 // 29 // |<-------- alpha ------------->| 30 // Buckets Collision list 31 // ---- +----+ +---+---+--- ...... ---+---+---+ 32 // / | |--->| | | | | | 33 // / +----+ +---+---+--- ...... ---+---+---+ 34 // / | | 35 // Locks/ +----+ 36 // +--+/ . . 37 // | | . . 38 // +--+ . . 39 // | | . . 40 // +--+ . . 41 // | | . . 42 // +--+ . . 43 // \ +----+ 44 // \ | | 45 // \ +----+ 46 // \ | | 47 // \---- +----+ 48 // 49 // The lock contention is spread over an array of locks. This helps improve 50 // concurrent access. The spine is designed for a certain capacity and load 51 // factor. When the capacity planning is done correctly we can expect 52 // O(load_factor = 1) insert, access and remove time. 53 // 54 // Micro benchmark on debug build gives about .5 Million/sec rate of insert, 55 // erase and lookup in parallel (total of about 1.5 Million ops/sec). If the 56 // blocks were of 4K, the hash table can support a virtual throughput of 57 // 6 GB/s. 58 // 59 // T Object type (contains both key and value) 60 // Hash Function that returns an hash from type T 61 // Equal Returns if two objects are equal 62 // (We need explicit equal for pointer type) 63 // 64 template <class T, class Hash, class Equal> 65 class HashTable { 66 public: 67 explicit HashTable(const size_t capacity = 1024 * 1024, 68 const float load_factor = 2.0, const uint32_t nlocks = 256) 69 : nbuckets_( 70 static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)), 71 nlocks_(nlocks) { 72 // pre-conditions 73 assert(capacity); 74 assert(load_factor); 75 assert(nbuckets_); 76 assert(nlocks_); 77 78 buckets_.reset(new Bucket[nbuckets_]); 79 #ifdef OS_LINUX 80 mlock(buckets_.get(), nbuckets_ * sizeof(Bucket)); 81 #endif 82 83 // initialize locks 84 locks_.reset(new port::RWMutex[nlocks_]); 85 #ifdef OS_LINUX 86 mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex)); 87 #endif 88 89 // post-conditions 90 assert(buckets_); 91 assert(locks_); 92 } 93 ~HashTable()94 virtual ~HashTable() { AssertEmptyBuckets(); } 95 96 // 97 // Insert given record to hash table 98 // Insert(const T & t)99 bool Insert(const T& t) { 100 const uint64_t h = Hash()(t); 101 const uint32_t bucket_idx = h % nbuckets_; 102 const uint32_t lock_idx = bucket_idx % nlocks_; 103 104 WriteLock _(&locks_[lock_idx]); 105 auto& bucket = buckets_[bucket_idx]; 106 return Insert(&bucket, t); 107 } 108 109 // Lookup hash table 110 // 111 // Please note that read lock should be held by the caller. This is because 112 // the caller owns the data, and should hold the read lock as long as he 113 // operates on the data. Find(const T & t,T * ret,port::RWMutex ** ret_lock)114 bool Find(const T& t, T* ret, port::RWMutex** ret_lock) { 115 const uint64_t h = Hash()(t); 116 const uint32_t bucket_idx = h % nbuckets_; 117 const uint32_t lock_idx = bucket_idx % nlocks_; 118 119 port::RWMutex& lock = locks_[lock_idx]; 120 lock.ReadLock(); 121 122 auto& bucket = buckets_[bucket_idx]; 123 if (Find(&bucket, t, ret)) { 124 *ret_lock = &lock; 125 return true; 126 } 127 128 lock.ReadUnlock(); 129 return false; 130 } 131 132 // 133 // Erase a given key from the hash table 134 // Erase(const T & t,T * ret)135 bool Erase(const T& t, T* ret) { 136 const uint64_t h = Hash()(t); 137 const uint32_t bucket_idx = h % nbuckets_; 138 const uint32_t lock_idx = bucket_idx % nlocks_; 139 140 WriteLock _(&locks_[lock_idx]); 141 142 auto& bucket = buckets_[bucket_idx]; 143 return Erase(&bucket, t, ret); 144 } 145 146 // Fetch the mutex associated with a key 147 // This call is used to hold the lock for a given data for extended period of 148 // time. GetMutex(const T & t)149 port::RWMutex* GetMutex(const T& t) { 150 const uint64_t h = Hash()(t); 151 const uint32_t bucket_idx = h % nbuckets_; 152 const uint32_t lock_idx = bucket_idx % nlocks_; 153 154 return &locks_[lock_idx]; 155 } 156 Clear(void (* fn)(T))157 void Clear(void (*fn)(T)) { 158 for (uint32_t i = 0; i < nbuckets_; ++i) { 159 const uint32_t lock_idx = i % nlocks_; 160 WriteLock _(&locks_[lock_idx]); 161 for (auto& t : buckets_[i].list_) { 162 (*fn)(t); 163 } 164 buckets_[i].list_.clear(); 165 } 166 } 167 168 protected: 169 // Models bucket of keys that hash to the same bucket number 170 struct Bucket { 171 std::list<T> list_; 172 }; 173 174 // Substitute for std::find with custom comparator operator Find(std::list<T> * list,const T & t)175 typename std::list<T>::iterator Find(std::list<T>* list, const T& t) { 176 for (auto it = list->begin(); it != list->end(); ++it) { 177 if (Equal()(*it, t)) { 178 return it; 179 } 180 } 181 return list->end(); 182 } 183 Insert(Bucket * bucket,const T & t)184 bool Insert(Bucket* bucket, const T& t) { 185 // Check if the key already exists 186 auto it = Find(&bucket->list_, t); 187 if (it != bucket->list_.end()) { 188 return false; 189 } 190 191 // insert to bucket 192 bucket->list_.push_back(t); 193 return true; 194 } 195 Find(Bucket * bucket,const T & t,T * ret)196 bool Find(Bucket* bucket, const T& t, T* ret) { 197 auto it = Find(&bucket->list_, t); 198 if (it != bucket->list_.end()) { 199 if (ret) { 200 *ret = *it; 201 } 202 return true; 203 } 204 return false; 205 } 206 Erase(Bucket * bucket,const T & t,T * ret)207 bool Erase(Bucket* bucket, const T& t, T* ret) { 208 auto it = Find(&bucket->list_, t); 209 if (it != bucket->list_.end()) { 210 if (ret) { 211 *ret = *it; 212 } 213 214 bucket->list_.erase(it); 215 return true; 216 } 217 return false; 218 } 219 220 // assert that all buckets are empty AssertEmptyBuckets()221 void AssertEmptyBuckets() { 222 #ifndef NDEBUG 223 for (size_t i = 0; i < nbuckets_; ++i) { 224 WriteLock _(&locks_[i % nlocks_]); 225 assert(buckets_[i].list_.empty()); 226 } 227 #endif 228 } 229 230 const uint32_t nbuckets_; // No. of buckets in the spine 231 std::unique_ptr<Bucket[]> buckets_; // Spine of the hash buckets 232 const uint32_t nlocks_; // No. of locks 233 std::unique_ptr<port::RWMutex[]> locks_; // Granular locks 234 }; 235 236 } // namespace ROCKSDB_NAMESPACE 237 238 #endif 239