1 //  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 #pragma once
7 
8 #ifndef ROCKSDB_LITE
9 
10 #include <assert.h>
11 #include <list>
12 #include <vector>
13 
14 #ifdef OS_LINUX
15 #include <sys/mman.h>
16 #endif
17 
18 #include "rocksdb/env.h"
19 #include "util/mutexlock.h"
20 
21 namespace ROCKSDB_NAMESPACE {
22 
23 // HashTable<T, Hash, Equal>
24 //
25 // Traditional implementation of hash table with synchronization built on top
26 // don't perform very well in multi-core scenarios. This is an implementation
27 // designed for multi-core scenarios with high lock contention.
28 //
29 //                         |<-------- alpha ------------->|
30 //               Buckets   Collision list
31 //          ---- +----+    +---+---+--- ...... ---+---+---+
32 //         /     |    |--->|   |   |              |   |   |
33 //        /      +----+    +---+---+--- ...... ---+---+---+
34 //       /       |    |
35 // Locks/        +----+
36 // +--+/         .    .
37 // |  |          .    .
38 // +--+          .    .
39 // |  |          .    .
40 // +--+          .    .
41 // |  |          .    .
42 // +--+          .    .
43 //     \         +----+
44 //      \        |    |
45 //       \       +----+
46 //        \      |    |
47 //         \---- +----+
48 //
49 // The lock contention is spread over an array of locks. This helps improve
50 // concurrent access. The spine is designed for a certain capacity and load
51 // factor. When the capacity planning is done correctly we can expect
52 // O(load_factor = 1) insert, access and remove time.
53 //
54 // Micro benchmark on debug build gives about .5 Million/sec rate of insert,
55 // erase and lookup in parallel (total of about 1.5 Million ops/sec). If the
56 // blocks were of 4K, the hash table can support  a virtual throughput of
57 // 6 GB/s.
58 //
59 // T      Object type (contains both key and value)
60 // Hash   Function that returns an hash from type T
61 // Equal  Returns if two objects are equal
62 //        (We need explicit equal for pointer type)
63 //
64 template <class T, class Hash, class Equal>
65 class HashTable {
66  public:
67   explicit HashTable(const size_t capacity = 1024 * 1024,
68                      const float load_factor = 2.0, const uint32_t nlocks = 256)
69       : nbuckets_(
70             static_cast<uint32_t>(load_factor ? capacity / load_factor : 0)),
71         nlocks_(nlocks) {
72     // pre-conditions
73     assert(capacity);
74     assert(load_factor);
75     assert(nbuckets_);
76     assert(nlocks_);
77 
78     buckets_.reset(new Bucket[nbuckets_]);
79 #ifdef OS_LINUX
80     mlock(buckets_.get(), nbuckets_ * sizeof(Bucket));
81 #endif
82 
83     // initialize locks
84     locks_.reset(new port::RWMutex[nlocks_]);
85 #ifdef OS_LINUX
86     mlock(locks_.get(), nlocks_ * sizeof(port::RWMutex));
87 #endif
88 
89     // post-conditions
90     assert(buckets_);
91     assert(locks_);
92   }
93 
~HashTable()94   virtual ~HashTable() { AssertEmptyBuckets(); }
95 
96   //
97   // Insert given record to hash table
98   //
Insert(const T & t)99   bool Insert(const T& t) {
100     const uint64_t h = Hash()(t);
101     const uint32_t bucket_idx = h % nbuckets_;
102     const uint32_t lock_idx = bucket_idx % nlocks_;
103 
104     WriteLock _(&locks_[lock_idx]);
105     auto& bucket = buckets_[bucket_idx];
106     return Insert(&bucket, t);
107   }
108 
109   // Lookup hash table
110   //
111   // Please note that read lock should be held by the caller. This is because
112   // the caller owns the data, and should hold the read lock as long as he
113   // operates on the data.
Find(const T & t,T * ret,port::RWMutex ** ret_lock)114   bool Find(const T& t, T* ret, port::RWMutex** ret_lock) {
115     const uint64_t h = Hash()(t);
116     const uint32_t bucket_idx = h % nbuckets_;
117     const uint32_t lock_idx = bucket_idx % nlocks_;
118 
119     port::RWMutex& lock = locks_[lock_idx];
120     lock.ReadLock();
121 
122     auto& bucket = buckets_[bucket_idx];
123     if (Find(&bucket, t, ret)) {
124       *ret_lock = &lock;
125       return true;
126     }
127 
128     lock.ReadUnlock();
129     return false;
130   }
131 
132   //
133   // Erase a given key from the hash table
134   //
Erase(const T & t,T * ret)135   bool Erase(const T& t, T* ret) {
136     const uint64_t h = Hash()(t);
137     const uint32_t bucket_idx = h % nbuckets_;
138     const uint32_t lock_idx = bucket_idx % nlocks_;
139 
140     WriteLock _(&locks_[lock_idx]);
141 
142     auto& bucket = buckets_[bucket_idx];
143     return Erase(&bucket, t, ret);
144   }
145 
146   // Fetch the mutex associated with a key
147   // This call is used to hold the lock for a given data for extended period of
148   // time.
GetMutex(const T & t)149   port::RWMutex* GetMutex(const T& t) {
150     const uint64_t h = Hash()(t);
151     const uint32_t bucket_idx = h % nbuckets_;
152     const uint32_t lock_idx = bucket_idx % nlocks_;
153 
154     return &locks_[lock_idx];
155   }
156 
Clear(void (* fn)(T))157   void Clear(void (*fn)(T)) {
158     for (uint32_t i = 0; i < nbuckets_; ++i) {
159       const uint32_t lock_idx = i % nlocks_;
160       WriteLock _(&locks_[lock_idx]);
161       for (auto& t : buckets_[i].list_) {
162         (*fn)(t);
163       }
164       buckets_[i].list_.clear();
165     }
166   }
167 
168  protected:
169   // Models bucket of keys that hash to the same bucket number
170   struct Bucket {
171     std::list<T> list_;
172   };
173 
174   // Substitute for std::find with custom comparator operator
Find(std::list<T> * list,const T & t)175   typename std::list<T>::iterator Find(std::list<T>* list, const T& t) {
176     for (auto it = list->begin(); it != list->end(); ++it) {
177       if (Equal()(*it, t)) {
178         return it;
179       }
180     }
181     return list->end();
182   }
183 
Insert(Bucket * bucket,const T & t)184   bool Insert(Bucket* bucket, const T& t) {
185     // Check if the key already exists
186     auto it = Find(&bucket->list_, t);
187     if (it != bucket->list_.end()) {
188       return false;
189     }
190 
191     // insert to bucket
192     bucket->list_.push_back(t);
193     return true;
194   }
195 
Find(Bucket * bucket,const T & t,T * ret)196   bool Find(Bucket* bucket, const T& t, T* ret) {
197     auto it = Find(&bucket->list_, t);
198     if (it != bucket->list_.end()) {
199       if (ret) {
200         *ret = *it;
201       }
202       return true;
203     }
204     return false;
205   }
206 
Erase(Bucket * bucket,const T & t,T * ret)207   bool Erase(Bucket* bucket, const T& t, T* ret) {
208     auto it = Find(&bucket->list_, t);
209     if (it != bucket->list_.end()) {
210       if (ret) {
211         *ret = *it;
212       }
213 
214       bucket->list_.erase(it);
215       return true;
216     }
217     return false;
218   }
219 
220   // assert that all buckets are empty
AssertEmptyBuckets()221   void AssertEmptyBuckets() {
222 #ifndef NDEBUG
223     for (size_t i = 0; i < nbuckets_; ++i) {
224       WriteLock _(&locks_[i % nlocks_]);
225       assert(buckets_[i].list_.empty());
226     }
227 #endif
228   }
229 
230   const uint32_t nbuckets_;                 // No. of buckets in the spine
231   std::unique_ptr<Bucket[]> buckets_;       // Spine of the hash buckets
232   const uint32_t nlocks_;                   // No. of locks
233   std::unique_ptr<port::RWMutex[]> locks_;  // Granular locks
234 };
235 
236 }  // namespace ROCKSDB_NAMESPACE
237 
238 #endif
239