1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #ifndef ROCKSDB_LITE
7 #include "table/cuckoo/cuckoo_table_builder.h"
8 
9 #include <assert.h>
10 #include <algorithm>
11 #include <limits>
12 #include <string>
13 #include <vector>
14 
15 #include "db/dbformat.h"
16 #include "file/writable_file_writer.h"
17 #include "rocksdb/env.h"
18 #include "rocksdb/table.h"
19 #include "table/block_based/block_builder.h"
20 #include "table/cuckoo/cuckoo_table_factory.h"
21 #include "table/format.h"
22 #include "table/meta_blocks.h"
23 #include "util/autovector.h"
24 #include "util/random.h"
25 #include "util/string_util.h"
26 
27 namespace ROCKSDB_NAMESPACE {
28 const std::string CuckooTablePropertyNames::kEmptyKey =
29       "rocksdb.cuckoo.bucket.empty.key";
30 const std::string CuckooTablePropertyNames::kNumHashFunc =
31       "rocksdb.cuckoo.hash.num";
32 const std::string CuckooTablePropertyNames::kHashTableSize =
33       "rocksdb.cuckoo.hash.size";
34 const std::string CuckooTablePropertyNames::kValueLength =
35       "rocksdb.cuckoo.value.length";
36 const std::string CuckooTablePropertyNames::kIsLastLevel =
37       "rocksdb.cuckoo.file.islastlevel";
38 const std::string CuckooTablePropertyNames::kCuckooBlockSize =
39       "rocksdb.cuckoo.hash.cuckooblocksize";
40 const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
41       "rocksdb.cuckoo.hash.identityfirst";
42 const std::string CuckooTablePropertyNames::kUseModuleHash =
43       "rocksdb.cuckoo.hash.usemodule";
44 const std::string CuckooTablePropertyNames::kUserKeyLength =
45       "rocksdb.cuckoo.hash.userkeylength";
46 
47 // Obtained by running echo rocksdb.table.cuckoo | sha1sum
48 extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
49 
CuckooTableBuilder(WritableFileWriter * file,double max_hash_table_ratio,uint32_t max_num_hash_table,uint32_t max_search_depth,const Comparator * user_comparator,uint32_t cuckoo_block_size,bool use_module_hash,bool identity_as_first_hash,uint64_t (* get_slice_hash)(const Slice &,uint32_t,uint64_t),uint32_t column_family_id,const std::string & column_family_name)50 CuckooTableBuilder::CuckooTableBuilder(
51     WritableFileWriter* file, double max_hash_table_ratio,
52     uint32_t max_num_hash_table, uint32_t max_search_depth,
53     const Comparator* user_comparator, uint32_t cuckoo_block_size,
54     bool use_module_hash, bool identity_as_first_hash,
55     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
56     uint32_t column_family_id, const std::string& column_family_name)
57     : num_hash_func_(2),
58       file_(file),
59       max_hash_table_ratio_(max_hash_table_ratio),
60       max_num_hash_func_(max_num_hash_table),
61       max_search_depth_(max_search_depth),
62       cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
63       hash_table_size_(use_module_hash ? 0 : 2),
64       is_last_level_file_(false),
65       has_seen_first_key_(false),
66       has_seen_first_value_(false),
67       key_size_(0),
68       value_size_(0),
69       num_entries_(0),
70       num_values_(0),
71       ucomp_(user_comparator),
72       use_module_hash_(use_module_hash),
73       identity_as_first_hash_(identity_as_first_hash),
74       get_slice_hash_(get_slice_hash),
75       closed_(false) {
76   // Data is in a huge block.
77   properties_.num_data_blocks = 1;
78   properties_.index_size = 0;
79   properties_.filter_size = 0;
80   properties_.column_family_id = column_family_id;
81   properties_.column_family_name = column_family_name;
82 }
83 
Add(const Slice & key,const Slice & value)84 void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
85   if (num_entries_ >= kMaxVectorIdx - 1) {
86     status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
87     return;
88   }
89   ParsedInternalKey ikey;
90   if (!ParseInternalKey(key, &ikey)) {
91     status_ = Status::Corruption("Unable to parse key into inernal key.");
92     return;
93   }
94   if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
95     status_ = Status::NotSupported("Unsupported key type " +
96                                    ToString(ikey.type));
97     return;
98   }
99 
100   // Determine if we can ignore the sequence number and value type from
101   // internal keys by looking at sequence number from first key. We assume
102   // that if first key has a zero sequence number, then all the remaining
103   // keys will have zero seq. no.
104   if (!has_seen_first_key_) {
105     is_last_level_file_ = ikey.sequence == 0;
106     has_seen_first_key_ = true;
107     smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
108     largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
109     key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
110   }
111   if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
112     status_ = Status::NotSupported("all keys have to be the same size");
113     return;
114   }
115 
116   if (ikey.type == kTypeValue) {
117     if (!has_seen_first_value_) {
118       has_seen_first_value_ = true;
119       value_size_ = value.size();
120     }
121     if (value_size_ != value.size()) {
122       status_ = Status::NotSupported("all values have to be the same size");
123       return;
124     }
125 
126     if (is_last_level_file_) {
127       kvs_.append(ikey.user_key.data(), ikey.user_key.size());
128     } else {
129       kvs_.append(key.data(), key.size());
130     }
131     kvs_.append(value.data(), value.size());
132     ++num_values_;
133   } else {
134     if (is_last_level_file_) {
135       deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
136     } else {
137       deleted_keys_.append(key.data(), key.size());
138     }
139   }
140   ++num_entries_;
141 
142   // In order to fill the empty buckets in the hash table, we identify a
143   // key which is not used so far (unused_user_key). We determine this by
144   // maintaining smallest and largest keys inserted so far in bytewise order
145   // and use them to find a key outside this range in Finish() operation.
146   // Note that this strategy is independent of user comparator used here.
147   if (ikey.user_key.compare(smallest_user_key_) < 0) {
148     smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
149   } else if (ikey.user_key.compare(largest_user_key_) > 0) {
150     largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
151   }
152   if (!use_module_hash_) {
153     if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
154       hash_table_size_ *= 2;
155     }
156   }
157 }
158 
IsDeletedKey(uint64_t idx) const159 bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
160   assert(closed_);
161   return idx >= num_values_;
162 }
163 
GetKey(uint64_t idx) const164 Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
165   assert(closed_);
166   if (IsDeletedKey(idx)) {
167     return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_));
168   }
169   return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_));
170 }
171 
GetUserKey(uint64_t idx) const172 Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
173   assert(closed_);
174   return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
175 }
176 
GetValue(uint64_t idx) const177 Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
178   assert(closed_);
179   if (IsDeletedKey(idx)) {
180     static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
181     return Slice(empty_value);
182   }
183   return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_));
184 }
185 
MakeHashTable(std::vector<CuckooBucket> * buckets)186 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
187   buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
188   uint32_t make_space_for_key_call_id = 0;
189   for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
190     uint64_t bucket_id = 0;
191     bool bucket_found = false;
192     autovector<uint64_t> hash_vals;
193     Slice user_key = GetUserKey(vector_idx);
194     for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
195         ++hash_cnt) {
196       uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
197           hash_table_size_, identity_as_first_hash_, get_slice_hash_);
198       // If there is a collision, check next cuckoo_block_size_ locations for
199       // empty locations. While checking, if we reach end of the hash table,
200       // stop searching and proceed for next hash function.
201       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
202           ++block_idx, ++hash_val) {
203         if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
204           bucket_id = hash_val;
205           bucket_found = true;
206           break;
207         } else {
208           if (ucomp_->Compare(user_key,
209                 GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) {
210             return Status::NotSupported("Same key is being inserted again.");
211           }
212           hash_vals.push_back(hash_val);
213         }
214       }
215     }
216     while (!bucket_found && !MakeSpaceForKey(hash_vals,
217           ++make_space_for_key_call_id, buckets, &bucket_id)) {
218       // Rehash by increashing number of hash tables.
219       if (num_hash_func_ >= max_num_hash_func_) {
220         return Status::NotSupported("Too many collisions. Unable to hash.");
221       }
222       // We don't really need to rehash the entire table because old hashes are
223       // still valid and we only increased the number of hash functions.
224       uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
225           hash_table_size_, identity_as_first_hash_, get_slice_hash_);
226       ++num_hash_func_;
227       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
228           ++block_idx, ++hash_val) {
229         if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
230           bucket_found = true;
231           bucket_id = hash_val;
232           break;
233         } else {
234           hash_vals.push_back(hash_val);
235         }
236       }
237     }
238     (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
239   }
240   return Status::OK();
241 }
242 
Finish()243 Status CuckooTableBuilder::Finish() {
244   assert(!closed_);
245   closed_ = true;
246   std::vector<CuckooBucket> buckets;
247   Status s;
248   std::string unused_bucket;
249   if (num_entries_ > 0) {
250     // Calculate the real hash size if module hash is enabled.
251     if (use_module_hash_) {
252       hash_table_size_ =
253         static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
254     }
255     status_ = MakeHashTable(&buckets);
256     if (!status_.ok()) {
257       return status_;
258     }
259     // Determine unused_user_key to fill empty buckets.
260     std::string unused_user_key = smallest_user_key_;
261     int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
262     while (curr_pos >= 0) {
263       --unused_user_key[curr_pos];
264       if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
265         break;
266       }
267       --curr_pos;
268     }
269     if (curr_pos < 0) {
270       // Try using the largest key to identify an unused key.
271       unused_user_key = largest_user_key_;
272       curr_pos = static_cast<int>(unused_user_key.size()) - 1;
273       while (curr_pos >= 0) {
274         ++unused_user_key[curr_pos];
275         if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
276           break;
277         }
278         --curr_pos;
279       }
280     }
281     if (curr_pos < 0) {
282       return Status::Corruption("Unable to find unused key");
283     }
284     if (is_last_level_file_) {
285       unused_bucket = unused_user_key;
286     } else {
287       ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
288       AppendInternalKey(&unused_bucket, ikey);
289     }
290   }
291   properties_.num_entries = num_entries_;
292   properties_.num_deletions = num_entries_ - num_values_;
293   properties_.fixed_key_len = key_size_;
294   properties_.user_collected_properties[
295         CuckooTablePropertyNames::kValueLength].assign(
296         reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
297 
298   uint64_t bucket_size = key_size_ + value_size_;
299   unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
300   // Write the table.
301   uint32_t num_added = 0;
302   for (auto& bucket : buckets) {
303     if (bucket.vector_idx == kMaxVectorIdx) {
304       io_status_ = file_->Append(Slice(unused_bucket));
305     } else {
306       ++num_added;
307       io_status_ = file_->Append(GetKey(bucket.vector_idx));
308       if (io_status_.ok()) {
309         if (value_size_ > 0) {
310           io_status_ = file_->Append(GetValue(bucket.vector_idx));
311         }
312       }
313     }
314     if (!io_status_.ok()) {
315       status_ = io_status_;
316       return status_;
317     }
318   }
319   assert(num_added == NumEntries());
320   properties_.raw_key_size = num_added * properties_.fixed_key_len;
321   properties_.raw_value_size = num_added * value_size_;
322 
323   uint64_t offset = buckets.size() * bucket_size;
324   properties_.data_size = offset;
325   unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
326   properties_.user_collected_properties[
327     CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
328   properties_.user_collected_properties[
329     CuckooTablePropertyNames::kNumHashFunc].assign(
330         reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
331 
332   properties_.user_collected_properties[
333     CuckooTablePropertyNames::kHashTableSize].assign(
334         reinterpret_cast<const char*>(&hash_table_size_),
335         sizeof(hash_table_size_));
336   properties_.user_collected_properties[
337     CuckooTablePropertyNames::kIsLastLevel].assign(
338         reinterpret_cast<const char*>(&is_last_level_file_),
339         sizeof(is_last_level_file_));
340   properties_.user_collected_properties[
341     CuckooTablePropertyNames::kCuckooBlockSize].assign(
342         reinterpret_cast<const char*>(&cuckoo_block_size_),
343         sizeof(cuckoo_block_size_));
344   properties_.user_collected_properties[
345     CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
346         reinterpret_cast<const char*>(&identity_as_first_hash_),
347         sizeof(identity_as_first_hash_));
348   properties_.user_collected_properties[
349     CuckooTablePropertyNames::kUseModuleHash].assign(
350         reinterpret_cast<const char*>(&use_module_hash_),
351         sizeof(use_module_hash_));
352   uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
353   properties_.user_collected_properties[
354     CuckooTablePropertyNames::kUserKeyLength].assign(
355         reinterpret_cast<const char*>(&user_key_len),
356         sizeof(user_key_len));
357 
358   // Write meta blocks.
359   MetaIndexBuilder meta_index_builder;
360   PropertyBlockBuilder property_block_builder;
361 
362   property_block_builder.AddTableProperty(properties_);
363   property_block_builder.Add(properties_.user_collected_properties);
364   Slice property_block = property_block_builder.Finish();
365   BlockHandle property_block_handle;
366   property_block_handle.set_offset(offset);
367   property_block_handle.set_size(property_block.size());
368   io_status_ = file_->Append(property_block);
369   offset += property_block.size();
370   if (!io_status_.ok()) {
371     status_ = io_status_;
372     return status_;
373   }
374 
375   meta_index_builder.Add(kPropertiesBlock, property_block_handle);
376   Slice meta_index_block = meta_index_builder.Finish();
377 
378   BlockHandle meta_index_block_handle;
379   meta_index_block_handle.set_offset(offset);
380   meta_index_block_handle.set_size(meta_index_block.size());
381   io_status_ = file_->Append(meta_index_block);
382   if (!io_status_.ok()) {
383     status_ = io_status_;
384     return status_;
385   }
386 
387   Footer footer(kCuckooTableMagicNumber, 1);
388   footer.set_metaindex_handle(meta_index_block_handle);
389   footer.set_index_handle(BlockHandle::NullBlockHandle());
390   std::string footer_encoding;
391   footer.EncodeTo(&footer_encoding);
392   io_status_ = file_->Append(footer_encoding);
393   status_ = io_status_;
394   return status_;
395 }
396 
Abandon()397 void CuckooTableBuilder::Abandon() {
398   assert(!closed_);
399   closed_ = true;
400 }
401 
NumEntries() const402 uint64_t CuckooTableBuilder::NumEntries() const {
403   return num_entries_;
404 }
405 
FileSize() const406 uint64_t CuckooTableBuilder::FileSize() const {
407   if (closed_) {
408     return file_->GetFileSize();
409   } else if (num_entries_ == 0) {
410     return 0;
411   }
412 
413   if (use_module_hash_) {
414     return static_cast<uint64_t>((key_size_ + value_size_) *
415         num_entries_ / max_hash_table_ratio_);
416   } else {
417     // Account for buckets being a power of two.
418     // As elements are added, file size remains constant for a while and
419     // doubles its size. Since compaction algorithm stops adding elements
420     // only after it exceeds the file limit, we account for the extra element
421     // being added here.
422     uint64_t expected_hash_table_size = hash_table_size_;
423     if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
424       expected_hash_table_size *= 2;
425     }
426     return (key_size_ + value_size_) * expected_hash_table_size - 1;
427   }
428 }
429 
430 // This method is invoked when there is no place to insert the target key.
431 // It searches for a set of elements that can be moved to accommodate target
432 // key. The search is a BFS graph traversal with first level (hash_vals)
433 // being all the buckets target key could go to.
434 // Then, from each node (curr_node), we find all the buckets that curr_node
435 // could go to. They form the children of curr_node in the tree.
436 // We continue the traversal until we find an empty bucket, in which case, we
437 // move all elements along the path from first level to this empty bucket, to
438 // make space for target key which is inserted at first level (*bucket_id).
439 // If tree depth exceedes max depth, we return false indicating failure.
MakeSpaceForKey(const autovector<uint64_t> & hash_vals,const uint32_t make_space_for_key_call_id,std::vector<CuckooBucket> * buckets,uint64_t * bucket_id)440 bool CuckooTableBuilder::MakeSpaceForKey(
441     const autovector<uint64_t>& hash_vals,
442     const uint32_t make_space_for_key_call_id,
443     std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
444   struct CuckooNode {
445     uint64_t bucket_id;
446     uint32_t depth;
447     uint32_t parent_pos;
448     CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
449         : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
450   };
451   // This is BFS search tree that is stored simply as a vector.
452   // Each node stores the index of parent node in the vector.
453   std::vector<CuckooNode> tree;
454   // We want to identify already visited buckets in the current method call so
455   // that we don't add same buckets again for exploration in the tree.
456   // We do this by maintaining a count of current method call in
457   // make_space_for_key_call_id, which acts as a unique id for this invocation
458   // of the method. We store this number into the nodes that we explore in
459   // current method call.
460   // It is unlikely for the increment operation to overflow because the maximum
461   // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
462   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
463     uint64_t bid = hash_vals[hash_cnt];
464     (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id;
465     tree.push_back(CuckooNode(bid, 0, 0));
466   }
467   bool null_found = false;
468   uint32_t curr_pos = 0;
469   while (!null_found && curr_pos < tree.size()) {
470     CuckooNode& curr_node = tree[curr_pos];
471     uint32_t curr_depth = curr_node.depth;
472     if (curr_depth >= max_search_depth_) {
473       break;
474     }
475     CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
476     for (uint32_t hash_cnt = 0;
477         hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
478       uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
479           hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
480           get_slice_hash_);
481       // Iterate inside Cuckoo Block.
482       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
483           ++block_idx, ++child_bucket_id) {
484         if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id ==
485             make_space_for_key_call_id) {
486           continue;
487         }
488         (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id =
489           make_space_for_key_call_id;
490         tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
491               curr_pos));
492         if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) {
493           null_found = true;
494           break;
495         }
496       }
497     }
498     ++curr_pos;
499   }
500 
501   if (null_found) {
502     // There is an empty node in tree.back(). Now, traverse the path from this
503     // empty node to top of the tree and at every node in the path, replace
504     // child with the parent. Stop when first level is reached in the tree
505     // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
506     // this location in first level for target key to be inserted.
507     uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
508     while (bucket_to_replace_pos >= num_hash_func_) {
509       CuckooNode& curr_node = tree[bucket_to_replace_pos];
510       (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
511         (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
512       bucket_to_replace_pos = curr_node.parent_pos;
513     }
514     *bucket_id = tree[bucket_to_replace_pos].bucket_id;
515   }
516   return null_found;
517 }
518 
GetFileChecksum() const519 std::string CuckooTableBuilder::GetFileChecksum() const {
520   if (file_ != nullptr) {
521     return file_->GetFileChecksum();
522   } else {
523     return kUnknownFileChecksum;
524   }
525 }
526 
GetFileChecksumFuncName() const527 const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
528   if (file_ != nullptr) {
529     return file_->GetFileChecksumFuncName();
530   } else {
531     return kUnknownFileChecksumFuncName.c_str();
532   }
533 }
534 
535 }  // namespace ROCKSDB_NAMESPACE
536 #endif  // ROCKSDB_LITE
537