1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 
8 #include <string>
9 #include <vector>
10 
11 #include "rocksdb/slice.h"
12 
13 namespace ROCKSDB_NAMESPACE {
14 // This is an experimental feature aiming to reduce the CPU utilization of
15 // point-lookup within a data-block. It is only used in data blocks, and not
16 // in meta-data blocks or per-table index blocks.
17 //
18 // It only used to support BlockBasedTable::Get().
19 //
20 // A serialized hash index is appended to the data-block. The new block data
21 // format is as follows:
22 //
23 // DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
24 //
25 // RI:       Restart Interval (the same as the default data-block format)
26 // RI_IDX:   Restart Interval index (the same as the default data-block format)
27 // HASH_IDX: The new data-block hash index feature.
28 // FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
29 //           the flag indicating if this hash index is in use. Note that
30 //           given a data block < 32KB, the MSB is never used. So we can
31 //           borrow the MSB as the hash index flag. Therefore, this format is
32 //           compatible with the legacy data-blocks with num_restarts < 32768,
33 //           as the MSB is 0.
34 //
35 // The format of the data-block hash index is as follows:
36 //
37 // HASH_IDX: [B B B ... B NUM_BUCK]
38 //
39 // B:         bucket, an array of restart index. Each buckets is uint8_t.
40 // NUM_BUCK:  Number of buckets, which is the length of the bucket array.
41 //
42 // We reserve two special flag:
43 //    kNoEntry=255,
44 //    kCollision=254.
45 //
46 // Therefore, the max number of restarts this hash index can supoport is 253.
47 //
48 // Buckets are initialized to be kNoEntry.
49 //
50 // When storing a key in the hash index, the key is first hashed to a bucket.
51 // If there the bucket is empty (kNoEntry), the restart index is stored in
52 // the bucket. If there is already a restart index there, we will update the
53 // existing restart index to a collision marker (kCollision). If the
54 // the bucket is already marked as collision, we do not store the restart
55 // index either.
56 //
57 // During query process, a key is first hashed to a bucket. Then we examine if
58 // the buckets store nothing (kNoEntry) or the bucket had a collision
59 // (kCollision). If either of those happens, we get the restart index of
60 // the key and will directly go to the restart interval to search the key.
61 //
62 // Note that we only support blocks with #restart_interval < 254. If a block
63 // has more restart interval than that, hash index will not be create for it.
64 
65 const uint8_t kNoEntry = 255;
66 const uint8_t kCollision = 254;
67 const uint8_t kMaxRestartSupportedByHashIndex = 253;
68 
69 // Because we use uint16_t address, we only support block no more than 64KB
70 const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
71 const double kDefaultUtilRatio = 0.75;
72 
73 class DataBlockHashIndexBuilder {
74  public:
DataBlockHashIndexBuilder()75   DataBlockHashIndexBuilder()
76       : bucket_per_key_(-1 /*uninitialized marker*/),
77         estimated_num_buckets_(0),
78         valid_(false) {}
79 
Initialize(double util_ratio)80   void Initialize(double util_ratio) {
81     if (util_ratio <= 0) {
82       util_ratio = kDefaultUtilRatio;  // sanity check
83     }
84     bucket_per_key_ = 1 / util_ratio;
85     valid_ = true;
86   }
87 
Valid()88   inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
89   void Add(const Slice& key, const size_t restart_index);
90   void Finish(std::string& buffer);
91   void Reset();
EstimateSize()92   inline size_t EstimateSize() const {
93     uint16_t estimated_num_buckets =
94         static_cast<uint16_t>(estimated_num_buckets_);
95 
96     // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
97     estimated_num_buckets |= 1;
98 
99     return sizeof(uint16_t) +
100            static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
101   }
102 
103  private:
104   double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
105   double estimated_num_buckets_;
106 
107   // Now the only usage for `valid_` is to mark false when the inserted
108   // restart_index is larger than supported. In this case HashIndex is not
109   // appended to the block content.
110   bool valid_;
111 
112   std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
113   friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
114 };
115 
116 class DataBlockHashIndex {
117  public:
DataBlockHashIndex()118   DataBlockHashIndex() : num_buckets_(0) {}
119 
120   void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
121 
122   uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
123 
Valid()124   inline bool Valid() { return num_buckets_ != 0; }
125 
126  private:
127   // To make the serialized hash index compact and to save the space overhead,
128   // here all the data fields persisted in the block are in uint16 format.
129   // We find that a uint16 is large enough to index every offset of a 64KiB
130   // block.
131   // So in other words, DataBlockHashIndex does not support block size equal
132   // or greater then 64KiB.
133   uint16_t num_buckets_;
134 };
135 
136 }  // namespace ROCKSDB_NAMESPACE
137