1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 6 #pragma once 7 8 #include <string> 9 #include <vector> 10 11 #include "rocksdb/slice.h" 12 13 namespace ROCKSDB_NAMESPACE { 14 // This is an experimental feature aiming to reduce the CPU utilization of 15 // point-lookup within a data-block. It is only used in data blocks, and not 16 // in meta-data blocks or per-table index blocks. 17 // 18 // It only used to support BlockBasedTable::Get(). 19 // 20 // A serialized hash index is appended to the data-block. The new block data 21 // format is as follows: 22 // 23 // DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER] 24 // 25 // RI: Restart Interval (the same as the default data-block format) 26 // RI_IDX: Restart Interval index (the same as the default data-block format) 27 // HASH_IDX: The new data-block hash index feature. 28 // FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as 29 // the flag indicating if this hash index is in use. Note that 30 // given a data block < 32KB, the MSB is never used. So we can 31 // borrow the MSB as the hash index flag. Therefore, this format is 32 // compatible with the legacy data-blocks with num_restarts < 32768, 33 // as the MSB is 0. 34 // 35 // The format of the data-block hash index is as follows: 36 // 37 // HASH_IDX: [B B B ... B NUM_BUCK] 38 // 39 // B: bucket, an array of restart index. Each buckets is uint8_t. 40 // NUM_BUCK: Number of buckets, which is the length of the bucket array. 41 // 42 // We reserve two special flag: 43 // kNoEntry=255, 44 // kCollision=254. 45 // 46 // Therefore, the max number of restarts this hash index can supoport is 253. 47 // 48 // Buckets are initialized to be kNoEntry. 49 // 50 // When storing a key in the hash index, the key is first hashed to a bucket. 51 // If there the bucket is empty (kNoEntry), the restart index is stored in 52 // the bucket. If there is already a restart index there, we will update the 53 // existing restart index to a collision marker (kCollision). If the 54 // the bucket is already marked as collision, we do not store the restart 55 // index either. 56 // 57 // During query process, a key is first hashed to a bucket. Then we examine if 58 // the buckets store nothing (kNoEntry) or the bucket had a collision 59 // (kCollision). If either of those happens, we get the restart index of 60 // the key and will directly go to the restart interval to search the key. 61 // 62 // Note that we only support blocks with #restart_interval < 254. If a block 63 // has more restart interval than that, hash index will not be create for it. 64 65 const uint8_t kNoEntry = 255; 66 const uint8_t kCollision = 254; 67 const uint8_t kMaxRestartSupportedByHashIndex = 253; 68 69 // Because we use uint16_t address, we only support block no more than 64KB 70 const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16; 71 const double kDefaultUtilRatio = 0.75; 72 73 class DataBlockHashIndexBuilder { 74 public: DataBlockHashIndexBuilder()75 DataBlockHashIndexBuilder() 76 : bucket_per_key_(-1 /*uninitialized marker*/), 77 estimated_num_buckets_(0), 78 valid_(false) {} 79 Initialize(double util_ratio)80 void Initialize(double util_ratio) { 81 if (util_ratio <= 0) { 82 util_ratio = kDefaultUtilRatio; // sanity check 83 } 84 bucket_per_key_ = 1 / util_ratio; 85 valid_ = true; 86 } 87 Valid()88 inline bool Valid() const { return valid_ && bucket_per_key_ > 0; } 89 void Add(const Slice& key, const size_t restart_index); 90 void Finish(std::string& buffer); 91 void Reset(); EstimateSize()92 inline size_t EstimateSize() const { 93 uint16_t estimated_num_buckets = 94 static_cast<uint16_t>(estimated_num_buckets_); 95 96 // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. 97 estimated_num_buckets |= 1; 98 99 return sizeof(uint16_t) + 100 static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t)); 101 } 102 103 private: 104 double bucket_per_key_; // is the multiplicative inverse of util_ratio_ 105 double estimated_num_buckets_; 106 107 // Now the only usage for `valid_` is to mark false when the inserted 108 // restart_index is larger than supported. In this case HashIndex is not 109 // appended to the block content. 110 bool valid_; 111 112 std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_; 113 friend class DataBlockHashIndex_DataBlockHashTestSmall_Test; 114 }; 115 116 class DataBlockHashIndex { 117 public: DataBlockHashIndex()118 DataBlockHashIndex() : num_buckets_(0) {} 119 120 void Initialize(const char* data, uint16_t size, uint16_t* map_offset); 121 122 uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const; 123 Valid()124 inline bool Valid() { return num_buckets_ != 0; } 125 126 private: 127 // To make the serialized hash index compact and to save the space overhead, 128 // here all the data fields persisted in the block are in uint16 format. 129 // We find that a uint16 is large enough to index every offset of a 64KiB 130 // block. 131 // So in other words, DataBlockHashIndex does not support block size equal 132 // or greater then 64KiB. 133 uint16_t num_buckets_; 134 }; 135 136 } // namespace ROCKSDB_NAMESPACE 137