1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 6 #pragma once 7 8 #ifndef ROCKSDB_LITE 9 10 #include <string> 11 #include <vector> 12 13 #include "db/dbformat.h" 14 #include "memory/arena.h" 15 #include "monitoring/histogram.h" 16 #include "options/cf_options.h" 17 #include "rocksdb/options.h" 18 19 namespace ROCKSDB_NAMESPACE { 20 21 // The file contains two classes PlainTableIndex and PlainTableIndexBuilder 22 // The two classes implement the index format of PlainTable. 23 // For descripton of PlainTable format, see comments of class 24 // PlainTableFactory 25 // 26 // 27 // PlainTableIndex contains buckets size of index_size_, each is a 28 // 32-bit integer. The lower 31 bits contain an offset value (explained below) 29 // and the first bit of the integer indicates type of the offset. 30 // 31 // +--------------+------------------------------------------------------+ 32 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + 33 // +--------------+------------------------------------------------------+ 34 // 35 // Explanation for the "flag bit": 36 // 37 // 0 indicates that the bucket contains only one prefix (no conflict when 38 // hashing this prefix), whose first row starts from this offset of the 39 // file. 40 // 1 indicates that the bucket contains more than one prefixes, or there 41 // are too many rows for one prefix so we need a binary search for it. In 42 // this case, the offset indicates the offset of sub_index_ holding the 43 // binary search indexes of keys for those rows. Those binary search indexes 44 // are organized in this way: 45 // 46 // The first 4 bytes, indicate how many indexes (N) are stored after it. After 47 // it, there are N 32-bit integers, each points of an offset of the file, 48 // which 49 // points to starting of a row. Those offsets need to be guaranteed to be in 50 // ascending order so the keys they are pointing to are also in ascending 51 // order 52 // to make sure we can use them to do binary searches. Below is visual 53 // presentation of a bucket. 54 // 55 // <begin> 56 // number_of_records: varint32 57 // record 1 file offset: fixedint32 58 // record 2 file offset: fixedint32 59 // .... 60 // record N file offset: fixedint32 61 // <end> 62 63 // The class loads the index block from a PlainTable SST file, and executes 64 // the index lookup. 65 // The class is used by PlainTableReader class. 66 class PlainTableIndex { 67 public: 68 enum IndexSearchResult { 69 kNoPrefixForBucket = 0, 70 kDirectToFile = 1, 71 kSubindex = 2 72 }; 73 PlainTableIndex(Slice data)74 explicit PlainTableIndex(Slice data) { InitFromRawData(data); } 75 PlainTableIndex()76 PlainTableIndex() 77 : index_size_(0), 78 sub_index_size_(0), 79 num_prefixes_(0), 80 index_(nullptr), 81 sub_index_(nullptr) {} 82 83 // The function that executes the lookup the hash table. 84 // The hash key is `prefix_hash`. The function fills the hash bucket 85 // content in `bucket_value`, which is up to the caller to interpret. 86 IndexSearchResult GetOffset(uint32_t prefix_hash, 87 uint32_t* bucket_value) const; 88 89 // Initialize data from `index_data`, which points to raw data for 90 // index stored in the SST file. 91 Status InitFromRawData(Slice index_data); 92 93 // Decode the sub index for specific hash bucket. 94 // The `offset` is the value returned as `bucket_value` by GetOffset() 95 // and is only valid when the return value is `kSubindex`. 96 // The return value is the pointer to the starting address of the 97 // sub-index. `upper_bound` is filled with the value indicating how many 98 // entries the sub-index has. GetSubIndexBasePtrAndUpperBound(uint32_t offset,uint32_t * upper_bound)99 const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, 100 uint32_t* upper_bound) const { 101 const char* index_ptr = &sub_index_[offset]; 102 return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); 103 } 104 GetIndexSize()105 uint32_t GetIndexSize() const { return index_size_; } 106 GetSubIndexSize()107 uint32_t GetSubIndexSize() const { return sub_index_size_; } 108 GetNumPrefixes()109 uint32_t GetNumPrefixes() const { return num_prefixes_; } 110 111 static const uint64_t kMaxFileSize = (1u << 31) - 1; 112 static const uint32_t kSubIndexMask = 0x80000000; 113 static const size_t kOffsetLen = sizeof(uint32_t); 114 115 private: 116 uint32_t index_size_; 117 uint32_t sub_index_size_; 118 uint32_t num_prefixes_; 119 120 uint32_t* index_; 121 char* sub_index_; 122 }; 123 124 // PlainTableIndexBuilder is used to create plain table index. 125 // After calling Finish(), it returns Slice, which is usually 126 // used either to initialize PlainTableIndex or 127 // to save index to sst file. 128 // For more details about the index, please refer to: 129 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format 130 // #wiki-in-memory-index-format 131 // The class is used by PlainTableBuilder class. 132 class PlainTableIndexBuilder { 133 public: PlainTableIndexBuilder(Arena * arena,const ImmutableCFOptions & ioptions,const SliceTransform * prefix_extractor,size_t index_sparseness,double hash_table_ratio,size_t huge_page_tlb_size)134 PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, 135 const SliceTransform* prefix_extractor, 136 size_t index_sparseness, double hash_table_ratio, 137 size_t huge_page_tlb_size) 138 : arena_(arena), 139 ioptions_(ioptions), 140 record_list_(kRecordsPerGroup), 141 is_first_record_(true), 142 due_index_(false), 143 num_prefixes_(0), 144 num_keys_per_prefix_(0), 145 prev_key_prefix_hash_(0), 146 index_sparseness_(index_sparseness), 147 index_size_(0), 148 sub_index_size_(0), 149 prefix_extractor_(prefix_extractor), 150 hash_table_ratio_(hash_table_ratio), 151 huge_page_tlb_size_(huge_page_tlb_size) {} 152 153 void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); 154 155 Slice Finish(); 156 GetTotalSize()157 uint32_t GetTotalSize() const { 158 return VarintLength(index_size_) + VarintLength(num_prefixes_) + 159 PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; 160 } 161 162 static const std::string kPlainTableIndexBlock; 163 164 private: 165 struct IndexRecord { 166 uint32_t hash; // hash of the prefix 167 uint32_t offset; // offset of a row 168 IndexRecord* next; 169 }; 170 171 // Helper class to track all the index records 172 class IndexRecordList { 173 public: IndexRecordList(size_t num_records_per_group)174 explicit IndexRecordList(size_t num_records_per_group) 175 : kNumRecordsPerGroup(num_records_per_group), 176 current_group_(nullptr), 177 num_records_in_current_group_(num_records_per_group) {} 178 ~IndexRecordList()179 ~IndexRecordList() { 180 for (size_t i = 0; i < groups_.size(); i++) { 181 delete[] groups_[i]; 182 } 183 } 184 185 void AddRecord(uint32_t hash, uint32_t offset); 186 GetNumRecords()187 size_t GetNumRecords() const { 188 return (groups_.size() - 1) * kNumRecordsPerGroup + 189 num_records_in_current_group_; 190 } At(size_t index)191 IndexRecord* At(size_t index) { 192 return &(groups_[index / kNumRecordsPerGroup] 193 [index % kNumRecordsPerGroup]); 194 } 195 196 private: AllocateNewGroup()197 IndexRecord* AllocateNewGroup() { 198 IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; 199 groups_.push_back(result); 200 return result; 201 } 202 203 // Each group in `groups_` contains fix-sized records (determined by 204 // kNumRecordsPerGroup). Which can help us minimize the cost if resizing 205 // occurs. 206 const size_t kNumRecordsPerGroup; 207 IndexRecord* current_group_; 208 // List of arrays allocated 209 std::vector<IndexRecord*> groups_; 210 size_t num_records_in_current_group_; 211 }; 212 213 void AllocateIndex(); 214 215 // Internal helper function to bucket index record list to hash buckets. 216 void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets, 217 std::vector<uint32_t>* entries_per_bucket); 218 219 // Internal helper class to fill the indexes and bloom filters to internal 220 // data structures. 221 Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets, 222 const std::vector<uint32_t>& entries_per_bucket); 223 224 Arena* arena_; 225 const ImmutableCFOptions ioptions_; 226 HistogramImpl keys_per_prefix_hist_; 227 IndexRecordList record_list_; 228 bool is_first_record_; 229 bool due_index_; 230 uint32_t num_prefixes_; 231 uint32_t num_keys_per_prefix_; 232 233 uint32_t prev_key_prefix_hash_; 234 size_t index_sparseness_; 235 uint32_t index_size_; 236 uint32_t sub_index_size_; 237 238 const SliceTransform* prefix_extractor_; 239 double hash_table_ratio_; 240 size_t huge_page_tlb_size_; 241 242 std::string prev_key_prefix_; 243 244 static const size_t kRecordsPerGroup = 256; 245 }; 246 247 }; // namespace ROCKSDB_NAMESPACE 248 249 #endif // ROCKSDB_LITE 250