1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 
8 #ifndef ROCKSDB_LITE
9 
10 #include <string>
11 #include <vector>
12 
13 #include "db/dbformat.h"
14 #include "memory/arena.h"
15 #include "monitoring/histogram.h"
16 #include "options/cf_options.h"
17 #include "rocksdb/options.h"
18 
19 namespace ROCKSDB_NAMESPACE {
20 
21 // The file contains two classes PlainTableIndex and PlainTableIndexBuilder
22 // The two classes implement the index format of PlainTable.
23 // For descripton of PlainTable format, see comments of class
24 // PlainTableFactory
25 //
26 //
27 // PlainTableIndex contains buckets size of index_size_, each is a
28 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
29 // and the first bit of the integer indicates type of the offset.
30 //
31 // +--------------+------------------------------------------------------+
32 // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
33 // +--------------+------------------------------------------------------+
34 //
35 // Explanation for the "flag bit":
36 //
37 // 0 indicates that the bucket contains only one prefix (no conflict when
38 //   hashing this prefix), whose first row starts from this offset of the
39 // file.
40 // 1 indicates that the bucket contains more than one prefixes, or there
41 //   are too many rows for one prefix so we need a binary search for it. In
42 //   this case, the offset indicates the offset of sub_index_ holding the
43 //   binary search indexes of keys for those rows. Those binary search indexes
44 //   are organized in this way:
45 //
46 // The first 4 bytes, indicate how many indexes (N) are stored after it. After
47 // it, there are N 32-bit integers, each points of an offset of the file,
48 // which
49 // points to starting of a row. Those offsets need to be guaranteed to be in
50 // ascending order so the keys they are pointing to are also in ascending
51 // order
52 // to make sure we can use them to do binary searches. Below is visual
53 // presentation of a bucket.
54 //
55 // <begin>
56 //   number_of_records:  varint32
57 //   record 1 file offset:  fixedint32
58 //   record 2 file offset:  fixedint32
59 //    ....
60 //   record N file offset:  fixedint32
61 // <end>
62 
63 // The class loads the index block from a PlainTable SST file, and executes
64 // the index lookup.
65 // The class is used by PlainTableReader class.
66 class PlainTableIndex {
67  public:
68   enum IndexSearchResult {
69     kNoPrefixForBucket = 0,
70     kDirectToFile = 1,
71     kSubindex = 2
72   };
73 
PlainTableIndex(Slice data)74   explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
75 
PlainTableIndex()76   PlainTableIndex()
77       : index_size_(0),
78         sub_index_size_(0),
79         num_prefixes_(0),
80         index_(nullptr),
81         sub_index_(nullptr) {}
82 
83   // The function that executes the lookup the hash table.
84   // The hash key is `prefix_hash`. The function fills the hash bucket
85   // content in `bucket_value`, which is up to the caller to interpret.
86   IndexSearchResult GetOffset(uint32_t prefix_hash,
87                               uint32_t* bucket_value) const;
88 
89   // Initialize data from `index_data`, which points to raw data for
90   // index stored in the SST file.
91   Status InitFromRawData(Slice index_data);
92 
93   // Decode the sub index for specific hash bucket.
94   // The `offset` is the value returned as `bucket_value` by GetOffset()
95   // and is only valid when the return value is `kSubindex`.
96   // The return value is the pointer to the starting address of the
97   // sub-index. `upper_bound` is filled with the value indicating how many
98   // entries the sub-index has.
GetSubIndexBasePtrAndUpperBound(uint32_t offset,uint32_t * upper_bound)99   const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
100                                               uint32_t* upper_bound) const {
101     const char* index_ptr = &sub_index_[offset];
102     return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
103   }
104 
GetIndexSize()105   uint32_t GetIndexSize() const { return index_size_; }
106 
GetSubIndexSize()107   uint32_t GetSubIndexSize() const { return sub_index_size_; }
108 
GetNumPrefixes()109   uint32_t GetNumPrefixes() const { return num_prefixes_; }
110 
111   static const uint64_t kMaxFileSize = (1u << 31) - 1;
112   static const uint32_t kSubIndexMask = 0x80000000;
113   static const size_t kOffsetLen = sizeof(uint32_t);
114 
115  private:
116   uint32_t index_size_;
117   uint32_t sub_index_size_;
118   uint32_t num_prefixes_;
119 
120   uint32_t* index_;
121   char* sub_index_;
122 };
123 
124 // PlainTableIndexBuilder is used to create plain table index.
125 // After calling Finish(), it returns Slice, which is usually
126 // used either to initialize PlainTableIndex or
127 // to save index to sst file.
128 // For more details about the index, please refer to:
129 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
130 // #wiki-in-memory-index-format
131 // The class is used by PlainTableBuilder class.
132 class PlainTableIndexBuilder {
133  public:
PlainTableIndexBuilder(Arena * arena,const ImmutableCFOptions & ioptions,const SliceTransform * prefix_extractor,size_t index_sparseness,double hash_table_ratio,size_t huge_page_tlb_size)134   PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
135                          const SliceTransform* prefix_extractor,
136                          size_t index_sparseness, double hash_table_ratio,
137                          size_t huge_page_tlb_size)
138       : arena_(arena),
139         ioptions_(ioptions),
140         record_list_(kRecordsPerGroup),
141         is_first_record_(true),
142         due_index_(false),
143         num_prefixes_(0),
144         num_keys_per_prefix_(0),
145         prev_key_prefix_hash_(0),
146         index_sparseness_(index_sparseness),
147         index_size_(0),
148         sub_index_size_(0),
149         prefix_extractor_(prefix_extractor),
150         hash_table_ratio_(hash_table_ratio),
151         huge_page_tlb_size_(huge_page_tlb_size) {}
152 
153   void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
154 
155   Slice Finish();
156 
GetTotalSize()157   uint32_t GetTotalSize() const {
158     return VarintLength(index_size_) + VarintLength(num_prefixes_) +
159            PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
160   }
161 
162   static const std::string kPlainTableIndexBlock;
163 
164  private:
165   struct IndexRecord {
166     uint32_t hash;    // hash of the prefix
167     uint32_t offset;  // offset of a row
168     IndexRecord* next;
169   };
170 
171   // Helper class to track all the index records
172   class IndexRecordList {
173    public:
IndexRecordList(size_t num_records_per_group)174     explicit IndexRecordList(size_t num_records_per_group)
175         : kNumRecordsPerGroup(num_records_per_group),
176           current_group_(nullptr),
177           num_records_in_current_group_(num_records_per_group) {}
178 
~IndexRecordList()179     ~IndexRecordList() {
180       for (size_t i = 0; i < groups_.size(); i++) {
181         delete[] groups_[i];
182       }
183     }
184 
185     void AddRecord(uint32_t hash, uint32_t offset);
186 
GetNumRecords()187     size_t GetNumRecords() const {
188       return (groups_.size() - 1) * kNumRecordsPerGroup +
189              num_records_in_current_group_;
190     }
At(size_t index)191     IndexRecord* At(size_t index) {
192       return &(groups_[index / kNumRecordsPerGroup]
193                       [index % kNumRecordsPerGroup]);
194     }
195 
196    private:
AllocateNewGroup()197     IndexRecord* AllocateNewGroup() {
198       IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
199       groups_.push_back(result);
200       return result;
201     }
202 
203     // Each group in `groups_` contains fix-sized records (determined by
204     // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
205     // occurs.
206     const size_t kNumRecordsPerGroup;
207     IndexRecord* current_group_;
208     // List of arrays allocated
209     std::vector<IndexRecord*> groups_;
210     size_t num_records_in_current_group_;
211   };
212 
213   void AllocateIndex();
214 
215   // Internal helper function to bucket index record list to hash buckets.
216   void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
217                         std::vector<uint32_t>* entries_per_bucket);
218 
219   // Internal helper class to fill the indexes and bloom filters to internal
220   // data structures.
221   Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
222                     const std::vector<uint32_t>& entries_per_bucket);
223 
224   Arena* arena_;
225   const ImmutableCFOptions ioptions_;
226   HistogramImpl keys_per_prefix_hist_;
227   IndexRecordList record_list_;
228   bool is_first_record_;
229   bool due_index_;
230   uint32_t num_prefixes_;
231   uint32_t num_keys_per_prefix_;
232 
233   uint32_t prev_key_prefix_hash_;
234   size_t index_sparseness_;
235   uint32_t index_size_;
236   uint32_t sub_index_size_;
237 
238   const SliceTransform* prefix_extractor_;
239   double hash_table_ratio_;
240   size_t huge_page_tlb_size_;
241 
242   std::string prev_key_prefix_;
243 
244   static const size_t kRecordsPerGroup = 256;
245 };
246 
247 };  // namespace ROCKSDB_NAMESPACE
248 
249 #endif  // ROCKSDB_LITE
250