1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #ifndef ROCKSDB_LITE
7 
8 #include <cinttypes>
9 
10 #include "table/plain/plain_table_index.h"
11 #include "util/coding.h"
12 #include "util/hash.h"
13 
14 namespace ROCKSDB_NAMESPACE {
15 
16 namespace {
GetBucketIdFromHash(uint32_t hash,uint32_t num_buckets)17 inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
18   assert(num_buckets > 0);
19   return hash % num_buckets;
20 }
21 }
22 
InitFromRawData(Slice data)23 Status PlainTableIndex::InitFromRawData(Slice data) {
24   if (!GetVarint32(&data, &index_size_)) {
25     return Status::Corruption("Couldn't read the index size!");
26   }
27   assert(index_size_ > 0);
28   if (!GetVarint32(&data, &num_prefixes_)) {
29     return Status::Corruption("Couldn't read the index size!");
30   }
31   sub_index_size_ =
32       static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
33 
34   char* index_data_begin = const_cast<char*>(data.data());
35   index_ = reinterpret_cast<uint32_t*>(index_data_begin);
36   sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
37   return Status::OK();
38 }
39 
GetOffset(uint32_t prefix_hash,uint32_t * bucket_value) const40 PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
41     uint32_t prefix_hash, uint32_t* bucket_value) const {
42   int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
43   GetUnaligned(index_ + bucket, bucket_value);
44   if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
45     *bucket_value ^= kSubIndexMask;
46     return kSubindex;
47   }
48   if (*bucket_value >= kMaxFileSize) {
49     return kNoPrefixForBucket;
50   } else {
51     // point directly to the file
52     return kDirectToFile;
53   }
54 }
55 
AddRecord(uint32_t hash,uint32_t offset)56 void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
57                                                         uint32_t offset) {
58   if (num_records_in_current_group_ == kNumRecordsPerGroup) {
59     current_group_ = AllocateNewGroup();
60     num_records_in_current_group_ = 0;
61   }
62   auto& new_record = current_group_[num_records_in_current_group_++];
63   new_record.hash = hash;
64   new_record.offset = offset;
65   new_record.next = nullptr;
66 }
67 
AddKeyPrefix(Slice key_prefix_slice,uint32_t key_offset)68 void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
69                                           uint32_t key_offset) {
70   if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
71     ++num_prefixes_;
72     if (!is_first_record_) {
73       keys_per_prefix_hist_.Add(num_keys_per_prefix_);
74     }
75     num_keys_per_prefix_ = 0;
76     prev_key_prefix_ = key_prefix_slice.ToString();
77     prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
78     due_index_ = true;
79   }
80 
81   if (due_index_) {
82     // Add an index key for every kIndexIntervalForSamePrefixKeys keys
83     record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
84     due_index_ = false;
85   }
86 
87   num_keys_per_prefix_++;
88   if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
89     due_index_ = true;
90   }
91   is_first_record_ = false;
92 }
93 
Finish()94 Slice PlainTableIndexBuilder::Finish() {
95   AllocateIndex();
96   std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
97   std::vector<uint32_t> entries_per_bucket(index_size_, 0);
98   BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
99 
100   keys_per_prefix_hist_.Add(num_keys_per_prefix_);
101   ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
102                  keys_per_prefix_hist_.ToString().c_str());
103 
104   // From the temp data structure, populate indexes.
105   return FillIndexes(hash_to_offsets, entries_per_bucket);
106 }
107 
AllocateIndex()108 void PlainTableIndexBuilder::AllocateIndex() {
109   if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
110     // Fall back to pure binary search if the user fails to specify a prefix
111     // extractor.
112     index_size_ = 1;
113   } else {
114     double hash_table_size_multipier = 1.0 / hash_table_ratio_;
115     index_size_ =
116       static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
117     assert(index_size_ > 0);
118   }
119 }
120 
BucketizeIndexes(std::vector<IndexRecord * > * hash_to_offsets,std::vector<uint32_t> * entries_per_bucket)121 void PlainTableIndexBuilder::BucketizeIndexes(
122     std::vector<IndexRecord*>* hash_to_offsets,
123     std::vector<uint32_t>* entries_per_bucket) {
124   bool first = true;
125   uint32_t prev_hash = 0;
126   size_t num_records = record_list_.GetNumRecords();
127   for (size_t i = 0; i < num_records; i++) {
128     IndexRecord* index_record = record_list_.At(i);
129     uint32_t cur_hash = index_record->hash;
130     if (first || prev_hash != cur_hash) {
131       prev_hash = cur_hash;
132       first = false;
133     }
134     uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
135     IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
136     index_record->next = prev_bucket_head;
137     (*hash_to_offsets)[bucket] = index_record;
138     (*entries_per_bucket)[bucket]++;
139   }
140 
141   sub_index_size_ = 0;
142   for (auto entry_count : *entries_per_bucket) {
143     if (entry_count <= 1) {
144       continue;
145     }
146     // Only buckets with more than 1 entry will have subindex.
147     sub_index_size_ += VarintLength(entry_count);
148     // total bytes needed to store these entries' in-file offsets.
149     sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
150   }
151 }
152 
FillIndexes(const std::vector<IndexRecord * > & hash_to_offsets,const std::vector<uint32_t> & entries_per_bucket)153 Slice PlainTableIndexBuilder::FillIndexes(
154     const std::vector<IndexRecord*>& hash_to_offsets,
155     const std::vector<uint32_t>& entries_per_bucket) {
156   ROCKS_LOG_DEBUG(ioptions_.info_log,
157                   "Reserving %" PRIu32 " bytes for plain table's sub_index",
158                   sub_index_size_);
159   auto total_allocate_size = GetTotalSize();
160   char* allocated = arena_->AllocateAligned(
161       total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
162 
163   auto temp_ptr = EncodeVarint32(allocated, index_size_);
164   uint32_t* index =
165       reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
166   char* sub_index = reinterpret_cast<char*>(index + index_size_);
167 
168   uint32_t sub_index_offset = 0;
169   for (uint32_t i = 0; i < index_size_; i++) {
170     uint32_t num_keys_for_bucket = entries_per_bucket[i];
171     switch (num_keys_for_bucket) {
172       case 0:
173         // No key for bucket
174         PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
175         break;
176       case 1:
177         // point directly to the file offset
178         PutUnaligned(index + i, hash_to_offsets[i]->offset);
179         break;
180       default:
181         // point to second level indexes.
182         PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
183         char* prev_ptr = &sub_index[sub_index_offset];
184         char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
185         sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
186         char* sub_index_pos = &sub_index[sub_index_offset];
187         IndexRecord* record = hash_to_offsets[i];
188         int j;
189         for (j = num_keys_for_bucket - 1; j >= 0 && record;
190              j--, record = record->next) {
191           EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
192         }
193         assert(j == -1 && record == nullptr);
194         sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
195         assert(sub_index_offset <= sub_index_size_);
196         break;
197     }
198   }
199   assert(sub_index_offset == sub_index_size_);
200 
201   ROCKS_LOG_DEBUG(ioptions_.info_log,
202                   "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
203                   index_size_, sub_index_size_);
204   return Slice(allocated, GetTotalSize());
205 }
206 
207 const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
208     "PlainTableIndexBlock";
209 };  // namespace ROCKSDB_NAMESPACE
210 
211 #endif  // ROCKSDB_LITE
212