1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5 
6 #pragma once
7 
8 #ifndef ROCKSDB_LITE
9 #include <unordered_map>
10 #include <memory>
11 #include <vector>
12 #include <string>
13 #include <stdint.h>
14 
15 #include "db/dbformat.h"
16 #include "file/random_access_file_reader.h"
17 #include "memory/arena.h"
18 #include "rocksdb/env.h"
19 #include "rocksdb/iterator.h"
20 #include "rocksdb/slice_transform.h"
21 #include "rocksdb/table.h"
22 #include "rocksdb/table_properties.h"
23 #include "table/plain/plain_table_bloom.h"
24 #include "table/plain/plain_table_factory.h"
25 #include "table/plain/plain_table_index.h"
26 #include "table/table_reader.h"
27 
28 namespace ROCKSDB_NAMESPACE {
29 
30 class Block;
31 struct BlockContents;
32 class BlockHandle;
33 class Footer;
34 struct Options;
35 class RandomAccessFile;
36 struct ReadOptions;
37 class TableCache;
38 class TableReader;
39 class InternalKeyComparator;
40 class PlainTableKeyDecoder;
41 class GetContext;
42 
43 extern const uint32_t kPlainTableVariableLength;
44 
45 struct PlainTableReaderFileInfo {
46   bool is_mmap_mode;
47   Slice file_data;
48   uint32_t data_end_offset;
49   std::unique_ptr<RandomAccessFileReader> file;
50 
PlainTableReaderFileInfoPlainTableReaderFileInfo51   PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
52                            const EnvOptions& storage_options,
53                            uint32_t _data_size_offset)
54       : is_mmap_mode(storage_options.use_mmap_reads),
55         data_end_offset(_data_size_offset),
56         file(std::move(_file)) {}
57 };
58 
59 // The reader class of PlainTable. For description of PlainTable format
60 // See comments of class PlainTableFactory, where instances of
61 // PlainTableReader are created.
62 class PlainTableReader: public TableReader {
63  public:
64 // Based on following output file format shown in plain_table_factory.h
65 // When opening the output file, PlainTableReader creates a hash table
66 // from key prefixes to offset of the output file. PlainTable will decide
67 // whether it points to the data offset of the first key with the key prefix
68 // or the offset of it. If there are too many keys share this prefix, it will
69 // create a binary search-able index from the suffix to offset on disk.
70   static Status Open(const ImmutableCFOptions& ioptions,
71                      const EnvOptions& env_options,
72                      const InternalKeyComparator& internal_comparator,
73                      std::unique_ptr<RandomAccessFileReader>&& file,
74                      uint64_t file_size, std::unique_ptr<TableReader>* table,
75                      const int bloom_bits_per_key, double hash_table_ratio,
76                      size_t index_sparseness, size_t huge_page_tlb_size,
77                      bool full_scan_mode, const bool immortal_table = false,
78                      const SliceTransform* prefix_extractor = nullptr);
79 
80   // Returns new iterator over table contents
81   // compaction_readahead_size: its value will only be used if for_compaction =
82   // true
83   InternalIterator* NewIterator(const ReadOptions&,
84                                 const SliceTransform* prefix_extractor,
85                                 Arena* arena, bool skip_filters,
86                                 TableReaderCaller caller,
87                                 size_t compaction_readahead_size = 0) override;
88 
89   void Prepare(const Slice& target) override;
90 
91   Status Get(const ReadOptions& readOptions, const Slice& key,
92              GetContext* get_context, const SliceTransform* prefix_extractor,
93              bool skip_filters = false) override;
94 
95   uint64_t ApproximateOffsetOf(const Slice& key,
96                                TableReaderCaller caller) override;
97 
98   uint64_t ApproximateSize(const Slice& start, const Slice& end,
99                            TableReaderCaller caller) override;
100 
GetIndexSize()101   uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
102   void SetupForCompaction() override;
103 
GetTableProperties()104   std::shared_ptr<const TableProperties> GetTableProperties() const override {
105     return table_properties_;
106   }
107 
ApproximateMemoryUsage()108   virtual size_t ApproximateMemoryUsage() const override {
109     return arena_.MemoryAllocatedBytes();
110   }
111 
112   PlainTableReader(const ImmutableCFOptions& ioptions,
113                    std::unique_ptr<RandomAccessFileReader>&& file,
114                    const EnvOptions& env_options,
115                    const InternalKeyComparator& internal_comparator,
116                    EncodingType encoding_type, uint64_t file_size,
117                    const TableProperties* table_properties,
118                    const SliceTransform* prefix_extractor);
119   virtual ~PlainTableReader();
120 
121  protected:
122   // Check bloom filter to see whether it might contain this prefix.
123   // The hash of the prefix is given, since it can be reused for index lookup
124   // too.
125   virtual bool MatchBloom(uint32_t hash) const;
126 
127   // PopulateIndex() builds index of keys. It must be called before any query
128   // to the table.
129   //
130   // props: the table properties object that need to be stored. Ownership of
131   //        the object will be passed.
132   //
133 
134   Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
135                        double hash_table_ratio, size_t index_sparseness,
136                        size_t huge_page_tlb_size);
137 
138   Status MmapDataIfNeeded();
139 
140  private:
141   const InternalKeyComparator internal_comparator_;
142   EncodingType encoding_type_;
143   // represents plain table's current status.
144   Status status_;
145 
146   PlainTableIndex index_;
147   bool full_scan_mode_;
148 
149   // data_start_offset_ and data_end_offset_ defines the range of the
150   // sst file that stores data.
151   const uint32_t data_start_offset_ = 0;
152   const uint32_t user_key_len_;
153   const SliceTransform* prefix_extractor_;
154 
155   static const size_t kNumInternalBytes = 8;
156 
157   // Bloom filter is used to rule out non-existent key
158   bool enable_bloom_;
159   PlainTableBloomV1 bloom_;
160   PlainTableReaderFileInfo file_info_;
161   Arena arena_;
162   CacheAllocationPtr index_block_alloc_;
163   CacheAllocationPtr bloom_block_alloc_;
164 
165   const ImmutableCFOptions& ioptions_;
166   std::unique_ptr<Cleanable> dummy_cleanable_;
167   uint64_t file_size_;
168  protected: // for testing
169   std::shared_ptr<const TableProperties> table_properties_;
170  private:
171 
IsFixedLength()172   bool IsFixedLength() const {
173     return user_key_len_ != kPlainTableVariableLength;
174   }
175 
GetFixedInternalKeyLength()176   size_t GetFixedInternalKeyLength() const {
177     return user_key_len_ + kNumInternalBytes;
178   }
179 
GetPrefix(const Slice & target)180   Slice GetPrefix(const Slice& target) const {
181     assert(target.size() >= 8);  // target is internal key
182     return GetPrefixFromUserKey(GetUserKey(target));
183   }
184 
GetPrefix(const ParsedInternalKey & target)185   Slice GetPrefix(const ParsedInternalKey& target) const {
186     return GetPrefixFromUserKey(target.user_key);
187   }
188 
GetUserKey(const Slice & key)189   Slice GetUserKey(const Slice& key) const {
190     return Slice(key.data(), key.size() - 8);
191   }
192 
GetPrefixFromUserKey(const Slice & user_key)193   Slice GetPrefixFromUserKey(const Slice& user_key) const {
194     if (!IsTotalOrderMode()) {
195       return prefix_extractor_->Transform(user_key);
196     } else {
197       // Use empty slice as prefix if prefix_extractor is not set.
198       // In that case,
199       // it falls back to pure binary search and
200       // total iterator seek is supported.
201       return Slice();
202     }
203   }
204 
205   friend class TableCache;
206   friend class PlainTableIterator;
207 
208   // Internal helper function to generate an IndexRecordList object from all
209   // the rows, which contains index records as a list.
210   // If bloom_ is not null, all the keys' full-key hash will be added to the
211   // bloom filter.
212   Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
213                                  std::vector<uint32_t>* prefix_hashes);
214 
215   // Internal helper function to allocate memory for bloom filter
216   void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
217                      size_t huge_page_tlb_size);
218 
219   void FillBloom(const std::vector<uint32_t>& prefix_hashes);
220 
221   // Read the key and value at `offset` to parameters for keys, the and
222   // `seekable`.
223   // On success, `offset` will be updated as the offset for the next key.
224   // `parsed_key` will be key in parsed format.
225   // if `internal_key` is not empty, it will be filled with key with slice
226   // format.
227   // if `seekable` is not null, it will return whether we can directly read
228   // data using this offset.
229   Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
230               ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
231               bool* seekable = nullptr) const;
232   // Get file offset for key target.
233   // return value prefix_matched is set to true if the offset is confirmed
234   // for a key with the same prefix as target.
235   Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
236                    const Slice& prefix, uint32_t prefix_hash,
237                    bool& prefix_matched, uint32_t* offset) const;
238 
IsTotalOrderMode()239   bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
240 
241   // No copying allowed
242   explicit PlainTableReader(const TableReader&) = delete;
243   void operator=(const TableReader&) = delete;
244 };
245 }  // namespace ROCKSDB_NAMESPACE
246 #endif  // ROCKSDB_LITE
247