1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 3 // Use of this source code is governed by a BSD-style license that can be 4 // found in the LICENSE file. See the AUTHORS file for names of contributors. 5 6 #pragma once 7 8 #ifndef ROCKSDB_LITE 9 #include <unordered_map> 10 #include <memory> 11 #include <vector> 12 #include <string> 13 #include <stdint.h> 14 15 #include "db/dbformat.h" 16 #include "file/random_access_file_reader.h" 17 #include "memory/arena.h" 18 #include "rocksdb/env.h" 19 #include "rocksdb/iterator.h" 20 #include "rocksdb/slice_transform.h" 21 #include "rocksdb/table.h" 22 #include "rocksdb/table_properties.h" 23 #include "table/plain/plain_table_bloom.h" 24 #include "table/plain/plain_table_factory.h" 25 #include "table/plain/plain_table_index.h" 26 #include "table/table_reader.h" 27 28 namespace ROCKSDB_NAMESPACE { 29 30 class Block; 31 struct BlockContents; 32 class BlockHandle; 33 class Footer; 34 struct Options; 35 class RandomAccessFile; 36 struct ReadOptions; 37 class TableCache; 38 class TableReader; 39 class InternalKeyComparator; 40 class PlainTableKeyDecoder; 41 class GetContext; 42 43 extern const uint32_t kPlainTableVariableLength; 44 45 struct PlainTableReaderFileInfo { 46 bool is_mmap_mode; 47 Slice file_data; 48 uint32_t data_end_offset; 49 std::unique_ptr<RandomAccessFileReader> file; 50 PlainTableReaderFileInfoPlainTableReaderFileInfo51 PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file, 52 const EnvOptions& storage_options, 53 uint32_t _data_size_offset) 54 : is_mmap_mode(storage_options.use_mmap_reads), 55 data_end_offset(_data_size_offset), 56 file(std::move(_file)) {} 57 }; 58 59 // The reader class of PlainTable. For description of PlainTable format 60 // See comments of class PlainTableFactory, where instances of 61 // PlainTableReader are created. 62 class PlainTableReader: public TableReader { 63 public: 64 // Based on following output file format shown in plain_table_factory.h 65 // When opening the output file, PlainTableReader creates a hash table 66 // from key prefixes to offset of the output file. PlainTable will decide 67 // whether it points to the data offset of the first key with the key prefix 68 // or the offset of it. If there are too many keys share this prefix, it will 69 // create a binary search-able index from the suffix to offset on disk. 70 static Status Open(const ImmutableCFOptions& ioptions, 71 const EnvOptions& env_options, 72 const InternalKeyComparator& internal_comparator, 73 std::unique_ptr<RandomAccessFileReader>&& file, 74 uint64_t file_size, std::unique_ptr<TableReader>* table, 75 const int bloom_bits_per_key, double hash_table_ratio, 76 size_t index_sparseness, size_t huge_page_tlb_size, 77 bool full_scan_mode, const bool immortal_table = false, 78 const SliceTransform* prefix_extractor = nullptr); 79 80 // Returns new iterator over table contents 81 // compaction_readahead_size: its value will only be used if for_compaction = 82 // true 83 InternalIterator* NewIterator(const ReadOptions&, 84 const SliceTransform* prefix_extractor, 85 Arena* arena, bool skip_filters, 86 TableReaderCaller caller, 87 size_t compaction_readahead_size = 0) override; 88 89 void Prepare(const Slice& target) override; 90 91 Status Get(const ReadOptions& readOptions, const Slice& key, 92 GetContext* get_context, const SliceTransform* prefix_extractor, 93 bool skip_filters = false) override; 94 95 uint64_t ApproximateOffsetOf(const Slice& key, 96 TableReaderCaller caller) override; 97 98 uint64_t ApproximateSize(const Slice& start, const Slice& end, 99 TableReaderCaller caller) override; 100 GetIndexSize()101 uint32_t GetIndexSize() const { return index_.GetIndexSize(); } 102 void SetupForCompaction() override; 103 GetTableProperties()104 std::shared_ptr<const TableProperties> GetTableProperties() const override { 105 return table_properties_; 106 } 107 ApproximateMemoryUsage()108 virtual size_t ApproximateMemoryUsage() const override { 109 return arena_.MemoryAllocatedBytes(); 110 } 111 112 PlainTableReader(const ImmutableCFOptions& ioptions, 113 std::unique_ptr<RandomAccessFileReader>&& file, 114 const EnvOptions& env_options, 115 const InternalKeyComparator& internal_comparator, 116 EncodingType encoding_type, uint64_t file_size, 117 const TableProperties* table_properties, 118 const SliceTransform* prefix_extractor); 119 virtual ~PlainTableReader(); 120 121 protected: 122 // Check bloom filter to see whether it might contain this prefix. 123 // The hash of the prefix is given, since it can be reused for index lookup 124 // too. 125 virtual bool MatchBloom(uint32_t hash) const; 126 127 // PopulateIndex() builds index of keys. It must be called before any query 128 // to the table. 129 // 130 // props: the table properties object that need to be stored. Ownership of 131 // the object will be passed. 132 // 133 134 Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, 135 double hash_table_ratio, size_t index_sparseness, 136 size_t huge_page_tlb_size); 137 138 Status MmapDataIfNeeded(); 139 140 private: 141 const InternalKeyComparator internal_comparator_; 142 EncodingType encoding_type_; 143 // represents plain table's current status. 144 Status status_; 145 146 PlainTableIndex index_; 147 bool full_scan_mode_; 148 149 // data_start_offset_ and data_end_offset_ defines the range of the 150 // sst file that stores data. 151 const uint32_t data_start_offset_ = 0; 152 const uint32_t user_key_len_; 153 const SliceTransform* prefix_extractor_; 154 155 static const size_t kNumInternalBytes = 8; 156 157 // Bloom filter is used to rule out non-existent key 158 bool enable_bloom_; 159 PlainTableBloomV1 bloom_; 160 PlainTableReaderFileInfo file_info_; 161 Arena arena_; 162 CacheAllocationPtr index_block_alloc_; 163 CacheAllocationPtr bloom_block_alloc_; 164 165 const ImmutableCFOptions& ioptions_; 166 std::unique_ptr<Cleanable> dummy_cleanable_; 167 uint64_t file_size_; 168 protected: // for testing 169 std::shared_ptr<const TableProperties> table_properties_; 170 private: 171 IsFixedLength()172 bool IsFixedLength() const { 173 return user_key_len_ != kPlainTableVariableLength; 174 } 175 GetFixedInternalKeyLength()176 size_t GetFixedInternalKeyLength() const { 177 return user_key_len_ + kNumInternalBytes; 178 } 179 GetPrefix(const Slice & target)180 Slice GetPrefix(const Slice& target) const { 181 assert(target.size() >= 8); // target is internal key 182 return GetPrefixFromUserKey(GetUserKey(target)); 183 } 184 GetPrefix(const ParsedInternalKey & target)185 Slice GetPrefix(const ParsedInternalKey& target) const { 186 return GetPrefixFromUserKey(target.user_key); 187 } 188 GetUserKey(const Slice & key)189 Slice GetUserKey(const Slice& key) const { 190 return Slice(key.data(), key.size() - 8); 191 } 192 GetPrefixFromUserKey(const Slice & user_key)193 Slice GetPrefixFromUserKey(const Slice& user_key) const { 194 if (!IsTotalOrderMode()) { 195 return prefix_extractor_->Transform(user_key); 196 } else { 197 // Use empty slice as prefix if prefix_extractor is not set. 198 // In that case, 199 // it falls back to pure binary search and 200 // total iterator seek is supported. 201 return Slice(); 202 } 203 } 204 205 friend class TableCache; 206 friend class PlainTableIterator; 207 208 // Internal helper function to generate an IndexRecordList object from all 209 // the rows, which contains index records as a list. 210 // If bloom_ is not null, all the keys' full-key hash will be added to the 211 // bloom filter. 212 Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, 213 std::vector<uint32_t>* prefix_hashes); 214 215 // Internal helper function to allocate memory for bloom filter 216 void AllocateBloom(int bloom_bits_per_key, int num_prefixes, 217 size_t huge_page_tlb_size); 218 219 void FillBloom(const std::vector<uint32_t>& prefix_hashes); 220 221 // Read the key and value at `offset` to parameters for keys, the and 222 // `seekable`. 223 // On success, `offset` will be updated as the offset for the next key. 224 // `parsed_key` will be key in parsed format. 225 // if `internal_key` is not empty, it will be filled with key with slice 226 // format. 227 // if `seekable` is not null, it will return whether we can directly read 228 // data using this offset. 229 Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset, 230 ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value, 231 bool* seekable = nullptr) const; 232 // Get file offset for key target. 233 // return value prefix_matched is set to true if the offset is confirmed 234 // for a key with the same prefix as target. 235 Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target, 236 const Slice& prefix, uint32_t prefix_hash, 237 bool& prefix_matched, uint32_t* offset) const; 238 IsTotalOrderMode()239 bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } 240 241 // No copying allowed 242 explicit PlainTableReader(const TableReader&) = delete; 243 void operator=(const TableReader&) = delete; 244 }; 245 } // namespace ROCKSDB_NAMESPACE 246 #endif // ROCKSDB_LITE 247