1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 6 #pragma once 7 8 #ifndef ROCKSDB_LITE 9 10 #include <array> 11 #include "db/dbformat.h" 12 #include "rocksdb/slice.h" 13 #include "table/plain/plain_table_reader.h" 14 15 // The file contains three helper classes of PlainTable format, 16 // PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. 17 // These classes issue the lowest level of operations of PlainTable. 18 // Actual data format of the key is documented in comments of class 19 // PlainTableFactory. 20 namespace ROCKSDB_NAMESPACE { 21 22 class WritableFile; 23 struct ParsedInternalKey; 24 struct PlainTableReaderFileInfo; 25 enum PlainTableEntryType : unsigned char; 26 27 // Helper class for PlainTable format to write out a key to an output file 28 // The class is used in PlainTableBuilder. 29 class PlainTableKeyEncoder { 30 public: PlainTableKeyEncoder(EncodingType encoding_type,uint32_t user_key_len,const SliceTransform * prefix_extractor,size_t index_sparseness)31 explicit PlainTableKeyEncoder(EncodingType encoding_type, 32 uint32_t user_key_len, 33 const SliceTransform* prefix_extractor, 34 size_t index_sparseness) 35 : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), 36 fixed_user_key_len_(user_key_len), 37 prefix_extractor_(prefix_extractor), 38 index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), 39 key_count_for_prefix_(0) {} 40 // key: the key to write out, in the format of internal key. 41 // file: the output file to write out 42 // offset: offset in the file. Needs to be updated after appending bytes 43 // for the key 44 // meta_bytes_buf: buffer for extra meta bytes 45 // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated 46 // if meta_bytes_buf is updated. 47 IOStatus AppendKey(const Slice& key, WritableFileWriter* file, 48 uint64_t* offset, char* meta_bytes_buf, 49 size_t* meta_bytes_buf_size); 50 51 // Return actual encoding type to be picked GetEncodingType()52 EncodingType GetEncodingType() { return encoding_type_; } 53 54 private: 55 EncodingType encoding_type_; 56 uint32_t fixed_user_key_len_; 57 const SliceTransform* prefix_extractor_; 58 const size_t index_sparseness_; 59 size_t key_count_for_prefix_; 60 IterKey pre_prefix_; 61 }; 62 63 // The class does raw file reads for PlainTableReader. 64 // It hides whether it is a mmap-read, or a non-mmap read. 65 // The class is implemented in a way to favor the performance of mmap case. 66 // The class is used by PlainTableReader. 67 class PlainTableFileReader { 68 public: PlainTableFileReader(const PlainTableReaderFileInfo * _file_info)69 explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) 70 : file_info_(_file_info), num_buf_(0) {} 71 // In mmaped mode, the results point to mmaped area of the file, which 72 // means it is always valid before closing the file. 73 // In non-mmap mode, the results point to an internal buffer. If the caller 74 // makes another read call, the results may not be valid. So callers should 75 // make a copy when needed. 76 // In order to save read calls to files, we keep two internal buffers: 77 // the first read and the most recent read. This is efficient because it 78 // columns these two common use cases: 79 // (1) hash index only identify one location, we read the key to verify 80 // the location, and read key and value if it is the right location. 81 // (2) after hash index checking, we identify two locations (because of 82 // hash bucket conflicts), we binary search the two location to see 83 // which one is what we need and start to read from the location. 84 // These two most common use cases will be covered by the two buffers 85 // so that we don't need to re-read the same location. 86 // Currently we keep a fixed size buffer. If a read doesn't exactly fit 87 // the buffer, we replace the second buffer with the location user reads. 88 // 89 // If return false, status code is stored in status_. Read(uint32_t file_offset,uint32_t len,Slice * out)90 bool Read(uint32_t file_offset, uint32_t len, Slice* out) { 91 if (file_info_->is_mmap_mode) { 92 assert(file_offset + len <= file_info_->data_end_offset); 93 *out = Slice(file_info_->file_data.data() + file_offset, len); 94 return true; 95 } else { 96 return ReadNonMmap(file_offset, len, out); 97 } 98 } 99 100 // If return false, status code is stored in status_. 101 bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); 102 103 // *bytes_read = 0 means eof. false means failure and status is saved 104 // in status_. Not directly returning Status to save copying status 105 // object to map previous performance of mmap mode. 106 inline bool ReadVarint32(uint32_t offset, uint32_t* output, 107 uint32_t* bytes_read); 108 109 bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, 110 uint32_t* bytes_read); 111 status()112 Status status() const { return status_; } 113 file_info()114 const PlainTableReaderFileInfo* file_info() { return file_info_; } 115 116 private: 117 const PlainTableReaderFileInfo* file_info_; 118 119 struct Buffer { BufferBuffer120 Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} 121 std::unique_ptr<char[]> buf; 122 uint32_t buf_start_offset; 123 uint32_t buf_len; 124 uint32_t buf_capacity; 125 }; 126 127 // Keep buffers for two recent reads. 128 std::array<std::unique_ptr<Buffer>, 2> buffers_; 129 uint32_t num_buf_; 130 Status status_; 131 132 Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); 133 }; 134 135 // A helper class to decode keys from input buffer 136 // The class is used by PlainTableBuilder. 137 class PlainTableKeyDecoder { 138 public: PlainTableKeyDecoder(const PlainTableReaderFileInfo * file_info,EncodingType encoding_type,uint32_t user_key_len,const SliceTransform * prefix_extractor)139 explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, 140 EncodingType encoding_type, 141 uint32_t user_key_len, 142 const SliceTransform* prefix_extractor) 143 : file_reader_(file_info), 144 encoding_type_(encoding_type), 145 prefix_len_(0), 146 fixed_user_key_len_(user_key_len), 147 prefix_extractor_(prefix_extractor), 148 in_prefix_(false) {} 149 // Find the next key. 150 // start: char array where the key starts. 151 // limit: boundary of the char array 152 // parsed_key: the output of the result key 153 // internal_key: if not null, fill with the output of the result key in 154 // un-parsed format 155 // bytes_read: how many bytes read from start. Output 156 // seekable: whether key can be read from this place. Used when building 157 // indexes. Output. 158 Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, 159 Slice* internal_key, Slice* value, uint32_t* bytes_read, 160 bool* seekable = nullptr); 161 162 Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, 163 Slice* internal_key, uint32_t* bytes_read, 164 bool* seekable = nullptr); 165 166 PlainTableFileReader file_reader_; 167 EncodingType encoding_type_; 168 uint32_t prefix_len_; 169 uint32_t fixed_user_key_len_; 170 Slice saved_user_key_; 171 IterKey cur_key_; 172 const SliceTransform* prefix_extractor_; 173 bool in_prefix_; 174 175 private: 176 Status NextPlainEncodingKey(uint32_t start_offset, 177 ParsedInternalKey* parsed_key, 178 Slice* internal_key, uint32_t* bytes_read, 179 bool* seekable = nullptr); 180 Status NextPrefixEncodingKey(uint32_t start_offset, 181 ParsedInternalKey* parsed_key, 182 Slice* internal_key, uint32_t* bytes_read, 183 bool* seekable = nullptr); 184 Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, 185 ParsedInternalKey* parsed_key, uint32_t* bytes_read, 186 bool* internal_key_valid, Slice* internal_key); 187 inline Status DecodeSize(uint32_t start_offset, 188 PlainTableEntryType* entry_type, uint32_t* key_size, 189 uint32_t* bytes_read); 190 }; 191 192 } // namespace ROCKSDB_NAMESPACE 193 194 #endif // ROCKSDB_LITE 195