1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 
8 #ifndef ROCKSDB_LITE
9 
10 #include <array>
11 #include "db/dbformat.h"
12 #include "rocksdb/slice.h"
13 #include "table/plain/plain_table_reader.h"
14 
15 // The file contains three helper classes of PlainTable format,
16 // PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
17 // These classes issue the lowest level of operations of PlainTable.
18 // Actual data format of the key is documented in comments of class
19 // PlainTableFactory.
20 namespace ROCKSDB_NAMESPACE {
21 
22 class WritableFile;
23 struct ParsedInternalKey;
24 struct PlainTableReaderFileInfo;
25 enum PlainTableEntryType : unsigned char;
26 
27 // Helper class for PlainTable format to write out a key to an output file
28 // The class is used in PlainTableBuilder.
29 class PlainTableKeyEncoder {
30  public:
PlainTableKeyEncoder(EncodingType encoding_type,uint32_t user_key_len,const SliceTransform * prefix_extractor,size_t index_sparseness)31   explicit PlainTableKeyEncoder(EncodingType encoding_type,
32                                 uint32_t user_key_len,
33                                 const SliceTransform* prefix_extractor,
34                                 size_t index_sparseness)
35       : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
36         fixed_user_key_len_(user_key_len),
37         prefix_extractor_(prefix_extractor),
38         index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
39         key_count_for_prefix_(0) {}
40   // key: the key to write out, in the format of internal key.
41   // file: the output file to write out
42   // offset: offset in the file. Needs to be updated after appending bytes
43   //         for the key
44   // meta_bytes_buf: buffer for extra meta bytes
45   // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
46   //                      if meta_bytes_buf is updated.
47   IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
48                      uint64_t* offset, char* meta_bytes_buf,
49                      size_t* meta_bytes_buf_size);
50 
51   // Return actual encoding type to be picked
GetEncodingType()52   EncodingType GetEncodingType() { return encoding_type_; }
53 
54  private:
55   EncodingType encoding_type_;
56   uint32_t fixed_user_key_len_;
57   const SliceTransform* prefix_extractor_;
58   const size_t index_sparseness_;
59   size_t key_count_for_prefix_;
60   IterKey pre_prefix_;
61 };
62 
63 // The class does raw file reads for PlainTableReader.
64 // It hides whether it is a mmap-read, or a non-mmap read.
65 // The class is implemented in a way to favor the performance of mmap case.
66 // The class is used by PlainTableReader.
67 class PlainTableFileReader {
68  public:
PlainTableFileReader(const PlainTableReaderFileInfo * _file_info)69   explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
70       : file_info_(_file_info), num_buf_(0) {}
71   // In mmaped mode, the results point to mmaped area of the file, which
72   // means it is always valid before closing the file.
73   // In non-mmap mode, the results point to an internal buffer. If the caller
74   // makes another read call, the results may not be valid. So callers should
75   // make a copy when needed.
76   // In order to save read calls to files, we keep two internal buffers:
77   // the first read and the most recent read. This is efficient because it
78   // columns these two common use cases:
79   // (1) hash index only identify one location, we read the key to verify
80   //     the location, and read key and value if it is the right location.
81   // (2) after hash index checking, we identify two locations (because of
82   //     hash bucket conflicts), we binary search the two location to see
83   //     which one is what we need and start to read from the location.
84   // These two most common use cases will be covered by the two buffers
85   // so that we don't need to re-read the same location.
86   // Currently we keep a fixed size buffer. If a read doesn't exactly fit
87   // the buffer, we replace the second buffer with the location user reads.
88   //
89   // If return false, status code is stored in status_.
Read(uint32_t file_offset,uint32_t len,Slice * out)90   bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
91     if (file_info_->is_mmap_mode) {
92       assert(file_offset + len <= file_info_->data_end_offset);
93       *out = Slice(file_info_->file_data.data() + file_offset, len);
94       return true;
95     } else {
96       return ReadNonMmap(file_offset, len, out);
97     }
98   }
99 
100   // If return false, status code is stored in status_.
101   bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
102 
103   // *bytes_read = 0 means eof. false means failure and status is saved
104   // in status_. Not directly returning Status to save copying status
105   // object to map previous performance of mmap mode.
106   inline bool ReadVarint32(uint32_t offset, uint32_t* output,
107                            uint32_t* bytes_read);
108 
109   bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
110                            uint32_t* bytes_read);
111 
status()112   Status status() const { return status_; }
113 
file_info()114   const PlainTableReaderFileInfo* file_info() { return file_info_; }
115 
116  private:
117   const PlainTableReaderFileInfo* file_info_;
118 
119   struct Buffer {
BufferBuffer120     Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
121     std::unique_ptr<char[]> buf;
122     uint32_t buf_start_offset;
123     uint32_t buf_len;
124     uint32_t buf_capacity;
125   };
126 
127   // Keep buffers for two recent reads.
128   std::array<std::unique_ptr<Buffer>, 2> buffers_;
129   uint32_t num_buf_;
130   Status status_;
131 
132   Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
133 };
134 
135 // A helper class to decode keys from input buffer
136 // The class is used by PlainTableBuilder.
137 class PlainTableKeyDecoder {
138  public:
PlainTableKeyDecoder(const PlainTableReaderFileInfo * file_info,EncodingType encoding_type,uint32_t user_key_len,const SliceTransform * prefix_extractor)139   explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
140                                 EncodingType encoding_type,
141                                 uint32_t user_key_len,
142                                 const SliceTransform* prefix_extractor)
143       : file_reader_(file_info),
144         encoding_type_(encoding_type),
145         prefix_len_(0),
146         fixed_user_key_len_(user_key_len),
147         prefix_extractor_(prefix_extractor),
148         in_prefix_(false) {}
149   // Find the next key.
150   // start: char array where the key starts.
151   // limit: boundary of the char array
152   // parsed_key: the output of the result key
153   // internal_key: if not null, fill with the output of the result key in
154   //               un-parsed format
155   // bytes_read: how many bytes read from start. Output
156   // seekable: whether key can be read from this place. Used when building
157   //           indexes. Output.
158   Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
159                  Slice* internal_key, Slice* value, uint32_t* bytes_read,
160                  bool* seekable = nullptr);
161 
162   Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
163                         Slice* internal_key, uint32_t* bytes_read,
164                         bool* seekable = nullptr);
165 
166   PlainTableFileReader file_reader_;
167   EncodingType encoding_type_;
168   uint32_t prefix_len_;
169   uint32_t fixed_user_key_len_;
170   Slice saved_user_key_;
171   IterKey cur_key_;
172   const SliceTransform* prefix_extractor_;
173   bool in_prefix_;
174 
175  private:
176   Status NextPlainEncodingKey(uint32_t start_offset,
177                               ParsedInternalKey* parsed_key,
178                               Slice* internal_key, uint32_t* bytes_read,
179                               bool* seekable = nullptr);
180   Status NextPrefixEncodingKey(uint32_t start_offset,
181                                ParsedInternalKey* parsed_key,
182                                Slice* internal_key, uint32_t* bytes_read,
183                                bool* seekable = nullptr);
184   Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
185                          ParsedInternalKey* parsed_key, uint32_t* bytes_read,
186                          bool* internal_key_valid, Slice* internal_key);
187   inline Status DecodeSize(uint32_t start_offset,
188                            PlainTableEntryType* entry_type, uint32_t* key_size,
189                            uint32_t* bytes_read);
190 };
191 
192 }  // namespace ROCKSDB_NAMESPACE
193 
194 #endif  // ROCKSDB_LITE
195