1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5 
6 #pragma once
7 
8 #ifndef ROCKSDB_LITE
9 #include <memory>
10 #include <string>
11 #include <stdint.h>
12 
13 #include "options/options_helper.h"
14 #include "rocksdb/options.h"
15 #include "rocksdb/table.h"
16 
17 namespace ROCKSDB_NAMESPACE {
18 
19 struct EnvOptions;
20 
21 class Status;
22 class RandomAccessFile;
23 class WritableFile;
24 class Table;
25 class TableBuilder;
26 
27 // PlainTableFactory is the entrance function to the PlainTable format of
28 // SST files. It returns instances PlainTableBuilder as the builder
29 // class and PlainTableReader as the reader class, where the format is
30 // actually implemented.
31 //
32 // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
33 // Data is not organized in blocks, which allows fast access. Because of
34 // following downsides
35 // 1. Data compression is not supported.
36 // 2. Data is not checksumed.
37 // it is not recommended to use this format on other type of file systems.
38 //
39 // PlainTable requires fixed length key, configured as a constructor
40 // parameter of the factory class. Output file format:
41 // +-------------+-----------------+
42 // | version     | user_key_length |
43 // +------------++------------+-----------------+  <= key1 offset
44 // |  encoded key1            | value_size  |   |
45 // +------------+-------------+-------------+   |
46 // | value1                                     |
47 // |                                            |
48 // +--------------------------+-------------+---+  <= key2 offset
49 // | encoded key2             | value_size  |   |
50 // +------------+-------------+-------------+   |
51 // | value2                                     |
52 // |                                            |
53 // |        ......                              |
54 // +-----------------+--------------------------+
55 //
56 // When the key encoding type is kPlain. Key part is encoded as:
57 // +------------+--------------------+
58 // | [key_size] |  internal key      |
59 // +------------+--------------------+
60 // for the case of user_key_len = kPlainTableVariableLength case,
61 // and simply:
62 // +----------------------+
63 // |  internal key        |
64 // +----------------------+
65 // for user_key_len != kPlainTableVariableLength case.
66 //
67 // If key encoding type is kPrefix. Keys are encoding in this format.
68 // There are three ways to encode a key:
69 // (1) Full Key
70 // +---------------+---------------+-------------------+
71 // | Full Key Flag | Full Key Size | Full Internal Key |
72 // +---------------+---------------+-------------------+
73 // which simply encodes a full key
74 //
75 // (2) A key shared the same prefix as the previous key, which is encoded as
76 //     format of (1).
77 // +-------------+-------------+-------------+-------------+------------+
78 // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
79 // +-------------+-------------+-------------+-------------+------------+
80 // where key is the suffix part of the key, including the internal bytes.
81 // the actual key will be constructed by concatenating prefix part of the
82 // previous key, with the suffix part of the key here, with sizes given here.
83 //
84 // (3) A key shared the same prefix as the previous key, which is encoded as
85 //     the format of (2).
86 // +-----------------+-----------------+------------------------+
87 // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
88 // +-----------------+-----------------+------------------------+
89 // The key will be constructed by concatenating previous key's prefix (which is
90 // also a prefix which the last key encoded in the format of (1)) and the
91 // key given here.
92 //
93 // For example, we for following keys (prefix and suffix are separated by
94 // spaces):
95 //   0000 0001
96 //   0000 00021
97 //   0000 0002
98 //   00011 00
99 //   0002 0001
100 // Will be encoded like this:
101 //   FK 8 00000001
102 //   PF 4 SF 5 00021
103 //   SF 4 0002
104 //   FK 7 0001100
105 //   FK 8 00020001
106 // (where FK means full key flag, PF means prefix flag and SF means suffix flag)
107 //
108 // All those "key flag + key size" shown above are in this format:
109 // The 8 bits of the first byte:
110 // +----+----+----+----+----+----+----+----+
111 // |  Type   |            Size             |
112 // +----+----+----+----+----+----+----+----+
113 // Type indicates: full key, prefix, or suffix.
114 // The last 6 bits are for size. If the size bits are not all 1, it means the
115 // size of the key. Otherwise, varint32 is read after this byte. This varint
116 // value + 0x3F (the value of all 1) will be the key size.
117 //
118 // For example, full key with length 16 will be encoded as (binary):
119 //     00 010000
120 // (00 means full key)
121 // and a prefix with 100 bytes will be encoded as:
122 //     01 111111    00100101
123 //         (63)       (37)
124 // (01 means key suffix)
125 //
126 // All the internal keys above (including kPlain and kPrefix) are encoded in
127 // this format:
128 // There are two types:
129 // (1) normal internal key format
130 // +----------- ...... -------------+----+---+---+---+---+---+---+---+
131 // |       user key                 |type|      sequence ID          |
132 // +----------- ..... --------------+----+---+---+---+---+---+---+---+
133 // (2) Special case for keys whose sequence ID is 0 and is value type
134 // +----------- ...... -------------+----+
135 // |       user key                 |0x80|
136 // +----------- ..... --------------+----+
137 // To save 7 bytes for the special case where sequence ID = 0.
138 //
139 //
140 class PlainTableFactory : public TableFactory {
141  public:
~PlainTableFactory()142   ~PlainTableFactory() {}
143   // user_key_len is the length of the user key. If it is set to be
144   // kPlainTableVariableLength, then it means variable length. Otherwise, all
145   // the keys need to have the fix length of this value. bloom_bits_per_key is
146   // number of bits used for bloom filer per key. hash_table_ratio is
147   // the desired utilization of the hash table used for prefix hashing.
148   // hash_table_ratio = number of prefixes / #buckets in the hash table
149   // hash_table_ratio = 0 means skip hash table but only replying on binary
150   // search.
151   // index_sparseness determines index interval for keys
152   // inside the same prefix. It will be the maximum number of linear search
153   // required after hash and binary search.
154   // index_sparseness = 0 means index for every key.
155   // huge_page_tlb_size determines whether to allocate hash indexes from huge
156   // page TLB and the page size if allocating from there. See comments of
157   // Arena::AllocateAligned() for details.
158   explicit PlainTableFactory(
159       const PlainTableOptions& _table_options = PlainTableOptions())
table_options_(_table_options)160       : table_options_(_table_options) {}
161 
Name()162   const char* Name() const override { return "PlainTable"; }
163   Status NewTableReader(const TableReaderOptions& table_reader_options,
164                         std::unique_ptr<RandomAccessFileReader>&& file,
165                         uint64_t file_size, std::unique_ptr<TableReader>* table,
166                         bool prefetch_index_and_filter_in_cache) const override;
167 
168   TableBuilder* NewTableBuilder(
169       const TableBuilderOptions& table_builder_options,
170       uint32_t column_family_id, WritableFileWriter* file) const override;
171 
172   std::string GetPrintableTableOptions() const override;
173 
174   const PlainTableOptions& table_options() const;
175 
176   static const char kValueTypeSeqId0 = char(~0);
177 
178   // Sanitizes the specified DB Options.
SanitizeOptions(const DBOptions &,const ColumnFamilyOptions &)179   Status SanitizeOptions(
180       const DBOptions& /*db_opts*/,
181       const ColumnFamilyOptions& /*cf_opts*/) const override {
182     return Status::OK();
183   }
184 
GetOptions()185   void* GetOptions() override { return &table_options_; }
186 
GetOptionString(std::string *,const std::string &)187   Status GetOptionString(std::string* /*opt_string*/,
188                          const std::string& /*delimiter*/) const override {
189     return Status::OK();
190   }
191 
192  private:
193   PlainTableOptions table_options_;
194 };
195 
196 static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
197     {"user_key_len",
198      {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
199       OptionVerificationType::kNormal, false, 0}},
200     {"bloom_bits_per_key",
201      {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
202       OptionVerificationType::kNormal, false, 0}},
203     {"hash_table_ratio",
204      {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
205       OptionVerificationType::kNormal, false, 0}},
206     {"index_sparseness",
207      {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
208       OptionVerificationType::kNormal, false, 0}},
209     {"huge_page_tlb_size",
210      {offsetof(struct PlainTableOptions, huge_page_tlb_size),
211       OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
212     {"encoding_type",
213      {offsetof(struct PlainTableOptions, encoding_type),
214       OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
215     {"full_scan_mode",
216      {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
217       OptionVerificationType::kNormal, false, 0}},
218     {"store_index_in_file",
219      {offsetof(struct PlainTableOptions, store_index_in_file),
220       OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
221 
222 }  // namespace ROCKSDB_NAMESPACE
223 #endif  // ROCKSDB_LITE
224