1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 3 // Use of this source code is governed by a BSD-style license that can be 4 // found in the LICENSE file. See the AUTHORS file for names of contributors. 5 6 #pragma once 7 8 #ifndef ROCKSDB_LITE 9 #include <memory> 10 #include <string> 11 #include <stdint.h> 12 13 #include "options/options_helper.h" 14 #include "rocksdb/options.h" 15 #include "rocksdb/table.h" 16 17 namespace ROCKSDB_NAMESPACE { 18 19 struct EnvOptions; 20 21 class Status; 22 class RandomAccessFile; 23 class WritableFile; 24 class Table; 25 class TableBuilder; 26 27 // PlainTableFactory is the entrance function to the PlainTable format of 28 // SST files. It returns instances PlainTableBuilder as the builder 29 // class and PlainTableReader as the reader class, where the format is 30 // actually implemented. 31 // 32 // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. 33 // Data is not organized in blocks, which allows fast access. Because of 34 // following downsides 35 // 1. Data compression is not supported. 36 // 2. Data is not checksumed. 37 // it is not recommended to use this format on other type of file systems. 38 // 39 // PlainTable requires fixed length key, configured as a constructor 40 // parameter of the factory class. Output file format: 41 // +-------------+-----------------+ 42 // | version | user_key_length | 43 // +------------++------------+-----------------+ <= key1 offset 44 // | encoded key1 | value_size | | 45 // +------------+-------------+-------------+ | 46 // | value1 | 47 // | | 48 // +--------------------------+-------------+---+ <= key2 offset 49 // | encoded key2 | value_size | | 50 // +------------+-------------+-------------+ | 51 // | value2 | 52 // | | 53 // | ...... | 54 // +-----------------+--------------------------+ 55 // 56 // When the key encoding type is kPlain. Key part is encoded as: 57 // +------------+--------------------+ 58 // | [key_size] | internal key | 59 // +------------+--------------------+ 60 // for the case of user_key_len = kPlainTableVariableLength case, 61 // and simply: 62 // +----------------------+ 63 // | internal key | 64 // +----------------------+ 65 // for user_key_len != kPlainTableVariableLength case. 66 // 67 // If key encoding type is kPrefix. Keys are encoding in this format. 68 // There are three ways to encode a key: 69 // (1) Full Key 70 // +---------------+---------------+-------------------+ 71 // | Full Key Flag | Full Key Size | Full Internal Key | 72 // +---------------+---------------+-------------------+ 73 // which simply encodes a full key 74 // 75 // (2) A key shared the same prefix as the previous key, which is encoded as 76 // format of (1). 77 // +-------------+-------------+-------------+-------------+------------+ 78 // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | 79 // +-------------+-------------+-------------+-------------+------------+ 80 // where key is the suffix part of the key, including the internal bytes. 81 // the actual key will be constructed by concatenating prefix part of the 82 // previous key, with the suffix part of the key here, with sizes given here. 83 // 84 // (3) A key shared the same prefix as the previous key, which is encoded as 85 // the format of (2). 86 // +-----------------+-----------------+------------------------+ 87 // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | 88 // +-----------------+-----------------+------------------------+ 89 // The key will be constructed by concatenating previous key's prefix (which is 90 // also a prefix which the last key encoded in the format of (1)) and the 91 // key given here. 92 // 93 // For example, we for following keys (prefix and suffix are separated by 94 // spaces): 95 // 0000 0001 96 // 0000 00021 97 // 0000 0002 98 // 00011 00 99 // 0002 0001 100 // Will be encoded like this: 101 // FK 8 00000001 102 // PF 4 SF 5 00021 103 // SF 4 0002 104 // FK 7 0001100 105 // FK 8 00020001 106 // (where FK means full key flag, PF means prefix flag and SF means suffix flag) 107 // 108 // All those "key flag + key size" shown above are in this format: 109 // The 8 bits of the first byte: 110 // +----+----+----+----+----+----+----+----+ 111 // | Type | Size | 112 // +----+----+----+----+----+----+----+----+ 113 // Type indicates: full key, prefix, or suffix. 114 // The last 6 bits are for size. If the size bits are not all 1, it means the 115 // size of the key. Otherwise, varint32 is read after this byte. This varint 116 // value + 0x3F (the value of all 1) will be the key size. 117 // 118 // For example, full key with length 16 will be encoded as (binary): 119 // 00 010000 120 // (00 means full key) 121 // and a prefix with 100 bytes will be encoded as: 122 // 01 111111 00100101 123 // (63) (37) 124 // (01 means key suffix) 125 // 126 // All the internal keys above (including kPlain and kPrefix) are encoded in 127 // this format: 128 // There are two types: 129 // (1) normal internal key format 130 // +----------- ...... -------------+----+---+---+---+---+---+---+---+ 131 // | user key |type| sequence ID | 132 // +----------- ..... --------------+----+---+---+---+---+---+---+---+ 133 // (2) Special case for keys whose sequence ID is 0 and is value type 134 // +----------- ...... -------------+----+ 135 // | user key |0x80| 136 // +----------- ..... --------------+----+ 137 // To save 7 bytes for the special case where sequence ID = 0. 138 // 139 // 140 class PlainTableFactory : public TableFactory { 141 public: ~PlainTableFactory()142 ~PlainTableFactory() {} 143 // user_key_len is the length of the user key. If it is set to be 144 // kPlainTableVariableLength, then it means variable length. Otherwise, all 145 // the keys need to have the fix length of this value. bloom_bits_per_key is 146 // number of bits used for bloom filer per key. hash_table_ratio is 147 // the desired utilization of the hash table used for prefix hashing. 148 // hash_table_ratio = number of prefixes / #buckets in the hash table 149 // hash_table_ratio = 0 means skip hash table but only replying on binary 150 // search. 151 // index_sparseness determines index interval for keys 152 // inside the same prefix. It will be the maximum number of linear search 153 // required after hash and binary search. 154 // index_sparseness = 0 means index for every key. 155 // huge_page_tlb_size determines whether to allocate hash indexes from huge 156 // page TLB and the page size if allocating from there. See comments of 157 // Arena::AllocateAligned() for details. 158 explicit PlainTableFactory( 159 const PlainTableOptions& _table_options = PlainTableOptions()) table_options_(_table_options)160 : table_options_(_table_options) {} 161 Name()162 const char* Name() const override { return "PlainTable"; } 163 Status NewTableReader(const TableReaderOptions& table_reader_options, 164 std::unique_ptr<RandomAccessFileReader>&& file, 165 uint64_t file_size, std::unique_ptr<TableReader>* table, 166 bool prefetch_index_and_filter_in_cache) const override; 167 168 TableBuilder* NewTableBuilder( 169 const TableBuilderOptions& table_builder_options, 170 uint32_t column_family_id, WritableFileWriter* file) const override; 171 172 std::string GetPrintableTableOptions() const override; 173 174 const PlainTableOptions& table_options() const; 175 176 static const char kValueTypeSeqId0 = char(~0); 177 178 // Sanitizes the specified DB Options. SanitizeOptions(const DBOptions &,const ColumnFamilyOptions &)179 Status SanitizeOptions( 180 const DBOptions& /*db_opts*/, 181 const ColumnFamilyOptions& /*cf_opts*/) const override { 182 return Status::OK(); 183 } 184 GetOptions()185 void* GetOptions() override { return &table_options_; } 186 GetOptionString(std::string *,const std::string &)187 Status GetOptionString(std::string* /*opt_string*/, 188 const std::string& /*delimiter*/) const override { 189 return Status::OK(); 190 } 191 192 private: 193 PlainTableOptions table_options_; 194 }; 195 196 static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = { 197 {"user_key_len", 198 {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, 199 OptionVerificationType::kNormal, false, 0}}, 200 {"bloom_bits_per_key", 201 {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, 202 OptionVerificationType::kNormal, false, 0}}, 203 {"hash_table_ratio", 204 {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, 205 OptionVerificationType::kNormal, false, 0}}, 206 {"index_sparseness", 207 {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, 208 OptionVerificationType::kNormal, false, 0}}, 209 {"huge_page_tlb_size", 210 {offsetof(struct PlainTableOptions, huge_page_tlb_size), 211 OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, 212 {"encoding_type", 213 {offsetof(struct PlainTableOptions, encoding_type), 214 OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}}, 215 {"full_scan_mode", 216 {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, 217 OptionVerificationType::kNormal, false, 0}}, 218 {"store_index_in_file", 219 {offsetof(struct PlainTableOptions, store_index_in_file), 220 OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; 221 222 } // namespace ROCKSDB_NAMESPACE 223 #endif // ROCKSDB_LITE 224