1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // This file contains the interface that must be implemented by any collection 7 // to be used as the backing store for a MemTable. Such a collection must 8 // satisfy the following properties: 9 // (1) It does not store duplicate items. 10 // (2) It uses MemTableRep::KeyComparator to compare items for iteration and 11 // equality. 12 // (3) It can be accessed concurrently by multiple readers and can support 13 // during reads. However, it needn't support multiple concurrent writes. 14 // (4) Items are never deleted. 15 // The liberal use of assertions is encouraged to enforce (1). 16 // 17 // The factory will be passed an MemTableAllocator object when a new MemTableRep 18 // is requested. 19 // 20 // Users can implement their own memtable representations. We include three 21 // types built in: 22 // - SkipListRep: This is the default; it is backed by a skip list. 23 // - HashSkipListRep: The memtable rep that is best used for keys that are 24 // structured like "prefix:suffix" where iteration within a prefix is 25 // common and iteration across different prefixes is rare. It is backed by 26 // a hash map where each bucket is a skip list. 27 // - VectorRep: This is backed by an unordered std::vector. On iteration, the 28 // vector is sorted. It is intelligent about sorting; once the MarkReadOnly() 29 // has been called, the vector will only be sorted once. It is optimized for 30 // random-write-heavy workloads. 31 // 32 // The last four implementations are designed for situations in which 33 // iteration over the entire collection is rare since doing so requires all the 34 // keys to be copied into a sorted data structure. 35 36 #pragma once 37 38 #include <rocksdb/slice.h> 39 #include <stdint.h> 40 #include <stdlib.h> 41 #include <memory> 42 #include <stdexcept> 43 44 namespace ROCKSDB_NAMESPACE { 45 46 class Arena; 47 class Allocator; 48 class LookupKey; 49 class SliceTransform; 50 class Logger; 51 52 typedef void* KeyHandle; 53 54 extern Slice GetLengthPrefixedSlice(const char* data); 55 56 class MemTableRep { 57 public: 58 // KeyComparator provides a means to compare keys, which are internal keys 59 // concatenated with values. 60 class KeyComparator { 61 public: 62 typedef ROCKSDB_NAMESPACE::Slice DecodedType; 63 decode_key(const char * key)64 virtual DecodedType decode_key(const char* key) const { 65 // The format of key is frozen and can be terated as a part of the API 66 // contract. Refer to MemTable::Add for details. 67 return GetLengthPrefixedSlice(key); 68 } 69 70 // Compare a and b. Return a negative value if a is less than b, 0 if they 71 // are equal, and a positive value if a is greater than b 72 virtual int operator()(const char* prefix_len_key1, 73 const char* prefix_len_key2) const = 0; 74 75 virtual int operator()(const char* prefix_len_key, 76 const Slice& key) const = 0; 77 ~KeyComparator()78 virtual ~KeyComparator() {} 79 }; 80 MemTableRep(Allocator * allocator)81 explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {} 82 83 // Allocate a buf of len size for storing key. The idea is that a 84 // specific memtable representation knows its underlying data structure 85 // better. By allowing it to allocate memory, it can possibly put 86 // correlated stuff in consecutive memory area to make processor 87 // prefetching more efficient. 88 virtual KeyHandle Allocate(const size_t len, char** buf); 89 90 // Insert key into the collection. (The caller will pack key and value into a 91 // single buffer and pass that in as the parameter to Insert). 92 // REQUIRES: nothing that compares equal to key is currently in the 93 // collection, and no concurrent modifications to the table in progress 94 virtual void Insert(KeyHandle handle) = 0; 95 96 // Same as ::Insert 97 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and 98 // the <key, seq> already exists. InsertKey(KeyHandle handle)99 virtual bool InsertKey(KeyHandle handle) { 100 Insert(handle); 101 return true; 102 } 103 104 // Same as Insert(), but in additional pass a hint to insert location for 105 // the key. If hint points to nullptr, a new hint will be populated. 106 // otherwise the hint will be updated to reflect the last insert location. 107 // 108 // Currently only skip-list based memtable implement the interface. Other 109 // implementations will fallback to Insert() by default. InsertWithHint(KeyHandle handle,void **)110 virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) { 111 // Ignore the hint by default. 112 Insert(handle); 113 } 114 115 // Same as ::InsertWithHint 116 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and 117 // the <key, seq> already exists. InsertKeyWithHint(KeyHandle handle,void ** hint)118 virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) { 119 InsertWithHint(handle, hint); 120 return true; 121 } 122 123 // Same as ::InsertWithHint, but allow concurrnet write 124 // 125 // If hint points to nullptr, a new hint will be allocated on heap, otherwise 126 // the hint will be updated to reflect the last insert location. The hint is 127 // owned by the caller and it is the caller's responsibility to delete the 128 // hint later. 129 // 130 // Currently only skip-list based memtable implement the interface. Other 131 // implementations will fallback to InsertConcurrently() by default. InsertWithHintConcurrently(KeyHandle handle,void **)132 virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) { 133 // Ignore the hint by default. 134 InsertConcurrently(handle); 135 } 136 137 // Same as ::InsertWithHintConcurrently 138 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and 139 // the <key, seq> already exists. InsertKeyWithHintConcurrently(KeyHandle handle,void ** hint)140 virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) { 141 InsertWithHintConcurrently(handle, hint); 142 return true; 143 } 144 145 // Like Insert(handle), but may be called concurrent with other calls 146 // to InsertConcurrently for other handles. 147 // 148 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and 149 // the <key, seq> already exists. 150 virtual void InsertConcurrently(KeyHandle handle); 151 152 // Same as ::InsertConcurrently 153 // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and 154 // the <key, seq> already exists. InsertKeyConcurrently(KeyHandle handle)155 virtual bool InsertKeyConcurrently(KeyHandle handle) { 156 InsertConcurrently(handle); 157 return true; 158 } 159 160 // Returns true iff an entry that compares equal to key is in the collection. 161 virtual bool Contains(const char* key) const = 0; 162 163 // Notify this table rep that it will no longer be added to. By default, 164 // does nothing. After MarkReadOnly() is called, this table rep will 165 // not be written to (ie No more calls to Allocate(), Insert(), 166 // or any writes done directly to entries accessed through the iterator.) MarkReadOnly()167 virtual void MarkReadOnly() {} 168 169 // Notify this table rep that it has been flushed to stable storage. 170 // By default, does nothing. 171 // 172 // Invariant: MarkReadOnly() is called, before MarkFlushed(). 173 // Note that this method if overridden, should not run for an extended period 174 // of time. Otherwise, RocksDB may be blocked. MarkFlushed()175 virtual void MarkFlushed() {} 176 177 // Look up key from the mem table, since the first key in the mem table whose 178 // user_key matches the one given k, call the function callback_func(), with 179 // callback_args directly forwarded as the first parameter, and the mem table 180 // key as the second parameter. If the return value is false, then terminates. 181 // Otherwise, go through the next key. 182 // 183 // It's safe for Get() to terminate after having finished all the potential 184 // key for the k.user_key(), or not. 185 // 186 // Default: 187 // Get() function with a default value of dynamically construct an iterator, 188 // seek and call the call back function. 189 virtual void Get(const LookupKey& k, void* callback_args, 190 bool (*callback_func)(void* arg, const char* entry)); 191 ApproximateNumEntries(const Slice &,const Slice &)192 virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/, 193 const Slice& /*end_key*/) { 194 return 0; 195 } 196 197 // Report an approximation of how much memory has been used other than memory 198 // that was allocated through the allocator. Safe to call from any thread. 199 virtual size_t ApproximateMemoryUsage() = 0; 200 ~MemTableRep()201 virtual ~MemTableRep() {} 202 203 // Iteration over the contents of a skip collection 204 class Iterator { 205 public: 206 // Initialize an iterator over the specified collection. 207 // The returned iterator is not valid. 208 // explicit Iterator(const MemTableRep* collection); ~Iterator()209 virtual ~Iterator() {} 210 211 // Returns true iff the iterator is positioned at a valid node. 212 virtual bool Valid() const = 0; 213 214 // Returns the key at the current position. 215 // REQUIRES: Valid() 216 virtual const char* key() const = 0; 217 218 // Advances to the next position. 219 // REQUIRES: Valid() 220 virtual void Next() = 0; 221 222 // Advances to the previous position. 223 // REQUIRES: Valid() 224 virtual void Prev() = 0; 225 226 // Advance to the first entry with a key >= target 227 virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; 228 229 // retreat to the first entry with a key <= target 230 virtual void SeekForPrev(const Slice& internal_key, 231 const char* memtable_key) = 0; 232 233 // Position at the first entry in collection. 234 // Final state of iterator is Valid() iff collection is not empty. 235 virtual void SeekToFirst() = 0; 236 237 // Position at the last entry in collection. 238 // Final state of iterator is Valid() iff collection is not empty. 239 virtual void SeekToLast() = 0; 240 }; 241 242 // Return an iterator over the keys in this representation. 243 // arena: If not null, the arena needs to be used to allocate the Iterator. 244 // When destroying the iterator, the caller will not call "delete" 245 // but Iterator::~Iterator() directly. The destructor needs to destroy 246 // all the states but those allocated in arena. 247 virtual Iterator* GetIterator(Arena* arena = nullptr) = 0; 248 249 // Return an iterator that has a special Seek semantics. The result of 250 // a Seek might only include keys with the same prefix as the target key. 251 // arena: If not null, the arena is used to allocate the Iterator. 252 // When destroying the iterator, the caller will not call "delete" 253 // but Iterator::~Iterator() directly. The destructor needs to destroy 254 // all the states but those allocated in arena. 255 virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) { 256 return GetIterator(arena); 257 } 258 259 // Return true if the current MemTableRep supports merge operator. 260 // Default: true IsMergeOperatorSupported()261 virtual bool IsMergeOperatorSupported() const { return true; } 262 263 // Return true if the current MemTableRep supports snapshot 264 // Default: true IsSnapshotSupported()265 virtual bool IsSnapshotSupported() const { return true; } 266 267 protected: 268 // When *key is an internal key concatenated with the value, returns the 269 // user key. 270 virtual Slice UserKey(const char* key) const; 271 272 Allocator* allocator_; 273 }; 274 275 // This is the base class for all factories that are used by RocksDB to create 276 // new MemTableRep objects 277 class MemTableRepFactory { 278 public: ~MemTableRepFactory()279 virtual ~MemTableRepFactory() {} 280 281 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, 282 Allocator*, const SliceTransform*, 283 Logger* logger) = 0; CreateMemTableRep(const MemTableRep::KeyComparator & key_cmp,Allocator * allocator,const SliceTransform * slice_transform,Logger * logger,uint32_t)284 virtual MemTableRep* CreateMemTableRep( 285 const MemTableRep::KeyComparator& key_cmp, Allocator* allocator, 286 const SliceTransform* slice_transform, Logger* logger, 287 uint32_t /* column_family_id */) { 288 return CreateMemTableRep(key_cmp, allocator, slice_transform, logger); 289 } 290 291 virtual const char* Name() const = 0; 292 293 // Return true if the current MemTableRep supports concurrent inserts 294 // Default: false IsInsertConcurrentlySupported()295 virtual bool IsInsertConcurrentlySupported() const { return false; } 296 297 // Return true if the current MemTableRep supports detecting duplicate 298 // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns 299 // false when if the <key,seq> already exists. 300 // Default: false CanHandleDuplicatedKey()301 virtual bool CanHandleDuplicatedKey() const { return false; } 302 }; 303 304 // This uses a skip list to store keys. It is the default. 305 // 306 // Parameters: 307 // lookahead: If non-zero, each iterator's seek operation will start the 308 // search from the previously visited record (doing at most 'lookahead' 309 // steps). This is an optimization for the access pattern including many 310 // seeks with consecutive keys. 311 class SkipListFactory : public MemTableRepFactory { 312 public: lookahead_(lookahead)313 explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} 314 315 using MemTableRepFactory::CreateMemTableRep; 316 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, 317 Allocator*, const SliceTransform*, 318 Logger* logger) override; Name()319 virtual const char* Name() const override { return "SkipListFactory"; } 320 IsInsertConcurrentlySupported()321 bool IsInsertConcurrentlySupported() const override { return true; } 322 CanHandleDuplicatedKey()323 bool CanHandleDuplicatedKey() const override { return true; } 324 325 private: 326 const size_t lookahead_; 327 }; 328 329 #ifndef ROCKSDB_LITE 330 // This creates MemTableReps that are backed by an std::vector. On iteration, 331 // the vector is sorted. This is useful for workloads where iteration is very 332 // rare and writes are generally not issued after reads begin. 333 // 334 // Parameters: 335 // count: Passed to the constructor of the underlying std::vector of each 336 // VectorRep. On initialization, the underlying array will be at least count 337 // bytes reserved for usage. 338 class VectorRepFactory : public MemTableRepFactory { 339 const size_t count_; 340 341 public: count_(count)342 explicit VectorRepFactory(size_t count = 0) : count_(count) {} 343 344 using MemTableRepFactory::CreateMemTableRep; 345 virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, 346 Allocator*, const SliceTransform*, 347 Logger* logger) override; 348 Name()349 virtual const char* Name() const override { return "VectorRepFactory"; } 350 }; 351 352 // This class contains a fixed array of buckets, each 353 // pointing to a skiplist (null if the bucket is empty). 354 // bucket_count: number of fixed array buckets 355 // skiplist_height: the max height of the skiplist 356 // skiplist_branching_factor: probabilistic size ratio between adjacent 357 // link lists in the skiplist 358 extern MemTableRepFactory* NewHashSkipListRepFactory( 359 size_t bucket_count = 1000000, int32_t skiplist_height = 4, 360 int32_t skiplist_branching_factor = 4); 361 362 // The factory is to create memtables based on a hash table: 363 // it contains a fixed array of buckets, each pointing to either a linked list 364 // or a skip list if number of entries inside the bucket exceeds 365 // threshold_use_skiplist. 366 // @bucket_count: number of fixed array buckets 367 // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc. 368 // Otherwise from huge page TLB. The user needs to reserve 369 // huge pages for it to be allocated, like: 370 // sysctl -w vm.nr_hugepages=20 371 // See linux doc Documentation/vm/hugetlbpage.txt 372 // @bucket_entries_logging_threshold: if number of entries in one bucket 373 // exceeds this number, log about it. 374 // @if_log_bucket_dist_when_flash: if true, log distribution of number of 375 // entries when flushing. 376 // @threshold_use_skiplist: a bucket switches to skip list if number of 377 // entries exceed this parameter. 378 extern MemTableRepFactory* NewHashLinkListRepFactory( 379 size_t bucket_count = 50000, size_t huge_page_tlb_size = 0, 380 int bucket_entries_logging_threshold = 4096, 381 bool if_log_bucket_dist_when_flash = true, 382 uint32_t threshold_use_skiplist = 256); 383 384 #endif // ROCKSDB_LITE 385 } // namespace ROCKSDB_NAMESPACE 386