1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // This file contains the interface that must be implemented by any collection
7 // to be used as the backing store for a MemTable. Such a collection must
8 // satisfy the following properties:
9 //  (1) It does not store duplicate items.
10 //  (2) It uses MemTableRep::KeyComparator to compare items for iteration and
11 //     equality.
12 //  (3) It can be accessed concurrently by multiple readers and can support
13 //     during reads. However, it needn't support multiple concurrent writes.
14 //  (4) Items are never deleted.
15 // The liberal use of assertions is encouraged to enforce (1).
16 //
17 // The factory will be passed an MemTableAllocator object when a new MemTableRep
18 // is requested.
19 //
20 // Users can implement their own memtable representations. We include three
21 // types built in:
22 //  - SkipListRep: This is the default; it is backed by a skip list.
23 //  - HashSkipListRep: The memtable rep that is best used for keys that are
24 //  structured like "prefix:suffix" where iteration within a prefix is
25 //  common and iteration across different prefixes is rare. It is backed by
26 //  a hash map where each bucket is a skip list.
27 //  - VectorRep: This is backed by an unordered std::vector. On iteration, the
28 // vector is sorted. It is intelligent about sorting; once the MarkReadOnly()
29 // has been called, the vector will only be sorted once. It is optimized for
30 // random-write-heavy workloads.
31 //
32 // The last four implementations are designed for situations in which
33 // iteration over the entire collection is rare since doing so requires all the
34 // keys to be copied into a sorted data structure.
35 
36 #pragma once
37 
38 #include <rocksdb/slice.h>
39 #include <stdint.h>
40 #include <stdlib.h>
41 #include <memory>
42 #include <stdexcept>
43 
44 namespace ROCKSDB_NAMESPACE {
45 
46 class Arena;
47 class Allocator;
48 class LookupKey;
49 class SliceTransform;
50 class Logger;
51 
52 typedef void* KeyHandle;
53 
54 extern Slice GetLengthPrefixedSlice(const char* data);
55 
56 class MemTableRep {
57  public:
58   // KeyComparator provides a means to compare keys, which are internal keys
59   // concatenated with values.
60   class KeyComparator {
61    public:
62     typedef ROCKSDB_NAMESPACE::Slice DecodedType;
63 
decode_key(const char * key)64     virtual DecodedType decode_key(const char* key) const {
65       // The format of key is frozen and can be terated as a part of the API
66       // contract. Refer to MemTable::Add for details.
67       return GetLengthPrefixedSlice(key);
68     }
69 
70     // Compare a and b. Return a negative value if a is less than b, 0 if they
71     // are equal, and a positive value if a is greater than b
72     virtual int operator()(const char* prefix_len_key1,
73                            const char* prefix_len_key2) const = 0;
74 
75     virtual int operator()(const char* prefix_len_key,
76                            const Slice& key) const = 0;
77 
~KeyComparator()78     virtual ~KeyComparator() {}
79   };
80 
MemTableRep(Allocator * allocator)81   explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
82 
83   // Allocate a buf of len size for storing key. The idea is that a
84   // specific memtable representation knows its underlying data structure
85   // better. By allowing it to allocate memory, it can possibly put
86   // correlated stuff in consecutive memory area to make processor
87   // prefetching more efficient.
88   virtual KeyHandle Allocate(const size_t len, char** buf);
89 
90   // Insert key into the collection. (The caller will pack key and value into a
91   // single buffer and pass that in as the parameter to Insert).
92   // REQUIRES: nothing that compares equal to key is currently in the
93   // collection, and no concurrent modifications to the table in progress
94   virtual void Insert(KeyHandle handle) = 0;
95 
96   // Same as ::Insert
97   // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
98   // the <key, seq> already exists.
InsertKey(KeyHandle handle)99   virtual bool InsertKey(KeyHandle handle) {
100     Insert(handle);
101     return true;
102   }
103 
104   // Same as Insert(), but in additional pass a hint to insert location for
105   // the key. If hint points to nullptr, a new hint will be populated.
106   // otherwise the hint will be updated to reflect the last insert location.
107   //
108   // Currently only skip-list based memtable implement the interface. Other
109   // implementations will fallback to Insert() by default.
InsertWithHint(KeyHandle handle,void **)110   virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
111     // Ignore the hint by default.
112     Insert(handle);
113   }
114 
115   // Same as ::InsertWithHint
116   // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
117   // the <key, seq> already exists.
InsertKeyWithHint(KeyHandle handle,void ** hint)118   virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
119     InsertWithHint(handle, hint);
120     return true;
121   }
122 
123   // Same as ::InsertWithHint, but allow concurrnet write
124   //
125   // If hint points to nullptr, a new hint will be allocated on heap, otherwise
126   // the hint will be updated to reflect the last insert location. The hint is
127   // owned by the caller and it is the caller's responsibility to delete the
128   // hint later.
129   //
130   // Currently only skip-list based memtable implement the interface. Other
131   // implementations will fallback to InsertConcurrently() by default.
InsertWithHintConcurrently(KeyHandle handle,void **)132   virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
133     // Ignore the hint by default.
134     InsertConcurrently(handle);
135   }
136 
137   // Same as ::InsertWithHintConcurrently
138   // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
139   // the <key, seq> already exists.
InsertKeyWithHintConcurrently(KeyHandle handle,void ** hint)140   virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
141     InsertWithHintConcurrently(handle, hint);
142     return true;
143   }
144 
145   // Like Insert(handle), but may be called concurrent with other calls
146   // to InsertConcurrently for other handles.
147   //
148   // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
149   // the <key, seq> already exists.
150   virtual void InsertConcurrently(KeyHandle handle);
151 
152   // Same as ::InsertConcurrently
153   // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
154   // the <key, seq> already exists.
InsertKeyConcurrently(KeyHandle handle)155   virtual bool InsertKeyConcurrently(KeyHandle handle) {
156     InsertConcurrently(handle);
157     return true;
158   }
159 
160   // Returns true iff an entry that compares equal to key is in the collection.
161   virtual bool Contains(const char* key) const = 0;
162 
163   // Notify this table rep that it will no longer be added to. By default,
164   // does nothing.  After MarkReadOnly() is called, this table rep will
165   // not be written to (ie No more calls to Allocate(), Insert(),
166   // or any writes done directly to entries accessed through the iterator.)
MarkReadOnly()167   virtual void MarkReadOnly() {}
168 
169   // Notify this table rep that it has been flushed to stable storage.
170   // By default, does nothing.
171   //
172   // Invariant: MarkReadOnly() is called, before MarkFlushed().
173   // Note that this method if overridden, should not run for an extended period
174   // of time. Otherwise, RocksDB may be blocked.
MarkFlushed()175   virtual void MarkFlushed() {}
176 
177   // Look up key from the mem table, since the first key in the mem table whose
178   // user_key matches the one given k, call the function callback_func(), with
179   // callback_args directly forwarded as the first parameter, and the mem table
180   // key as the second parameter. If the return value is false, then terminates.
181   // Otherwise, go through the next key.
182   //
183   // It's safe for Get() to terminate after having finished all the potential
184   // key for the k.user_key(), or not.
185   //
186   // Default:
187   // Get() function with a default value of dynamically construct an iterator,
188   // seek and call the call back function.
189   virtual void Get(const LookupKey& k, void* callback_args,
190                    bool (*callback_func)(void* arg, const char* entry));
191 
ApproximateNumEntries(const Slice &,const Slice &)192   virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
193                                          const Slice& /*end_key*/) {
194     return 0;
195   }
196 
197   // Report an approximation of how much memory has been used other than memory
198   // that was allocated through the allocator.  Safe to call from any thread.
199   virtual size_t ApproximateMemoryUsage() = 0;
200 
~MemTableRep()201   virtual ~MemTableRep() {}
202 
203   // Iteration over the contents of a skip collection
204   class Iterator {
205    public:
206     // Initialize an iterator over the specified collection.
207     // The returned iterator is not valid.
208     // explicit Iterator(const MemTableRep* collection);
~Iterator()209     virtual ~Iterator() {}
210 
211     // Returns true iff the iterator is positioned at a valid node.
212     virtual bool Valid() const = 0;
213 
214     // Returns the key at the current position.
215     // REQUIRES: Valid()
216     virtual const char* key() const = 0;
217 
218     // Advances to the next position.
219     // REQUIRES: Valid()
220     virtual void Next() = 0;
221 
222     // Advances to the previous position.
223     // REQUIRES: Valid()
224     virtual void Prev() = 0;
225 
226     // Advance to the first entry with a key >= target
227     virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
228 
229     // retreat to the first entry with a key <= target
230     virtual void SeekForPrev(const Slice& internal_key,
231                              const char* memtable_key) = 0;
232 
233     // Position at the first entry in collection.
234     // Final state of iterator is Valid() iff collection is not empty.
235     virtual void SeekToFirst() = 0;
236 
237     // Position at the last entry in collection.
238     // Final state of iterator is Valid() iff collection is not empty.
239     virtual void SeekToLast() = 0;
240   };
241 
242   // Return an iterator over the keys in this representation.
243   // arena: If not null, the arena needs to be used to allocate the Iterator.
244   //        When destroying the iterator, the caller will not call "delete"
245   //        but Iterator::~Iterator() directly. The destructor needs to destroy
246   //        all the states but those allocated in arena.
247   virtual Iterator* GetIterator(Arena* arena = nullptr) = 0;
248 
249   // Return an iterator that has a special Seek semantics. The result of
250   // a Seek might only include keys with the same prefix as the target key.
251   // arena: If not null, the arena is used to allocate the Iterator.
252   //        When destroying the iterator, the caller will not call "delete"
253   //        but Iterator::~Iterator() directly. The destructor needs to destroy
254   //        all the states but those allocated in arena.
255   virtual Iterator* GetDynamicPrefixIterator(Arena* arena = nullptr) {
256     return GetIterator(arena);
257   }
258 
259   // Return true if the current MemTableRep supports merge operator.
260   // Default: true
IsMergeOperatorSupported()261   virtual bool IsMergeOperatorSupported() const { return true; }
262 
263   // Return true if the current MemTableRep supports snapshot
264   // Default: true
IsSnapshotSupported()265   virtual bool IsSnapshotSupported() const { return true; }
266 
267  protected:
268   // When *key is an internal key concatenated with the value, returns the
269   // user key.
270   virtual Slice UserKey(const char* key) const;
271 
272   Allocator* allocator_;
273 };
274 
275 // This is the base class for all factories that are used by RocksDB to create
276 // new MemTableRep objects
277 class MemTableRepFactory {
278  public:
~MemTableRepFactory()279   virtual ~MemTableRepFactory() {}
280 
281   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
282                                          Allocator*, const SliceTransform*,
283                                          Logger* logger) = 0;
CreateMemTableRep(const MemTableRep::KeyComparator & key_cmp,Allocator * allocator,const SliceTransform * slice_transform,Logger * logger,uint32_t)284   virtual MemTableRep* CreateMemTableRep(
285       const MemTableRep::KeyComparator& key_cmp, Allocator* allocator,
286       const SliceTransform* slice_transform, Logger* logger,
287       uint32_t /* column_family_id */) {
288     return CreateMemTableRep(key_cmp, allocator, slice_transform, logger);
289   }
290 
291   virtual const char* Name() const = 0;
292 
293   // Return true if the current MemTableRep supports concurrent inserts
294   // Default: false
IsInsertConcurrentlySupported()295   virtual bool IsInsertConcurrentlySupported() const { return false; }
296 
297   // Return true if the current MemTableRep supports detecting duplicate
298   // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
299   // false when if the <key,seq> already exists.
300   // Default: false
CanHandleDuplicatedKey()301   virtual bool CanHandleDuplicatedKey() const { return false; }
302 };
303 
304 // This uses a skip list to store keys. It is the default.
305 //
306 // Parameters:
307 //   lookahead: If non-zero, each iterator's seek operation will start the
308 //     search from the previously visited record (doing at most 'lookahead'
309 //     steps). This is an optimization for the access pattern including many
310 //     seeks with consecutive keys.
311 class SkipListFactory : public MemTableRepFactory {
312  public:
lookahead_(lookahead)313   explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {}
314 
315   using MemTableRepFactory::CreateMemTableRep;
316   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
317                                          Allocator*, const SliceTransform*,
318                                          Logger* logger) override;
Name()319   virtual const char* Name() const override { return "SkipListFactory"; }
320 
IsInsertConcurrentlySupported()321   bool IsInsertConcurrentlySupported() const override { return true; }
322 
CanHandleDuplicatedKey()323   bool CanHandleDuplicatedKey() const override { return true; }
324 
325  private:
326   const size_t lookahead_;
327 };
328 
329 #ifndef ROCKSDB_LITE
330 // This creates MemTableReps that are backed by an std::vector. On iteration,
331 // the vector is sorted. This is useful for workloads where iteration is very
332 // rare and writes are generally not issued after reads begin.
333 //
334 // Parameters:
335 //   count: Passed to the constructor of the underlying std::vector of each
336 //     VectorRep. On initialization, the underlying array will be at least count
337 //     bytes reserved for usage.
338 class VectorRepFactory : public MemTableRepFactory {
339   const size_t count_;
340 
341  public:
count_(count)342   explicit VectorRepFactory(size_t count = 0) : count_(count) {}
343 
344   using MemTableRepFactory::CreateMemTableRep;
345   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
346                                          Allocator*, const SliceTransform*,
347                                          Logger* logger) override;
348 
Name()349   virtual const char* Name() const override { return "VectorRepFactory"; }
350 };
351 
352 // This class contains a fixed array of buckets, each
353 // pointing to a skiplist (null if the bucket is empty).
354 // bucket_count: number of fixed array buckets
355 // skiplist_height: the max height of the skiplist
356 // skiplist_branching_factor: probabilistic size ratio between adjacent
357 //                            link lists in the skiplist
358 extern MemTableRepFactory* NewHashSkipListRepFactory(
359     size_t bucket_count = 1000000, int32_t skiplist_height = 4,
360     int32_t skiplist_branching_factor = 4);
361 
362 // The factory is to create memtables based on a hash table:
363 // it contains a fixed array of buckets, each pointing to either a linked list
364 // or a skip list if number of entries inside the bucket exceeds
365 // threshold_use_skiplist.
366 // @bucket_count: number of fixed array buckets
367 // @huge_page_tlb_size: if <=0, allocate the hash table bytes from malloc.
368 //                      Otherwise from huge page TLB. The user needs to reserve
369 //                      huge pages for it to be allocated, like:
370 //                          sysctl -w vm.nr_hugepages=20
371 //                      See linux doc Documentation/vm/hugetlbpage.txt
372 // @bucket_entries_logging_threshold: if number of entries in one bucket
373 //                                    exceeds this number, log about it.
374 // @if_log_bucket_dist_when_flash: if true, log distribution of number of
375 //                                 entries when flushing.
376 // @threshold_use_skiplist: a bucket switches to skip list if number of
377 //                          entries exceed this parameter.
378 extern MemTableRepFactory* NewHashLinkListRepFactory(
379     size_t bucket_count = 50000, size_t huge_page_tlb_size = 0,
380     int bucket_entries_logging_threshold = 4096,
381     bool if_log_bucket_dist_when_flash = true,
382     uint32_t threshold_use_skiplist = 256);
383 
384 #endif  // ROCKSDB_LITE
385 }  // namespace ROCKSDB_NAMESPACE
386