1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8 
9 #pragma once
10 
11 #include <memory>
12 
13 #include "rocksdb/memtablerep.h"
14 #include "rocksdb/universal_compaction.h"
15 
16 namespace ROCKSDB_NAMESPACE {
17 
18 class Slice;
19 class SliceTransform;
20 enum CompressionType : unsigned char;
21 class TablePropertiesCollectorFactory;
22 class TableFactory;
23 struct Options;
24 
25 enum CompactionStyle : char {
26   // level based compaction style
27   kCompactionStyleLevel = 0x0,
28   // Universal compaction style
29   // Not supported in ROCKSDB_LITE.
30   kCompactionStyleUniversal = 0x1,
31   // FIFO compaction style
32   // Not supported in ROCKSDB_LITE
33   kCompactionStyleFIFO = 0x2,
34   // Disable background compaction. Compaction jobs are submitted
35   // via CompactFiles().
36   // Not supported in ROCKSDB_LITE
37   kCompactionStyleNone = 0x3,
38 };
39 
40 // In Level-based compaction, it Determines which file from a level to be
41 // picked to merge to the next level. We suggest people try
42 // kMinOverlappingRatio first when you tune your database.
43 enum CompactionPri : char {
44   // Slightly prioritize larger files by size compensated by #deletes
45   kByCompensatedSize = 0x0,
46   // First compact files whose data's latest update time is oldest.
47   // Try this if you only update some hot keys in small ranges.
48   kOldestLargestSeqFirst = 0x1,
49   // First compact files whose range hasn't been compacted to the next level
50   // for the longest. If your updates are random across the key space,
51   // write amplification is slightly better with this option.
52   kOldestSmallestSeqFirst = 0x2,
53   // First compact files whose ratio between overlapping size in next level
54   // and its size is the smallest. It in many cases can optimize write
55   // amplification.
56   kMinOverlappingRatio = 0x3,
57 };
58 
59 struct CompactionOptionsFIFO {
60   // once the total sum of table files reaches this, we will delete the oldest
61   // table file
62   // Default: 1GB
63   uint64_t max_table_files_size;
64 
65   // If true, try to do compaction to compact smaller files into larger ones.
66   // Minimum files to compact follows options.level0_file_num_compaction_trigger
67   // and compaction won't trigger if average compact bytes per del file is
68   // larger than options.write_buffer_size. This is to protect large files
69   // from being compacted again.
70   // Default: false;
71   bool allow_compaction = false;
72 
CompactionOptionsFIFOCompactionOptionsFIFO73   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
CompactionOptionsFIFOCompactionOptionsFIFO74   CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
75       : max_table_files_size(_max_table_files_size),
76         allow_compaction(_allow_compaction) {}
77 };
78 
79 // Compression options for different compression algorithms like Zlib
80 struct CompressionOptions {
81   // RocksDB's generic default compression level. Internally it'll be translated
82   // to the default compression level specific to the library being used (see
83   // comment above `ColumnFamilyOptions::compression`).
84   //
85   // The default value is the max 16-bit int as it'll be written out in OPTIONS
86   // file, which should be portable.
87   const static int kDefaultCompressionLevel = 32767;
88 
89   int window_bits;
90   int level;
91   int strategy;
92 
93   // Maximum size of dictionaries used to prime the compression library.
94   // Enabling dictionary can improve compression ratios when there are
95   // repetitions across data blocks.
96   //
97   // The dictionary is created by sampling the SST file data. If
98   // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
99   // dictionary generator. Otherwise, the random samples are used directly as
100   // the dictionary.
101   //
102   // When compression dictionary is disabled, we compress and write each block
103   // before buffering data for the next one. When compression dictionary is
104   // enabled, we buffer all SST file data in-memory so we can sample it, as data
105   // can only be compressed and written after the dictionary has been finalized.
106   // So users of this feature may see increased memory usage.
107   //
108   // Default: 0.
109   uint32_t max_dict_bytes;
110 
111   // Maximum size of training data passed to zstd's dictionary trainer. Using
112   // zstd's dictionary trainer can achieve even better compression ratio
113   // improvements than using `max_dict_bytes` alone.
114   //
115   // The training data will be used to generate a dictionary of max_dict_bytes.
116   //
117   // Default: 0.
118   uint32_t zstd_max_train_bytes;
119 
120   // When the compression options are set by the user, it will be set to "true".
121   // For bottommost_compression_opts, to enable it, user must set enabled=true.
122   // Otherwise, bottommost compression will use compression_opts as default
123   // compression options.
124   //
125   // For compression_opts, if compression_opts.enabled=false, it is still
126   // used as compression options for compression process.
127   //
128   // Default: false.
129   bool enabled;
130 
CompressionOptionsCompressionOptions131   CompressionOptions()
132       : window_bits(-14),
133         level(kDefaultCompressionLevel),
134         strategy(0),
135         max_dict_bytes(0),
136         zstd_max_train_bytes(0),
137         enabled(false) {}
CompressionOptionsCompressionOptions138   CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes,
139                      int _zstd_max_train_bytes, bool _enabled)
140       : window_bits(wbits),
141         level(_lev),
142         strategy(_strategy),
143         max_dict_bytes(_max_dict_bytes),
144         zstd_max_train_bytes(_zstd_max_train_bytes),
145         enabled(_enabled) {}
146 };
147 
148 enum UpdateStatus {    // Return status For inplace update callback
149   UPDATE_FAILED   = 0, // Nothing to update
150   UPDATED_INPLACE = 1, // Value updated inplace
151   UPDATED         = 2, // No inplace update. Merged value set
152 };
153 
154 struct AdvancedColumnFamilyOptions {
155   // The maximum number of write buffers that are built up in memory.
156   // The default and the minimum number is 2, so that when 1 write buffer
157   // is being flushed to storage, new writes can continue to the other
158   // write buffer.
159   // If max_write_buffer_number > 3, writing will be slowed down to
160   // options.delayed_write_rate if we are writing to the last write buffer
161   // allowed.
162   //
163   // Default: 2
164   //
165   // Dynamically changeable through SetOptions() API
166   int max_write_buffer_number = 2;
167 
168   // The minimum number of write buffers that will be merged together
169   // before writing to storage.  If set to 1, then
170   // all write buffers are flushed to L0 as individual files and this increases
171   // read amplification because a get request has to check in all of these
172   // files. Also, an in-memory merge may result in writing lesser
173   // data to storage if there are duplicate records in each of these
174   // individual write buffers.  Default: 1
175   int min_write_buffer_number_to_merge = 1;
176 
177   // DEPRECATED
178   // The total maximum number of write buffers to maintain in memory including
179   // copies of buffers that have already been flushed.  Unlike
180   // max_write_buffer_number, this parameter does not affect flushing.
181   // This parameter is being replaced by max_write_buffer_size_to_maintain.
182   // If both parameters are set to non-zero values, this parameter will be
183   // ignored.
184   int max_write_buffer_number_to_maintain = 0;
185 
186   // The total maximum size(bytes) of write buffers to maintain in memory
187   // including copies of buffers that have already been flushed. This parameter
188   // only affects trimming of flushed buffers and does not affect flushing.
189   // This controls the maximum amount of write history that will be available
190   // in memory for conflict checking when Transactions are used. The actual
191   // size of write history (flushed Memtables) might be higher than this limit
192   // if further trimming will reduce write history total size below this
193   // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB,
194   // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB.
195   // Because trimming the next Memtable of size 20MB will reduce total memory
196   // usage to 52MB which is below the limit, RocksDB will stop trimming.
197   //
198   // When using an OptimisticTransactionDB:
199   // If this value is too low, some transactions may fail at commit time due
200   // to not being able to determine whether there were any write conflicts.
201   //
202   // When using a TransactionDB:
203   // If Transaction::SetSnapshot is used, TransactionDB will read either
204   // in-memory write buffers or SST files to do write-conflict checking.
205   // Increasing this value can reduce the number of reads to SST files
206   // done for conflict detection.
207   //
208   // Setting this value to 0 will cause write buffers to be freed immediately
209   // after they are flushed. If this value is set to -1,
210   // 'max_write_buffer_number * write_buffer_size' will be used.
211   //
212   // Default:
213   // If using a TransactionDB/OptimisticTransactionDB, the default value will
214   // be set to the value of 'max_write_buffer_number * write_buffer_size'
215   // if it is not explicitly set by the user.  Otherwise, the default is 0.
216   int64_t max_write_buffer_size_to_maintain = 0;
217 
218   // Allows thread-safe inplace updates. If this is true, there is no way to
219   // achieve point-in-time consistency using snapshot or iterator (assuming
220   // concurrent updates). Hence iterator and multi-get will return results
221   // which are not consistent as of any point-in-time.
222   // If inplace_callback function is not set,
223   //   Put(key, new_value) will update inplace the existing_value iff
224   //   * key exists in current memtable
225   //   * new sizeof(new_value) <= sizeof(existing_value)
226   //   * existing_value for that key is a put i.e. kTypeValue
227   // If inplace_callback function is set, check doc for inplace_callback.
228   // Default: false.
229   bool inplace_update_support = false;
230 
231   // Number of locks used for inplace update
232   // Default: 10000, if inplace_update_support = true, else 0.
233   //
234   // Dynamically changeable through SetOptions() API
235   size_t inplace_update_num_locks = 10000;
236 
237   // existing_value - pointer to previous value (from both memtable and sst).
238   //                  nullptr if key doesn't exist
239   // existing_value_size - pointer to size of existing_value).
240   //                       nullptr if key doesn't exist
241   // delta_value - Delta value to be merged with the existing_value.
242   //               Stored in transaction logs.
243   // merged_value - Set when delta is applied on the previous value.
244 
245   // Applicable only when inplace_update_support is true,
246   // this callback function is called at the time of updating the memtable
247   // as part of a Put operation, lets say Put(key, delta_value). It allows the
248   // 'delta_value' specified as part of the Put operation to be merged with
249   // an 'existing_value' of the key in the database.
250 
251   // If the merged value is smaller in size that the 'existing_value',
252   // then this function can update the 'existing_value' buffer inplace and
253   // the corresponding 'existing_value'_size pointer, if it wishes to.
254   // The callback should return UpdateStatus::UPDATED_INPLACE.
255   // In this case. (In this case, the snapshot-semantics of the rocksdb
256   // Iterator is not atomic anymore).
257 
258   // If the merged value is larger in size than the 'existing_value' or the
259   // application does not wish to modify the 'existing_value' buffer inplace,
260   // then the merged value should be returned via *merge_value. It is set by
261   // merging the 'existing_value' and the Put 'delta_value'. The callback should
262   // return UpdateStatus::UPDATED in this case. This merged value will be added
263   // to the memtable.
264 
265   // If merging fails or the application does not wish to take any action,
266   // then the callback should return UpdateStatus::UPDATE_FAILED.
267 
268   // Please remember that the original call from the application is Put(key,
269   // delta_value). So the transaction log (if enabled) will still contain (key,
270   // delta_value). The 'merged_value' is not stored in the transaction log.
271   // Hence the inplace_callback function should be consistent across db reopens.
272 
273   // Default: nullptr
274   UpdateStatus (*inplace_callback)(char* existing_value,
275                                    uint32_t* existing_value_size,
276                                    Slice delta_value,
277                                    std::string* merged_value) = nullptr;
278 
279   // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
280   // create prefix bloom for memtable with the size of
281   // write_buffer_size * memtable_prefix_bloom_size_ratio.
282   // If it is larger than 0.25, it is sanitized to 0.25.
283   //
284   // Default: 0 (disable)
285   //
286   // Dynamically changeable through SetOptions() API
287   double memtable_prefix_bloom_size_ratio = 0.0;
288 
289   // Enable whole key bloom filter in memtable. Note this will only take effect
290   // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
291   // can potentially reduce CPU usage for point-look-ups.
292   //
293   // Default: false (disable)
294   //
295   // Dynamically changeable through SetOptions() API
296   bool memtable_whole_key_filtering = false;
297 
298   // Page size for huge page for the arena used by the memtable. If <=0, it
299   // won't allocate from huge page but from malloc.
300   // Users are responsible to reserve huge pages for it to be allocated. For
301   // example:
302   //      sysctl -w vm.nr_hugepages=20
303   // See linux doc Documentation/vm/hugetlbpage.txt
304   // If there isn't enough free huge page available, it will fall back to
305   // malloc.
306   //
307   // Dynamically changeable through SetOptions() API
308   size_t memtable_huge_page_size = 0;
309 
310   // If non-nullptr, memtable will use the specified function to extract
311   // prefixes for keys, and for each prefix maintain a hint of insert location
312   // to reduce CPU usage for inserting keys with the prefix. Keys out of
313   // domain of the prefix extractor will be insert without using hints.
314   //
315   // Currently only the default skiplist based memtable implements the feature.
316   // All other memtable implementation will ignore the option. It incurs ~250
317   // additional bytes of memory overhead to store a hint for each prefix.
318   // Also concurrent writes (when allow_concurrent_memtable_write is true) will
319   // ignore the option.
320   //
321   // The option is best suited for workloads where keys will likely to insert
322   // to a location close the last inserted key with the same prefix.
323   // One example could be inserting keys of the form (prefix + timestamp),
324   // and keys of the same prefix always comes in with time order. Another
325   // example would be updating the same key over and over again, in which case
326   // the prefix can be the key itself.
327   //
328   // Default: nullptr (disable)
329   std::shared_ptr<const SliceTransform>
330       memtable_insert_with_hint_prefix_extractor = nullptr;
331 
332   // Control locality of bloom filter probes to improve CPU cache hit rate.
333   // This option now only applies to plaintable prefix bloom. This
334   // optimization is turned off when set to 0, and positive number to turn
335   // it on.
336   // Default: 0
337   uint32_t bloom_locality = 0;
338 
339   // size of one block in arena memory allocation.
340   // If <= 0, a proper value is automatically calculated (usually 1/8 of
341   // writer_buffer_size, rounded up to a multiple of 4KB).
342   //
343   // There are two additional restriction of the specified size:
344   // (1) size should be in the range of [4096, 2 << 30] and
345   // (2) be the multiple of the CPU word (which helps with the memory
346   // alignment).
347   //
348   // We'll automatically check and adjust the size number to make sure it
349   // conforms to the restrictions.
350   //
351   // Default: 0
352   //
353   // Dynamically changeable through SetOptions() API
354   size_t arena_block_size = 0;
355 
356   // Different levels can have different compression policies. There
357   // are cases where most lower levels would like to use quick compression
358   // algorithms while the higher levels (which have more data) use
359   // compression algorithms that have better compression but could
360   // be slower. This array, if non-empty, should have an entry for
361   // each level of the database; these override the value specified in
362   // the previous field 'compression'.
363   //
364   // NOTICE if level_compaction_dynamic_level_bytes=true,
365   // compression_per_level[0] still determines L0, but other elements
366   // of the array are based on base level (the level L0 files are merged
367   // to), and may not match the level users see from info log for metadata.
368   // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
369   // determines compaction type for level n+i-1.
370   // For example, if we have three 5 levels, and we determine to merge L0
371   // data to L4 (which means L1..L3 will be empty), then the new files go to
372   // L4 uses compression type compression_per_level[1].
373   // If now L0 is merged to L2. Data goes to L2 will be compressed
374   // according to compression_per_level[1], L3 using compression_per_level[2]
375   // and L4 using compression_per_level[3]. Compaction for each level can
376   // change when data grows.
377   std::vector<CompressionType> compression_per_level;
378 
379   // Number of levels for this database
380   int num_levels = 7;
381 
382   // Soft limit on number of level-0 files. We start slowing down writes at this
383   // point. A value <0 means that no writing slow down will be triggered by
384   // number of files in level-0.
385   //
386   // Default: 20
387   //
388   // Dynamically changeable through SetOptions() API
389   int level0_slowdown_writes_trigger = 20;
390 
391   // Maximum number of level-0 files.  We stop writes at this point.
392   //
393   // Default: 36
394   //
395   // Dynamically changeable through SetOptions() API
396   int level0_stop_writes_trigger = 36;
397 
398   // Target file size for compaction.
399   // target_file_size_base is per-file size for level-1.
400   // Target file size for level L can be calculated by
401   // target_file_size_base * (target_file_size_multiplier ^ (L-1))
402   // For example, if target_file_size_base is 2MB and
403   // target_file_size_multiplier is 10, then each file on level-1 will
404   // be 2MB, and each file on level 2 will be 20MB,
405   // and each file on level-3 will be 200MB.
406   //
407   // Default: 64MB.
408   //
409   // Dynamically changeable through SetOptions() API
410   uint64_t target_file_size_base = 64 * 1048576;
411 
412   // By default target_file_size_multiplier is 1, which means
413   // by default files in different levels will have similar size.
414   //
415   // Dynamically changeable through SetOptions() API
416   int target_file_size_multiplier = 1;
417 
418   // If true, RocksDB will pick target size of each level dynamically.
419   // We will pick a base level b >= 1. L0 will be directly merged into level b,
420   // instead of always into level 1. Level 1 to b-1 need to be empty.
421   // We try to pick b and its target size so that
422   // 1. target size is in the range of
423   //   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
424   //    max_bytes_for_level_base]
425   // 2. target size of the last level (level num_levels-1) equals to extra size
426   //    of the level.
427   // At the same time max_bytes_for_level_multiplier and
428   // max_bytes_for_level_multiplier_additional are still satisfied.
429   // (When L0 is too large, we make some adjustment. See below.)
430   //
431   // With this option on, from an empty DB, we make last level the base level,
432   // which means merging L0 data into the last level, until it exceeds
433   // max_bytes_for_level_base. And then we make the second last level to be
434   // base level, to start to merge L0 data to second last level, with its
435   // target size to be 1/max_bytes_for_level_multiplier of the last level's
436   // extra size. After the data accumulates more so that we need to move the
437   // base level to the third last one, and so on.
438   //
439   // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
440   // and max_bytes_for_level_base=10MB.
441   // Target sizes of level 1 to 5 starts with:
442   // [- - - - 10MB]
443   // with base level is level. Target sizes of level 1 to 4 are not applicable
444   // because they will not be used.
445   // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
446   // base target to level 4 and now the targets looks like:
447   // [- - - 1.1MB 11MB]
448   // While data are accumulated, size targets are tuned based on actual data
449   // of level 5. When level 5 has 50MB of data, the target is like:
450   // [- - - 5MB 50MB]
451   // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
452   // level 4 to be the base level, its target size needs to be 10.1MB, which
453   // doesn't satisfy the target size range. So now we make level 3 the target
454   // size and the target sizes of the levels look like:
455   // [- - 1.01MB 10.1MB 101MB]
456   // In the same way, while level 5 further grows, all levels' targets grow,
457   // like
458   // [- - 5MB 50MB 500MB]
459   // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
460   // base level and make levels' target sizes like this:
461   // [- 1.001MB 10.01MB 100.1MB 1001MB]
462   // and go on...
463   //
464   // By doing it, we give max_bytes_for_level_multiplier a priority against
465   // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
466   // useful to limit worse case space amplification.
467   //
468   //
469   // If the compaction from L0 is lagged behind, a special mode will be turned
470   // on to prioritize write amplification against max_bytes_for_level_multiplier
471   // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
472   // at number of L0 files and total L0 size. If number of L0 files is at least
473   // the double of level0_file_num_compaction_trigger, or the total size is
474   // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
475   // to the actual data size in L0, and then determine the target for each level
476   // so that each level will have the same level multiplier.
477   //
478   // For example, when L0 size is 100MB, the size of last level is 1600MB,
479   // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
480   // Since L0 size is larger than max_bytes_for_level_base, this is a L0
481   // compaction backlogged mode. So that the L1 size is determined to be 100MB.
482   // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
483   // be needed. The level multiplier will be calculated to be 4 and the three
484   // levels' target to be [100MB, 400MB, 1600MB].
485   //
486   // In this mode, The number of levels will be no more than the normal mode,
487   // and the level multiplier will be lower. The write amplification will
488   // likely to be reduced.
489   //
490   //
491   // max_bytes_for_level_multiplier_additional is ignored with this flag on.
492   //
493   // Turning this feature on or off for an existing DB can cause unexpected
494   // LSM tree structure so it's not recommended.
495   //
496   // Default: false
497   bool level_compaction_dynamic_level_bytes = false;
498 
499   // Default: 10.
500   //
501   // Dynamically changeable through SetOptions() API
502   double max_bytes_for_level_multiplier = 10;
503 
504   // Different max-size multipliers for different levels.
505   // These are multiplied by max_bytes_for_level_multiplier to arrive
506   // at the max-size of each level.
507   //
508   // Default: 1
509   //
510   // Dynamically changeable through SetOptions() API
511   std::vector<int> max_bytes_for_level_multiplier_additional =
512       std::vector<int>(num_levels, 1);
513 
514   // We try to limit number of bytes in one compaction to be lower than this
515   // threshold. But it's not guaranteed.
516   // Value 0 will be sanitized.
517   //
518   // Default: target_file_size_base * 25
519   //
520   // Dynamically changeable through SetOptions() API
521   uint64_t max_compaction_bytes = 0;
522 
523   // All writes will be slowed down to at least delayed_write_rate if estimated
524   // bytes needed to be compaction exceed this threshold.
525   //
526   // Default: 64GB
527   //
528   // Dynamically changeable through SetOptions() API
529   uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
530 
531   // All writes are stopped if estimated bytes needed to be compaction exceed
532   // this threshold.
533   //
534   // Default: 256GB
535   //
536   // Dynamically changeable through SetOptions() API
537   uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
538 
539   // The compaction style. Default: kCompactionStyleLevel
540   CompactionStyle compaction_style = kCompactionStyleLevel;
541 
542   // If level compaction_style = kCompactionStyleLevel, for each level,
543   // which files are prioritized to be picked to compact.
544   // Default: kMinOverlappingRatio
545   CompactionPri compaction_pri = kMinOverlappingRatio;
546 
547   // The options needed to support Universal Style compactions
548   //
549   // Dynamically changeable through SetOptions() API
550   // Dynamic change example:
551   // SetOptions("compaction_options_universal", "{size_ratio=2;}")
552   CompactionOptionsUniversal compaction_options_universal;
553 
554   // The options for FIFO compaction style
555   //
556   // Dynamically changeable through SetOptions() API
557   // Dynamic change example:
558   // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}")
559   CompactionOptionsFIFO compaction_options_fifo;
560 
561   // An iteration->Next() sequentially skips over keys with the same
562   // user-key unless this option is set. This number specifies the number
563   // of keys (with the same userkey) that will be sequentially
564   // skipped before a reseek is issued.
565   //
566   // Default: 8
567   //
568   // Dynamically changeable through SetOptions() API
569   uint64_t max_sequential_skip_in_iterations = 8;
570 
571   // This is a factory that provides MemTableRep objects.
572   // Default: a factory that provides a skip-list-based implementation of
573   // MemTableRep.
574   std::shared_ptr<MemTableRepFactory> memtable_factory =
575       std::shared_ptr<SkipListFactory>(new SkipListFactory);
576 
577   // Block-based table related options are moved to BlockBasedTableOptions.
578   // Related options that were originally here but now moved include:
579   //   no_block_cache
580   //   block_cache
581   //   block_cache_compressed
582   //   block_size
583   //   block_size_deviation
584   //   block_restart_interval
585   //   filter_policy
586   //   whole_key_filtering
587   // If you'd like to customize some of these options, you will need to
588   // use NewBlockBasedTableFactory() to construct a new table factory.
589 
590   // This option allows user to collect their own interested statistics of
591   // the tables.
592   // Default: empty vector -- no user-defined statistics collection will be
593   // performed.
594   typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
595       TablePropertiesCollectorFactories;
596   TablePropertiesCollectorFactories table_properties_collector_factories;
597 
598   // Maximum number of successive merge operations on a key in the memtable.
599   //
600   // When a merge operation is added to the memtable and the maximum number of
601   // successive merges is reached, the value of the key will be calculated and
602   // inserted into the memtable instead of the merge operation. This will
603   // ensure that there are never more than max_successive_merges merge
604   // operations in the memtable.
605   //
606   // Default: 0 (disabled)
607   //
608   // Dynamically changeable through SetOptions() API
609   size_t max_successive_merges = 0;
610 
611   // This flag specifies that the implementation should optimize the filters
612   // mainly for cases where keys are found rather than also optimize for keys
613   // missed. This would be used in cases where the application knows that
614   // there are very few misses or the performance in the case of misses is not
615   // important.
616   //
617   // For now, this flag allows us to not store filters for the last level i.e
618   // the largest level which contains data of the LSM store. For keys which
619   // are hits, the filters in this level are not useful because we will search
620   // for the data anyway. NOTE: the filters in other levels are still useful
621   // even for key hit because they tell us whether to look in that level or go
622   // to the higher level.
623   //
624   // Default: false
625   bool optimize_filters_for_hits = false;
626 
627   // After writing every SST file, reopen it and read all the keys.
628   //
629   // Default: false
630   //
631   // Dynamically changeable through SetOptions() API
632   bool paranoid_file_checks = false;
633 
634   // In debug mode, RocksDB run consistency checks on the LSM every time the LSM
635   // change (Flush, Compaction, AddFile). These checks are disabled in release
636   // mode, use this option to enable them in release mode as well.
637   // Default: false
638   bool force_consistency_checks = false;
639 
640   // Measure IO stats in compactions and flushes, if true.
641   //
642   // Default: false
643   //
644   // Dynamically changeable through SetOptions() API
645   bool report_bg_io_stats = false;
646 
647   // Files older than TTL will go through the compaction process.
648   // In Level: Non-bottom-level files older than TTL will go through the
649   //           compation process.
650   // In FIFO: Files older than TTL will be deleted.
651   // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
652   // In FIFO, this option will have the same meaning as
653   // periodic_compaction_seconds. Whichever stricter will be used.
654   // 0 means disabling.
655   // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
656   // pick default.
657   //
658   // Default: 30 days for leveled compaction + block based table. disable
659   //          otherwise.
660   //
661   // Dynamically changeable through SetOptions() API
662   uint64_t ttl = 0xfffffffffffffffe;
663 
664   // Files older than this value will be picked up for compaction, and
665   // re-written to the same level as they were before.
666   //
667   // A file's age is computed by looking at file_creation_time or creation_time
668   // table properties in order, if they have valid non-zero values; if not, the
669   // age is based on the file's last modified time (given by the underlying
670   // Env).
671   //
672   // Supported in Level and FIFO compaction.
673   // In FIFO compaction, this option has the same meaning as TTL and whichever
674   // stricter will be used.
675   // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
676   //
677   // Values:
678   // 0: Turn off Periodic compactions.
679   // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
680   //     as needed. For now, RocksDB will change this value to 30 days
681   //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
682   //     process at least once every 30 days if not compacted sooner.
683   //     In FIFO compaction, since the option has the same meaning as ttl,
684   //     when this value is left default, and ttl is left to 0, 30 days will be
685   //     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
686   //
687   // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
688   //
689   // Dynamically changeable through SetOptions() API
690   uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
691 
692   // If this option is set then 1 in N blocks are compressed
693   // using a fast (lz4) and slow (zstd) compression algorithm.
694   // The compressibility is reported as stats and the stored
695   // data is left uncompressed (unless compression is also requested).
696   uint64_t sample_for_compression = 0;
697 
698   // Create ColumnFamilyOptions with default values for all fields
699   AdvancedColumnFamilyOptions();
700   // Create ColumnFamilyOptions from Options
701   explicit AdvancedColumnFamilyOptions(const Options& options);
702 
703   // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
704 
705   // NOT SUPPORTED ANYMORE
706   // This does not do anything anymore.
707   int max_mem_compaction_level;
708 
709   // NOT SUPPORTED ANYMORE -- this options is no longer used
710   // Puts are delayed to options.delayed_write_rate when any level has a
711   // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0.
712   //
713   // Default: 0 (disabled)
714   //
715   // Dynamically changeable through SetOptions() API
716   double soft_rate_limit = 0.0;
717 
718   // NOT SUPPORTED ANYMORE -- this options is no longer used
719   double hard_rate_limit = 0.0;
720 
721   // NOT SUPPORTED ANYMORE -- this options is no longer used
722   unsigned int rate_limit_delay_max_milliseconds = 100;
723 
724   // NOT SUPPORTED ANYMORE
725   // Does not have any effect.
726   bool purge_redundant_kvs_while_flush = true;
727 };
728 
729 }  // namespace ROCKSDB_NAMESPACE
730