1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8 
9 #pragma once
10 
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <limits>
14 #include <memory>
15 #include <string>
16 #include <unordered_map>
17 #include <vector>
18 
19 #include "rocksdb/advanced_options.h"
20 #include "rocksdb/comparator.h"
21 #include "rocksdb/env.h"
22 #include "rocksdb/file_checksum.h"
23 #include "rocksdb/listener.h"
24 #include "rocksdb/universal_compaction.h"
25 #include "rocksdb/version.h"
26 #include "rocksdb/write_buffer_manager.h"
27 
28 #ifdef max
29 #undef max
30 #endif
31 
32 namespace ROCKSDB_NAMESPACE {
33 
34 class Cache;
35 class CompactionFilter;
36 class CompactionFilterFactory;
37 class Comparator;
38 class ConcurrentTaskLimiter;
39 class Env;
40 enum InfoLogLevel : unsigned char;
41 class SstFileManager;
42 class FilterPolicy;
43 class Logger;
44 class MergeOperator;
45 class Snapshot;
46 class MemTableRepFactory;
47 class RateLimiter;
48 class Slice;
49 class Statistics;
50 class InternalKeyComparator;
51 class WalFilter;
52 class FileSystem;
53 
54 enum class CpuPriority {
55   kIdle = 0,
56   kLow = 1,
57   kNormal = 2,
58   kHigh = 3,
59 };
60 
61 // DB contents are stored in a set of blocks, each of which holds a
62 // sequence of key,value pairs.  Each block may be compressed before
63 // being stored in a file.  The following enum describes which
64 // compression method (if any) is used to compress a block.
65 enum CompressionType : unsigned char {
66   // NOTE: do not change the values of existing entries, as these are
67   // part of the persistent format on disk.
68   kNoCompression = 0x0,
69   kSnappyCompression = 0x1,
70   kZlibCompression = 0x2,
71   kBZip2Compression = 0x3,
72   kLZ4Compression = 0x4,
73   kLZ4HCCompression = 0x5,
74   kXpressCompression = 0x6,
75   kZSTD = 0x7,
76 
77   // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than
78   // 0.8.0 or consider a possibility of downgrading the service or copying
79   // the database files to another service running with an older version of
80   // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will
81   // eventually remove the option from the public API.
82   kZSTDNotFinalCompression = 0x40,
83 
84   // kDisableCompressionOption is used to disable some compression options.
85   kDisableCompressionOption = 0xff,
86 };
87 
88 struct Options;
89 struct DbPath;
90 
91 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
92   // The function recovers options to a previous version. Only 4.6 or later
93   // versions are supported.
94   ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
95                                    int rocksdb_minor_version = 6);
96 
97   // Some functions that make it easier to optimize RocksDB
98   // Use this if your DB is very small (like under 1GB) and you don't want to
99   // spend lots of memory for memtables.
100   // An optional cache object is passed in to be used as the block cache
101   ColumnFamilyOptions* OptimizeForSmallDb(
102       std::shared_ptr<Cache>* cache = nullptr);
103 
104   // Use this if you don't need to keep the data sorted, i.e. you'll never use
105   // an iterator, only Put() and Get() API calls
106   //
107   // Not supported in ROCKSDB_LITE
108   ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
109 
110   // Default values for some parameters in ColumnFamilyOptions are not
111   // optimized for heavy workloads and big datasets, which means you might
112   // observe write stalls under some conditions. As a starting point for tuning
113   // RocksDB options, use the following two functions:
114   // * OptimizeLevelStyleCompaction -- optimizes level style compaction
115   // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
116   // Universal style compaction is focused on reducing Write Amplification
117   // Factor for big data sets, but increases Space Amplification. You can learn
118   // more about the different styles here:
119   // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
120   // Make sure to also call IncreaseParallelism(), which will provide the
121   // biggest performance gains.
122   // Note: we might use more memory than memtable_memory_budget during high
123   // write rate period
124   //
125   // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
126   ColumnFamilyOptions* OptimizeLevelStyleCompaction(
127       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
128   ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
129       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
130 
131   // -------------------
132   // Parameters that affect behavior
133 
134   // Comparator used to define the order of keys in the table.
135   // Default: a comparator that uses lexicographic byte-wise ordering
136   //
137   // REQUIRES: The client must ensure that the comparator supplied
138   // here has the same name and orders keys *exactly* the same as the
139   // comparator provided to previous open calls on the same DB.
140   const Comparator* comparator = BytewiseComparator();
141 
142   // REQUIRES: The client must provide a merge operator if Merge operation
143   // needs to be accessed. Calling Merge on a DB without a merge operator
144   // would result in Status::NotSupported. The client must ensure that the
145   // merge operator supplied here has the same name and *exactly* the same
146   // semantics as the merge operator provided to previous open calls on
147   // the same DB. The only exception is reserved for upgrade, where a DB
148   // previously without a merge operator is introduced to Merge operation
149   // for the first time. It's necessary to specify a merge operator when
150   // opening the DB in this case.
151   // Default: nullptr
152   std::shared_ptr<MergeOperator> merge_operator = nullptr;
153 
154   // A single CompactionFilter instance to call into during compaction.
155   // Allows an application to modify/delete a key-value during background
156   // compaction.
157   //
158   // If the client requires a new compaction filter to be used for different
159   // compaction runs, it can specify compaction_filter_factory instead of this
160   // option.  The client should specify only one of the two.
161   // compaction_filter takes precedence over compaction_filter_factory if
162   // client specifies both.
163   //
164   // If multithreaded compaction is being used, the supplied CompactionFilter
165   // instance may be used from different threads concurrently and so should be
166   // thread-safe.
167   //
168   // Default: nullptr
169   const CompactionFilter* compaction_filter = nullptr;
170 
171   // This is a factory that provides compaction filter objects which allow
172   // an application to modify/delete a key-value during background compaction.
173   //
174   // A new filter will be created on each compaction run.  If multithreaded
175   // compaction is being used, each created CompactionFilter will only be used
176   // from a single thread and so does not need to be thread-safe.
177   //
178   // Default: nullptr
179   std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
180 
181   // -------------------
182   // Parameters that affect performance
183 
184   // Amount of data to build up in memory (backed by an unsorted log
185   // on disk) before converting to a sorted on-disk file.
186   //
187   // Larger values increase performance, especially during bulk loads.
188   // Up to max_write_buffer_number write buffers may be held in memory
189   // at the same time,
190   // so you may wish to adjust this parameter to control memory usage.
191   // Also, a larger write buffer will result in a longer recovery time
192   // the next time the database is opened.
193   //
194   // Note that write_buffer_size is enforced per column family.
195   // See db_write_buffer_size for sharing memory across column families.
196   //
197   // Default: 64MB
198   //
199   // Dynamically changeable through SetOptions() API
200   size_t write_buffer_size = 64 << 20;
201 
202   // Compress blocks using the specified compression algorithm.
203   //
204   // Default: kSnappyCompression, if it's supported. If snappy is not linked
205   // with the library, the default is kNoCompression.
206   //
207   // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
208   //    ~200-500MB/s compression
209   //    ~400-800MB/s decompression
210   //
211   // Note that these speeds are significantly faster than most
212   // persistent storage speeds, and therefore it is typically never
213   // worth switching to kNoCompression.  Even if the input data is
214   // incompressible, the kSnappyCompression implementation will
215   // efficiently detect that and will switch to uncompressed mode.
216   //
217   // If you do not set `compression_opts.level`, or set it to
218   // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
219   // default corresponding to `compression` as follows:
220   //
221   // - kZSTD: 3
222   // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
223   // - kLZ4HCCompression: 0
224   // - For all others, we do not specify a compression level
225   //
226   // Dynamically changeable through SetOptions() API
227   CompressionType compression;
228 
229   // Compression algorithm that will be used for the bottommost level that
230   // contain files.
231   //
232   // Default: kDisableCompressionOption (Disabled)
233   CompressionType bottommost_compression = kDisableCompressionOption;
234 
235   // different options for compression algorithms used by bottommost_compression
236   // if it is enabled. To enable it, please see the definition of
237   // CompressionOptions.
238   CompressionOptions bottommost_compression_opts;
239 
240   // different options for compression algorithms
241   CompressionOptions compression_opts;
242 
243   // Number of files to trigger level-0 compaction. A value <0 means that
244   // level-0 compaction will not be triggered by number of files at all.
245   //
246   // Default: 4
247   //
248   // Dynamically changeable through SetOptions() API
249   int level0_file_num_compaction_trigger = 4;
250 
251   // If non-nullptr, use the specified function to determine the
252   // prefixes for keys.  These prefixes will be placed in the filter.
253   // Depending on the workload, this can reduce the number of read-IOP
254   // cost for scans when a prefix is passed via ReadOptions to
255   // db.NewIterator().  For prefix filtering to work properly,
256   // "prefix_extractor" and "comparator" must be such that the following
257   // properties hold:
258   //
259   // 1) key.starts_with(prefix(key))
260   // 2) Compare(prefix(key), key) <= 0.
261   // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
262   // 4) prefix(prefix(key)) == prefix(key)
263   //
264   // Default: nullptr
265   std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
266 
267   // Control maximum total data size for a level.
268   // max_bytes_for_level_base is the max total for level-1.
269   // Maximum number of bytes for level L can be calculated as
270   // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
271   // For example, if max_bytes_for_level_base is 200MB, and if
272   // max_bytes_for_level_multiplier is 10, total data size for level-1
273   // will be 200MB, total file size for level-2 will be 2GB,
274   // and total file size for level-3 will be 20GB.
275   //
276   // Default: 256MB.
277   //
278   // Dynamically changeable through SetOptions() API
279   uint64_t max_bytes_for_level_base = 256 * 1048576;
280 
281   // Deprecated.
282   uint64_t snap_refresh_nanos = 0;
283 
284   // Disable automatic compactions. Manual compactions can still
285   // be issued on this column family
286   //
287   // Dynamically changeable through SetOptions() API
288   bool disable_auto_compactions = false;
289 
290   // This is a factory that provides TableFactory objects.
291   // Default: a block-based table factory that provides a default
292   // implementation of TableBuilder and TableReader with default
293   // BlockBasedTableOptions.
294   std::shared_ptr<TableFactory> table_factory;
295 
296   // A list of paths where SST files for this column family
297   // can be put into, with its target size. Similar to db_paths,
298   // newer data is placed into paths specified earlier in the
299   // vector while older data gradually moves to paths specified
300   // later in the vector.
301   // Note that, if a path is supplied to multiple column
302   // families, it would have files and total size from all
303   // the column families combined. User should provision for the
304   // total size(from all the column families) in such cases.
305   //
306   // If left empty, db_paths will be used.
307   // Default: empty
308   std::vector<DbPath> cf_paths;
309 
310   // Compaction concurrent thread limiter for the column family.
311   // If non-nullptr, use given concurrent thread limiter to control
312   // the max outstanding compaction tasks. Limiter can be shared with
313   // multiple column families across db instances.
314   //
315   // Default: nullptr
316   std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
317 
318   // Create ColumnFamilyOptions with default values for all fields
319   ColumnFamilyOptions();
320   // Create ColumnFamilyOptions from Options
321   explicit ColumnFamilyOptions(const Options& options);
322 
323   void Dump(Logger* log) const;
324 };
325 
326 enum class WALRecoveryMode : char {
327   // Original levelDB recovery
328   // We tolerate incomplete record in trailing data on all logs
329   // Use case : This is legacy behavior
330   kTolerateCorruptedTailRecords = 0x00,
331   // Recover from clean shutdown
332   // We don't expect to find any corruption in the WAL
333   // Use case : This is ideal for unit tests and rare applications that
334   // can require high consistency guarantee
335   kAbsoluteConsistency = 0x01,
336   // Recover to point-in-time consistency (default)
337   // We stop the WAL playback on discovering WAL inconsistency
338   // Use case : Ideal for systems that have disk controller cache like
339   // hard disk, SSD without super capacitor that store related data
340   kPointInTimeRecovery = 0x02,
341   // Recovery after a disaster
342   // We ignore any corruption in the WAL and try to salvage as much data as
343   // possible
344   // Use case : Ideal for last ditch effort to recover data or systems that
345   // operate with low grade unrelated data
346   kSkipAnyCorruptedRecords = 0x03,
347 };
348 
349 struct DbPath {
350   std::string path;
351   uint64_t target_size;  // Target size of total files under the path, in byte.
352 
DbPathDbPath353   DbPath() : target_size(0) {}
DbPathDbPath354   DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
355 };
356 
357 struct DBOptions {
358   // The function recovers options to the option as in version 4.6.
359   DBOptions* OldDefaults(int rocksdb_major_version = 4,
360                          int rocksdb_minor_version = 6);
361 
362   // Some functions that make it easier to optimize RocksDB
363 
364   // Use this if your DB is very small (like under 1GB) and you don't want to
365   // spend lots of memory for memtables.
366   // An optional cache object is passed in for the memory of the
367   // memtable to cost to
368   DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
369 
370 #ifndef ROCKSDB_LITE
371   // By default, RocksDB uses only one background thread for flush and
372   // compaction. Calling this function will set it up such that total of
373   // `total_threads` is used. Good value for `total_threads` is the number of
374   // cores. You almost definitely want to call this function if your system is
375   // bottlenecked by RocksDB.
376   DBOptions* IncreaseParallelism(int total_threads = 16);
377 #endif  // ROCKSDB_LITE
378 
379   // If true, the database will be created if it is missing.
380   // Default: false
381   bool create_if_missing = false;
382 
383   // If true, missing column families will be automatically created.
384   // Default: false
385   bool create_missing_column_families = false;
386 
387   // If true, an error is raised if the database already exists.
388   // Default: false
389   bool error_if_exists = false;
390 
391   // If true, RocksDB will aggressively check consistency of the data.
392   // Also, if any of the  writes to the database fails (Put, Delete, Merge,
393   // Write), the database will switch to read-only mode and fail all other
394   // Write operations.
395   // In most cases you want this to be set to true.
396   // Default: true
397   bool paranoid_checks = true;
398 
399   // Use the specified object to interact with the environment,
400   // e.g. to read/write files, schedule background work, etc. In the near
401   // future, support for doing storage operations such as read/write files
402   // through env will be deprecated in favor of file_system (see below)
403   // Default: Env::Default()
404   Env* env = Env::Default();
405 
406   // Use to control write rate of flush and compaction. Flush has higher
407   // priority than compaction. Rate limiting is disabled if nullptr.
408   // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
409   // Default: nullptr
410   std::shared_ptr<RateLimiter> rate_limiter = nullptr;
411 
412   // Use to track SST files and control their file deletion rate.
413   //
414   // Features:
415   //  - Throttle the deletion rate of the SST files.
416   //  - Keep track the total size of all SST files.
417   //  - Set a maximum allowed space limit for SST files that when reached
418   //    the DB wont do any further flushes or compactions and will set the
419   //    background error.
420   //  - Can be shared between multiple dbs.
421   // Limitations:
422   //  - Only track and throttle deletes of SST files in
423   //    first db_path (db_name if db_paths is empty).
424   //
425   // Default: nullptr
426   std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
427 
428   // Any internal progress/error information generated by the db will
429   // be written to info_log if it is non-nullptr, or to a file stored
430   // in the same directory as the DB contents if info_log is nullptr.
431   // Default: nullptr
432   std::shared_ptr<Logger> info_log = nullptr;
433 
434 #ifdef NDEBUG
435   InfoLogLevel info_log_level = INFO_LEVEL;
436 #else
437   InfoLogLevel info_log_level = DEBUG_LEVEL;
438 #endif  // NDEBUG
439 
440   // Number of open files that can be used by the DB.  You may need to
441   // increase this if your database has a large working set. Value -1 means
442   // files opened are always kept open. You can estimate number of files based
443   // on target_file_size_base and target_file_size_multiplier for level-based
444   // compaction. For universal-style compaction, you can usually set it to -1.
445   //
446   // Default: -1
447   //
448   // Dynamically changeable through SetDBOptions() API.
449   int max_open_files = -1;
450 
451   // If max_open_files is -1, DB will open all files on DB::Open(). You can
452   // use this option to increase the number of threads used to open the files.
453   // Default: 16
454   int max_file_opening_threads = 16;
455 
456   // Once write-ahead logs exceed this size, we will start forcing the flush of
457   // column families whose memtables are backed by the oldest live WAL file
458   // (i.e. the ones that are causing all the space amplification). If set to 0
459   // (default), we will dynamically choose the WAL size limit to be
460   // [sum of all write_buffer_size * max_write_buffer_number] * 4
461   // This option takes effect only when there are more than one column family as
462   // otherwise the wal size is dictated by the write_buffer_size.
463   //
464   // Default: 0
465   //
466   // Dynamically changeable through SetDBOptions() API.
467   uint64_t max_total_wal_size = 0;
468 
469   // If non-null, then we should collect metrics about database operations
470   std::shared_ptr<Statistics> statistics = nullptr;
471 
472   // By default, writes to stable storage use fdatasync (on platforms
473   // where this function is available). If this option is true,
474   // fsync is used instead.
475   //
476   // fsync and fdatasync are equally safe for our purposes and fdatasync is
477   // faster, so it is rarely necessary to set this option. It is provided
478   // as a workaround for kernel/filesystem bugs, such as one that affected
479   // fdatasync with ext4 in kernel versions prior to 3.7.
480   bool use_fsync = false;
481 
482   // A list of paths where SST files can be put into, with its target size.
483   // Newer data is placed into paths specified earlier in the vector while
484   // older data gradually moves to paths specified later in the vector.
485   //
486   // For example, you have a flash device with 10GB allocated for the DB,
487   // as well as a hard drive of 2TB, you should config it to be:
488   //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
489   //
490   // The system will try to guarantee data under each path is close to but
491   // not larger than the target size. But current and future file sizes used
492   // by determining where to place a file are based on best-effort estimation,
493   // which means there is a chance that the actual size under the directory
494   // is slightly more than target size under some workloads. User should give
495   // some buffer room for those cases.
496   //
497   // If none of the paths has sufficient room to place a file, the file will
498   // be placed to the last path anyway, despite to the target size.
499   //
500   // Placing newer data to earlier paths is also best-efforts. User should
501   // expect user files to be placed in higher levels in some extreme cases.
502   //
503   // If left empty, only one path will be used, which is db_name passed when
504   // opening the DB.
505   // Default: empty
506   std::vector<DbPath> db_paths;
507 
508   // This specifies the info LOG dir.
509   // If it is empty, the log files will be in the same dir as data.
510   // If it is non empty, the log files will be in the specified dir,
511   // and the db data dir's absolute path will be used as the log file
512   // name's prefix.
513   std::string db_log_dir = "";
514 
515   // This specifies the absolute dir path for write-ahead logs (WAL).
516   // If it is empty, the log files will be in the same dir as data,
517   //   dbname is used as the data dir by default
518   // If it is non empty, the log files will be in kept the specified dir.
519   // When destroying the db,
520   //   all log files in wal_dir and the dir itself is deleted
521   std::string wal_dir = "";
522 
523   // The periodicity when obsolete files get deleted. The default
524   // value is 6 hours. The files that get out of scope by compaction
525   // process will still get automatically delete on every compaction,
526   // regardless of this setting
527   //
528   // Default: 6 hours
529   //
530   // Dynamically changeable through SetDBOptions() API.
531   uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
532 
533   // Maximum number of concurrent background jobs (compactions and flushes).
534   //
535   // Default: 2
536   //
537   // Dynamically changeable through SetDBOptions() API.
538   int max_background_jobs = 2;
539 
540   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
541   // value of max_background_jobs. This option is ignored.
542   //
543   // Dynamically changeable through SetDBOptions() API.
544   int base_background_compactions = -1;
545 
546   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
547   // value of max_background_jobs. For backwards compatibility we will set
548   // `max_background_jobs = max_background_compactions + max_background_flushes`
549   // in the case where user sets at least one of `max_background_compactions` or
550   // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
551   //
552   // Maximum number of concurrent background compaction jobs, submitted to
553   // the default LOW priority thread pool.
554   //
555   // If you're increasing this, also consider increasing number of threads in
556   // LOW priority thread pool. For more information, see
557   // Env::SetBackgroundThreads
558   //
559   // Default: -1
560   //
561   // Dynamically changeable through SetDBOptions() API.
562   int max_background_compactions = -1;
563 
564   // This value represents the maximum number of threads that will
565   // concurrently perform a compaction job by breaking it into multiple,
566   // smaller ones that are run simultaneously.
567   // Default: 1 (i.e. no subcompactions)
568   uint32_t max_subcompactions = 1;
569 
570   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
571   // value of max_background_jobs. For backwards compatibility we will set
572   // `max_background_jobs = max_background_compactions + max_background_flushes`
573   // in the case where user sets at least one of `max_background_compactions` or
574   // `max_background_flushes`.
575   //
576   // Maximum number of concurrent background memtable flush jobs, submitted by
577   // default to the HIGH priority thread pool. If the HIGH priority thread pool
578   // is configured to have zero threads, flush jobs will share the LOW priority
579   // thread pool with compaction jobs.
580   //
581   // It is important to use both thread pools when the same Env is shared by
582   // multiple db instances. Without a separate pool, long running compaction
583   // jobs could potentially block memtable flush jobs of other db instances,
584   // leading to unnecessary Put stalls.
585   //
586   // If you're increasing this, also consider increasing number of threads in
587   // HIGH priority thread pool. For more information, see
588   // Env::SetBackgroundThreads
589   // Default: -1
590   int max_background_flushes = -1;
591 
592   // Specify the maximal size of the info log file. If the log file
593   // is larger than `max_log_file_size`, a new info log file will
594   // be created.
595   // If max_log_file_size == 0, all logs will be written to one
596   // log file.
597   size_t max_log_file_size = 0;
598 
599   // Time for the info log file to roll (in seconds).
600   // If specified with non-zero value, log file will be rolled
601   // if it has been active longer than `log_file_time_to_roll`.
602   // Default: 0 (disabled)
603   // Not supported in ROCKSDB_LITE mode!
604   size_t log_file_time_to_roll = 0;
605 
606   // Maximal info log files to be kept.
607   // Default: 1000
608   size_t keep_log_file_num = 1000;
609 
610   // Recycle log files.
611   // If non-zero, we will reuse previously written log files for new
612   // logs, overwriting the old data.  The value indicates how many
613   // such files we will keep around at any point in time for later
614   // use.  This is more efficient because the blocks are already
615   // allocated and fdatasync does not need to update the inode after
616   // each write.
617   // Default: 0
618   size_t recycle_log_file_num = 0;
619 
620   // manifest file is rolled over on reaching this limit.
621   // The older manifest file be deleted.
622   // The default value is 1GB so that the manifest file can grow, but not
623   // reach the limit of storage capacity.
624   uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
625 
626   // Number of shards used for table cache.
627   int table_cache_numshardbits = 6;
628 
629   // NOT SUPPORTED ANYMORE
630   // int table_cache_remove_scan_count_limit;
631 
632   // The following two fields affect how archived logs will be deleted.
633   // 1. If both set to 0, logs will be deleted asap and will not get into
634   //    the archive.
635   // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
636   //    WAL files will be checked every 10 min and if total size is greater
637   //    then WAL_size_limit_MB, they will be deleted starting with the
638   //    earliest until size_limit is met. All empty files will be deleted.
639   // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
640   //    WAL files will be checked every WAL_ttl_seconds / 2 and those that
641   //    are older than WAL_ttl_seconds will be deleted.
642   // 4. If both are not 0, WAL files will be checked every 10 min and both
643   //    checks will be performed with ttl being first.
644   uint64_t WAL_ttl_seconds = 0;
645   uint64_t WAL_size_limit_MB = 0;
646 
647   // Number of bytes to preallocate (via fallocate) the manifest
648   // files.  Default is 4mb, which is reasonable to reduce random IO
649   // as well as prevent overallocation for mounts that preallocate
650   // large amounts of data (such as xfs's allocsize option).
651   size_t manifest_preallocation_size = 4 * 1024 * 1024;
652 
653   // Allow the OS to mmap file for reading sst tables. Default: false
654   bool allow_mmap_reads = false;
655 
656   // Allow the OS to mmap file for writing.
657   // DB::SyncWAL() only works if this is set to false.
658   // Default: false
659   bool allow_mmap_writes = false;
660 
661   // Enable direct I/O mode for read/write
662   // they may or may not improve performance depending on the use case
663   //
664   // Files will be opened in "direct I/O" mode
665   // which means that data r/w from the disk will not be cached or
666   // buffered. The hardware buffer of the devices may however still
667   // be used. Memory mapped files are not impacted by these parameters.
668 
669   // Use O_DIRECT for user and compaction reads.
670   // When true, we also force new_table_reader_for_compaction_inputs to true.
671   // Default: false
672   // Not supported in ROCKSDB_LITE mode!
673   bool use_direct_reads = false;
674 
675   // Use O_DIRECT for writes in background flush and compactions.
676   // Default: false
677   // Not supported in ROCKSDB_LITE mode!
678   bool use_direct_io_for_flush_and_compaction = false;
679 
680   // If false, fallocate() calls are bypassed
681   bool allow_fallocate = true;
682 
683   // Disable child process inherit open files. Default: true
684   bool is_fd_close_on_exec = true;
685 
686   // NOT SUPPORTED ANYMORE -- this options is no longer used
687   bool skip_log_error_on_recovery = false;
688 
689   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
690   //
691   // Default: 600 (10 min)
692   //
693   // Dynamically changeable through SetDBOptions() API.
694   unsigned int stats_dump_period_sec = 600;
695 
696   // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
697   // Default: 600
698   unsigned int stats_persist_period_sec = 600;
699 
700   // If true, automatically persist stats to a hidden column family (column
701   // family name: ___rocksdb_stats_history___) every
702   // stats_persist_period_sec seconds; otherwise, write to an in-memory
703   // struct. User can query through `GetStatsHistory` API.
704   // If user attempts to create a column family with the same name on a DB
705   // which have previously set persist_stats_to_disk to true, the column family
706   // creation will fail, but the hidden column family will survive, as well as
707   // the previously persisted statistics.
708   // When peristing stats to disk, the stat name will be limited at 100 bytes.
709   // Default: false
710   bool persist_stats_to_disk = false;
711 
712   // if not zero, periodically take stats snapshots and store in memory, the
713   // memory size for stats snapshots is capped at stats_history_buffer_size
714   // Default: 1MB
715   size_t stats_history_buffer_size = 1024 * 1024;
716 
717   // If set true, will hint the underlying file system that the file
718   // access pattern is random, when a sst file is opened.
719   // Default: true
720   bool advise_random_on_open = true;
721 
722   // Amount of data to build up in memtables across all column
723   // families before writing to disk.
724   //
725   // This is distinct from write_buffer_size, which enforces a limit
726   // for a single memtable.
727   //
728   // This feature is disabled by default. Specify a non-zero value
729   // to enable it.
730   //
731   // Default: 0 (disabled)
732   size_t db_write_buffer_size = 0;
733 
734   // The memory usage of memtable will report to this object. The same object
735   // can be passed into multiple DBs and it will track the sum of size of all
736   // the DBs. If the total size of all live memtables of all the DBs exceeds
737   // a limit, a flush will be triggered in the next DB to which the next write
738   // is issued.
739   //
740   // If the object is only passed to one DB, the behavior is the same as
741   // db_write_buffer_size. When write_buffer_manager is set, the value set will
742   // override db_write_buffer_size.
743   //
744   // This feature is disabled by default. Specify a non-zero value
745   // to enable it.
746   //
747   // Default: null
748   std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
749 
750   // Specify the file access pattern once a compaction is started.
751   // It will be applied to all input files of a compaction.
752   // Default: NORMAL
753   enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
754   AccessHint access_hint_on_compaction_start = NORMAL;
755 
756   // If true, always create a new file descriptor and new table reader
757   // for compaction inputs. Turn this parameter on may introduce extra
758   // memory usage in the table reader, if it allocates extra memory
759   // for indexes. This will allow file descriptor prefetch options
760   // to be set for compaction input files and not to impact file
761   // descriptors for the same file used by user queries.
762   // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
763   // for this mode if using block-based table.
764   //
765   // Default: false
766   // This flag has no affect on the behavior of compaction and plan to delete
767   // in the future.
768   bool new_table_reader_for_compaction_inputs = false;
769 
770   // If non-zero, we perform bigger reads when doing compaction. If you're
771   // running RocksDB on spinning disks, you should set this to at least 2MB.
772   // That way RocksDB's compaction is doing sequential instead of random reads.
773   //
774   // When non-zero, we also force new_table_reader_for_compaction_inputs to
775   // true.
776   //
777   // Default: 0
778   //
779   // Dynamically changeable through SetDBOptions() API.
780   size_t compaction_readahead_size = 0;
781 
782   // This is a maximum buffer size that is used by WinMmapReadableFile in
783   // unbuffered disk I/O mode. We need to maintain an aligned buffer for
784   // reads. We allow the buffer to grow until the specified value and then
785   // for bigger requests allocate one shot buffers. In unbuffered mode we
786   // always bypass read-ahead buffer at ReadaheadRandomAccessFile
787   // When read-ahead is required we then make use of compaction_readahead_size
788   // value and always try to read ahead. With read-ahead we always
789   // pre-allocate buffer to the size instead of growing it up to a limit.
790   //
791   // This option is currently honored only on Windows
792   //
793   // Default: 1 Mb
794   //
795   // Special value: 0 - means do not maintain per instance buffer. Allocate
796   //                per request buffer and avoid locking.
797   size_t random_access_max_buffer_size = 1024 * 1024;
798 
799   // This is the maximum buffer size that is used by WritableFileWriter.
800   // On Windows, we need to maintain an aligned buffer for writes.
801   // We allow the buffer to grow until it's size hits the limit in buffered
802   // IO and fix the buffer size when using direct IO to ensure alignment of
803   // write requests if the logical sector size is unusual
804   //
805   // Default: 1024 * 1024 (1 MB)
806   //
807   // Dynamically changeable through SetDBOptions() API.
808   size_t writable_file_max_buffer_size = 1024 * 1024;
809 
810   // Use adaptive mutex, which spins in the user space before resorting
811   // to kernel. This could reduce context switch when the mutex is not
812   // heavily contended. However, if the mutex is hot, we could end up
813   // wasting spin time.
814   // Default: false
815   bool use_adaptive_mutex = false;
816 
817   // Create DBOptions with default values for all fields
818   DBOptions();
819   // Create DBOptions from Options
820   explicit DBOptions(const Options& options);
821 
822   void Dump(Logger* log) const;
823 
824   // Allows OS to incrementally sync files to disk while they are being
825   // written, asynchronously, in the background. This operation can be used
826   // to smooth out write I/Os over time. Users shouldn't rely on it for
827   // persistency guarantee.
828   // Issue one request for every bytes_per_sync written. 0 turns it off.
829   //
830   // You may consider using rate_limiter to regulate write rate to device.
831   // When rate limiter is enabled, it automatically enables bytes_per_sync
832   // to 1MB.
833   //
834   // This option applies to table files
835   //
836   // Default: 0, turned off
837   //
838   // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
839   // Dynamically changeable through SetDBOptions() API.
840   uint64_t bytes_per_sync = 0;
841 
842   // Same as bytes_per_sync, but applies to WAL files
843   //
844   // Default: 0, turned off
845   //
846   // Dynamically changeable through SetDBOptions() API.
847   uint64_t wal_bytes_per_sync = 0;
848 
849   // When true, guarantees WAL files have at most `wal_bytes_per_sync`
850   // bytes submitted for writeback at any given time, and SST files have at most
851   // `bytes_per_sync` bytes pending writeback at any given time. This can be
852   // used to handle cases where processing speed exceeds I/O speed during file
853   // generation, which can lead to a huge sync when the file is finished, even
854   // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
855   //
856   //  - If `sync_file_range` is supported it achieves this by waiting for any
857   //    prior `sync_file_range`s to finish before proceeding. In this way,
858   //    processing (compression, etc.) can proceed uninhibited in the gap
859   //    between `sync_file_range`s, and we block only when I/O falls behind.
860   //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
861   //    always blocks, thus preventing the interleaving of I/O and processing.
862   //
863   // Note: Enabling this option does not provide any additional persistence
864   // guarantees, as it may use `sync_file_range`, which does not write out
865   // metadata.
866   //
867   // Default: false
868   bool strict_bytes_per_sync = false;
869 
870   // A vector of EventListeners whose callback functions will be called
871   // when specific RocksDB event happens.
872   std::vector<std::shared_ptr<EventListener>> listeners;
873 
874   // If true, then the status of the threads involved in this DB will
875   // be tracked and available via GetThreadList() API.
876   //
877   // Default: false
878   bool enable_thread_tracking = false;
879 
880   // The limited write rate to DB if soft_pending_compaction_bytes_limit or
881   // level0_slowdown_writes_trigger is triggered, or we are writing to the
882   // last mem table allowed and we allow more than 3 mem tables. It is
883   // calculated using size of user write requests before compression.
884   // RocksDB may decide to slow down more if the compaction still
885   // gets behind further.
886   // If the value is 0, we will infer a value from `rater_limiter` value
887   // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
888   // if users change the rate in `rate_limiter` after DB is opened,
889   // `delayed_write_rate` won't be adjusted.
890   //
891   // Unit: byte per second.
892   //
893   // Default: 0
894   //
895   // Dynamically changeable through SetDBOptions() API.
896   uint64_t delayed_write_rate = 0;
897 
898   // By default, a single write thread queue is maintained. The thread gets
899   // to the head of the queue becomes write batch group leader and responsible
900   // for writing to WAL and memtable for the batch group.
901   //
902   // If enable_pipelined_write is true, separate write thread queue is
903   // maintained for WAL write and memtable write. A write thread first enter WAL
904   // writer queue and then memtable writer queue. Pending thread on the WAL
905   // writer queue thus only have to wait for previous writers to finish their
906   // WAL writing but not the memtable writing. Enabling the feature may improve
907   // write throughput and reduce latency of the prepare phase of two-phase
908   // commit.
909   //
910   // Default: false
911   bool enable_pipelined_write = false;
912 
913   // Setting unordered_write to true trades higher write throughput with
914   // relaxing the immutability guarantee of snapshots. This violates the
915   // repeatability one expects from ::Get from a snapshot, as well as
916   // ::MultiGet and Iterator's consistent-point-in-time view property.
917   // If the application cannot tolerate the relaxed guarantees, it can implement
918   // its own mechanisms to work around that and yet benefit from the higher
919   // throughput. Using TransactionDB with WRITE_PREPARED write policy and
920   // two_write_queues=true is one way to achieve immutable snapshots despite
921   // unordered_write.
922   //
923   // By default, i.e., when it is false, rocksdb does not advance the sequence
924   // number for new snapshots unless all the writes with lower sequence numbers
925   // are already finished. This provides the immutability that we except from
926   // snapshots. Moreover, since Iterator and MultiGet internally depend on
927   // snapshots, the snapshot immutability results into Iterator and MultiGet
928   // offering consistent-point-in-time view. If set to true, although
929   // Read-Your-Own-Write property is still provided, the snapshot immutability
930   // property is relaxed: the writes issued after the snapshot is obtained (with
931   // larger sequence numbers) will be still not visible to the reads from that
932   // snapshot, however, there still might be pending writes (with lower sequence
933   // number) that will change the state visible to the snapshot after they are
934   // landed to the memtable.
935   //
936   // Default: false
937   bool unordered_write = false;
938 
939   // If true, allow multi-writers to update mem tables in parallel.
940   // Only some memtable_factory-s support concurrent writes; currently it
941   // is implemented only for SkipListFactory.  Concurrent memtable writes
942   // are not compatible with inplace_update_support or filter_deletes.
943   // It is strongly recommended to set enable_write_thread_adaptive_yield
944   // if you are going to use this feature.
945   //
946   // Default: true
947   bool allow_concurrent_memtable_write = true;
948 
949   // If true, threads synchronizing with the write batch group leader will
950   // wait for up to write_thread_max_yield_usec before blocking on a mutex.
951   // This can substantially improve throughput for concurrent workloads,
952   // regardless of whether allow_concurrent_memtable_write is enabled.
953   //
954   // Default: true
955   bool enable_write_thread_adaptive_yield = true;
956 
957   // The maximum limit of number of bytes that are written in a single batch
958   // of WAL or memtable write. It is followed when the leader write size
959   // is larger than 1/8 of this limit.
960   //
961   // Default: 1 MB
962   uint64_t max_write_batch_group_size_bytes = 1 << 20;
963 
964   // The maximum number of microseconds that a write operation will use
965   // a yielding spin loop to coordinate with other write threads before
966   // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
967   // set properly) increasing this value is likely to increase RocksDB
968   // throughput at the expense of increased CPU usage.
969   //
970   // Default: 100
971   uint64_t write_thread_max_yield_usec = 100;
972 
973   // The latency in microseconds after which a std::this_thread::yield
974   // call (sched_yield on Linux) is considered to be a signal that
975   // other processes or threads would like to use the current core.
976   // Increasing this makes writer threads more likely to take CPU
977   // by spinning, which will show up as an increase in the number of
978   // involuntary context switches.
979   //
980   // Default: 3
981   uint64_t write_thread_slow_yield_usec = 3;
982 
983   // If true, then DB::Open() will not update the statistics used to optimize
984   // compaction decision by loading table properties from many files.
985   // Turning off this feature will improve DBOpen time especially in
986   // disk environment.
987   //
988   // Default: false
989   bool skip_stats_update_on_db_open = false;
990 
991   // If true, then DB::Open() will not fetch and check sizes of all sst files.
992   // This may significantly speed up startup if there are many sst files,
993   // especially when using non-default Env with expensive GetFileSize().
994   // We'll still check that all required sst files exist.
995   // If paranoid_checks is false, this option is ignored, and sst files are
996   // not checked at all.
997   //
998   // Default: false
999   bool skip_checking_sst_file_sizes_on_db_open = false;
1000 
1001   // Recovery mode to control the consistency while replaying WAL
1002   // Default: kPointInTimeRecovery
1003   WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
1004 
1005   // if set to false then recovery will fail when a prepared
1006   // transaction is encountered in the WAL
1007   bool allow_2pc = false;
1008 
1009   // A global cache for table-level rows.
1010   // Default: nullptr (disabled)
1011   // Not supported in ROCKSDB_LITE mode!
1012   std::shared_ptr<Cache> row_cache = nullptr;
1013 
1014 #ifndef ROCKSDB_LITE
1015   // A filter object supplied to be invoked while processing write-ahead-logs
1016   // (WALs) during recovery. The filter provides a way to inspect log
1017   // records, ignoring a particular record or skipping replay.
1018   // The filter is invoked at startup and is invoked from a single-thread
1019   // currently.
1020   WalFilter* wal_filter = nullptr;
1021 #endif  // ROCKSDB_LITE
1022 
1023   // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
1024   // / SetOptions will fail if options file is not detected or properly
1025   // persisted.
1026   //
1027   // DEFAULT: false
1028   bool fail_if_options_file_error = false;
1029 
1030   // If true, then print malloc stats together with rocksdb.stats
1031   // when printing to LOG.
1032   // DEFAULT: false
1033   bool dump_malloc_stats = false;
1034 
1035   // By default RocksDB replay WAL logs and flush them on DB open, which may
1036   // create very small SST files. If this option is enabled, RocksDB will try
1037   // to avoid (but not guarantee not to) flush during recovery. Also, existing
1038   // WAL logs will be kept, so that if crash happened before flush, we still
1039   // have logs to recover from.
1040   //
1041   // DEFAULT: false
1042   bool avoid_flush_during_recovery = false;
1043 
1044   // By default RocksDB will flush all memtables on DB close if there are
1045   // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
1046   // DB close. Unpersisted data WILL BE LOST.
1047   //
1048   // DEFAULT: false
1049   //
1050   // Dynamically changeable through SetDBOptions() API.
1051   bool avoid_flush_during_shutdown = false;
1052 
1053   // Set this option to true during creation of database if you want
1054   // to be able to ingest behind (call IngestExternalFile() skipping keys
1055   // that already exist, rather than overwriting matching keys).
1056   // Setting this option to true will affect 2 things:
1057   // 1) Disable some internal optimizations around SST file compression
1058   // 2) Reserve bottom-most level for ingested files only.
1059   // 3) Note that num_levels should be >= 3 if this option is turned on.
1060   //
1061   // DEFAULT: false
1062   // Immutable.
1063   bool allow_ingest_behind = false;
1064 
1065   // Needed to support differential snapshots.
1066   // If set to true then DB will only process deletes with sequence number
1067   // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
1068   // Clients are responsible to periodically call this method to advance
1069   // the cutoff time. If this method is never called and preserve_deletes
1070   // is set to true NO deletes will ever be processed.
1071   // At the moment this only keeps normal deletes, SingleDeletes will
1072   // not be preserved.
1073   // DEFAULT: false
1074   // Immutable (TODO: make it dynamically changeable)
1075   bool preserve_deletes = false;
1076 
1077   // If enabled it uses two queues for writes, one for the ones with
1078   // disable_memtable and one for the ones that also write to memtable. This
1079   // allows the memtable writes not to lag behind other writes. It can be used
1080   // to optimize MySQL 2PC in which only the commits, which are serial, write to
1081   // memtable.
1082   bool two_write_queues = false;
1083 
1084   // If true WAL is not flushed automatically after each write. Instead it
1085   // relies on manual invocation of FlushWAL to write the WAL buffer to its
1086   // file.
1087   bool manual_wal_flush = false;
1088 
1089   // If true, RocksDB supports flushing multiple column families and committing
1090   // their results atomically to MANIFEST. Note that it is not
1091   // necessary to set atomic_flush to true if WAL is always enabled since WAL
1092   // allows the database to be restored to the last persistent state in WAL.
1093   // This option is useful when there are column families with writes NOT
1094   // protected by WAL.
1095   // For manual flush, application has to specify which column families to
1096   // flush atomically in DB::Flush.
1097   // For auto-triggered flush, RocksDB atomically flushes ALL column families.
1098   //
1099   // Currently, any WAL-enabled writes after atomic flush may be replayed
1100   // independently if the process crashes later and tries to recover.
1101   bool atomic_flush = false;
1102 
1103   // If true, working thread may avoid doing unnecessary and long-latency
1104   // operation (such as deleting obsolete files directly or deleting memtable)
1105   // and will instead schedule a background job to do it.
1106   // Use it if you're latency-sensitive.
1107   // If set to true, takes precedence over
1108   // ReadOptions::background_purge_on_iterator_cleanup.
1109   bool avoid_unnecessary_blocking_io = false;
1110 
1111   // Historically DB ID has always been stored in Identity File in DB folder.
1112   // If this flag is true, the DB ID is written to Manifest file in addition
1113   // to the Identity file. By doing this 2 problems are solved
1114   // 1. We don't checksum the Identity file where as Manifest file is.
1115   // 2. Since the source of truth for DB is Manifest file DB ID will sit with
1116   //    the source of truth. Previously the Identity file could be copied
1117   //    independent of Manifest and that can result in wrong DB ID.
1118   // We recommend setting this flag to true.
1119   // Default: false
1120   bool write_dbid_to_manifest = false;
1121 
1122   // The number of bytes to prefetch when reading the log. This is mostly useful
1123   // for reading a remotely located log, as it can save the number of
1124   // round-trips. If 0, then the prefetching is disabled.
1125   //
1126   // Default: 0
1127   size_t log_readahead_size = 0;
1128 
1129   // If user does NOT provide the checksum generator factory, the file checksum
1130   // will NOT be used. A new file checksum generator object will be created
1131   // when a SST file is created. Therefore, each created FileChecksumGenerator
1132   // will only be used from a single thread and so does not need to be
1133   // thread-safe.
1134   //
1135   // Default: nullptr
1136   std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
1137 
1138   // By default, RocksDB recovery fails if any table file referenced in
1139   // MANIFEST are missing after scanning the MANIFEST.
1140   // Best-efforts recovery is another recovery mode that
1141   // tries to restore the database to the most recent point in time without
1142   // missing file.
1143   // Currently not compatible with atomic flush. Furthermore, WAL files will
1144   // not be used for recovery if best_efforts_recovery is true.
1145   // Default: false
1146   bool best_efforts_recovery = false;
1147 };
1148 
1149 // Options to control the behavior of a database (passed to DB::Open)
1150 struct Options : public DBOptions, public ColumnFamilyOptions {
1151   // Create an Options object with default values for all fields.
OptionsOptions1152   Options() : DBOptions(), ColumnFamilyOptions() {}
1153 
OptionsOptions1154   Options(const DBOptions& db_options,
1155           const ColumnFamilyOptions& column_family_options)
1156       : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
1157 
1158   // The function recovers options to the option as in version 4.6.
1159   Options* OldDefaults(int rocksdb_major_version = 4,
1160                        int rocksdb_minor_version = 6);
1161 
1162   void Dump(Logger* log) const;
1163 
1164   void DumpCFOptions(Logger* log) const;
1165 
1166   // Some functions that make it easier to optimize RocksDB
1167 
1168   // Set appropriate parameters for bulk loading.
1169   // The reason that this is a function that returns "this" instead of a
1170   // constructor is to enable chaining of multiple similar calls in the future.
1171   //
1172 
1173   // All data will be in level 0 without any automatic compaction.
1174   // It's recommended to manually call CompactRange(NULL, NULL) before reading
1175   // from the database, because otherwise the read can be very slow.
1176   Options* PrepareForBulkLoad();
1177 
1178   // Use this if your DB is very small (like under 1GB) and you don't want to
1179   // spend lots of memory for memtables.
1180   Options* OptimizeForSmallDb();
1181 };
1182 
1183 //
1184 // An application can issue a read request (via Get/Iterators) and specify
1185 // if that read should process data that ALREADY resides on a specified cache
1186 // level. For example, if an application specifies kBlockCacheTier then the
1187 // Get call will process data that is already processed in the memtable or
1188 // the block cache. It will not page in data from the OS cache or data that
1189 // resides in storage.
1190 enum ReadTier {
1191   kReadAllTier = 0x0,     // data in memtable, block cache, OS cache or storage
1192   kBlockCacheTier = 0x1,  // data in memtable or block cache
1193   kPersistedTier = 0x2,   // persisted data.  When WAL is disabled, this option
1194                           // will skip data in memtable.
1195                           // Note that this ReadTier currently only supports
1196                           // Get and MultiGet and does not support iterators.
1197   kMemtableTier = 0x3     // data in memtable. used for memtable-only iterators.
1198 };
1199 
1200 // Options that control read operations
1201 struct ReadOptions {
1202   // If "snapshot" is non-nullptr, read as of the supplied snapshot
1203   // (which must belong to the DB that is being read and which must
1204   // not have been released).  If "snapshot" is nullptr, use an implicit
1205   // snapshot of the state at the beginning of this read operation.
1206   // Default: nullptr
1207   const Snapshot* snapshot;
1208 
1209   // `iterate_lower_bound` defines the smallest key at which the backward
1210   // iterator can return an entry. Once the bound is passed, Valid() will be
1211   // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
1212   // entry.
1213   //
1214   // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
1215   // need to have the same prefix. This is because ordering is not guaranteed
1216   // outside of prefix domain.
1217   //
1218   // Default: nullptr
1219   const Slice* iterate_lower_bound;
1220 
1221   // "iterate_upper_bound" defines the extent upto which the forward iterator
1222   // can returns entries. Once the bound is reached, Valid() will be false.
1223   // "iterate_upper_bound" is exclusive ie the bound value is
1224   // not a valid entry. If prefix_extractor is not null, the Seek target
1225   // and iterate_upper_bound need to have the same prefix.
1226   // This is because ordering is not guaranteed outside of prefix domain.
1227   //
1228   // Default: nullptr
1229   const Slice* iterate_upper_bound;
1230 
1231   // RocksDB does auto-readahead for iterators on noticing more than two reads
1232   // for a table file. The readahead starts at 8KB and doubles on every
1233   // additional read upto 256KB.
1234   // This option can help if most of the range scans are large, and if it is
1235   // determined that a larger readahead than that enabled by auto-readahead is
1236   // needed.
1237   // Using a large readahead size (> 2MB) can typically improve the performance
1238   // of forward iteration on spinning disks.
1239   // Default: 0
1240   size_t readahead_size;
1241 
1242   // A threshold for the number of keys that can be skipped before failing an
1243   // iterator seek as incomplete. The default value of 0 should be used to
1244   // never fail a request as incomplete, even on skipping too many keys.
1245   // Default: 0
1246   uint64_t max_skippable_internal_keys;
1247 
1248   // Specify if this read request should process data that ALREADY
1249   // resides on a particular cache. If the required data is not
1250   // found at the specified cache, then Status::Incomplete is returned.
1251   // Default: kReadAllTier
1252   ReadTier read_tier;
1253 
1254   // If true, all data read from underlying storage will be
1255   // verified against corresponding checksums.
1256   // Default: true
1257   bool verify_checksums;
1258 
1259   // Should the "data block"/"index block"" read for this iteration be placed in
1260   // block cache?
1261   // Callers may wish to set this field to false for bulk scans.
1262   // This would help not to the change eviction order of existing items in the
1263   // block cache. Default: true
1264   bool fill_cache;
1265 
1266   // Specify to create a tailing iterator -- a special iterator that has a
1267   // view of the complete database (i.e. it can also be used to read newly
1268   // added data) and is optimized for sequential reads. It will return records
1269   // that were inserted into the database after the creation of the iterator.
1270   // Default: false
1271   // Not supported in ROCKSDB_LITE mode!
1272   bool tailing;
1273 
1274   // This options is not used anymore. It was to turn on a functionality that
1275   // has been removed.
1276   bool managed;
1277 
1278   // Enable a total order seek regardless of index format (e.g. hash index)
1279   // used in the table. Some table format (e.g. plain table) may not support
1280   // this option.
1281   // If true when calling Get(), we also skip prefix bloom when reading from
1282   // block based table. It provides a way to read existing data after
1283   // changing implementation of prefix extractor.
1284   bool total_order_seek;
1285 
1286   // When true, by default use total_order_seek = true, and RocksDB can
1287   // selectively enable prefix seek mode if won't generate a different result
1288   // from total_order_seek, based on seek key, and iterator upper bound.
1289   // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
1290   // prefix mode is not used.
1291   bool auto_prefix_mode;
1292 
1293   // Enforce that the iterator only iterates over the same prefix as the seek.
1294   // This option is effective only for prefix seeks, i.e. prefix_extractor is
1295   // non-null for the column family and total_order_seek is false.  Unlike
1296   // iterate_upper_bound, prefix_same_as_start only works within a prefix
1297   // but in both directions.
1298   // Default: false
1299   bool prefix_same_as_start;
1300 
1301   // Keep the blocks loaded by the iterator pinned in memory as long as the
1302   // iterator is not deleted, If used when reading from tables created with
1303   // BlockBasedTableOptions::use_delta_encoding = false,
1304   // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
1305   // return 1.
1306   // Default: false
1307   bool pin_data;
1308 
1309   // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
1310   // schedule a background job in the flush job queue and delete obsolete files
1311   // in background.
1312   // Default: false
1313   bool background_purge_on_iterator_cleanup;
1314 
1315   // If true, keys deleted using the DeleteRange() API will be visible to
1316   // readers until they are naturally deleted during compaction. This improves
1317   // read performance in DBs with many range deletions.
1318   // Default: false
1319   bool ignore_range_deletions;
1320 
1321   // A callback to determine whether relevant keys for this scan exist in a
1322   // given table based on the table's properties. The callback is passed the
1323   // properties of each table during iteration. If the callback returns false,
1324   // the table will not be scanned. This option only affects Iterators and has
1325   // no impact on point lookups.
1326   // Default: empty (every table will be scanned)
1327   std::function<bool(const TableProperties&)> table_filter;
1328 
1329   // Needed to support differential snapshots. Has 2 effects:
1330   // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
1331   // 2) if this param > 0 iterator will return INTERNAL keys instead of
1332   //    user keys; e.g. return tombstones as well.
1333   // Default: 0 (don't filter by seqnum, return user keys)
1334   SequenceNumber iter_start_seqnum;
1335 
1336   // Timestamp of operation. Read should return the latest data visible to the
1337   // specified timestamp. All timestamps of the same database must be of the
1338   // same length and format. The user is responsible for providing a customized
1339   // compare function via Comparator to order <key, timestamp> tuples.
1340   // The user-specified timestamp feature is still under active development,
1341   // and the API is subject to change.
1342   const Slice* timestamp;
1343 
1344   ReadOptions();
1345   ReadOptions(bool cksum, bool cache);
1346 };
1347 
1348 // Options that control write operations
1349 struct WriteOptions {
1350   // If true, the write will be flushed from the operating system
1351   // buffer cache (by calling WritableFile::Sync()) before the write
1352   // is considered complete.  If this flag is true, writes will be
1353   // slower.
1354   //
1355   // If this flag is false, and the machine crashes, some recent
1356   // writes may be lost.  Note that if it is just the process that
1357   // crashes (i.e., the machine does not reboot), no writes will be
1358   // lost even if sync==false.
1359   //
1360   // In other words, a DB write with sync==false has similar
1361   // crash semantics as the "write()" system call.  A DB write
1362   // with sync==true has similar crash semantics to a "write()"
1363   // system call followed by "fdatasync()".
1364   //
1365   // Default: false
1366   bool sync;
1367 
1368   // If true, writes will not first go to the write ahead log,
1369   // and the write may get lost after a crash. The backup engine
1370   // relies on write-ahead logs to back up the memtable, so if
1371   // you disable write-ahead logs, you must create backups with
1372   // flush_before_backup=true to avoid losing unflushed memtable data.
1373   // Default: false
1374   bool disableWAL;
1375 
1376   // If true and if user is trying to write to column families that don't exist
1377   // (they were dropped),  ignore the write (don't return an error). If there
1378   // are multiple writes in a WriteBatch, other writes will succeed.
1379   // Default: false
1380   bool ignore_missing_column_families;
1381 
1382   // If true and we need to wait or sleep for the write request, fails
1383   // immediately with Status::Incomplete().
1384   // Default: false
1385   bool no_slowdown;
1386 
1387   // If true, this write request is of lower priority if compaction is
1388   // behind. In this case, no_slowdown = true, the request will be cancelled
1389   // immediately with Status::Incomplete() returned. Otherwise, it will be
1390   // slowed down. The slowdown value is determined by RocksDB to guarantee
1391   // it introduces minimum impacts to high priority writes.
1392   //
1393   // Default: false
1394   bool low_pri;
1395 
1396   // If true, this writebatch will maintain the last insert positions of each
1397   // memtable as hints in concurrent write. It can improve write performance
1398   // in concurrent writes if keys in one writebatch are sequential. In
1399   // non-concurrent writes (when concurrent_memtable_writes is false) this
1400   // option will be ignored.
1401   //
1402   // Default: false
1403   bool memtable_insert_hint_per_batch;
1404 
1405   // Timestamp of write operation, e.g. Put. All timestamps of the same
1406   // database must share the same length and format. The user is also
1407   // responsible for providing a customized compare function via Comparator to
1408   // order <key, timestamp> tuples. If the user wants to enable timestamp, then
1409   // all write operations must be associated with timestamp because RocksDB, as
1410   // a single-node storage engine currently has no knowledge of global time,
1411   // thus has to rely on the application.
1412   // The user-specified timestamp feature is still under active development,
1413   // and the API is subject to change.
1414   const Slice* timestamp;
1415 
WriteOptionsWriteOptions1416   WriteOptions()
1417       : sync(false),
1418         disableWAL(false),
1419         ignore_missing_column_families(false),
1420         no_slowdown(false),
1421         low_pri(false),
1422         memtable_insert_hint_per_batch(false),
1423         timestamp(nullptr) {}
1424 };
1425 
1426 // Options that control flush operations
1427 struct FlushOptions {
1428   // If true, the flush will wait until the flush is done.
1429   // Default: true
1430   bool wait;
1431   // If true, the flush would proceed immediately even it means writes will
1432   // stall for the duration of the flush; if false the operation will wait
1433   // until it's possible to do flush w/o causing stall or until required flush
1434   // is performed by someone else (foreground call or background thread).
1435   // Default: false
1436   bool allow_write_stall;
FlushOptionsFlushOptions1437   FlushOptions() : wait(true), allow_write_stall(false) {}
1438 };
1439 
1440 // Create a Logger from provided DBOptions
1441 extern Status CreateLoggerFromOptions(const std::string& dbname,
1442                                       const DBOptions& options,
1443                                       std::shared_ptr<Logger>* logger);
1444 
1445 // CompactionOptions are used in CompactFiles() call.
1446 struct CompactionOptions {
1447   // Compaction output compression type
1448   // Default: snappy
1449   // If set to `kDisableCompressionOption`, RocksDB will choose compression type
1450   // according to the `ColumnFamilyOptions`, taking into account the output
1451   // level if `compression_per_level` is specified.
1452   CompressionType compression;
1453   // Compaction will create files of size `output_file_size_limit`.
1454   // Default: MAX, which means that compaction will create a single file
1455   uint64_t output_file_size_limit;
1456   // If > 0, it will replace the option in the DBOptions for this compaction.
1457   uint32_t max_subcompactions;
1458 
CompactionOptionsCompactionOptions1459   CompactionOptions()
1460       : compression(kSnappyCompression),
1461         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
1462         max_subcompactions(0) {}
1463 };
1464 
1465 // For level based compaction, we can configure if we want to skip/force
1466 // bottommost level compaction.
1467 enum class BottommostLevelCompaction {
1468   // Skip bottommost level compaction
1469   kSkip,
1470   // Only compact bottommost level if there is a compaction filter
1471   // This is the default option
1472   kIfHaveCompactionFilter,
1473   // Always compact bottommost level
1474   kForce,
1475   // Always compact bottommost level but in bottommost level avoid
1476   // double-compacting files created in the same compaction
1477   kForceOptimized,
1478 };
1479 
1480 // CompactRangeOptions is used by CompactRange() call.
1481 struct CompactRangeOptions {
1482   // If true, no other compaction will run at the same time as this
1483   // manual compaction
1484   bool exclusive_manual_compaction = true;
1485   // If true, compacted files will be moved to the minimum level capable
1486   // of holding the data or given level (specified non-negative target_level).
1487   bool change_level = false;
1488   // If change_level is true and target_level have non-negative value, compacted
1489   // files will be moved to target_level.
1490   int target_level = -1;
1491   // Compaction outputs will be placed in options.db_paths[target_path_id].
1492   // Behavior is undefined if target_path_id is out of range.
1493   uint32_t target_path_id = 0;
1494   // By default level based compaction will only compact the bottommost level
1495   // if there is a compaction filter
1496   BottommostLevelCompaction bottommost_level_compaction =
1497       BottommostLevelCompaction::kIfHaveCompactionFilter;
1498   // If true, will execute immediately even if doing so would cause the DB to
1499   // enter write stall mode. Otherwise, it'll sleep until load is low enough.
1500   bool allow_write_stall = false;
1501   // If > 0, it will replace the option in the DBOptions for this compaction.
1502   uint32_t max_subcompactions = 0;
1503 };
1504 
1505 // IngestExternalFileOptions is used by IngestExternalFile()
1506 struct IngestExternalFileOptions {
1507   // Can be set to true to move the files instead of copying them.
1508   bool move_files = false;
1509   // If set to true, ingestion falls back to copy when move fails.
1510   bool failed_move_fall_back_to_copy = true;
1511   // If set to false, an ingested file keys could appear in existing snapshots
1512   // that where created before the file was ingested.
1513   bool snapshot_consistency = true;
1514   // If set to false, IngestExternalFile() will fail if the file key range
1515   // overlaps with existing keys or tombstones in the DB.
1516   bool allow_global_seqno = true;
1517   // If set to false and the file key range overlaps with the memtable key range
1518   // (memtable flush required), IngestExternalFile will fail.
1519   bool allow_blocking_flush = true;
1520   // Set to true if you would like duplicate keys in the file being ingested
1521   // to be skipped rather than overwriting existing data under that key.
1522   // Usecase: back-fill of some historical data in the database without
1523   // over-writing existing newer version of data.
1524   // This option could only be used if the DB has been running
1525   // with allow_ingest_behind=true since the dawn of time.
1526   // All files will be ingested at the bottommost level with seqno=0.
1527   bool ingest_behind = false;
1528   // Set to true if you would like to write global_seqno to a given offset in
1529   // the external SST file for backward compatibility. Older versions of
1530   // RocksDB writes a global_seqno to a given offset within ingested SST files,
1531   // and new versions of RocksDB do not. If you ingest an external SST using
1532   // new version of RocksDB and would like to be able to downgrade to an
1533   // older version of RocksDB, you should set 'write_global_seqno' to true. If
1534   // your service is just starting to use the new RocksDB, we recommend that
1535   // you set this option to false, which brings two benefits:
1536   // 1. No extra random write for global_seqno during ingestion.
1537   // 2. Without writing external SST file, it's possible to do checksum.
1538   // We have a plan to set this option to false by default in the future.
1539   bool write_global_seqno = true;
1540   // Set to true if you would like to verify the checksums of each block of the
1541   // external SST file before ingestion.
1542   // Warning: setting this to true causes slowdown in file ingestion because
1543   // the external SST file has to be read.
1544   bool verify_checksums_before_ingest = false;
1545   // When verify_checksums_before_ingest = true, RocksDB uses default
1546   // readahead setting to scan the file while verifying checksums before
1547   // ingestion.
1548   // Users can override the default value using this option.
1549   // Using a large readahead size (> 2MB) can typically improve the performance
1550   // of forward iteration on spinning disks.
1551   size_t verify_checksums_readahead_size = 0;
1552 };
1553 
1554 enum TraceFilterType : uint64_t {
1555   // Trace all the operations
1556   kTraceFilterNone = 0x0,
1557   // Do not trace the get operations
1558   kTraceFilterGet = 0x1 << 0,
1559   // Do not trace the write operations
1560   kTraceFilterWrite = 0x1 << 1
1561 };
1562 
1563 // TraceOptions is used for StartTrace
1564 struct TraceOptions {
1565   // To avoid the trace file size grows large than the storage space,
1566   // user can set the max trace file size in Bytes. Default is 64GB
1567   uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
1568   // Specify trace sampling option, i.e. capture one per how many requests.
1569   // Default to 1 (capture every request).
1570   uint64_t sampling_frequency = 1;
1571   // Note: The filtering happens before sampling.
1572   uint64_t filter = kTraceFilterNone;
1573 };
1574 
1575 // ImportColumnFamilyOptions is used by ImportColumnFamily()
1576 struct ImportColumnFamilyOptions {
1577   // Can be set to true to move the files instead of copying them.
1578   bool move_files = false;
1579 };
1580 
1581 // Options used with DB::GetApproximateSizes()
1582 struct SizeApproximationOptions {
1583   // Defines whether the returned size should include the recently written
1584   // data in the mem-tables. If set to false, include_files must be true.
1585   bool include_memtabtles = false;
1586   // Defines whether the returned size should include data serialized to disk.
1587   // If set to false, include_memtabtles must be true.
1588   bool include_files = true;
1589   // When approximating the files total size that is used to store a keys range
1590   // using DB::GetApproximateSizes, allow approximation with an error margin of
1591   // up to total_files_size * files_size_error_margin. This allows to take some
1592   // shortcuts in files size approximation, resulting in better performance,
1593   // while guaranteeing the resulting error is within a reasonable margin.
1594   // E.g., if the value is 0.1, then the error margin of the returned files size
1595   // approximation will be within 10%.
1596   // If the value is non-positive - a more precise yet more CPU intensive
1597   // estimation is performed.
1598   double files_size_error_margin = -1.0;
1599 };
1600 
1601 }  // namespace ROCKSDB_NAMESPACE
1602