1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE file. See the AUTHORS file for names of contributors. 8 9 #pragma once 10 11 #include <stddef.h> 12 #include <stdint.h> 13 #include <limits> 14 #include <memory> 15 #include <string> 16 #include <unordered_map> 17 #include <vector> 18 19 #include "rocksdb/advanced_options.h" 20 #include "rocksdb/comparator.h" 21 #include "rocksdb/env.h" 22 #include "rocksdb/file_checksum.h" 23 #include "rocksdb/listener.h" 24 #include "rocksdb/universal_compaction.h" 25 #include "rocksdb/version.h" 26 #include "rocksdb/write_buffer_manager.h" 27 28 #ifdef max 29 #undef max 30 #endif 31 32 namespace ROCKSDB_NAMESPACE { 33 34 class Cache; 35 class CompactionFilter; 36 class CompactionFilterFactory; 37 class Comparator; 38 class ConcurrentTaskLimiter; 39 class Env; 40 enum InfoLogLevel : unsigned char; 41 class SstFileManager; 42 class FilterPolicy; 43 class Logger; 44 class MergeOperator; 45 class Snapshot; 46 class MemTableRepFactory; 47 class RateLimiter; 48 class Slice; 49 class Statistics; 50 class InternalKeyComparator; 51 class WalFilter; 52 class FileSystem; 53 54 enum class CpuPriority { 55 kIdle = 0, 56 kLow = 1, 57 kNormal = 2, 58 kHigh = 3, 59 }; 60 61 // DB contents are stored in a set of blocks, each of which holds a 62 // sequence of key,value pairs. Each block may be compressed before 63 // being stored in a file. The following enum describes which 64 // compression method (if any) is used to compress a block. 65 enum CompressionType : unsigned char { 66 // NOTE: do not change the values of existing entries, as these are 67 // part of the persistent format on disk. 68 kNoCompression = 0x0, 69 kSnappyCompression = 0x1, 70 kZlibCompression = 0x2, 71 kBZip2Compression = 0x3, 72 kLZ4Compression = 0x4, 73 kLZ4HCCompression = 0x5, 74 kXpressCompression = 0x6, 75 kZSTD = 0x7, 76 77 // Only use kZSTDNotFinalCompression if you have to use ZSTD lib older than 78 // 0.8.0 or consider a possibility of downgrading the service or copying 79 // the database files to another service running with an older version of 80 // RocksDB that doesn't have kZSTD. Otherwise, you should use kZSTD. We will 81 // eventually remove the option from the public API. 82 kZSTDNotFinalCompression = 0x40, 83 84 // kDisableCompressionOption is used to disable some compression options. 85 kDisableCompressionOption = 0xff, 86 }; 87 88 struct Options; 89 struct DbPath; 90 91 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { 92 // The function recovers options to a previous version. Only 4.6 or later 93 // versions are supported. 94 ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, 95 int rocksdb_minor_version = 6); 96 97 // Some functions that make it easier to optimize RocksDB 98 // Use this if your DB is very small (like under 1GB) and you don't want to 99 // spend lots of memory for memtables. 100 // An optional cache object is passed in to be used as the block cache 101 ColumnFamilyOptions* OptimizeForSmallDb( 102 std::shared_ptr<Cache>* cache = nullptr); 103 104 // Use this if you don't need to keep the data sorted, i.e. you'll never use 105 // an iterator, only Put() and Get() API calls 106 // 107 // Not supported in ROCKSDB_LITE 108 ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb); 109 110 // Default values for some parameters in ColumnFamilyOptions are not 111 // optimized for heavy workloads and big datasets, which means you might 112 // observe write stalls under some conditions. As a starting point for tuning 113 // RocksDB options, use the following two functions: 114 // * OptimizeLevelStyleCompaction -- optimizes level style compaction 115 // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction 116 // Universal style compaction is focused on reducing Write Amplification 117 // Factor for big data sets, but increases Space Amplification. You can learn 118 // more about the different styles here: 119 // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide 120 // Make sure to also call IncreaseParallelism(), which will provide the 121 // biggest performance gains. 122 // Note: we might use more memory than memtable_memory_budget during high 123 // write rate period 124 // 125 // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE 126 ColumnFamilyOptions* OptimizeLevelStyleCompaction( 127 uint64_t memtable_memory_budget = 512 * 1024 * 1024); 128 ColumnFamilyOptions* OptimizeUniversalStyleCompaction( 129 uint64_t memtable_memory_budget = 512 * 1024 * 1024); 130 131 // ------------------- 132 // Parameters that affect behavior 133 134 // Comparator used to define the order of keys in the table. 135 // Default: a comparator that uses lexicographic byte-wise ordering 136 // 137 // REQUIRES: The client must ensure that the comparator supplied 138 // here has the same name and orders keys *exactly* the same as the 139 // comparator provided to previous open calls on the same DB. 140 const Comparator* comparator = BytewiseComparator(); 141 142 // REQUIRES: The client must provide a merge operator if Merge operation 143 // needs to be accessed. Calling Merge on a DB without a merge operator 144 // would result in Status::NotSupported. The client must ensure that the 145 // merge operator supplied here has the same name and *exactly* the same 146 // semantics as the merge operator provided to previous open calls on 147 // the same DB. The only exception is reserved for upgrade, where a DB 148 // previously without a merge operator is introduced to Merge operation 149 // for the first time. It's necessary to specify a merge operator when 150 // opening the DB in this case. 151 // Default: nullptr 152 std::shared_ptr<MergeOperator> merge_operator = nullptr; 153 154 // A single CompactionFilter instance to call into during compaction. 155 // Allows an application to modify/delete a key-value during background 156 // compaction. 157 // 158 // If the client requires a new compaction filter to be used for different 159 // compaction runs, it can specify compaction_filter_factory instead of this 160 // option. The client should specify only one of the two. 161 // compaction_filter takes precedence over compaction_filter_factory if 162 // client specifies both. 163 // 164 // If multithreaded compaction is being used, the supplied CompactionFilter 165 // instance may be used from different threads concurrently and so should be 166 // thread-safe. 167 // 168 // Default: nullptr 169 const CompactionFilter* compaction_filter = nullptr; 170 171 // This is a factory that provides compaction filter objects which allow 172 // an application to modify/delete a key-value during background compaction. 173 // 174 // A new filter will be created on each compaction run. If multithreaded 175 // compaction is being used, each created CompactionFilter will only be used 176 // from a single thread and so does not need to be thread-safe. 177 // 178 // Default: nullptr 179 std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr; 180 181 // ------------------- 182 // Parameters that affect performance 183 184 // Amount of data to build up in memory (backed by an unsorted log 185 // on disk) before converting to a sorted on-disk file. 186 // 187 // Larger values increase performance, especially during bulk loads. 188 // Up to max_write_buffer_number write buffers may be held in memory 189 // at the same time, 190 // so you may wish to adjust this parameter to control memory usage. 191 // Also, a larger write buffer will result in a longer recovery time 192 // the next time the database is opened. 193 // 194 // Note that write_buffer_size is enforced per column family. 195 // See db_write_buffer_size for sharing memory across column families. 196 // 197 // Default: 64MB 198 // 199 // Dynamically changeable through SetOptions() API 200 size_t write_buffer_size = 64 << 20; 201 202 // Compress blocks using the specified compression algorithm. 203 // 204 // Default: kSnappyCompression, if it's supported. If snappy is not linked 205 // with the library, the default is kNoCompression. 206 // 207 // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: 208 // ~200-500MB/s compression 209 // ~400-800MB/s decompression 210 // 211 // Note that these speeds are significantly faster than most 212 // persistent storage speeds, and therefore it is typically never 213 // worth switching to kNoCompression. Even if the input data is 214 // incompressible, the kSnappyCompression implementation will 215 // efficiently detect that and will switch to uncompressed mode. 216 // 217 // If you do not set `compression_opts.level`, or set it to 218 // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the 219 // default corresponding to `compression` as follows: 220 // 221 // - kZSTD: 3 222 // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1) 223 // - kLZ4HCCompression: 0 224 // - For all others, we do not specify a compression level 225 // 226 // Dynamically changeable through SetOptions() API 227 CompressionType compression; 228 229 // Compression algorithm that will be used for the bottommost level that 230 // contain files. 231 // 232 // Default: kDisableCompressionOption (Disabled) 233 CompressionType bottommost_compression = kDisableCompressionOption; 234 235 // different options for compression algorithms used by bottommost_compression 236 // if it is enabled. To enable it, please see the definition of 237 // CompressionOptions. 238 CompressionOptions bottommost_compression_opts; 239 240 // different options for compression algorithms 241 CompressionOptions compression_opts; 242 243 // Number of files to trigger level-0 compaction. A value <0 means that 244 // level-0 compaction will not be triggered by number of files at all. 245 // 246 // Default: 4 247 // 248 // Dynamically changeable through SetOptions() API 249 int level0_file_num_compaction_trigger = 4; 250 251 // If non-nullptr, use the specified function to determine the 252 // prefixes for keys. These prefixes will be placed in the filter. 253 // Depending on the workload, this can reduce the number of read-IOP 254 // cost for scans when a prefix is passed via ReadOptions to 255 // db.NewIterator(). For prefix filtering to work properly, 256 // "prefix_extractor" and "comparator" must be such that the following 257 // properties hold: 258 // 259 // 1) key.starts_with(prefix(key)) 260 // 2) Compare(prefix(key), key) <= 0. 261 // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0 262 // 4) prefix(prefix(key)) == prefix(key) 263 // 264 // Default: nullptr 265 std::shared_ptr<const SliceTransform> prefix_extractor = nullptr; 266 267 // Control maximum total data size for a level. 268 // max_bytes_for_level_base is the max total for level-1. 269 // Maximum number of bytes for level L can be calculated as 270 // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) 271 // For example, if max_bytes_for_level_base is 200MB, and if 272 // max_bytes_for_level_multiplier is 10, total data size for level-1 273 // will be 200MB, total file size for level-2 will be 2GB, 274 // and total file size for level-3 will be 20GB. 275 // 276 // Default: 256MB. 277 // 278 // Dynamically changeable through SetOptions() API 279 uint64_t max_bytes_for_level_base = 256 * 1048576; 280 281 // Deprecated. 282 uint64_t snap_refresh_nanos = 0; 283 284 // Disable automatic compactions. Manual compactions can still 285 // be issued on this column family 286 // 287 // Dynamically changeable through SetOptions() API 288 bool disable_auto_compactions = false; 289 290 // This is a factory that provides TableFactory objects. 291 // Default: a block-based table factory that provides a default 292 // implementation of TableBuilder and TableReader with default 293 // BlockBasedTableOptions. 294 std::shared_ptr<TableFactory> table_factory; 295 296 // A list of paths where SST files for this column family 297 // can be put into, with its target size. Similar to db_paths, 298 // newer data is placed into paths specified earlier in the 299 // vector while older data gradually moves to paths specified 300 // later in the vector. 301 // Note that, if a path is supplied to multiple column 302 // families, it would have files and total size from all 303 // the column families combined. User should provision for the 304 // total size(from all the column families) in such cases. 305 // 306 // If left empty, db_paths will be used. 307 // Default: empty 308 std::vector<DbPath> cf_paths; 309 310 // Compaction concurrent thread limiter for the column family. 311 // If non-nullptr, use given concurrent thread limiter to control 312 // the max outstanding compaction tasks. Limiter can be shared with 313 // multiple column families across db instances. 314 // 315 // Default: nullptr 316 std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr; 317 318 // Create ColumnFamilyOptions with default values for all fields 319 ColumnFamilyOptions(); 320 // Create ColumnFamilyOptions from Options 321 explicit ColumnFamilyOptions(const Options& options); 322 323 void Dump(Logger* log) const; 324 }; 325 326 enum class WALRecoveryMode : char { 327 // Original levelDB recovery 328 // We tolerate incomplete record in trailing data on all logs 329 // Use case : This is legacy behavior 330 kTolerateCorruptedTailRecords = 0x00, 331 // Recover from clean shutdown 332 // We don't expect to find any corruption in the WAL 333 // Use case : This is ideal for unit tests and rare applications that 334 // can require high consistency guarantee 335 kAbsoluteConsistency = 0x01, 336 // Recover to point-in-time consistency (default) 337 // We stop the WAL playback on discovering WAL inconsistency 338 // Use case : Ideal for systems that have disk controller cache like 339 // hard disk, SSD without super capacitor that store related data 340 kPointInTimeRecovery = 0x02, 341 // Recovery after a disaster 342 // We ignore any corruption in the WAL and try to salvage as much data as 343 // possible 344 // Use case : Ideal for last ditch effort to recover data or systems that 345 // operate with low grade unrelated data 346 kSkipAnyCorruptedRecords = 0x03, 347 }; 348 349 struct DbPath { 350 std::string path; 351 uint64_t target_size; // Target size of total files under the path, in byte. 352 DbPathDbPath353 DbPath() : target_size(0) {} DbPathDbPath354 DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} 355 }; 356 357 struct DBOptions { 358 // The function recovers options to the option as in version 4.6. 359 DBOptions* OldDefaults(int rocksdb_major_version = 4, 360 int rocksdb_minor_version = 6); 361 362 // Some functions that make it easier to optimize RocksDB 363 364 // Use this if your DB is very small (like under 1GB) and you don't want to 365 // spend lots of memory for memtables. 366 // An optional cache object is passed in for the memory of the 367 // memtable to cost to 368 DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr); 369 370 #ifndef ROCKSDB_LITE 371 // By default, RocksDB uses only one background thread for flush and 372 // compaction. Calling this function will set it up such that total of 373 // `total_threads` is used. Good value for `total_threads` is the number of 374 // cores. You almost definitely want to call this function if your system is 375 // bottlenecked by RocksDB. 376 DBOptions* IncreaseParallelism(int total_threads = 16); 377 #endif // ROCKSDB_LITE 378 379 // If true, the database will be created if it is missing. 380 // Default: false 381 bool create_if_missing = false; 382 383 // If true, missing column families will be automatically created. 384 // Default: false 385 bool create_missing_column_families = false; 386 387 // If true, an error is raised if the database already exists. 388 // Default: false 389 bool error_if_exists = false; 390 391 // If true, RocksDB will aggressively check consistency of the data. 392 // Also, if any of the writes to the database fails (Put, Delete, Merge, 393 // Write), the database will switch to read-only mode and fail all other 394 // Write operations. 395 // In most cases you want this to be set to true. 396 // Default: true 397 bool paranoid_checks = true; 398 399 // Use the specified object to interact with the environment, 400 // e.g. to read/write files, schedule background work, etc. In the near 401 // future, support for doing storage operations such as read/write files 402 // through env will be deprecated in favor of file_system (see below) 403 // Default: Env::Default() 404 Env* env = Env::Default(); 405 406 // Use to control write rate of flush and compaction. Flush has higher 407 // priority than compaction. Rate limiting is disabled if nullptr. 408 // If rate limiter is enabled, bytes_per_sync is set to 1MB by default. 409 // Default: nullptr 410 std::shared_ptr<RateLimiter> rate_limiter = nullptr; 411 412 // Use to track SST files and control their file deletion rate. 413 // 414 // Features: 415 // - Throttle the deletion rate of the SST files. 416 // - Keep track the total size of all SST files. 417 // - Set a maximum allowed space limit for SST files that when reached 418 // the DB wont do any further flushes or compactions and will set the 419 // background error. 420 // - Can be shared between multiple dbs. 421 // Limitations: 422 // - Only track and throttle deletes of SST files in 423 // first db_path (db_name if db_paths is empty). 424 // 425 // Default: nullptr 426 std::shared_ptr<SstFileManager> sst_file_manager = nullptr; 427 428 // Any internal progress/error information generated by the db will 429 // be written to info_log if it is non-nullptr, or to a file stored 430 // in the same directory as the DB contents if info_log is nullptr. 431 // Default: nullptr 432 std::shared_ptr<Logger> info_log = nullptr; 433 434 #ifdef NDEBUG 435 InfoLogLevel info_log_level = INFO_LEVEL; 436 #else 437 InfoLogLevel info_log_level = DEBUG_LEVEL; 438 #endif // NDEBUG 439 440 // Number of open files that can be used by the DB. You may need to 441 // increase this if your database has a large working set. Value -1 means 442 // files opened are always kept open. You can estimate number of files based 443 // on target_file_size_base and target_file_size_multiplier for level-based 444 // compaction. For universal-style compaction, you can usually set it to -1. 445 // 446 // Default: -1 447 // 448 // Dynamically changeable through SetDBOptions() API. 449 int max_open_files = -1; 450 451 // If max_open_files is -1, DB will open all files on DB::Open(). You can 452 // use this option to increase the number of threads used to open the files. 453 // Default: 16 454 int max_file_opening_threads = 16; 455 456 // Once write-ahead logs exceed this size, we will start forcing the flush of 457 // column families whose memtables are backed by the oldest live WAL file 458 // (i.e. the ones that are causing all the space amplification). If set to 0 459 // (default), we will dynamically choose the WAL size limit to be 460 // [sum of all write_buffer_size * max_write_buffer_number] * 4 461 // This option takes effect only when there are more than one column family as 462 // otherwise the wal size is dictated by the write_buffer_size. 463 // 464 // Default: 0 465 // 466 // Dynamically changeable through SetDBOptions() API. 467 uint64_t max_total_wal_size = 0; 468 469 // If non-null, then we should collect metrics about database operations 470 std::shared_ptr<Statistics> statistics = nullptr; 471 472 // By default, writes to stable storage use fdatasync (on platforms 473 // where this function is available). If this option is true, 474 // fsync is used instead. 475 // 476 // fsync and fdatasync are equally safe for our purposes and fdatasync is 477 // faster, so it is rarely necessary to set this option. It is provided 478 // as a workaround for kernel/filesystem bugs, such as one that affected 479 // fdatasync with ext4 in kernel versions prior to 3.7. 480 bool use_fsync = false; 481 482 // A list of paths where SST files can be put into, with its target size. 483 // Newer data is placed into paths specified earlier in the vector while 484 // older data gradually moves to paths specified later in the vector. 485 // 486 // For example, you have a flash device with 10GB allocated for the DB, 487 // as well as a hard drive of 2TB, you should config it to be: 488 // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] 489 // 490 // The system will try to guarantee data under each path is close to but 491 // not larger than the target size. But current and future file sizes used 492 // by determining where to place a file are based on best-effort estimation, 493 // which means there is a chance that the actual size under the directory 494 // is slightly more than target size under some workloads. User should give 495 // some buffer room for those cases. 496 // 497 // If none of the paths has sufficient room to place a file, the file will 498 // be placed to the last path anyway, despite to the target size. 499 // 500 // Placing newer data to earlier paths is also best-efforts. User should 501 // expect user files to be placed in higher levels in some extreme cases. 502 // 503 // If left empty, only one path will be used, which is db_name passed when 504 // opening the DB. 505 // Default: empty 506 std::vector<DbPath> db_paths; 507 508 // This specifies the info LOG dir. 509 // If it is empty, the log files will be in the same dir as data. 510 // If it is non empty, the log files will be in the specified dir, 511 // and the db data dir's absolute path will be used as the log file 512 // name's prefix. 513 std::string db_log_dir = ""; 514 515 // This specifies the absolute dir path for write-ahead logs (WAL). 516 // If it is empty, the log files will be in the same dir as data, 517 // dbname is used as the data dir by default 518 // If it is non empty, the log files will be in kept the specified dir. 519 // When destroying the db, 520 // all log files in wal_dir and the dir itself is deleted 521 std::string wal_dir = ""; 522 523 // The periodicity when obsolete files get deleted. The default 524 // value is 6 hours. The files that get out of scope by compaction 525 // process will still get automatically delete on every compaction, 526 // regardless of this setting 527 // 528 // Default: 6 hours 529 // 530 // Dynamically changeable through SetDBOptions() API. 531 uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000; 532 533 // Maximum number of concurrent background jobs (compactions and flushes). 534 // 535 // Default: 2 536 // 537 // Dynamically changeable through SetDBOptions() API. 538 int max_background_jobs = 2; 539 540 // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the 541 // value of max_background_jobs. This option is ignored. 542 // 543 // Dynamically changeable through SetDBOptions() API. 544 int base_background_compactions = -1; 545 546 // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the 547 // value of max_background_jobs. For backwards compatibility we will set 548 // `max_background_jobs = max_background_compactions + max_background_flushes` 549 // in the case where user sets at least one of `max_background_compactions` or 550 // `max_background_flushes` (we replace -1 by 1 in case one option is unset). 551 // 552 // Maximum number of concurrent background compaction jobs, submitted to 553 // the default LOW priority thread pool. 554 // 555 // If you're increasing this, also consider increasing number of threads in 556 // LOW priority thread pool. For more information, see 557 // Env::SetBackgroundThreads 558 // 559 // Default: -1 560 // 561 // Dynamically changeable through SetDBOptions() API. 562 int max_background_compactions = -1; 563 564 // This value represents the maximum number of threads that will 565 // concurrently perform a compaction job by breaking it into multiple, 566 // smaller ones that are run simultaneously. 567 // Default: 1 (i.e. no subcompactions) 568 uint32_t max_subcompactions = 1; 569 570 // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the 571 // value of max_background_jobs. For backwards compatibility we will set 572 // `max_background_jobs = max_background_compactions + max_background_flushes` 573 // in the case where user sets at least one of `max_background_compactions` or 574 // `max_background_flushes`. 575 // 576 // Maximum number of concurrent background memtable flush jobs, submitted by 577 // default to the HIGH priority thread pool. If the HIGH priority thread pool 578 // is configured to have zero threads, flush jobs will share the LOW priority 579 // thread pool with compaction jobs. 580 // 581 // It is important to use both thread pools when the same Env is shared by 582 // multiple db instances. Without a separate pool, long running compaction 583 // jobs could potentially block memtable flush jobs of other db instances, 584 // leading to unnecessary Put stalls. 585 // 586 // If you're increasing this, also consider increasing number of threads in 587 // HIGH priority thread pool. For more information, see 588 // Env::SetBackgroundThreads 589 // Default: -1 590 int max_background_flushes = -1; 591 592 // Specify the maximal size of the info log file. If the log file 593 // is larger than `max_log_file_size`, a new info log file will 594 // be created. 595 // If max_log_file_size == 0, all logs will be written to one 596 // log file. 597 size_t max_log_file_size = 0; 598 599 // Time for the info log file to roll (in seconds). 600 // If specified with non-zero value, log file will be rolled 601 // if it has been active longer than `log_file_time_to_roll`. 602 // Default: 0 (disabled) 603 // Not supported in ROCKSDB_LITE mode! 604 size_t log_file_time_to_roll = 0; 605 606 // Maximal info log files to be kept. 607 // Default: 1000 608 size_t keep_log_file_num = 1000; 609 610 // Recycle log files. 611 // If non-zero, we will reuse previously written log files for new 612 // logs, overwriting the old data. The value indicates how many 613 // such files we will keep around at any point in time for later 614 // use. This is more efficient because the blocks are already 615 // allocated and fdatasync does not need to update the inode after 616 // each write. 617 // Default: 0 618 size_t recycle_log_file_num = 0; 619 620 // manifest file is rolled over on reaching this limit. 621 // The older manifest file be deleted. 622 // The default value is 1GB so that the manifest file can grow, but not 623 // reach the limit of storage capacity. 624 uint64_t max_manifest_file_size = 1024 * 1024 * 1024; 625 626 // Number of shards used for table cache. 627 int table_cache_numshardbits = 6; 628 629 // NOT SUPPORTED ANYMORE 630 // int table_cache_remove_scan_count_limit; 631 632 // The following two fields affect how archived logs will be deleted. 633 // 1. If both set to 0, logs will be deleted asap and will not get into 634 // the archive. 635 // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, 636 // WAL files will be checked every 10 min and if total size is greater 637 // then WAL_size_limit_MB, they will be deleted starting with the 638 // earliest until size_limit is met. All empty files will be deleted. 639 // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then 640 // WAL files will be checked every WAL_ttl_seconds / 2 and those that 641 // are older than WAL_ttl_seconds will be deleted. 642 // 4. If both are not 0, WAL files will be checked every 10 min and both 643 // checks will be performed with ttl being first. 644 uint64_t WAL_ttl_seconds = 0; 645 uint64_t WAL_size_limit_MB = 0; 646 647 // Number of bytes to preallocate (via fallocate) the manifest 648 // files. Default is 4mb, which is reasonable to reduce random IO 649 // as well as prevent overallocation for mounts that preallocate 650 // large amounts of data (such as xfs's allocsize option). 651 size_t manifest_preallocation_size = 4 * 1024 * 1024; 652 653 // Allow the OS to mmap file for reading sst tables. Default: false 654 bool allow_mmap_reads = false; 655 656 // Allow the OS to mmap file for writing. 657 // DB::SyncWAL() only works if this is set to false. 658 // Default: false 659 bool allow_mmap_writes = false; 660 661 // Enable direct I/O mode for read/write 662 // they may or may not improve performance depending on the use case 663 // 664 // Files will be opened in "direct I/O" mode 665 // which means that data r/w from the disk will not be cached or 666 // buffered. The hardware buffer of the devices may however still 667 // be used. Memory mapped files are not impacted by these parameters. 668 669 // Use O_DIRECT for user and compaction reads. 670 // When true, we also force new_table_reader_for_compaction_inputs to true. 671 // Default: false 672 // Not supported in ROCKSDB_LITE mode! 673 bool use_direct_reads = false; 674 675 // Use O_DIRECT for writes in background flush and compactions. 676 // Default: false 677 // Not supported in ROCKSDB_LITE mode! 678 bool use_direct_io_for_flush_and_compaction = false; 679 680 // If false, fallocate() calls are bypassed 681 bool allow_fallocate = true; 682 683 // Disable child process inherit open files. Default: true 684 bool is_fd_close_on_exec = true; 685 686 // NOT SUPPORTED ANYMORE -- this options is no longer used 687 bool skip_log_error_on_recovery = false; 688 689 // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec 690 // 691 // Default: 600 (10 min) 692 // 693 // Dynamically changeable through SetDBOptions() API. 694 unsigned int stats_dump_period_sec = 600; 695 696 // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec 697 // Default: 600 698 unsigned int stats_persist_period_sec = 600; 699 700 // If true, automatically persist stats to a hidden column family (column 701 // family name: ___rocksdb_stats_history___) every 702 // stats_persist_period_sec seconds; otherwise, write to an in-memory 703 // struct. User can query through `GetStatsHistory` API. 704 // If user attempts to create a column family with the same name on a DB 705 // which have previously set persist_stats_to_disk to true, the column family 706 // creation will fail, but the hidden column family will survive, as well as 707 // the previously persisted statistics. 708 // When peristing stats to disk, the stat name will be limited at 100 bytes. 709 // Default: false 710 bool persist_stats_to_disk = false; 711 712 // if not zero, periodically take stats snapshots and store in memory, the 713 // memory size for stats snapshots is capped at stats_history_buffer_size 714 // Default: 1MB 715 size_t stats_history_buffer_size = 1024 * 1024; 716 717 // If set true, will hint the underlying file system that the file 718 // access pattern is random, when a sst file is opened. 719 // Default: true 720 bool advise_random_on_open = true; 721 722 // Amount of data to build up in memtables across all column 723 // families before writing to disk. 724 // 725 // This is distinct from write_buffer_size, which enforces a limit 726 // for a single memtable. 727 // 728 // This feature is disabled by default. Specify a non-zero value 729 // to enable it. 730 // 731 // Default: 0 (disabled) 732 size_t db_write_buffer_size = 0; 733 734 // The memory usage of memtable will report to this object. The same object 735 // can be passed into multiple DBs and it will track the sum of size of all 736 // the DBs. If the total size of all live memtables of all the DBs exceeds 737 // a limit, a flush will be triggered in the next DB to which the next write 738 // is issued. 739 // 740 // If the object is only passed to one DB, the behavior is the same as 741 // db_write_buffer_size. When write_buffer_manager is set, the value set will 742 // override db_write_buffer_size. 743 // 744 // This feature is disabled by default. Specify a non-zero value 745 // to enable it. 746 // 747 // Default: null 748 std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr; 749 750 // Specify the file access pattern once a compaction is started. 751 // It will be applied to all input files of a compaction. 752 // Default: NORMAL 753 enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED }; 754 AccessHint access_hint_on_compaction_start = NORMAL; 755 756 // If true, always create a new file descriptor and new table reader 757 // for compaction inputs. Turn this parameter on may introduce extra 758 // memory usage in the table reader, if it allocates extra memory 759 // for indexes. This will allow file descriptor prefetch options 760 // to be set for compaction input files and not to impact file 761 // descriptors for the same file used by user queries. 762 // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks 763 // for this mode if using block-based table. 764 // 765 // Default: false 766 // This flag has no affect on the behavior of compaction and plan to delete 767 // in the future. 768 bool new_table_reader_for_compaction_inputs = false; 769 770 // If non-zero, we perform bigger reads when doing compaction. If you're 771 // running RocksDB on spinning disks, you should set this to at least 2MB. 772 // That way RocksDB's compaction is doing sequential instead of random reads. 773 // 774 // When non-zero, we also force new_table_reader_for_compaction_inputs to 775 // true. 776 // 777 // Default: 0 778 // 779 // Dynamically changeable through SetDBOptions() API. 780 size_t compaction_readahead_size = 0; 781 782 // This is a maximum buffer size that is used by WinMmapReadableFile in 783 // unbuffered disk I/O mode. We need to maintain an aligned buffer for 784 // reads. We allow the buffer to grow until the specified value and then 785 // for bigger requests allocate one shot buffers. In unbuffered mode we 786 // always bypass read-ahead buffer at ReadaheadRandomAccessFile 787 // When read-ahead is required we then make use of compaction_readahead_size 788 // value and always try to read ahead. With read-ahead we always 789 // pre-allocate buffer to the size instead of growing it up to a limit. 790 // 791 // This option is currently honored only on Windows 792 // 793 // Default: 1 Mb 794 // 795 // Special value: 0 - means do not maintain per instance buffer. Allocate 796 // per request buffer and avoid locking. 797 size_t random_access_max_buffer_size = 1024 * 1024; 798 799 // This is the maximum buffer size that is used by WritableFileWriter. 800 // On Windows, we need to maintain an aligned buffer for writes. 801 // We allow the buffer to grow until it's size hits the limit in buffered 802 // IO and fix the buffer size when using direct IO to ensure alignment of 803 // write requests if the logical sector size is unusual 804 // 805 // Default: 1024 * 1024 (1 MB) 806 // 807 // Dynamically changeable through SetDBOptions() API. 808 size_t writable_file_max_buffer_size = 1024 * 1024; 809 810 // Use adaptive mutex, which spins in the user space before resorting 811 // to kernel. This could reduce context switch when the mutex is not 812 // heavily contended. However, if the mutex is hot, we could end up 813 // wasting spin time. 814 // Default: false 815 bool use_adaptive_mutex = false; 816 817 // Create DBOptions with default values for all fields 818 DBOptions(); 819 // Create DBOptions from Options 820 explicit DBOptions(const Options& options); 821 822 void Dump(Logger* log) const; 823 824 // Allows OS to incrementally sync files to disk while they are being 825 // written, asynchronously, in the background. This operation can be used 826 // to smooth out write I/Os over time. Users shouldn't rely on it for 827 // persistency guarantee. 828 // Issue one request for every bytes_per_sync written. 0 turns it off. 829 // 830 // You may consider using rate_limiter to regulate write rate to device. 831 // When rate limiter is enabled, it automatically enables bytes_per_sync 832 // to 1MB. 833 // 834 // This option applies to table files 835 // 836 // Default: 0, turned off 837 // 838 // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead 839 // Dynamically changeable through SetDBOptions() API. 840 uint64_t bytes_per_sync = 0; 841 842 // Same as bytes_per_sync, but applies to WAL files 843 // 844 // Default: 0, turned off 845 // 846 // Dynamically changeable through SetDBOptions() API. 847 uint64_t wal_bytes_per_sync = 0; 848 849 // When true, guarantees WAL files have at most `wal_bytes_per_sync` 850 // bytes submitted for writeback at any given time, and SST files have at most 851 // `bytes_per_sync` bytes pending writeback at any given time. This can be 852 // used to handle cases where processing speed exceeds I/O speed during file 853 // generation, which can lead to a huge sync when the file is finished, even 854 // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured. 855 // 856 // - If `sync_file_range` is supported it achieves this by waiting for any 857 // prior `sync_file_range`s to finish before proceeding. In this way, 858 // processing (compression, etc.) can proceed uninhibited in the gap 859 // between `sync_file_range`s, and we block only when I/O falls behind. 860 // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism 861 // always blocks, thus preventing the interleaving of I/O and processing. 862 // 863 // Note: Enabling this option does not provide any additional persistence 864 // guarantees, as it may use `sync_file_range`, which does not write out 865 // metadata. 866 // 867 // Default: false 868 bool strict_bytes_per_sync = false; 869 870 // A vector of EventListeners whose callback functions will be called 871 // when specific RocksDB event happens. 872 std::vector<std::shared_ptr<EventListener>> listeners; 873 874 // If true, then the status of the threads involved in this DB will 875 // be tracked and available via GetThreadList() API. 876 // 877 // Default: false 878 bool enable_thread_tracking = false; 879 880 // The limited write rate to DB if soft_pending_compaction_bytes_limit or 881 // level0_slowdown_writes_trigger is triggered, or we are writing to the 882 // last mem table allowed and we allow more than 3 mem tables. It is 883 // calculated using size of user write requests before compression. 884 // RocksDB may decide to slow down more if the compaction still 885 // gets behind further. 886 // If the value is 0, we will infer a value from `rater_limiter` value 887 // if it is not empty, or 16MB if `rater_limiter` is empty. Note that 888 // if users change the rate in `rate_limiter` after DB is opened, 889 // `delayed_write_rate` won't be adjusted. 890 // 891 // Unit: byte per second. 892 // 893 // Default: 0 894 // 895 // Dynamically changeable through SetDBOptions() API. 896 uint64_t delayed_write_rate = 0; 897 898 // By default, a single write thread queue is maintained. The thread gets 899 // to the head of the queue becomes write batch group leader and responsible 900 // for writing to WAL and memtable for the batch group. 901 // 902 // If enable_pipelined_write is true, separate write thread queue is 903 // maintained for WAL write and memtable write. A write thread first enter WAL 904 // writer queue and then memtable writer queue. Pending thread on the WAL 905 // writer queue thus only have to wait for previous writers to finish their 906 // WAL writing but not the memtable writing. Enabling the feature may improve 907 // write throughput and reduce latency of the prepare phase of two-phase 908 // commit. 909 // 910 // Default: false 911 bool enable_pipelined_write = false; 912 913 // Setting unordered_write to true trades higher write throughput with 914 // relaxing the immutability guarantee of snapshots. This violates the 915 // repeatability one expects from ::Get from a snapshot, as well as 916 // ::MultiGet and Iterator's consistent-point-in-time view property. 917 // If the application cannot tolerate the relaxed guarantees, it can implement 918 // its own mechanisms to work around that and yet benefit from the higher 919 // throughput. Using TransactionDB with WRITE_PREPARED write policy and 920 // two_write_queues=true is one way to achieve immutable snapshots despite 921 // unordered_write. 922 // 923 // By default, i.e., when it is false, rocksdb does not advance the sequence 924 // number for new snapshots unless all the writes with lower sequence numbers 925 // are already finished. This provides the immutability that we except from 926 // snapshots. Moreover, since Iterator and MultiGet internally depend on 927 // snapshots, the snapshot immutability results into Iterator and MultiGet 928 // offering consistent-point-in-time view. If set to true, although 929 // Read-Your-Own-Write property is still provided, the snapshot immutability 930 // property is relaxed: the writes issued after the snapshot is obtained (with 931 // larger sequence numbers) will be still not visible to the reads from that 932 // snapshot, however, there still might be pending writes (with lower sequence 933 // number) that will change the state visible to the snapshot after they are 934 // landed to the memtable. 935 // 936 // Default: false 937 bool unordered_write = false; 938 939 // If true, allow multi-writers to update mem tables in parallel. 940 // Only some memtable_factory-s support concurrent writes; currently it 941 // is implemented only for SkipListFactory. Concurrent memtable writes 942 // are not compatible with inplace_update_support or filter_deletes. 943 // It is strongly recommended to set enable_write_thread_adaptive_yield 944 // if you are going to use this feature. 945 // 946 // Default: true 947 bool allow_concurrent_memtable_write = true; 948 949 // If true, threads synchronizing with the write batch group leader will 950 // wait for up to write_thread_max_yield_usec before blocking on a mutex. 951 // This can substantially improve throughput for concurrent workloads, 952 // regardless of whether allow_concurrent_memtable_write is enabled. 953 // 954 // Default: true 955 bool enable_write_thread_adaptive_yield = true; 956 957 // The maximum limit of number of bytes that are written in a single batch 958 // of WAL or memtable write. It is followed when the leader write size 959 // is larger than 1/8 of this limit. 960 // 961 // Default: 1 MB 962 uint64_t max_write_batch_group_size_bytes = 1 << 20; 963 964 // The maximum number of microseconds that a write operation will use 965 // a yielding spin loop to coordinate with other write threads before 966 // blocking on a mutex. (Assuming write_thread_slow_yield_usec is 967 // set properly) increasing this value is likely to increase RocksDB 968 // throughput at the expense of increased CPU usage. 969 // 970 // Default: 100 971 uint64_t write_thread_max_yield_usec = 100; 972 973 // The latency in microseconds after which a std::this_thread::yield 974 // call (sched_yield on Linux) is considered to be a signal that 975 // other processes or threads would like to use the current core. 976 // Increasing this makes writer threads more likely to take CPU 977 // by spinning, which will show up as an increase in the number of 978 // involuntary context switches. 979 // 980 // Default: 3 981 uint64_t write_thread_slow_yield_usec = 3; 982 983 // If true, then DB::Open() will not update the statistics used to optimize 984 // compaction decision by loading table properties from many files. 985 // Turning off this feature will improve DBOpen time especially in 986 // disk environment. 987 // 988 // Default: false 989 bool skip_stats_update_on_db_open = false; 990 991 // If true, then DB::Open() will not fetch and check sizes of all sst files. 992 // This may significantly speed up startup if there are many sst files, 993 // especially when using non-default Env with expensive GetFileSize(). 994 // We'll still check that all required sst files exist. 995 // If paranoid_checks is false, this option is ignored, and sst files are 996 // not checked at all. 997 // 998 // Default: false 999 bool skip_checking_sst_file_sizes_on_db_open = false; 1000 1001 // Recovery mode to control the consistency while replaying WAL 1002 // Default: kPointInTimeRecovery 1003 WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; 1004 1005 // if set to false then recovery will fail when a prepared 1006 // transaction is encountered in the WAL 1007 bool allow_2pc = false; 1008 1009 // A global cache for table-level rows. 1010 // Default: nullptr (disabled) 1011 // Not supported in ROCKSDB_LITE mode! 1012 std::shared_ptr<Cache> row_cache = nullptr; 1013 1014 #ifndef ROCKSDB_LITE 1015 // A filter object supplied to be invoked while processing write-ahead-logs 1016 // (WALs) during recovery. The filter provides a way to inspect log 1017 // records, ignoring a particular record or skipping replay. 1018 // The filter is invoked at startup and is invoked from a single-thread 1019 // currently. 1020 WalFilter* wal_filter = nullptr; 1021 #endif // ROCKSDB_LITE 1022 1023 // If true, then DB::Open / CreateColumnFamily / DropColumnFamily 1024 // / SetOptions will fail if options file is not detected or properly 1025 // persisted. 1026 // 1027 // DEFAULT: false 1028 bool fail_if_options_file_error = false; 1029 1030 // If true, then print malloc stats together with rocksdb.stats 1031 // when printing to LOG. 1032 // DEFAULT: false 1033 bool dump_malloc_stats = false; 1034 1035 // By default RocksDB replay WAL logs and flush them on DB open, which may 1036 // create very small SST files. If this option is enabled, RocksDB will try 1037 // to avoid (but not guarantee not to) flush during recovery. Also, existing 1038 // WAL logs will be kept, so that if crash happened before flush, we still 1039 // have logs to recover from. 1040 // 1041 // DEFAULT: false 1042 bool avoid_flush_during_recovery = false; 1043 1044 // By default RocksDB will flush all memtables on DB close if there are 1045 // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup 1046 // DB close. Unpersisted data WILL BE LOST. 1047 // 1048 // DEFAULT: false 1049 // 1050 // Dynamically changeable through SetDBOptions() API. 1051 bool avoid_flush_during_shutdown = false; 1052 1053 // Set this option to true during creation of database if you want 1054 // to be able to ingest behind (call IngestExternalFile() skipping keys 1055 // that already exist, rather than overwriting matching keys). 1056 // Setting this option to true will affect 2 things: 1057 // 1) Disable some internal optimizations around SST file compression 1058 // 2) Reserve bottom-most level for ingested files only. 1059 // 3) Note that num_levels should be >= 3 if this option is turned on. 1060 // 1061 // DEFAULT: false 1062 // Immutable. 1063 bool allow_ingest_behind = false; 1064 1065 // Needed to support differential snapshots. 1066 // If set to true then DB will only process deletes with sequence number 1067 // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts). 1068 // Clients are responsible to periodically call this method to advance 1069 // the cutoff time. If this method is never called and preserve_deletes 1070 // is set to true NO deletes will ever be processed. 1071 // At the moment this only keeps normal deletes, SingleDeletes will 1072 // not be preserved. 1073 // DEFAULT: false 1074 // Immutable (TODO: make it dynamically changeable) 1075 bool preserve_deletes = false; 1076 1077 // If enabled it uses two queues for writes, one for the ones with 1078 // disable_memtable and one for the ones that also write to memtable. This 1079 // allows the memtable writes not to lag behind other writes. It can be used 1080 // to optimize MySQL 2PC in which only the commits, which are serial, write to 1081 // memtable. 1082 bool two_write_queues = false; 1083 1084 // If true WAL is not flushed automatically after each write. Instead it 1085 // relies on manual invocation of FlushWAL to write the WAL buffer to its 1086 // file. 1087 bool manual_wal_flush = false; 1088 1089 // If true, RocksDB supports flushing multiple column families and committing 1090 // their results atomically to MANIFEST. Note that it is not 1091 // necessary to set atomic_flush to true if WAL is always enabled since WAL 1092 // allows the database to be restored to the last persistent state in WAL. 1093 // This option is useful when there are column families with writes NOT 1094 // protected by WAL. 1095 // For manual flush, application has to specify which column families to 1096 // flush atomically in DB::Flush. 1097 // For auto-triggered flush, RocksDB atomically flushes ALL column families. 1098 // 1099 // Currently, any WAL-enabled writes after atomic flush may be replayed 1100 // independently if the process crashes later and tries to recover. 1101 bool atomic_flush = false; 1102 1103 // If true, working thread may avoid doing unnecessary and long-latency 1104 // operation (such as deleting obsolete files directly or deleting memtable) 1105 // and will instead schedule a background job to do it. 1106 // Use it if you're latency-sensitive. 1107 // If set to true, takes precedence over 1108 // ReadOptions::background_purge_on_iterator_cleanup. 1109 bool avoid_unnecessary_blocking_io = false; 1110 1111 // Historically DB ID has always been stored in Identity File in DB folder. 1112 // If this flag is true, the DB ID is written to Manifest file in addition 1113 // to the Identity file. By doing this 2 problems are solved 1114 // 1. We don't checksum the Identity file where as Manifest file is. 1115 // 2. Since the source of truth for DB is Manifest file DB ID will sit with 1116 // the source of truth. Previously the Identity file could be copied 1117 // independent of Manifest and that can result in wrong DB ID. 1118 // We recommend setting this flag to true. 1119 // Default: false 1120 bool write_dbid_to_manifest = false; 1121 1122 // The number of bytes to prefetch when reading the log. This is mostly useful 1123 // for reading a remotely located log, as it can save the number of 1124 // round-trips. If 0, then the prefetching is disabled. 1125 // 1126 // Default: 0 1127 size_t log_readahead_size = 0; 1128 1129 // If user does NOT provide the checksum generator factory, the file checksum 1130 // will NOT be used. A new file checksum generator object will be created 1131 // when a SST file is created. Therefore, each created FileChecksumGenerator 1132 // will only be used from a single thread and so does not need to be 1133 // thread-safe. 1134 // 1135 // Default: nullptr 1136 std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr; 1137 1138 // By default, RocksDB recovery fails if any table file referenced in 1139 // MANIFEST are missing after scanning the MANIFEST. 1140 // Best-efforts recovery is another recovery mode that 1141 // tries to restore the database to the most recent point in time without 1142 // missing file. 1143 // Currently not compatible with atomic flush. Furthermore, WAL files will 1144 // not be used for recovery if best_efforts_recovery is true. 1145 // Default: false 1146 bool best_efforts_recovery = false; 1147 }; 1148 1149 // Options to control the behavior of a database (passed to DB::Open) 1150 struct Options : public DBOptions, public ColumnFamilyOptions { 1151 // Create an Options object with default values for all fields. OptionsOptions1152 Options() : DBOptions(), ColumnFamilyOptions() {} 1153 OptionsOptions1154 Options(const DBOptions& db_options, 1155 const ColumnFamilyOptions& column_family_options) 1156 : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {} 1157 1158 // The function recovers options to the option as in version 4.6. 1159 Options* OldDefaults(int rocksdb_major_version = 4, 1160 int rocksdb_minor_version = 6); 1161 1162 void Dump(Logger* log) const; 1163 1164 void DumpCFOptions(Logger* log) const; 1165 1166 // Some functions that make it easier to optimize RocksDB 1167 1168 // Set appropriate parameters for bulk loading. 1169 // The reason that this is a function that returns "this" instead of a 1170 // constructor is to enable chaining of multiple similar calls in the future. 1171 // 1172 1173 // All data will be in level 0 without any automatic compaction. 1174 // It's recommended to manually call CompactRange(NULL, NULL) before reading 1175 // from the database, because otherwise the read can be very slow. 1176 Options* PrepareForBulkLoad(); 1177 1178 // Use this if your DB is very small (like under 1GB) and you don't want to 1179 // spend lots of memory for memtables. 1180 Options* OptimizeForSmallDb(); 1181 }; 1182 1183 // 1184 // An application can issue a read request (via Get/Iterators) and specify 1185 // if that read should process data that ALREADY resides on a specified cache 1186 // level. For example, if an application specifies kBlockCacheTier then the 1187 // Get call will process data that is already processed in the memtable or 1188 // the block cache. It will not page in data from the OS cache or data that 1189 // resides in storage. 1190 enum ReadTier { 1191 kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage 1192 kBlockCacheTier = 0x1, // data in memtable or block cache 1193 kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option 1194 // will skip data in memtable. 1195 // Note that this ReadTier currently only supports 1196 // Get and MultiGet and does not support iterators. 1197 kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators. 1198 }; 1199 1200 // Options that control read operations 1201 struct ReadOptions { 1202 // If "snapshot" is non-nullptr, read as of the supplied snapshot 1203 // (which must belong to the DB that is being read and which must 1204 // not have been released). If "snapshot" is nullptr, use an implicit 1205 // snapshot of the state at the beginning of this read operation. 1206 // Default: nullptr 1207 const Snapshot* snapshot; 1208 1209 // `iterate_lower_bound` defines the smallest key at which the backward 1210 // iterator can return an entry. Once the bound is passed, Valid() will be 1211 // false. `iterate_lower_bound` is inclusive ie the bound value is a valid 1212 // entry. 1213 // 1214 // If prefix_extractor is not null, the Seek target and `iterate_lower_bound` 1215 // need to have the same prefix. This is because ordering is not guaranteed 1216 // outside of prefix domain. 1217 // 1218 // Default: nullptr 1219 const Slice* iterate_lower_bound; 1220 1221 // "iterate_upper_bound" defines the extent upto which the forward iterator 1222 // can returns entries. Once the bound is reached, Valid() will be false. 1223 // "iterate_upper_bound" is exclusive ie the bound value is 1224 // not a valid entry. If prefix_extractor is not null, the Seek target 1225 // and iterate_upper_bound need to have the same prefix. 1226 // This is because ordering is not guaranteed outside of prefix domain. 1227 // 1228 // Default: nullptr 1229 const Slice* iterate_upper_bound; 1230 1231 // RocksDB does auto-readahead for iterators on noticing more than two reads 1232 // for a table file. The readahead starts at 8KB and doubles on every 1233 // additional read upto 256KB. 1234 // This option can help if most of the range scans are large, and if it is 1235 // determined that a larger readahead than that enabled by auto-readahead is 1236 // needed. 1237 // Using a large readahead size (> 2MB) can typically improve the performance 1238 // of forward iteration on spinning disks. 1239 // Default: 0 1240 size_t readahead_size; 1241 1242 // A threshold for the number of keys that can be skipped before failing an 1243 // iterator seek as incomplete. The default value of 0 should be used to 1244 // never fail a request as incomplete, even on skipping too many keys. 1245 // Default: 0 1246 uint64_t max_skippable_internal_keys; 1247 1248 // Specify if this read request should process data that ALREADY 1249 // resides on a particular cache. If the required data is not 1250 // found at the specified cache, then Status::Incomplete is returned. 1251 // Default: kReadAllTier 1252 ReadTier read_tier; 1253 1254 // If true, all data read from underlying storage will be 1255 // verified against corresponding checksums. 1256 // Default: true 1257 bool verify_checksums; 1258 1259 // Should the "data block"/"index block"" read for this iteration be placed in 1260 // block cache? 1261 // Callers may wish to set this field to false for bulk scans. 1262 // This would help not to the change eviction order of existing items in the 1263 // block cache. Default: true 1264 bool fill_cache; 1265 1266 // Specify to create a tailing iterator -- a special iterator that has a 1267 // view of the complete database (i.e. it can also be used to read newly 1268 // added data) and is optimized for sequential reads. It will return records 1269 // that were inserted into the database after the creation of the iterator. 1270 // Default: false 1271 // Not supported in ROCKSDB_LITE mode! 1272 bool tailing; 1273 1274 // This options is not used anymore. It was to turn on a functionality that 1275 // has been removed. 1276 bool managed; 1277 1278 // Enable a total order seek regardless of index format (e.g. hash index) 1279 // used in the table. Some table format (e.g. plain table) may not support 1280 // this option. 1281 // If true when calling Get(), we also skip prefix bloom when reading from 1282 // block based table. It provides a way to read existing data after 1283 // changing implementation of prefix extractor. 1284 bool total_order_seek; 1285 1286 // When true, by default use total_order_seek = true, and RocksDB can 1287 // selectively enable prefix seek mode if won't generate a different result 1288 // from total_order_seek, based on seek key, and iterator upper bound. 1289 // Not suppported in ROCKSDB_LITE mode, in the way that even with value true 1290 // prefix mode is not used. 1291 bool auto_prefix_mode; 1292 1293 // Enforce that the iterator only iterates over the same prefix as the seek. 1294 // This option is effective only for prefix seeks, i.e. prefix_extractor is 1295 // non-null for the column family and total_order_seek is false. Unlike 1296 // iterate_upper_bound, prefix_same_as_start only works within a prefix 1297 // but in both directions. 1298 // Default: false 1299 bool prefix_same_as_start; 1300 1301 // Keep the blocks loaded by the iterator pinned in memory as long as the 1302 // iterator is not deleted, If used when reading from tables created with 1303 // BlockBasedTableOptions::use_delta_encoding = false, 1304 // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to 1305 // return 1. 1306 // Default: false 1307 bool pin_data; 1308 1309 // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we 1310 // schedule a background job in the flush job queue and delete obsolete files 1311 // in background. 1312 // Default: false 1313 bool background_purge_on_iterator_cleanup; 1314 1315 // If true, keys deleted using the DeleteRange() API will be visible to 1316 // readers until they are naturally deleted during compaction. This improves 1317 // read performance in DBs with many range deletions. 1318 // Default: false 1319 bool ignore_range_deletions; 1320 1321 // A callback to determine whether relevant keys for this scan exist in a 1322 // given table based on the table's properties. The callback is passed the 1323 // properties of each table during iteration. If the callback returns false, 1324 // the table will not be scanned. This option only affects Iterators and has 1325 // no impact on point lookups. 1326 // Default: empty (every table will be scanned) 1327 std::function<bool(const TableProperties&)> table_filter; 1328 1329 // Needed to support differential snapshots. Has 2 effects: 1330 // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum 1331 // 2) if this param > 0 iterator will return INTERNAL keys instead of 1332 // user keys; e.g. return tombstones as well. 1333 // Default: 0 (don't filter by seqnum, return user keys) 1334 SequenceNumber iter_start_seqnum; 1335 1336 // Timestamp of operation. Read should return the latest data visible to the 1337 // specified timestamp. All timestamps of the same database must be of the 1338 // same length and format. The user is responsible for providing a customized 1339 // compare function via Comparator to order <key, timestamp> tuples. 1340 // The user-specified timestamp feature is still under active development, 1341 // and the API is subject to change. 1342 const Slice* timestamp; 1343 1344 ReadOptions(); 1345 ReadOptions(bool cksum, bool cache); 1346 }; 1347 1348 // Options that control write operations 1349 struct WriteOptions { 1350 // If true, the write will be flushed from the operating system 1351 // buffer cache (by calling WritableFile::Sync()) before the write 1352 // is considered complete. If this flag is true, writes will be 1353 // slower. 1354 // 1355 // If this flag is false, and the machine crashes, some recent 1356 // writes may be lost. Note that if it is just the process that 1357 // crashes (i.e., the machine does not reboot), no writes will be 1358 // lost even if sync==false. 1359 // 1360 // In other words, a DB write with sync==false has similar 1361 // crash semantics as the "write()" system call. A DB write 1362 // with sync==true has similar crash semantics to a "write()" 1363 // system call followed by "fdatasync()". 1364 // 1365 // Default: false 1366 bool sync; 1367 1368 // If true, writes will not first go to the write ahead log, 1369 // and the write may get lost after a crash. The backup engine 1370 // relies on write-ahead logs to back up the memtable, so if 1371 // you disable write-ahead logs, you must create backups with 1372 // flush_before_backup=true to avoid losing unflushed memtable data. 1373 // Default: false 1374 bool disableWAL; 1375 1376 // If true and if user is trying to write to column families that don't exist 1377 // (they were dropped), ignore the write (don't return an error). If there 1378 // are multiple writes in a WriteBatch, other writes will succeed. 1379 // Default: false 1380 bool ignore_missing_column_families; 1381 1382 // If true and we need to wait or sleep for the write request, fails 1383 // immediately with Status::Incomplete(). 1384 // Default: false 1385 bool no_slowdown; 1386 1387 // If true, this write request is of lower priority if compaction is 1388 // behind. In this case, no_slowdown = true, the request will be cancelled 1389 // immediately with Status::Incomplete() returned. Otherwise, it will be 1390 // slowed down. The slowdown value is determined by RocksDB to guarantee 1391 // it introduces minimum impacts to high priority writes. 1392 // 1393 // Default: false 1394 bool low_pri; 1395 1396 // If true, this writebatch will maintain the last insert positions of each 1397 // memtable as hints in concurrent write. It can improve write performance 1398 // in concurrent writes if keys in one writebatch are sequential. In 1399 // non-concurrent writes (when concurrent_memtable_writes is false) this 1400 // option will be ignored. 1401 // 1402 // Default: false 1403 bool memtable_insert_hint_per_batch; 1404 1405 // Timestamp of write operation, e.g. Put. All timestamps of the same 1406 // database must share the same length and format. The user is also 1407 // responsible for providing a customized compare function via Comparator to 1408 // order <key, timestamp> tuples. If the user wants to enable timestamp, then 1409 // all write operations must be associated with timestamp because RocksDB, as 1410 // a single-node storage engine currently has no knowledge of global time, 1411 // thus has to rely on the application. 1412 // The user-specified timestamp feature is still under active development, 1413 // and the API is subject to change. 1414 const Slice* timestamp; 1415 WriteOptionsWriteOptions1416 WriteOptions() 1417 : sync(false), 1418 disableWAL(false), 1419 ignore_missing_column_families(false), 1420 no_slowdown(false), 1421 low_pri(false), 1422 memtable_insert_hint_per_batch(false), 1423 timestamp(nullptr) {} 1424 }; 1425 1426 // Options that control flush operations 1427 struct FlushOptions { 1428 // If true, the flush will wait until the flush is done. 1429 // Default: true 1430 bool wait; 1431 // If true, the flush would proceed immediately even it means writes will 1432 // stall for the duration of the flush; if false the operation will wait 1433 // until it's possible to do flush w/o causing stall or until required flush 1434 // is performed by someone else (foreground call or background thread). 1435 // Default: false 1436 bool allow_write_stall; FlushOptionsFlushOptions1437 FlushOptions() : wait(true), allow_write_stall(false) {} 1438 }; 1439 1440 // Create a Logger from provided DBOptions 1441 extern Status CreateLoggerFromOptions(const std::string& dbname, 1442 const DBOptions& options, 1443 std::shared_ptr<Logger>* logger); 1444 1445 // CompactionOptions are used in CompactFiles() call. 1446 struct CompactionOptions { 1447 // Compaction output compression type 1448 // Default: snappy 1449 // If set to `kDisableCompressionOption`, RocksDB will choose compression type 1450 // according to the `ColumnFamilyOptions`, taking into account the output 1451 // level if `compression_per_level` is specified. 1452 CompressionType compression; 1453 // Compaction will create files of size `output_file_size_limit`. 1454 // Default: MAX, which means that compaction will create a single file 1455 uint64_t output_file_size_limit; 1456 // If > 0, it will replace the option in the DBOptions for this compaction. 1457 uint32_t max_subcompactions; 1458 CompactionOptionsCompactionOptions1459 CompactionOptions() 1460 : compression(kSnappyCompression), 1461 output_file_size_limit(std::numeric_limits<uint64_t>::max()), 1462 max_subcompactions(0) {} 1463 }; 1464 1465 // For level based compaction, we can configure if we want to skip/force 1466 // bottommost level compaction. 1467 enum class BottommostLevelCompaction { 1468 // Skip bottommost level compaction 1469 kSkip, 1470 // Only compact bottommost level if there is a compaction filter 1471 // This is the default option 1472 kIfHaveCompactionFilter, 1473 // Always compact bottommost level 1474 kForce, 1475 // Always compact bottommost level but in bottommost level avoid 1476 // double-compacting files created in the same compaction 1477 kForceOptimized, 1478 }; 1479 1480 // CompactRangeOptions is used by CompactRange() call. 1481 struct CompactRangeOptions { 1482 // If true, no other compaction will run at the same time as this 1483 // manual compaction 1484 bool exclusive_manual_compaction = true; 1485 // If true, compacted files will be moved to the minimum level capable 1486 // of holding the data or given level (specified non-negative target_level). 1487 bool change_level = false; 1488 // If change_level is true and target_level have non-negative value, compacted 1489 // files will be moved to target_level. 1490 int target_level = -1; 1491 // Compaction outputs will be placed in options.db_paths[target_path_id]. 1492 // Behavior is undefined if target_path_id is out of range. 1493 uint32_t target_path_id = 0; 1494 // By default level based compaction will only compact the bottommost level 1495 // if there is a compaction filter 1496 BottommostLevelCompaction bottommost_level_compaction = 1497 BottommostLevelCompaction::kIfHaveCompactionFilter; 1498 // If true, will execute immediately even if doing so would cause the DB to 1499 // enter write stall mode. Otherwise, it'll sleep until load is low enough. 1500 bool allow_write_stall = false; 1501 // If > 0, it will replace the option in the DBOptions for this compaction. 1502 uint32_t max_subcompactions = 0; 1503 }; 1504 1505 // IngestExternalFileOptions is used by IngestExternalFile() 1506 struct IngestExternalFileOptions { 1507 // Can be set to true to move the files instead of copying them. 1508 bool move_files = false; 1509 // If set to true, ingestion falls back to copy when move fails. 1510 bool failed_move_fall_back_to_copy = true; 1511 // If set to false, an ingested file keys could appear in existing snapshots 1512 // that where created before the file was ingested. 1513 bool snapshot_consistency = true; 1514 // If set to false, IngestExternalFile() will fail if the file key range 1515 // overlaps with existing keys or tombstones in the DB. 1516 bool allow_global_seqno = true; 1517 // If set to false and the file key range overlaps with the memtable key range 1518 // (memtable flush required), IngestExternalFile will fail. 1519 bool allow_blocking_flush = true; 1520 // Set to true if you would like duplicate keys in the file being ingested 1521 // to be skipped rather than overwriting existing data under that key. 1522 // Usecase: back-fill of some historical data in the database without 1523 // over-writing existing newer version of data. 1524 // This option could only be used if the DB has been running 1525 // with allow_ingest_behind=true since the dawn of time. 1526 // All files will be ingested at the bottommost level with seqno=0. 1527 bool ingest_behind = false; 1528 // Set to true if you would like to write global_seqno to a given offset in 1529 // the external SST file for backward compatibility. Older versions of 1530 // RocksDB writes a global_seqno to a given offset within ingested SST files, 1531 // and new versions of RocksDB do not. If you ingest an external SST using 1532 // new version of RocksDB and would like to be able to downgrade to an 1533 // older version of RocksDB, you should set 'write_global_seqno' to true. If 1534 // your service is just starting to use the new RocksDB, we recommend that 1535 // you set this option to false, which brings two benefits: 1536 // 1. No extra random write for global_seqno during ingestion. 1537 // 2. Without writing external SST file, it's possible to do checksum. 1538 // We have a plan to set this option to false by default in the future. 1539 bool write_global_seqno = true; 1540 // Set to true if you would like to verify the checksums of each block of the 1541 // external SST file before ingestion. 1542 // Warning: setting this to true causes slowdown in file ingestion because 1543 // the external SST file has to be read. 1544 bool verify_checksums_before_ingest = false; 1545 // When verify_checksums_before_ingest = true, RocksDB uses default 1546 // readahead setting to scan the file while verifying checksums before 1547 // ingestion. 1548 // Users can override the default value using this option. 1549 // Using a large readahead size (> 2MB) can typically improve the performance 1550 // of forward iteration on spinning disks. 1551 size_t verify_checksums_readahead_size = 0; 1552 }; 1553 1554 enum TraceFilterType : uint64_t { 1555 // Trace all the operations 1556 kTraceFilterNone = 0x0, 1557 // Do not trace the get operations 1558 kTraceFilterGet = 0x1 << 0, 1559 // Do not trace the write operations 1560 kTraceFilterWrite = 0x1 << 1 1561 }; 1562 1563 // TraceOptions is used for StartTrace 1564 struct TraceOptions { 1565 // To avoid the trace file size grows large than the storage space, 1566 // user can set the max trace file size in Bytes. Default is 64GB 1567 uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024; 1568 // Specify trace sampling option, i.e. capture one per how many requests. 1569 // Default to 1 (capture every request). 1570 uint64_t sampling_frequency = 1; 1571 // Note: The filtering happens before sampling. 1572 uint64_t filter = kTraceFilterNone; 1573 }; 1574 1575 // ImportColumnFamilyOptions is used by ImportColumnFamily() 1576 struct ImportColumnFamilyOptions { 1577 // Can be set to true to move the files instead of copying them. 1578 bool move_files = false; 1579 }; 1580 1581 // Options used with DB::GetApproximateSizes() 1582 struct SizeApproximationOptions { 1583 // Defines whether the returned size should include the recently written 1584 // data in the mem-tables. If set to false, include_files must be true. 1585 bool include_memtabtles = false; 1586 // Defines whether the returned size should include data serialized to disk. 1587 // If set to false, include_memtabtles must be true. 1588 bool include_files = true; 1589 // When approximating the files total size that is used to store a keys range 1590 // using DB::GetApproximateSizes, allow approximation with an error margin of 1591 // up to total_files_size * files_size_error_margin. This allows to take some 1592 // shortcuts in files size approximation, resulting in better performance, 1593 // while guaranteeing the resulting error is within a reasonable margin. 1594 // E.g., if the value is 0.1, then the error margin of the returned files size 1595 // approximation will be within 10%. 1596 // If the value is non-positive - a more precise yet more CPU intensive 1597 // estimation is performed. 1598 double files_size_error_margin = -1.0; 1599 }; 1600 1601 } // namespace ROCKSDB_NAMESPACE 1602