1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE file. See the AUTHORS file for names of contributors. 8 9 #pragma once 10 11 #include <memory> 12 13 #include "rocksdb/memtablerep.h" 14 #include "rocksdb/universal_compaction.h" 15 16 namespace ROCKSDB_NAMESPACE { 17 18 class Slice; 19 class SliceTransform; 20 enum CompressionType : unsigned char; 21 class TablePropertiesCollectorFactory; 22 class TableFactory; 23 struct Options; 24 25 enum CompactionStyle : char { 26 // level based compaction style 27 kCompactionStyleLevel = 0x0, 28 // Universal compaction style 29 // Not supported in ROCKSDB_LITE. 30 kCompactionStyleUniversal = 0x1, 31 // FIFO compaction style 32 // Not supported in ROCKSDB_LITE 33 kCompactionStyleFIFO = 0x2, 34 // Disable background compaction. Compaction jobs are submitted 35 // via CompactFiles(). 36 // Not supported in ROCKSDB_LITE 37 kCompactionStyleNone = 0x3, 38 }; 39 40 // In Level-based compaction, it Determines which file from a level to be 41 // picked to merge to the next level. We suggest people try 42 // kMinOverlappingRatio first when you tune your database. 43 enum CompactionPri : char { 44 // Slightly prioritize larger files by size compensated by #deletes 45 kByCompensatedSize = 0x0, 46 // First compact files whose data's latest update time is oldest. 47 // Try this if you only update some hot keys in small ranges. 48 kOldestLargestSeqFirst = 0x1, 49 // First compact files whose range hasn't been compacted to the next level 50 // for the longest. If your updates are random across the key space, 51 // write amplification is slightly better with this option. 52 kOldestSmallestSeqFirst = 0x2, 53 // First compact files whose ratio between overlapping size in next level 54 // and its size is the smallest. It in many cases can optimize write 55 // amplification. 56 kMinOverlappingRatio = 0x3, 57 }; 58 59 struct CompactionOptionsFIFO { 60 // once the total sum of table files reaches this, we will delete the oldest 61 // table file 62 // Default: 1GB 63 uint64_t max_table_files_size; 64 65 // If true, try to do compaction to compact smaller files into larger ones. 66 // Minimum files to compact follows options.level0_file_num_compaction_trigger 67 // and compaction won't trigger if average compact bytes per del file is 68 // larger than options.write_buffer_size. This is to protect large files 69 // from being compacted again. 70 // Default: false; 71 bool allow_compaction = false; 72 CompactionOptionsFIFOCompactionOptionsFIFO73 CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} CompactionOptionsFIFOCompactionOptionsFIFO74 CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) 75 : max_table_files_size(_max_table_files_size), 76 allow_compaction(_allow_compaction) {} 77 }; 78 79 // Compression options for different compression algorithms like Zlib 80 struct CompressionOptions { 81 // RocksDB's generic default compression level. Internally it'll be translated 82 // to the default compression level specific to the library being used (see 83 // comment above `ColumnFamilyOptions::compression`). 84 // 85 // The default value is the max 16-bit int as it'll be written out in OPTIONS 86 // file, which should be portable. 87 const static int kDefaultCompressionLevel = 32767; 88 89 int window_bits; 90 int level; 91 int strategy; 92 93 // Maximum size of dictionaries used to prime the compression library. 94 // Enabling dictionary can improve compression ratios when there are 95 // repetitions across data blocks. 96 // 97 // The dictionary is created by sampling the SST file data. If 98 // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's 99 // dictionary generator. Otherwise, the random samples are used directly as 100 // the dictionary. 101 // 102 // When compression dictionary is disabled, we compress and write each block 103 // before buffering data for the next one. When compression dictionary is 104 // enabled, we buffer all SST file data in-memory so we can sample it, as data 105 // can only be compressed and written after the dictionary has been finalized. 106 // So users of this feature may see increased memory usage. 107 // 108 // Default: 0. 109 uint32_t max_dict_bytes; 110 111 // Maximum size of training data passed to zstd's dictionary trainer. Using 112 // zstd's dictionary trainer can achieve even better compression ratio 113 // improvements than using `max_dict_bytes` alone. 114 // 115 // The training data will be used to generate a dictionary of max_dict_bytes. 116 // 117 // Default: 0. 118 uint32_t zstd_max_train_bytes; 119 120 // When the compression options are set by the user, it will be set to "true". 121 // For bottommost_compression_opts, to enable it, user must set enabled=true. 122 // Otherwise, bottommost compression will use compression_opts as default 123 // compression options. 124 // 125 // For compression_opts, if compression_opts.enabled=false, it is still 126 // used as compression options for compression process. 127 // 128 // Default: false. 129 bool enabled; 130 CompressionOptionsCompressionOptions131 CompressionOptions() 132 : window_bits(-14), 133 level(kDefaultCompressionLevel), 134 strategy(0), 135 max_dict_bytes(0), 136 zstd_max_train_bytes(0), 137 enabled(false) {} CompressionOptionsCompressionOptions138 CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes, 139 int _zstd_max_train_bytes, bool _enabled) 140 : window_bits(wbits), 141 level(_lev), 142 strategy(_strategy), 143 max_dict_bytes(_max_dict_bytes), 144 zstd_max_train_bytes(_zstd_max_train_bytes), 145 enabled(_enabled) {} 146 }; 147 148 enum UpdateStatus { // Return status For inplace update callback 149 UPDATE_FAILED = 0, // Nothing to update 150 UPDATED_INPLACE = 1, // Value updated inplace 151 UPDATED = 2, // No inplace update. Merged value set 152 }; 153 154 struct AdvancedColumnFamilyOptions { 155 // The maximum number of write buffers that are built up in memory. 156 // The default and the minimum number is 2, so that when 1 write buffer 157 // is being flushed to storage, new writes can continue to the other 158 // write buffer. 159 // If max_write_buffer_number > 3, writing will be slowed down to 160 // options.delayed_write_rate if we are writing to the last write buffer 161 // allowed. 162 // 163 // Default: 2 164 // 165 // Dynamically changeable through SetOptions() API 166 int max_write_buffer_number = 2; 167 168 // The minimum number of write buffers that will be merged together 169 // before writing to storage. If set to 1, then 170 // all write buffers are flushed to L0 as individual files and this increases 171 // read amplification because a get request has to check in all of these 172 // files. Also, an in-memory merge may result in writing lesser 173 // data to storage if there are duplicate records in each of these 174 // individual write buffers. Default: 1 175 int min_write_buffer_number_to_merge = 1; 176 177 // DEPRECATED 178 // The total maximum number of write buffers to maintain in memory including 179 // copies of buffers that have already been flushed. Unlike 180 // max_write_buffer_number, this parameter does not affect flushing. 181 // This parameter is being replaced by max_write_buffer_size_to_maintain. 182 // If both parameters are set to non-zero values, this parameter will be 183 // ignored. 184 int max_write_buffer_number_to_maintain = 0; 185 186 // The total maximum size(bytes) of write buffers to maintain in memory 187 // including copies of buffers that have already been flushed. This parameter 188 // only affects trimming of flushed buffers and does not affect flushing. 189 // This controls the maximum amount of write history that will be available 190 // in memory for conflict checking when Transactions are used. The actual 191 // size of write history (flushed Memtables) might be higher than this limit 192 // if further trimming will reduce write history total size below this 193 // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB, 194 // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB. 195 // Because trimming the next Memtable of size 20MB will reduce total memory 196 // usage to 52MB which is below the limit, RocksDB will stop trimming. 197 // 198 // When using an OptimisticTransactionDB: 199 // If this value is too low, some transactions may fail at commit time due 200 // to not being able to determine whether there were any write conflicts. 201 // 202 // When using a TransactionDB: 203 // If Transaction::SetSnapshot is used, TransactionDB will read either 204 // in-memory write buffers or SST files to do write-conflict checking. 205 // Increasing this value can reduce the number of reads to SST files 206 // done for conflict detection. 207 // 208 // Setting this value to 0 will cause write buffers to be freed immediately 209 // after they are flushed. If this value is set to -1, 210 // 'max_write_buffer_number * write_buffer_size' will be used. 211 // 212 // Default: 213 // If using a TransactionDB/OptimisticTransactionDB, the default value will 214 // be set to the value of 'max_write_buffer_number * write_buffer_size' 215 // if it is not explicitly set by the user. Otherwise, the default is 0. 216 int64_t max_write_buffer_size_to_maintain = 0; 217 218 // Allows thread-safe inplace updates. If this is true, there is no way to 219 // achieve point-in-time consistency using snapshot or iterator (assuming 220 // concurrent updates). Hence iterator and multi-get will return results 221 // which are not consistent as of any point-in-time. 222 // If inplace_callback function is not set, 223 // Put(key, new_value) will update inplace the existing_value iff 224 // * key exists in current memtable 225 // * new sizeof(new_value) <= sizeof(existing_value) 226 // * existing_value for that key is a put i.e. kTypeValue 227 // If inplace_callback function is set, check doc for inplace_callback. 228 // Default: false. 229 bool inplace_update_support = false; 230 231 // Number of locks used for inplace update 232 // Default: 10000, if inplace_update_support = true, else 0. 233 // 234 // Dynamically changeable through SetOptions() API 235 size_t inplace_update_num_locks = 10000; 236 237 // existing_value - pointer to previous value (from both memtable and sst). 238 // nullptr if key doesn't exist 239 // existing_value_size - pointer to size of existing_value). 240 // nullptr if key doesn't exist 241 // delta_value - Delta value to be merged with the existing_value. 242 // Stored in transaction logs. 243 // merged_value - Set when delta is applied on the previous value. 244 245 // Applicable only when inplace_update_support is true, 246 // this callback function is called at the time of updating the memtable 247 // as part of a Put operation, lets say Put(key, delta_value). It allows the 248 // 'delta_value' specified as part of the Put operation to be merged with 249 // an 'existing_value' of the key in the database. 250 251 // If the merged value is smaller in size that the 'existing_value', 252 // then this function can update the 'existing_value' buffer inplace and 253 // the corresponding 'existing_value'_size pointer, if it wishes to. 254 // The callback should return UpdateStatus::UPDATED_INPLACE. 255 // In this case. (In this case, the snapshot-semantics of the rocksdb 256 // Iterator is not atomic anymore). 257 258 // If the merged value is larger in size than the 'existing_value' or the 259 // application does not wish to modify the 'existing_value' buffer inplace, 260 // then the merged value should be returned via *merge_value. It is set by 261 // merging the 'existing_value' and the Put 'delta_value'. The callback should 262 // return UpdateStatus::UPDATED in this case. This merged value will be added 263 // to the memtable. 264 265 // If merging fails or the application does not wish to take any action, 266 // then the callback should return UpdateStatus::UPDATE_FAILED. 267 268 // Please remember that the original call from the application is Put(key, 269 // delta_value). So the transaction log (if enabled) will still contain (key, 270 // delta_value). The 'merged_value' is not stored in the transaction log. 271 // Hence the inplace_callback function should be consistent across db reopens. 272 273 // Default: nullptr 274 UpdateStatus (*inplace_callback)(char* existing_value, 275 uint32_t* existing_value_size, 276 Slice delta_value, 277 std::string* merged_value) = nullptr; 278 279 // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, 280 // create prefix bloom for memtable with the size of 281 // write_buffer_size * memtable_prefix_bloom_size_ratio. 282 // If it is larger than 0.25, it is sanitized to 0.25. 283 // 284 // Default: 0 (disable) 285 // 286 // Dynamically changeable through SetOptions() API 287 double memtable_prefix_bloom_size_ratio = 0.0; 288 289 // Enable whole key bloom filter in memtable. Note this will only take effect 290 // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering 291 // can potentially reduce CPU usage for point-look-ups. 292 // 293 // Default: false (disable) 294 // 295 // Dynamically changeable through SetOptions() API 296 bool memtable_whole_key_filtering = false; 297 298 // Page size for huge page for the arena used by the memtable. If <=0, it 299 // won't allocate from huge page but from malloc. 300 // Users are responsible to reserve huge pages for it to be allocated. For 301 // example: 302 // sysctl -w vm.nr_hugepages=20 303 // See linux doc Documentation/vm/hugetlbpage.txt 304 // If there isn't enough free huge page available, it will fall back to 305 // malloc. 306 // 307 // Dynamically changeable through SetOptions() API 308 size_t memtable_huge_page_size = 0; 309 310 // If non-nullptr, memtable will use the specified function to extract 311 // prefixes for keys, and for each prefix maintain a hint of insert location 312 // to reduce CPU usage for inserting keys with the prefix. Keys out of 313 // domain of the prefix extractor will be insert without using hints. 314 // 315 // Currently only the default skiplist based memtable implements the feature. 316 // All other memtable implementation will ignore the option. It incurs ~250 317 // additional bytes of memory overhead to store a hint for each prefix. 318 // Also concurrent writes (when allow_concurrent_memtable_write is true) will 319 // ignore the option. 320 // 321 // The option is best suited for workloads where keys will likely to insert 322 // to a location close the last inserted key with the same prefix. 323 // One example could be inserting keys of the form (prefix + timestamp), 324 // and keys of the same prefix always comes in with time order. Another 325 // example would be updating the same key over and over again, in which case 326 // the prefix can be the key itself. 327 // 328 // Default: nullptr (disable) 329 std::shared_ptr<const SliceTransform> 330 memtable_insert_with_hint_prefix_extractor = nullptr; 331 332 // Control locality of bloom filter probes to improve CPU cache hit rate. 333 // This option now only applies to plaintable prefix bloom. This 334 // optimization is turned off when set to 0, and positive number to turn 335 // it on. 336 // Default: 0 337 uint32_t bloom_locality = 0; 338 339 // size of one block in arena memory allocation. 340 // If <= 0, a proper value is automatically calculated (usually 1/8 of 341 // writer_buffer_size, rounded up to a multiple of 4KB). 342 // 343 // There are two additional restriction of the specified size: 344 // (1) size should be in the range of [4096, 2 << 30] and 345 // (2) be the multiple of the CPU word (which helps with the memory 346 // alignment). 347 // 348 // We'll automatically check and adjust the size number to make sure it 349 // conforms to the restrictions. 350 // 351 // Default: 0 352 // 353 // Dynamically changeable through SetOptions() API 354 size_t arena_block_size = 0; 355 356 // Different levels can have different compression policies. There 357 // are cases where most lower levels would like to use quick compression 358 // algorithms while the higher levels (which have more data) use 359 // compression algorithms that have better compression but could 360 // be slower. This array, if non-empty, should have an entry for 361 // each level of the database; these override the value specified in 362 // the previous field 'compression'. 363 // 364 // NOTICE if level_compaction_dynamic_level_bytes=true, 365 // compression_per_level[0] still determines L0, but other elements 366 // of the array are based on base level (the level L0 files are merged 367 // to), and may not match the level users see from info log for metadata. 368 // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] 369 // determines compaction type for level n+i-1. 370 // For example, if we have three 5 levels, and we determine to merge L0 371 // data to L4 (which means L1..L3 will be empty), then the new files go to 372 // L4 uses compression type compression_per_level[1]. 373 // If now L0 is merged to L2. Data goes to L2 will be compressed 374 // according to compression_per_level[1], L3 using compression_per_level[2] 375 // and L4 using compression_per_level[3]. Compaction for each level can 376 // change when data grows. 377 std::vector<CompressionType> compression_per_level; 378 379 // Number of levels for this database 380 int num_levels = 7; 381 382 // Soft limit on number of level-0 files. We start slowing down writes at this 383 // point. A value <0 means that no writing slow down will be triggered by 384 // number of files in level-0. 385 // 386 // Default: 20 387 // 388 // Dynamically changeable through SetOptions() API 389 int level0_slowdown_writes_trigger = 20; 390 391 // Maximum number of level-0 files. We stop writes at this point. 392 // 393 // Default: 36 394 // 395 // Dynamically changeable through SetOptions() API 396 int level0_stop_writes_trigger = 36; 397 398 // Target file size for compaction. 399 // target_file_size_base is per-file size for level-1. 400 // Target file size for level L can be calculated by 401 // target_file_size_base * (target_file_size_multiplier ^ (L-1)) 402 // For example, if target_file_size_base is 2MB and 403 // target_file_size_multiplier is 10, then each file on level-1 will 404 // be 2MB, and each file on level 2 will be 20MB, 405 // and each file on level-3 will be 200MB. 406 // 407 // Default: 64MB. 408 // 409 // Dynamically changeable through SetOptions() API 410 uint64_t target_file_size_base = 64 * 1048576; 411 412 // By default target_file_size_multiplier is 1, which means 413 // by default files in different levels will have similar size. 414 // 415 // Dynamically changeable through SetOptions() API 416 int target_file_size_multiplier = 1; 417 418 // If true, RocksDB will pick target size of each level dynamically. 419 // We will pick a base level b >= 1. L0 will be directly merged into level b, 420 // instead of always into level 1. Level 1 to b-1 need to be empty. 421 // We try to pick b and its target size so that 422 // 1. target size is in the range of 423 // (max_bytes_for_level_base / max_bytes_for_level_multiplier, 424 // max_bytes_for_level_base] 425 // 2. target size of the last level (level num_levels-1) equals to extra size 426 // of the level. 427 // At the same time max_bytes_for_level_multiplier and 428 // max_bytes_for_level_multiplier_additional are still satisfied. 429 // (When L0 is too large, we make some adjustment. See below.) 430 // 431 // With this option on, from an empty DB, we make last level the base level, 432 // which means merging L0 data into the last level, until it exceeds 433 // max_bytes_for_level_base. And then we make the second last level to be 434 // base level, to start to merge L0 data to second last level, with its 435 // target size to be 1/max_bytes_for_level_multiplier of the last level's 436 // extra size. After the data accumulates more so that we need to move the 437 // base level to the third last one, and so on. 438 // 439 // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, 440 // and max_bytes_for_level_base=10MB. 441 // Target sizes of level 1 to 5 starts with: 442 // [- - - - 10MB] 443 // with base level is level. Target sizes of level 1 to 4 are not applicable 444 // because they will not be used. 445 // Until the size of Level 5 grows to more than 10MB, say 11MB, we make 446 // base target to level 4 and now the targets looks like: 447 // [- - - 1.1MB 11MB] 448 // While data are accumulated, size targets are tuned based on actual data 449 // of level 5. When level 5 has 50MB of data, the target is like: 450 // [- - - 5MB 50MB] 451 // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep 452 // level 4 to be the base level, its target size needs to be 10.1MB, which 453 // doesn't satisfy the target size range. So now we make level 3 the target 454 // size and the target sizes of the levels look like: 455 // [- - 1.01MB 10.1MB 101MB] 456 // In the same way, while level 5 further grows, all levels' targets grow, 457 // like 458 // [- - 5MB 50MB 500MB] 459 // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the 460 // base level and make levels' target sizes like this: 461 // [- 1.001MB 10.01MB 100.1MB 1001MB] 462 // and go on... 463 // 464 // By doing it, we give max_bytes_for_level_multiplier a priority against 465 // max_bytes_for_level_base, for a more predictable LSM tree shape. It is 466 // useful to limit worse case space amplification. 467 // 468 // 469 // If the compaction from L0 is lagged behind, a special mode will be turned 470 // on to prioritize write amplification against max_bytes_for_level_multiplier 471 // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking 472 // at number of L0 files and total L0 size. If number of L0 files is at least 473 // the double of level0_file_num_compaction_trigger, or the total size is 474 // at least max_bytes_for_level_base, this mode is on. The target of L1 grows 475 // to the actual data size in L0, and then determine the target for each level 476 // so that each level will have the same level multiplier. 477 // 478 // For example, when L0 size is 100MB, the size of last level is 1600MB, 479 // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. 480 // Since L0 size is larger than max_bytes_for_level_base, this is a L0 481 // compaction backlogged mode. So that the L1 size is determined to be 100MB. 482 // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will 483 // be needed. The level multiplier will be calculated to be 4 and the three 484 // levels' target to be [100MB, 400MB, 1600MB]. 485 // 486 // In this mode, The number of levels will be no more than the normal mode, 487 // and the level multiplier will be lower. The write amplification will 488 // likely to be reduced. 489 // 490 // 491 // max_bytes_for_level_multiplier_additional is ignored with this flag on. 492 // 493 // Turning this feature on or off for an existing DB can cause unexpected 494 // LSM tree structure so it's not recommended. 495 // 496 // Default: false 497 bool level_compaction_dynamic_level_bytes = false; 498 499 // Default: 10. 500 // 501 // Dynamically changeable through SetOptions() API 502 double max_bytes_for_level_multiplier = 10; 503 504 // Different max-size multipliers for different levels. 505 // These are multiplied by max_bytes_for_level_multiplier to arrive 506 // at the max-size of each level. 507 // 508 // Default: 1 509 // 510 // Dynamically changeable through SetOptions() API 511 std::vector<int> max_bytes_for_level_multiplier_additional = 512 std::vector<int>(num_levels, 1); 513 514 // We try to limit number of bytes in one compaction to be lower than this 515 // threshold. But it's not guaranteed. 516 // Value 0 will be sanitized. 517 // 518 // Default: target_file_size_base * 25 519 // 520 // Dynamically changeable through SetOptions() API 521 uint64_t max_compaction_bytes = 0; 522 523 // All writes will be slowed down to at least delayed_write_rate if estimated 524 // bytes needed to be compaction exceed this threshold. 525 // 526 // Default: 64GB 527 // 528 // Dynamically changeable through SetOptions() API 529 uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; 530 531 // All writes are stopped if estimated bytes needed to be compaction exceed 532 // this threshold. 533 // 534 // Default: 256GB 535 // 536 // Dynamically changeable through SetOptions() API 537 uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; 538 539 // The compaction style. Default: kCompactionStyleLevel 540 CompactionStyle compaction_style = kCompactionStyleLevel; 541 542 // If level compaction_style = kCompactionStyleLevel, for each level, 543 // which files are prioritized to be picked to compact. 544 // Default: kMinOverlappingRatio 545 CompactionPri compaction_pri = kMinOverlappingRatio; 546 547 // The options needed to support Universal Style compactions 548 // 549 // Dynamically changeable through SetOptions() API 550 // Dynamic change example: 551 // SetOptions("compaction_options_universal", "{size_ratio=2;}") 552 CompactionOptionsUniversal compaction_options_universal; 553 554 // The options for FIFO compaction style 555 // 556 // Dynamically changeable through SetOptions() API 557 // Dynamic change example: 558 // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}") 559 CompactionOptionsFIFO compaction_options_fifo; 560 561 // An iteration->Next() sequentially skips over keys with the same 562 // user-key unless this option is set. This number specifies the number 563 // of keys (with the same userkey) that will be sequentially 564 // skipped before a reseek is issued. 565 // 566 // Default: 8 567 // 568 // Dynamically changeable through SetOptions() API 569 uint64_t max_sequential_skip_in_iterations = 8; 570 571 // This is a factory that provides MemTableRep objects. 572 // Default: a factory that provides a skip-list-based implementation of 573 // MemTableRep. 574 std::shared_ptr<MemTableRepFactory> memtable_factory = 575 std::shared_ptr<SkipListFactory>(new SkipListFactory); 576 577 // Block-based table related options are moved to BlockBasedTableOptions. 578 // Related options that were originally here but now moved include: 579 // no_block_cache 580 // block_cache 581 // block_cache_compressed 582 // block_size 583 // block_size_deviation 584 // block_restart_interval 585 // filter_policy 586 // whole_key_filtering 587 // If you'd like to customize some of these options, you will need to 588 // use NewBlockBasedTableFactory() to construct a new table factory. 589 590 // This option allows user to collect their own interested statistics of 591 // the tables. 592 // Default: empty vector -- no user-defined statistics collection will be 593 // performed. 594 typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> 595 TablePropertiesCollectorFactories; 596 TablePropertiesCollectorFactories table_properties_collector_factories; 597 598 // Maximum number of successive merge operations on a key in the memtable. 599 // 600 // When a merge operation is added to the memtable and the maximum number of 601 // successive merges is reached, the value of the key will be calculated and 602 // inserted into the memtable instead of the merge operation. This will 603 // ensure that there are never more than max_successive_merges merge 604 // operations in the memtable. 605 // 606 // Default: 0 (disabled) 607 // 608 // Dynamically changeable through SetOptions() API 609 size_t max_successive_merges = 0; 610 611 // This flag specifies that the implementation should optimize the filters 612 // mainly for cases where keys are found rather than also optimize for keys 613 // missed. This would be used in cases where the application knows that 614 // there are very few misses or the performance in the case of misses is not 615 // important. 616 // 617 // For now, this flag allows us to not store filters for the last level i.e 618 // the largest level which contains data of the LSM store. For keys which 619 // are hits, the filters in this level are not useful because we will search 620 // for the data anyway. NOTE: the filters in other levels are still useful 621 // even for key hit because they tell us whether to look in that level or go 622 // to the higher level. 623 // 624 // Default: false 625 bool optimize_filters_for_hits = false; 626 627 // After writing every SST file, reopen it and read all the keys. 628 // 629 // Default: false 630 // 631 // Dynamically changeable through SetOptions() API 632 bool paranoid_file_checks = false; 633 634 // In debug mode, RocksDB run consistency checks on the LSM every time the LSM 635 // change (Flush, Compaction, AddFile). These checks are disabled in release 636 // mode, use this option to enable them in release mode as well. 637 // Default: false 638 bool force_consistency_checks = false; 639 640 // Measure IO stats in compactions and flushes, if true. 641 // 642 // Default: false 643 // 644 // Dynamically changeable through SetOptions() API 645 bool report_bg_io_stats = false; 646 647 // Files older than TTL will go through the compaction process. 648 // In Level: Non-bottom-level files older than TTL will go through the 649 // compation process. 650 // In FIFO: Files older than TTL will be deleted. 651 // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 652 // In FIFO, this option will have the same meaning as 653 // periodic_compaction_seconds. Whichever stricter will be used. 654 // 0 means disabling. 655 // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to 656 // pick default. 657 // 658 // Default: 30 days for leveled compaction + block based table. disable 659 // otherwise. 660 // 661 // Dynamically changeable through SetOptions() API 662 uint64_t ttl = 0xfffffffffffffffe; 663 664 // Files older than this value will be picked up for compaction, and 665 // re-written to the same level as they were before. 666 // 667 // A file's age is computed by looking at file_creation_time or creation_time 668 // table properties in order, if they have valid non-zero values; if not, the 669 // age is based on the file's last modified time (given by the underlying 670 // Env). 671 // 672 // Supported in Level and FIFO compaction. 673 // In FIFO compaction, this option has the same meaning as TTL and whichever 674 // stricter will be used. 675 // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 676 // 677 // Values: 678 // 0: Turn off Periodic compactions. 679 // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature 680 // as needed. For now, RocksDB will change this value to 30 days 681 // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction 682 // process at least once every 30 days if not compacted sooner. 683 // In FIFO compaction, since the option has the same meaning as ttl, 684 // when this value is left default, and ttl is left to 0, 30 days will be 685 // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. 686 // 687 // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) 688 // 689 // Dynamically changeable through SetOptions() API 690 uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; 691 692 // If this option is set then 1 in N blocks are compressed 693 // using a fast (lz4) and slow (zstd) compression algorithm. 694 // The compressibility is reported as stats and the stored 695 // data is left uncompressed (unless compression is also requested). 696 uint64_t sample_for_compression = 0; 697 698 // Create ColumnFamilyOptions with default values for all fields 699 AdvancedColumnFamilyOptions(); 700 // Create ColumnFamilyOptions from Options 701 explicit AdvancedColumnFamilyOptions(const Options& options); 702 703 // ---------------- OPTIONS NOT SUPPORTED ANYMORE ---------------- 704 705 // NOT SUPPORTED ANYMORE 706 // This does not do anything anymore. 707 int max_mem_compaction_level; 708 709 // NOT SUPPORTED ANYMORE -- this options is no longer used 710 // Puts are delayed to options.delayed_write_rate when any level has a 711 // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. 712 // 713 // Default: 0 (disabled) 714 // 715 // Dynamically changeable through SetOptions() API 716 double soft_rate_limit = 0.0; 717 718 // NOT SUPPORTED ANYMORE -- this options is no longer used 719 double hard_rate_limit = 0.0; 720 721 // NOT SUPPORTED ANYMORE -- this options is no longer used 722 unsigned int rate_limit_delay_max_milliseconds = 100; 723 724 // NOT SUPPORTED ANYMORE 725 // Does not have any effect. 726 bool purge_redundant_kvs_while_flush = true; 727 }; 728 729 } // namespace ROCKSDB_NAMESPACE 730