1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #pragma once 11 #include <algorithm> 12 #include <set> 13 #include <string> 14 #include <utility> 15 #include <vector> 16 #include "db/blob/blob_file_addition.h" 17 #include "db/blob/blob_file_garbage.h" 18 #include "db/dbformat.h" 19 #include "memory/arena.h" 20 #include "rocksdb/cache.h" 21 #include "table/table_reader.h" 22 #include "util/autovector.h" 23 24 namespace ROCKSDB_NAMESPACE { 25 26 class VersionSet; 27 28 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; 29 constexpr uint64_t kUnknownOldestAncesterTime = 0; 30 constexpr uint64_t kUnknownFileCreationTime = 0; 31 32 extern const std::string kUnknownFileChecksum; 33 extern const std::string kUnknownFileChecksumFuncName; 34 35 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); 36 37 // A copyable structure contains information needed to read data from an SST 38 // file. It can contain a pointer to a table reader opened for the file, or 39 // file number and size, which can be used to create a new table reader for it. 40 // The behavior is undefined when a copied of the structure is used when the 41 // file is not in any live version any more. 42 struct FileDescriptor { 43 // Table reader in table_reader_handle 44 TableReader* table_reader; 45 uint64_t packed_number_and_path_id; 46 uint64_t file_size; // File size in bytes 47 SequenceNumber smallest_seqno; // The smallest seqno in this file 48 SequenceNumber largest_seqno; // The largest seqno in this file 49 FileDescriptorFileDescriptor50 FileDescriptor() : FileDescriptor(0, 0, 0) {} 51 FileDescriptorFileDescriptor52 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) 53 : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {} 54 FileDescriptorFileDescriptor55 FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size, 56 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno) 57 : table_reader(nullptr), 58 packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), 59 file_size(_file_size), 60 smallest_seqno(_smallest_seqno), 61 largest_seqno(_largest_seqno) {} 62 FileDescriptorFileDescriptor63 FileDescriptor(const FileDescriptor& fd) { *this = fd; } 64 65 FileDescriptor& operator=(const FileDescriptor& fd) { 66 table_reader = fd.table_reader; 67 packed_number_and_path_id = fd.packed_number_and_path_id; 68 file_size = fd.file_size; 69 smallest_seqno = fd.smallest_seqno; 70 largest_seqno = fd.largest_seqno; 71 return *this; 72 } 73 GetNumberFileDescriptor74 uint64_t GetNumber() const { 75 return packed_number_and_path_id & kFileNumberMask; 76 } GetPathIdFileDescriptor77 uint32_t GetPathId() const { 78 return static_cast<uint32_t>( 79 packed_number_and_path_id / (kFileNumberMask + 1)); 80 } GetFileSizeFileDescriptor81 uint64_t GetFileSize() const { return file_size; } 82 }; 83 84 struct FileSampledStats { FileSampledStatsFileSampledStats85 FileSampledStats() : num_reads_sampled(0) {} FileSampledStatsFileSampledStats86 FileSampledStats(const FileSampledStats& other) { *this = other; } 87 FileSampledStats& operator=(const FileSampledStats& other) { 88 num_reads_sampled = other.num_reads_sampled.load(); 89 return *this; 90 } 91 92 // number of user reads to this file. 93 mutable std::atomic<uint64_t> num_reads_sampled; 94 }; 95 96 struct FileMetaData { 97 FileDescriptor fd; 98 InternalKey smallest; // Smallest internal key served by table 99 InternalKey largest; // Largest internal key served by table 100 101 // Needs to be disposed when refs becomes 0. 102 Cache::Handle* table_reader_handle = nullptr; 103 104 FileSampledStats stats; 105 106 // Stats for compensating deletion entries during compaction 107 108 // File size compensated by deletion entry. 109 // This is updated in Version::UpdateAccumulatedStats() first time when the 110 // file is created or loaded. After it is updated (!= 0), it is immutable. 111 uint64_t compensated_file_size = 0; 112 // These values can mutate, but they can only be read or written from 113 // single-threaded LogAndApply thread 114 uint64_t num_entries = 0; // the number of entries. 115 uint64_t num_deletions = 0; // the number of deletion entries. 116 uint64_t raw_key_size = 0; // total uncompressed key size. 117 uint64_t raw_value_size = 0; // total uncompressed value size. 118 119 int refs = 0; // Reference count 120 121 bool being_compacted = false; // Is this file undergoing compaction? 122 bool init_stats_from_file = false; // true if the data-entry stats of this 123 // file has initialized from file. 124 125 bool marked_for_compaction = false; // True if client asked us nicely to 126 // compact this file. 127 128 // Used only in BlobDB. The file number of the oldest blob file this SST file 129 // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. 130 uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; 131 132 // The file could be the compaction output from other SST files, which could 133 // in turn be outputs for compact older SST files. We track the memtable 134 // flush timestamp for the oldest SST file that eventaully contribute data 135 // to this file. 0 means the information is not available. 136 uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; 137 138 // Unix time when the SST file is created. 139 uint64_t file_creation_time = kUnknownFileCreationTime; 140 141 // File checksum 142 std::string file_checksum = kUnknownFileChecksum; 143 144 // File checksum function name 145 std::string file_checksum_func_name = kUnknownFileChecksumFuncName; 146 147 FileMetaData() = default; 148 FileMetaDataFileMetaData149 FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, 150 const InternalKey& smallest_key, const InternalKey& largest_key, 151 const SequenceNumber& smallest_seq, 152 const SequenceNumber& largest_seq, bool marked_for_compact, 153 uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, 154 uint64_t _file_creation_time, const std::string& _file_checksum, 155 const std::string& _file_checksum_func_name) 156 : fd(file, file_path_id, file_size, smallest_seq, largest_seq), 157 smallest(smallest_key), 158 largest(largest_key), 159 marked_for_compaction(marked_for_compact), 160 oldest_blob_file_number(oldest_blob_file), 161 oldest_ancester_time(_oldest_ancester_time), 162 file_creation_time(_file_creation_time), 163 file_checksum(_file_checksum), 164 file_checksum_func_name(_file_checksum_func_name) { 165 TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); 166 } 167 168 // REQUIRED: Keys must be given to the function in sorted order (it expects 169 // the last key to be the largest). 170 void UpdateBoundaries(const Slice& key, const Slice& value, 171 SequenceNumber seqno, ValueType value_type); 172 173 // Unlike UpdateBoundaries, ranges do not need to be presented in any 174 // particular order. UpdateBoundariesForRangeFileMetaData175 void UpdateBoundariesForRange(const InternalKey& start, 176 const InternalKey& end, SequenceNumber seqno, 177 const InternalKeyComparator& icmp) { 178 if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) { 179 smallest = start; 180 } 181 if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { 182 largest = end; 183 } 184 fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); 185 fd.largest_seqno = std::max(fd.largest_seqno, seqno); 186 } 187 188 // Try to get oldest ancester time from the class itself or table properties 189 // if table reader is already pinned. 190 // 0 means the information is not available. TryGetOldestAncesterTimeFileMetaData191 uint64_t TryGetOldestAncesterTime() { 192 if (oldest_ancester_time != kUnknownOldestAncesterTime) { 193 return oldest_ancester_time; 194 } else if (fd.table_reader != nullptr && 195 fd.table_reader->GetTableProperties() != nullptr) { 196 return fd.table_reader->GetTableProperties()->creation_time; 197 } 198 return kUnknownOldestAncesterTime; 199 } 200 TryGetFileCreationTimeFileMetaData201 uint64_t TryGetFileCreationTime() { 202 if (file_creation_time != kUnknownFileCreationTime) { 203 return file_creation_time; 204 } else if (fd.table_reader != nullptr && 205 fd.table_reader->GetTableProperties() != nullptr) { 206 return fd.table_reader->GetTableProperties()->file_creation_time; 207 } 208 return kUnknownFileCreationTime; 209 } 210 }; 211 212 // A compressed copy of file meta data that just contain minimum data needed 213 // to server read operations, while still keeping the pointer to full metadata 214 // of the file in case it is needed. 215 struct FdWithKeyRange { 216 FileDescriptor fd; 217 FileMetaData* file_metadata; // Point to all metadata 218 Slice smallest_key; // slice that contain smallest key 219 Slice largest_key; // slice that contain largest key 220 FdWithKeyRangeFdWithKeyRange221 FdWithKeyRange() 222 : fd(), 223 file_metadata(nullptr), 224 smallest_key(), 225 largest_key() { 226 } 227 FdWithKeyRangeFdWithKeyRange228 FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, 229 FileMetaData* _file_metadata) 230 : fd(_fd), 231 file_metadata(_file_metadata), 232 smallest_key(_smallest_key), 233 largest_key(_largest_key) {} 234 }; 235 236 // Data structure to store an array of FdWithKeyRange in one level 237 // Actual data is guaranteed to be stored closely 238 struct LevelFilesBrief { 239 size_t num_files; 240 FdWithKeyRange* files; LevelFilesBriefLevelFilesBrief241 LevelFilesBrief() { 242 num_files = 0; 243 files = nullptr; 244 } 245 }; 246 247 // The state of a DB at any given time is referred to as a Version. 248 // Any modification to the Version is considered a Version Edit. A Version is 249 // constructed by joining a sequence of Version Edits. Version Edits are written 250 // to the MANIFEST file. 251 class VersionEdit { 252 public: 253 void Clear(); 254 SetDBId(const std::string & db_id)255 void SetDBId(const std::string& db_id) { 256 has_db_id_ = true; 257 db_id_ = db_id; 258 } HasDbId()259 bool HasDbId() const { return has_db_id_; } GetDbId()260 const std::string& GetDbId() const { return db_id_; } 261 SetComparatorName(const Slice & name)262 void SetComparatorName(const Slice& name) { 263 has_comparator_ = true; 264 comparator_ = name.ToString(); 265 } HasComparatorName()266 bool HasComparatorName() const { return has_comparator_; } GetComparatorName()267 const std::string& GetComparatorName() const { return comparator_; } 268 SetLogNumber(uint64_t num)269 void SetLogNumber(uint64_t num) { 270 has_log_number_ = true; 271 log_number_ = num; 272 } HasLogNumber()273 bool HasLogNumber() const { return has_log_number_; } GetLogNumber()274 uint64_t GetLogNumber() const { return log_number_; } 275 SetPrevLogNumber(uint64_t num)276 void SetPrevLogNumber(uint64_t num) { 277 has_prev_log_number_ = true; 278 prev_log_number_ = num; 279 } HasPrevLogNumber()280 bool HasPrevLogNumber() const { return has_prev_log_number_; } GetPrevLogNumber()281 uint64_t GetPrevLogNumber() const { return prev_log_number_; } 282 SetNextFile(uint64_t num)283 void SetNextFile(uint64_t num) { 284 has_next_file_number_ = true; 285 next_file_number_ = num; 286 } HasNextFile()287 bool HasNextFile() const { return has_next_file_number_; } GetNextFile()288 uint64_t GetNextFile() const { return next_file_number_; } 289 SetMaxColumnFamily(uint32_t max_column_family)290 void SetMaxColumnFamily(uint32_t max_column_family) { 291 has_max_column_family_ = true; 292 max_column_family_ = max_column_family; 293 } HasMaxColumnFamily()294 bool HasMaxColumnFamily() const { return has_max_column_family_; } GetMaxColumnFamily()295 uint32_t GetMaxColumnFamily() const { return max_column_family_; } 296 SetMinLogNumberToKeep(uint64_t num)297 void SetMinLogNumberToKeep(uint64_t num) { 298 has_min_log_number_to_keep_ = true; 299 min_log_number_to_keep_ = num; 300 } HasMinLogNumberToKeep()301 bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; } GetMinLogNumberToKeep()302 uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; } 303 SetLastSequence(SequenceNumber seq)304 void SetLastSequence(SequenceNumber seq) { 305 has_last_sequence_ = true; 306 last_sequence_ = seq; 307 } HasLastSequence()308 bool HasLastSequence() const { return has_last_sequence_; } GetLastSequence()309 SequenceNumber GetLastSequence() const { return last_sequence_; } 310 311 // Delete the specified table file from the specified level. DeleteFile(int level,uint64_t file)312 void DeleteFile(int level, uint64_t file) { 313 deleted_files_.emplace(level, file); 314 } 315 316 // Retrieve the table files deleted as well as their associated levels. 317 using DeletedFiles = std::set<std::pair<int, uint64_t>>; GetDeletedFiles()318 const DeletedFiles& GetDeletedFiles() const { return deleted_files_; } 319 320 // Add the specified table file at the specified level. 321 // REQUIRES: This version has not been saved (see VersionSet::SaveTo) 322 // REQUIRES: "smallest" and "largest" are smallest and largest keys in file 323 // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file 324 // referred to by this file if any, kInvalidBlobFileNumber otherwise. AddFile(int level,uint64_t file,uint32_t file_path_id,uint64_t file_size,const InternalKey & smallest,const InternalKey & largest,const SequenceNumber & smallest_seqno,const SequenceNumber & largest_seqno,bool marked_for_compaction,uint64_t oldest_blob_file_number,uint64_t oldest_ancester_time,uint64_t file_creation_time,const std::string & file_checksum,const std::string & file_checksum_func_name)325 void AddFile(int level, uint64_t file, uint32_t file_path_id, 326 uint64_t file_size, const InternalKey& smallest, 327 const InternalKey& largest, const SequenceNumber& smallest_seqno, 328 const SequenceNumber& largest_seqno, bool marked_for_compaction, 329 uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, 330 uint64_t file_creation_time, const std::string& file_checksum, 331 const std::string& file_checksum_func_name) { 332 assert(smallest_seqno <= largest_seqno); 333 new_files_.emplace_back( 334 level, FileMetaData(file, file_path_id, file_size, smallest, largest, 335 smallest_seqno, largest_seqno, 336 marked_for_compaction, oldest_blob_file_number, 337 oldest_ancester_time, file_creation_time, 338 file_checksum, file_checksum_func_name)); 339 } 340 AddFile(int level,const FileMetaData & f)341 void AddFile(int level, const FileMetaData& f) { 342 assert(f.fd.smallest_seqno <= f.fd.largest_seqno); 343 new_files_.emplace_back(level, f); 344 } 345 346 // Retrieve the table files added as well as their associated levels. 347 using NewFiles = std::vector<std::pair<int, FileMetaData>>; GetNewFiles()348 const NewFiles& GetNewFiles() const { return new_files_; } 349 350 // Add a new blob file. AddBlobFile(uint64_t blob_file_number,uint64_t total_blob_count,uint64_t total_blob_bytes,std::string checksum_method,std::string checksum_value)351 void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count, 352 uint64_t total_blob_bytes, std::string checksum_method, 353 std::string checksum_value) { 354 blob_file_additions_.emplace_back( 355 blob_file_number, total_blob_count, total_blob_bytes, 356 std::move(checksum_method), std::move(checksum_value)); 357 } 358 359 // Retrieve all the blob files added. 360 using BlobFileAdditions = std::vector<BlobFileAddition>; GetBlobFileAdditions()361 const BlobFileAdditions& GetBlobFileAdditions() const { 362 return blob_file_additions_; 363 } 364 365 // Add garbage for an existing blob file. Note: intentionally broken English 366 // follows. AddBlobFileGarbage(uint64_t blob_file_number,uint64_t garbage_blob_count,uint64_t garbage_blob_bytes)367 void AddBlobFileGarbage(uint64_t blob_file_number, 368 uint64_t garbage_blob_count, 369 uint64_t garbage_blob_bytes) { 370 blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count, 371 garbage_blob_bytes); 372 } 373 374 // Retrieve all the blob file garbage added. 375 using BlobFileGarbages = std::vector<BlobFileGarbage>; GetBlobFileGarbages()376 const BlobFileGarbages& GetBlobFileGarbages() const { 377 return blob_file_garbages_; 378 } 379 380 // Number of edits NumEntries()381 size_t NumEntries() const { 382 return new_files_.size() + deleted_files_.size() + 383 blob_file_additions_.size() + blob_file_garbages_.size(); 384 } 385 SetColumnFamily(uint32_t column_family_id)386 void SetColumnFamily(uint32_t column_family_id) { 387 column_family_ = column_family_id; 388 } GetColumnFamily()389 uint32_t GetColumnFamily() const { return column_family_; } 390 391 // set column family ID by calling SetColumnFamily() AddColumnFamily(const std::string & name)392 void AddColumnFamily(const std::string& name) { 393 assert(!is_column_family_drop_); 394 assert(!is_column_family_add_); 395 assert(NumEntries() == 0); 396 is_column_family_add_ = true; 397 column_family_name_ = name; 398 } 399 400 // set column family ID by calling SetColumnFamily() DropColumnFamily()401 void DropColumnFamily() { 402 assert(!is_column_family_drop_); 403 assert(!is_column_family_add_); 404 assert(NumEntries() == 0); 405 is_column_family_drop_ = true; 406 } 407 IsColumnFamilyManipulation()408 bool IsColumnFamilyManipulation() const { 409 return is_column_family_add_ || is_column_family_drop_; 410 } 411 MarkAtomicGroup(uint32_t remaining_entries)412 void MarkAtomicGroup(uint32_t remaining_entries) { 413 is_in_atomic_group_ = true; 414 remaining_entries_ = remaining_entries; 415 } IsInAtomicGroup()416 bool IsInAtomicGroup() const { return is_in_atomic_group_; } GetRemainingEntries()417 uint32_t GetRemainingEntries() const { return remaining_entries_; } 418 419 // return true on success. 420 bool EncodeTo(std::string* dst) const; 421 Status DecodeFrom(const Slice& src); 422 423 std::string DebugString(bool hex_key = false) const; 424 std::string DebugJSON(int edit_num, bool hex_key = false) const; 425 426 private: 427 friend class ReactiveVersionSet; 428 friend class VersionEditHandler; 429 friend class VersionEditHandlerPointInTime; 430 friend class VersionSet; 431 friend class Version; 432 friend class AtomicGroupReadBuffer; 433 434 bool GetLevel(Slice* input, int* level, const char** msg); 435 436 const char* DecodeNewFile4From(Slice* input); 437 438 int max_level_ = 0; 439 std::string db_id_; 440 std::string comparator_; 441 uint64_t log_number_ = 0; 442 uint64_t prev_log_number_ = 0; 443 uint64_t next_file_number_ = 0; 444 uint32_t max_column_family_ = 0; 445 // The most recent WAL log number that is deleted 446 uint64_t min_log_number_to_keep_ = 0; 447 SequenceNumber last_sequence_ = 0; 448 bool has_db_id_ = false; 449 bool has_comparator_ = false; 450 bool has_log_number_ = false; 451 bool has_prev_log_number_ = false; 452 bool has_next_file_number_ = false; 453 bool has_max_column_family_ = false; 454 bool has_min_log_number_to_keep_ = false; 455 bool has_last_sequence_ = false; 456 457 DeletedFiles deleted_files_; 458 NewFiles new_files_; 459 460 BlobFileAdditions blob_file_additions_; 461 BlobFileGarbages blob_file_garbages_; 462 463 // Each version edit record should have column_family_ set 464 // If it's not set, it is default (0) 465 uint32_t column_family_ = 0; 466 // a version edit can be either column_family add or 467 // column_family drop. If it's column family add, 468 // it also includes column family name. 469 bool is_column_family_drop_ = false; 470 bool is_column_family_add_ = false; 471 std::string column_family_name_; 472 473 bool is_in_atomic_group_ = false; 474 uint32_t remaining_entries_ = 0; 475 }; 476 477 } // namespace ROCKSDB_NAMESPACE 478