1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #pragma once
11 #include <algorithm>
12 #include <set>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 #include "db/blob/blob_file_addition.h"
17 #include "db/blob/blob_file_garbage.h"
18 #include "db/dbformat.h"
19 #include "memory/arena.h"
20 #include "rocksdb/cache.h"
21 #include "table/table_reader.h"
22 #include "util/autovector.h"
23 
24 namespace ROCKSDB_NAMESPACE {
25 
26 class VersionSet;
27 
28 constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
29 constexpr uint64_t kUnknownOldestAncesterTime = 0;
30 constexpr uint64_t kUnknownFileCreationTime = 0;
31 
32 extern const std::string kUnknownFileChecksum;
33 extern const std::string kUnknownFileChecksumFuncName;
34 
35 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
36 
37 // A copyable structure contains information needed to read data from an SST
38 // file. It can contain a pointer to a table reader opened for the file, or
39 // file number and size, which can be used to create a new table reader for it.
40 // The behavior is undefined when a copied of the structure is used when the
41 // file is not in any live version any more.
42 struct FileDescriptor {
43   // Table reader in table_reader_handle
44   TableReader* table_reader;
45   uint64_t packed_number_and_path_id;
46   uint64_t file_size;  // File size in bytes
47   SequenceNumber smallest_seqno;  // The smallest seqno in this file
48   SequenceNumber largest_seqno;   // The largest seqno in this file
49 
FileDescriptorFileDescriptor50   FileDescriptor() : FileDescriptor(0, 0, 0) {}
51 
FileDescriptorFileDescriptor52   FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
53       : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
54 
FileDescriptorFileDescriptor55   FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
56                  SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
57       : table_reader(nullptr),
58         packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
59         file_size(_file_size),
60         smallest_seqno(_smallest_seqno),
61         largest_seqno(_largest_seqno) {}
62 
FileDescriptorFileDescriptor63   FileDescriptor(const FileDescriptor& fd) { *this = fd; }
64 
65   FileDescriptor& operator=(const FileDescriptor& fd) {
66     table_reader = fd.table_reader;
67     packed_number_and_path_id = fd.packed_number_and_path_id;
68     file_size = fd.file_size;
69     smallest_seqno = fd.smallest_seqno;
70     largest_seqno = fd.largest_seqno;
71     return *this;
72   }
73 
GetNumberFileDescriptor74   uint64_t GetNumber() const {
75     return packed_number_and_path_id & kFileNumberMask;
76   }
GetPathIdFileDescriptor77   uint32_t GetPathId() const {
78     return static_cast<uint32_t>(
79         packed_number_and_path_id / (kFileNumberMask + 1));
80   }
GetFileSizeFileDescriptor81   uint64_t GetFileSize() const { return file_size; }
82 };
83 
84 struct FileSampledStats {
FileSampledStatsFileSampledStats85   FileSampledStats() : num_reads_sampled(0) {}
FileSampledStatsFileSampledStats86   FileSampledStats(const FileSampledStats& other) { *this = other; }
87   FileSampledStats& operator=(const FileSampledStats& other) {
88     num_reads_sampled = other.num_reads_sampled.load();
89     return *this;
90   }
91 
92   // number of user reads to this file.
93   mutable std::atomic<uint64_t> num_reads_sampled;
94 };
95 
96 struct FileMetaData {
97   FileDescriptor fd;
98   InternalKey smallest;            // Smallest internal key served by table
99   InternalKey largest;             // Largest internal key served by table
100 
101   // Needs to be disposed when refs becomes 0.
102   Cache::Handle* table_reader_handle = nullptr;
103 
104   FileSampledStats stats;
105 
106   // Stats for compensating deletion entries during compaction
107 
108   // File size compensated by deletion entry.
109   // This is updated in Version::UpdateAccumulatedStats() first time when the
110   // file is created or loaded.  After it is updated (!= 0), it is immutable.
111   uint64_t compensated_file_size = 0;
112   // These values can mutate, but they can only be read or written from
113   // single-threaded LogAndApply thread
114   uint64_t num_entries = 0;     // the number of entries.
115   uint64_t num_deletions = 0;   // the number of deletion entries.
116   uint64_t raw_key_size = 0;    // total uncompressed key size.
117   uint64_t raw_value_size = 0;  // total uncompressed value size.
118 
119   int refs = 0;  // Reference count
120 
121   bool being_compacted = false;       // Is this file undergoing compaction?
122   bool init_stats_from_file = false;  // true if the data-entry stats of this
123                                       // file has initialized from file.
124 
125   bool marked_for_compaction = false;  // True if client asked us nicely to
126                                        // compact this file.
127 
128   // Used only in BlobDB. The file number of the oldest blob file this SST file
129   // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
130   uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
131 
132   // The file could be the compaction output from other SST files, which could
133   // in turn be outputs for compact older SST files. We track the memtable
134   // flush timestamp for the oldest SST file that eventaully contribute data
135   // to this file. 0 means the information is not available.
136   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
137 
138   // Unix time when the SST file is created.
139   uint64_t file_creation_time = kUnknownFileCreationTime;
140 
141   // File checksum
142   std::string file_checksum = kUnknownFileChecksum;
143 
144   // File checksum function name
145   std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
146 
147   FileMetaData() = default;
148 
FileMetaDataFileMetaData149   FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
150                const InternalKey& smallest_key, const InternalKey& largest_key,
151                const SequenceNumber& smallest_seq,
152                const SequenceNumber& largest_seq, bool marked_for_compact,
153                uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
154                uint64_t _file_creation_time, const std::string& _file_checksum,
155                const std::string& _file_checksum_func_name)
156       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
157         smallest(smallest_key),
158         largest(largest_key),
159         marked_for_compaction(marked_for_compact),
160         oldest_blob_file_number(oldest_blob_file),
161         oldest_ancester_time(_oldest_ancester_time),
162         file_creation_time(_file_creation_time),
163         file_checksum(_file_checksum),
164         file_checksum_func_name(_file_checksum_func_name) {
165     TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
166   }
167 
168   // REQUIRED: Keys must be given to the function in sorted order (it expects
169   // the last key to be the largest).
170   void UpdateBoundaries(const Slice& key, const Slice& value,
171                         SequenceNumber seqno, ValueType value_type);
172 
173   // Unlike UpdateBoundaries, ranges do not need to be presented in any
174   // particular order.
UpdateBoundariesForRangeFileMetaData175   void UpdateBoundariesForRange(const InternalKey& start,
176                                 const InternalKey& end, SequenceNumber seqno,
177                                 const InternalKeyComparator& icmp) {
178     if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
179       smallest = start;
180     }
181     if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
182       largest = end;
183     }
184     fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
185     fd.largest_seqno = std::max(fd.largest_seqno, seqno);
186   }
187 
188   // Try to get oldest ancester time from the class itself or table properties
189   // if table reader is already pinned.
190   // 0 means the information is not available.
TryGetOldestAncesterTimeFileMetaData191   uint64_t TryGetOldestAncesterTime() {
192     if (oldest_ancester_time != kUnknownOldestAncesterTime) {
193       return oldest_ancester_time;
194     } else if (fd.table_reader != nullptr &&
195                fd.table_reader->GetTableProperties() != nullptr) {
196       return fd.table_reader->GetTableProperties()->creation_time;
197     }
198     return kUnknownOldestAncesterTime;
199   }
200 
TryGetFileCreationTimeFileMetaData201   uint64_t TryGetFileCreationTime() {
202     if (file_creation_time != kUnknownFileCreationTime) {
203       return file_creation_time;
204     } else if (fd.table_reader != nullptr &&
205                fd.table_reader->GetTableProperties() != nullptr) {
206       return fd.table_reader->GetTableProperties()->file_creation_time;
207     }
208     return kUnknownFileCreationTime;
209   }
210 };
211 
212 // A compressed copy of file meta data that just contain minimum data needed
213 // to server read operations, while still keeping the pointer to full metadata
214 // of the file in case it is needed.
215 struct FdWithKeyRange {
216   FileDescriptor fd;
217   FileMetaData* file_metadata;  // Point to all metadata
218   Slice smallest_key;    // slice that contain smallest key
219   Slice largest_key;     // slice that contain largest key
220 
FdWithKeyRangeFdWithKeyRange221   FdWithKeyRange()
222       : fd(),
223         file_metadata(nullptr),
224         smallest_key(),
225         largest_key() {
226   }
227 
FdWithKeyRangeFdWithKeyRange228   FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
229                  FileMetaData* _file_metadata)
230       : fd(_fd),
231         file_metadata(_file_metadata),
232         smallest_key(_smallest_key),
233         largest_key(_largest_key) {}
234 };
235 
236 // Data structure to store an array of FdWithKeyRange in one level
237 // Actual data is guaranteed to be stored closely
238 struct LevelFilesBrief {
239   size_t num_files;
240   FdWithKeyRange* files;
LevelFilesBriefLevelFilesBrief241   LevelFilesBrief() {
242     num_files = 0;
243     files = nullptr;
244   }
245 };
246 
247 // The state of a DB at any given time is referred to as a Version.
248 // Any modification to the Version is considered a Version Edit. A Version is
249 // constructed by joining a sequence of Version Edits. Version Edits are written
250 // to the MANIFEST file.
251 class VersionEdit {
252  public:
253   void Clear();
254 
SetDBId(const std::string & db_id)255   void SetDBId(const std::string& db_id) {
256     has_db_id_ = true;
257     db_id_ = db_id;
258   }
HasDbId()259   bool HasDbId() const { return has_db_id_; }
GetDbId()260   const std::string& GetDbId() const { return db_id_; }
261 
SetComparatorName(const Slice & name)262   void SetComparatorName(const Slice& name) {
263     has_comparator_ = true;
264     comparator_ = name.ToString();
265   }
HasComparatorName()266   bool HasComparatorName() const { return has_comparator_; }
GetComparatorName()267   const std::string& GetComparatorName() const { return comparator_; }
268 
SetLogNumber(uint64_t num)269   void SetLogNumber(uint64_t num) {
270     has_log_number_ = true;
271     log_number_ = num;
272   }
HasLogNumber()273   bool HasLogNumber() const { return has_log_number_; }
GetLogNumber()274   uint64_t GetLogNumber() const { return log_number_; }
275 
SetPrevLogNumber(uint64_t num)276   void SetPrevLogNumber(uint64_t num) {
277     has_prev_log_number_ = true;
278     prev_log_number_ = num;
279   }
HasPrevLogNumber()280   bool HasPrevLogNumber() const { return has_prev_log_number_; }
GetPrevLogNumber()281   uint64_t GetPrevLogNumber() const { return prev_log_number_; }
282 
SetNextFile(uint64_t num)283   void SetNextFile(uint64_t num) {
284     has_next_file_number_ = true;
285     next_file_number_ = num;
286   }
HasNextFile()287   bool HasNextFile() const { return has_next_file_number_; }
GetNextFile()288   uint64_t GetNextFile() const { return next_file_number_; }
289 
SetMaxColumnFamily(uint32_t max_column_family)290   void SetMaxColumnFamily(uint32_t max_column_family) {
291     has_max_column_family_ = true;
292     max_column_family_ = max_column_family;
293   }
HasMaxColumnFamily()294   bool HasMaxColumnFamily() const { return has_max_column_family_; }
GetMaxColumnFamily()295   uint32_t GetMaxColumnFamily() const { return max_column_family_; }
296 
SetMinLogNumberToKeep(uint64_t num)297   void SetMinLogNumberToKeep(uint64_t num) {
298     has_min_log_number_to_keep_ = true;
299     min_log_number_to_keep_ = num;
300   }
HasMinLogNumberToKeep()301   bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
GetMinLogNumberToKeep()302   uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
303 
SetLastSequence(SequenceNumber seq)304   void SetLastSequence(SequenceNumber seq) {
305     has_last_sequence_ = true;
306     last_sequence_ = seq;
307   }
HasLastSequence()308   bool HasLastSequence() const { return has_last_sequence_; }
GetLastSequence()309   SequenceNumber GetLastSequence() const { return last_sequence_; }
310 
311   // Delete the specified table file from the specified level.
DeleteFile(int level,uint64_t file)312   void DeleteFile(int level, uint64_t file) {
313     deleted_files_.emplace(level, file);
314   }
315 
316   // Retrieve the table files deleted as well as their associated levels.
317   using DeletedFiles = std::set<std::pair<int, uint64_t>>;
GetDeletedFiles()318   const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
319 
320   // Add the specified table file at the specified level.
321   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
322   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
323   // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
324   // referred to by this file if any, kInvalidBlobFileNumber otherwise.
AddFile(int level,uint64_t file,uint32_t file_path_id,uint64_t file_size,const InternalKey & smallest,const InternalKey & largest,const SequenceNumber & smallest_seqno,const SequenceNumber & largest_seqno,bool marked_for_compaction,uint64_t oldest_blob_file_number,uint64_t oldest_ancester_time,uint64_t file_creation_time,const std::string & file_checksum,const std::string & file_checksum_func_name)325   void AddFile(int level, uint64_t file, uint32_t file_path_id,
326                uint64_t file_size, const InternalKey& smallest,
327                const InternalKey& largest, const SequenceNumber& smallest_seqno,
328                const SequenceNumber& largest_seqno, bool marked_for_compaction,
329                uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
330                uint64_t file_creation_time, const std::string& file_checksum,
331                const std::string& file_checksum_func_name) {
332     assert(smallest_seqno <= largest_seqno);
333     new_files_.emplace_back(
334         level, FileMetaData(file, file_path_id, file_size, smallest, largest,
335                             smallest_seqno, largest_seqno,
336                             marked_for_compaction, oldest_blob_file_number,
337                             oldest_ancester_time, file_creation_time,
338                             file_checksum, file_checksum_func_name));
339   }
340 
AddFile(int level,const FileMetaData & f)341   void AddFile(int level, const FileMetaData& f) {
342     assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
343     new_files_.emplace_back(level, f);
344   }
345 
346   // Retrieve the table files added as well as their associated levels.
347   using NewFiles = std::vector<std::pair<int, FileMetaData>>;
GetNewFiles()348   const NewFiles& GetNewFiles() const { return new_files_; }
349 
350   // Add a new blob file.
AddBlobFile(uint64_t blob_file_number,uint64_t total_blob_count,uint64_t total_blob_bytes,std::string checksum_method,std::string checksum_value)351   void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count,
352                    uint64_t total_blob_bytes, std::string checksum_method,
353                    std::string checksum_value) {
354     blob_file_additions_.emplace_back(
355         blob_file_number, total_blob_count, total_blob_bytes,
356         std::move(checksum_method), std::move(checksum_value));
357   }
358 
359   // Retrieve all the blob files added.
360   using BlobFileAdditions = std::vector<BlobFileAddition>;
GetBlobFileAdditions()361   const BlobFileAdditions& GetBlobFileAdditions() const {
362     return blob_file_additions_;
363   }
364 
365   // Add garbage for an existing blob file.  Note: intentionally broken English
366   // follows.
AddBlobFileGarbage(uint64_t blob_file_number,uint64_t garbage_blob_count,uint64_t garbage_blob_bytes)367   void AddBlobFileGarbage(uint64_t blob_file_number,
368                           uint64_t garbage_blob_count,
369                           uint64_t garbage_blob_bytes) {
370     blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count,
371                                      garbage_blob_bytes);
372   }
373 
374   // Retrieve all the blob file garbage added.
375   using BlobFileGarbages = std::vector<BlobFileGarbage>;
GetBlobFileGarbages()376   const BlobFileGarbages& GetBlobFileGarbages() const {
377     return blob_file_garbages_;
378   }
379 
380   // Number of edits
NumEntries()381   size_t NumEntries() const {
382     return new_files_.size() + deleted_files_.size() +
383            blob_file_additions_.size() + blob_file_garbages_.size();
384   }
385 
SetColumnFamily(uint32_t column_family_id)386   void SetColumnFamily(uint32_t column_family_id) {
387     column_family_ = column_family_id;
388   }
GetColumnFamily()389   uint32_t GetColumnFamily() const { return column_family_; }
390 
391   // set column family ID by calling SetColumnFamily()
AddColumnFamily(const std::string & name)392   void AddColumnFamily(const std::string& name) {
393     assert(!is_column_family_drop_);
394     assert(!is_column_family_add_);
395     assert(NumEntries() == 0);
396     is_column_family_add_ = true;
397     column_family_name_ = name;
398   }
399 
400   // set column family ID by calling SetColumnFamily()
DropColumnFamily()401   void DropColumnFamily() {
402     assert(!is_column_family_drop_);
403     assert(!is_column_family_add_);
404     assert(NumEntries() == 0);
405     is_column_family_drop_ = true;
406   }
407 
IsColumnFamilyManipulation()408   bool IsColumnFamilyManipulation() const {
409     return is_column_family_add_ || is_column_family_drop_;
410   }
411 
MarkAtomicGroup(uint32_t remaining_entries)412   void MarkAtomicGroup(uint32_t remaining_entries) {
413     is_in_atomic_group_ = true;
414     remaining_entries_ = remaining_entries;
415   }
IsInAtomicGroup()416   bool IsInAtomicGroup() const { return is_in_atomic_group_; }
GetRemainingEntries()417   uint32_t GetRemainingEntries() const { return remaining_entries_; }
418 
419   // return true on success.
420   bool EncodeTo(std::string* dst) const;
421   Status DecodeFrom(const Slice& src);
422 
423   std::string DebugString(bool hex_key = false) const;
424   std::string DebugJSON(int edit_num, bool hex_key = false) const;
425 
426  private:
427   friend class ReactiveVersionSet;
428   friend class VersionEditHandler;
429   friend class VersionEditHandlerPointInTime;
430   friend class VersionSet;
431   friend class Version;
432   friend class AtomicGroupReadBuffer;
433 
434   bool GetLevel(Slice* input, int* level, const char** msg);
435 
436   const char* DecodeNewFile4From(Slice* input);
437 
438   int max_level_ = 0;
439   std::string db_id_;
440   std::string comparator_;
441   uint64_t log_number_ = 0;
442   uint64_t prev_log_number_ = 0;
443   uint64_t next_file_number_ = 0;
444   uint32_t max_column_family_ = 0;
445   // The most recent WAL log number that is deleted
446   uint64_t min_log_number_to_keep_ = 0;
447   SequenceNumber last_sequence_ = 0;
448   bool has_db_id_ = false;
449   bool has_comparator_ = false;
450   bool has_log_number_ = false;
451   bool has_prev_log_number_ = false;
452   bool has_next_file_number_ = false;
453   bool has_max_column_family_ = false;
454   bool has_min_log_number_to_keep_ = false;
455   bool has_last_sequence_ = false;
456 
457   DeletedFiles deleted_files_;
458   NewFiles new_files_;
459 
460   BlobFileAdditions blob_file_additions_;
461   BlobFileGarbages blob_file_garbages_;
462 
463   // Each version edit record should have column_family_ set
464   // If it's not set, it is default (0)
465   uint32_t column_family_ = 0;
466   // a version edit can be either column_family add or
467   // column_family drop. If it's column family add,
468   // it also includes column family name.
469   bool is_column_family_drop_ = false;
470   bool is_column_family_add_ = false;
471   std::string column_family_name_;
472 
473   bool is_in_atomic_group_ = false;
474   uint32_t remaining_entries_ = 0;
475 };
476 
477 }  // namespace ROCKSDB_NAMESPACE
478