1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "table/format.h"
11 
12 #include <cinttypes>
13 #include <string>
14 
15 #include "block_fetcher.h"
16 #include "file/random_access_file_reader.h"
17 #include "logging/logging.h"
18 #include "memory/memory_allocator.h"
19 #include "monitoring/perf_context_imp.h"
20 #include "monitoring/statistics.h"
21 #include "rocksdb/env.h"
22 #include "table/block_based/block.h"
23 #include "table/block_based/block_based_table_reader.h"
24 #include "table/persistent_cache_helper.h"
25 #include "util/coding.h"
26 #include "util/compression.h"
27 #include "util/crc32c.h"
28 #include "util/stop_watch.h"
29 #include "util/string_util.h"
30 
31 namespace ROCKSDB_NAMESPACE {
32 
33 extern const uint64_t kLegacyBlockBasedTableMagicNumber;
34 extern const uint64_t kBlockBasedTableMagicNumber;
35 
36 #ifndef ROCKSDB_LITE
37 extern const uint64_t kLegacyPlainTableMagicNumber;
38 extern const uint64_t kPlainTableMagicNumber;
39 #else
40 // ROCKSDB_LITE doesn't have plain table
41 const uint64_t kLegacyPlainTableMagicNumber = 0;
42 const uint64_t kPlainTableMagicNumber = 0;
43 #endif
44 
ShouldReportDetailedTime(Env * env,Statistics * stats)45 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
46   return env != nullptr && stats != nullptr &&
47          stats->get_stats_level() > kExceptDetailedTimers;
48 }
49 
EncodeTo(std::string * dst) const50 void BlockHandle::EncodeTo(std::string* dst) const {
51   // Sanity check that all fields have been set
52   assert(offset_ != ~static_cast<uint64_t>(0));
53   assert(size_ != ~static_cast<uint64_t>(0));
54   PutVarint64Varint64(dst, offset_, size_);
55 }
56 
DecodeFrom(Slice * input)57 Status BlockHandle::DecodeFrom(Slice* input) {
58   if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
59     return Status::OK();
60   } else {
61     // reset in case failure after partially decoding
62     offset_ = 0;
63     size_ = 0;
64     return Status::Corruption("bad block handle");
65   }
66 }
67 
DecodeSizeFrom(uint64_t _offset,Slice * input)68 Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
69   if (GetVarint64(input, &size_)) {
70     offset_ = _offset;
71     return Status::OK();
72   } else {
73     // reset in case failure after partially decoding
74     offset_ = 0;
75     size_ = 0;
76     return Status::Corruption("bad block handle");
77   }
78 }
79 
80 // Return a string that contains the copy of handle.
ToString(bool hex) const81 std::string BlockHandle::ToString(bool hex) const {
82   std::string handle_str;
83   EncodeTo(&handle_str);
84   if (hex) {
85     return Slice(handle_str).ToString(true);
86   } else {
87     return handle_str;
88   }
89 }
90 
91 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
92 
EncodeTo(std::string * dst,bool have_first_key,const BlockHandle * previous_handle) const93 void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
94                           const BlockHandle* previous_handle) const {
95   if (previous_handle) {
96     assert(handle.offset() == previous_handle->offset() +
97                                   previous_handle->size() + kBlockTrailerSize);
98     PutVarsignedint64(dst, handle.size() - previous_handle->size());
99   } else {
100     handle.EncodeTo(dst);
101   }
102   assert(dst->size() != 0);
103 
104   if (have_first_key) {
105     PutLengthPrefixedSlice(dst, first_internal_key);
106   }
107 }
108 
DecodeFrom(Slice * input,bool have_first_key,const BlockHandle * previous_handle)109 Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
110                               const BlockHandle* previous_handle) {
111   if (previous_handle) {
112     int64_t delta;
113     if (!GetVarsignedint64(input, &delta)) {
114       return Status::Corruption("bad delta-encoded index value");
115     }
116     handle = BlockHandle(
117         previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
118         previous_handle->size() + delta);
119   } else {
120     Status s = handle.DecodeFrom(input);
121     if (!s.ok()) {
122       return s;
123     }
124   }
125 
126   if (!have_first_key) {
127     first_internal_key = Slice();
128   } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
129     return Status::Corruption("bad first key in block info");
130   }
131 
132   return Status::OK();
133 }
134 
ToString(bool hex,bool have_first_key) const135 std::string IndexValue::ToString(bool hex, bool have_first_key) const {
136   std::string s;
137   EncodeTo(&s, have_first_key, nullptr);
138   if (hex) {
139     return Slice(s).ToString(true);
140   } else {
141     return s;
142   }
143 }
144 
145 namespace {
IsLegacyFooterFormat(uint64_t magic_number)146 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
147   return magic_number == kLegacyBlockBasedTableMagicNumber ||
148          magic_number == kLegacyPlainTableMagicNumber;
149 }
UpconvertLegacyFooterFormat(uint64_t magic_number)150 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
151   if (magic_number == kLegacyBlockBasedTableMagicNumber) {
152     return kBlockBasedTableMagicNumber;
153   }
154   if (magic_number == kLegacyPlainTableMagicNumber) {
155     return kPlainTableMagicNumber;
156   }
157   assert(false);
158   return 0;
159 }
160 }  // namespace
161 
162 // legacy footer format:
163 //    metaindex handle (varint64 offset, varint64 size)
164 //    index handle     (varint64 offset, varint64 size)
165 //    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
166 //    table_magic_number (8 bytes)
167 // new footer format:
168 //    checksum type (char, 1 byte)
169 //    metaindex handle (varint64 offset, varint64 size)
170 //    index handle     (varint64 offset, varint64 size)
171 //    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
172 //    footer version (4 bytes)
173 //    table_magic_number (8 bytes)
EncodeTo(std::string * dst) const174 void Footer::EncodeTo(std::string* dst) const {
175   assert(HasInitializedTableMagicNumber());
176   if (IsLegacyFooterFormat(table_magic_number())) {
177     // has to be default checksum with legacy footer
178     assert(checksum_ == kCRC32c);
179     const size_t original_size = dst->size();
180     metaindex_handle_.EncodeTo(dst);
181     index_handle_.EncodeTo(dst);
182     dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength);  // Padding
183     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
184     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
185     assert(dst->size() == original_size + kVersion0EncodedLength);
186   } else {
187     const size_t original_size = dst->size();
188     dst->push_back(static_cast<char>(checksum_));
189     metaindex_handle_.EncodeTo(dst);
190     index_handle_.EncodeTo(dst);
191     dst->resize(original_size + kNewVersionsEncodedLength - 12);  // Padding
192     PutFixed32(dst, version());
193     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
194     PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
195     assert(dst->size() == original_size + kNewVersionsEncodedLength);
196   }
197 }
198 
Footer(uint64_t _table_magic_number,uint32_t _version)199 Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
200     : version_(_version),
201       checksum_(kCRC32c),
202       table_magic_number_(_table_magic_number) {
203   // This should be guaranteed by constructor callers
204   assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
205 }
206 
DecodeFrom(Slice * input)207 Status Footer::DecodeFrom(Slice* input) {
208   assert(!HasInitializedTableMagicNumber());
209   assert(input != nullptr);
210   assert(input->size() >= kMinEncodedLength);
211 
212   const char* magic_ptr =
213       input->data() + input->size() - kMagicNumberLengthByte;
214   const uint32_t magic_lo = DecodeFixed32(magic_ptr);
215   const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
216   uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
217                     (static_cast<uint64_t>(magic_lo)));
218 
219   // We check for legacy formats here and silently upconvert them
220   bool legacy = IsLegacyFooterFormat(magic);
221   if (legacy) {
222     magic = UpconvertLegacyFooterFormat(magic);
223   }
224   set_table_magic_number(magic);
225 
226   if (legacy) {
227     // The size is already asserted to be at least kMinEncodedLength
228     // at the beginning of the function
229     input->remove_prefix(input->size() - kVersion0EncodedLength);
230     version_ = 0 /* legacy */;
231     checksum_ = kCRC32c;
232   } else {
233     version_ = DecodeFixed32(magic_ptr - 4);
234     // Footer version 1 and higher will always occupy exactly this many bytes.
235     // It consists of the checksum type, two block handles, padding,
236     // a version number, and a magic number
237     if (input->size() < kNewVersionsEncodedLength) {
238       return Status::Corruption("input is too short to be an sstable");
239     } else {
240       input->remove_prefix(input->size() - kNewVersionsEncodedLength);
241     }
242     uint32_t chksum;
243     if (!GetVarint32(input, &chksum)) {
244       return Status::Corruption("bad checksum type");
245     }
246     checksum_ = static_cast<ChecksumType>(chksum);
247   }
248 
249   Status result = metaindex_handle_.DecodeFrom(input);
250   if (result.ok()) {
251     result = index_handle_.DecodeFrom(input);
252   }
253   if (result.ok()) {
254     // We skip over any leftover data (just padding for now) in "input"
255     const char* end = magic_ptr + kMagicNumberLengthByte;
256     *input = Slice(end, input->data() + input->size() - end);
257   }
258   return result;
259 }
260 
ToString() const261 std::string Footer::ToString() const {
262   std::string result;
263   result.reserve(1024);
264 
265   bool legacy = IsLegacyFooterFormat(table_magic_number_);
266   if (legacy) {
267     result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
268     result.append("index handle: " + index_handle_.ToString() + "\n  ");
269     result.append("table_magic_number: " +
270                   ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n  ");
271   } else {
272     result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
273                   "\n  ");
274     result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n  ");
275     result.append("index handle: " + index_handle_.ToString() + "\n  ");
276     result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
277                   "\n  ");
278     result.append("table_magic_number: " +
279                   ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n  ");
280   }
281   return result;
282 }
283 
ReadFooterFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,uint64_t file_size,Footer * footer,uint64_t enforce_table_magic_number)284 Status ReadFooterFromFile(RandomAccessFileReader* file,
285                           FilePrefetchBuffer* prefetch_buffer,
286                           uint64_t file_size, Footer* footer,
287                           uint64_t enforce_table_magic_number) {
288   if (file_size < Footer::kMinEncodedLength) {
289     return Status::Corruption("file is too short (" + ToString(file_size) +
290                               " bytes) to be an "
291                               "sstable: " +
292                               file->file_name());
293   }
294 
295   std::string footer_buf;
296   std::unique_ptr<const char[]> internal_buf;
297   Slice footer_input;
298   size_t read_offset =
299       (file_size > Footer::kMaxEncodedLength)
300           ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
301           : 0;
302   Status s;
303   if (prefetch_buffer == nullptr ||
304       !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
305                                          &footer_input)) {
306     if (file->use_direct_io()) {
307       s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
308                     nullptr, &internal_buf);
309     } else {
310       footer_buf.reserve(Footer::kMaxEncodedLength);
311       s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
312                     &footer_buf[0], nullptr);
313     }
314     if (!s.ok()) return s;
315   }
316 
317   // Check that we actually read the whole footer from the file. It may be
318   // that size isn't correct.
319   if (footer_input.size() < Footer::kMinEncodedLength) {
320     return Status::Corruption("file is too short (" + ToString(file_size) +
321                               " bytes) to be an "
322                               "sstable" +
323                               file->file_name());
324   }
325 
326   s = footer->DecodeFrom(&footer_input);
327   if (!s.ok()) {
328     return s;
329   }
330   if (enforce_table_magic_number != 0 &&
331       enforce_table_magic_number != footer->table_magic_number()) {
332     return Status::Corruption(
333         "Bad table magic number: expected " +
334         ToString(enforce_table_magic_number) + ", found " +
335         ToString(footer->table_magic_number()) + " in " + file->file_name());
336   }
337   return Status::OK();
338 }
339 
UncompressBlockContentsForCompressionType(const UncompressionInfo & uncompression_info,const char * data,size_t n,BlockContents * contents,uint32_t format_version,const ImmutableCFOptions & ioptions,MemoryAllocator * allocator)340 Status UncompressBlockContentsForCompressionType(
341     const UncompressionInfo& uncompression_info, const char* data, size_t n,
342     BlockContents* contents, uint32_t format_version,
343     const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
344   CacheAllocationPtr ubuf;
345 
346   assert(uncompression_info.type() != kNoCompression &&
347          "Invalid compression type");
348 
349   StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
350                                         ioptions.env, ioptions.statistics));
351   int decompress_size = 0;
352   switch (uncompression_info.type()) {
353     case kSnappyCompression: {
354       size_t ulength = 0;
355       static char snappy_corrupt_msg[] =
356           "Snappy not supported or corrupted Snappy compressed block contents";
357       if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
358         return Status::Corruption(snappy_corrupt_msg);
359       }
360       ubuf = AllocateBlock(ulength, allocator);
361       if (!Snappy_Uncompress(data, n, ubuf.get())) {
362         return Status::Corruption(snappy_corrupt_msg);
363       }
364       *contents = BlockContents(std::move(ubuf), ulength);
365       break;
366     }
367     case kZlibCompression:
368       ubuf = Zlib_Uncompress(
369           uncompression_info, data, n, &decompress_size,
370           GetCompressFormatForVersion(kZlibCompression, format_version),
371           allocator);
372       if (!ubuf) {
373         static char zlib_corrupt_msg[] =
374             "Zlib not supported or corrupted Zlib compressed block contents";
375         return Status::Corruption(zlib_corrupt_msg);
376       }
377       *contents = BlockContents(std::move(ubuf), decompress_size);
378       break;
379     case kBZip2Compression:
380       ubuf = BZip2_Uncompress(
381           data, n, &decompress_size,
382           GetCompressFormatForVersion(kBZip2Compression, format_version),
383           allocator);
384       if (!ubuf) {
385         static char bzip2_corrupt_msg[] =
386             "Bzip2 not supported or corrupted Bzip2 compressed block contents";
387         return Status::Corruption(bzip2_corrupt_msg);
388       }
389       *contents = BlockContents(std::move(ubuf), decompress_size);
390       break;
391     case kLZ4Compression:
392       ubuf = LZ4_Uncompress(
393           uncompression_info, data, n, &decompress_size,
394           GetCompressFormatForVersion(kLZ4Compression, format_version),
395           allocator);
396       if (!ubuf) {
397         static char lz4_corrupt_msg[] =
398             "LZ4 not supported or corrupted LZ4 compressed block contents";
399         return Status::Corruption(lz4_corrupt_msg);
400       }
401       *contents = BlockContents(std::move(ubuf), decompress_size);
402       break;
403     case kLZ4HCCompression:
404       ubuf = LZ4_Uncompress(
405           uncompression_info, data, n, &decompress_size,
406           GetCompressFormatForVersion(kLZ4HCCompression, format_version),
407           allocator);
408       if (!ubuf) {
409         static char lz4hc_corrupt_msg[] =
410             "LZ4HC not supported or corrupted LZ4HC compressed block contents";
411         return Status::Corruption(lz4hc_corrupt_msg);
412       }
413       *contents = BlockContents(std::move(ubuf), decompress_size);
414       break;
415     case kXpressCompression:
416       // XPRESS allocates memory internally, thus no support for custom
417       // allocator.
418       ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
419       if (!ubuf) {
420         static char xpress_corrupt_msg[] =
421             "XPRESS not supported or corrupted XPRESS compressed block "
422             "contents";
423         return Status::Corruption(xpress_corrupt_msg);
424       }
425       *contents = BlockContents(std::move(ubuf), decompress_size);
426       break;
427     case kZSTD:
428     case kZSTDNotFinalCompression:
429       ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
430                              allocator);
431       if (!ubuf) {
432         static char zstd_corrupt_msg[] =
433             "ZSTD not supported or corrupted ZSTD compressed block contents";
434         return Status::Corruption(zstd_corrupt_msg);
435       }
436       *contents = BlockContents(std::move(ubuf), decompress_size);
437       break;
438     default:
439       return Status::Corruption("bad block type");
440   }
441 
442   if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
443     RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
444                           timer.ElapsedNanos());
445   }
446   RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
447                         contents->data.size());
448   RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
449 
450   return Status::OK();
451 }
452 
453 //
454 // The 'data' points to the raw block contents that was read in from file.
455 // This method allocates a new heap buffer and the raw block
456 // contents are uncompresed into this buffer. This
457 // buffer is returned via 'result' and it is upto the caller to
458 // free this buffer.
459 // format_version is the block format as defined in include/rocksdb/table.h
UncompressBlockContents(const UncompressionInfo & uncompression_info,const char * data,size_t n,BlockContents * contents,uint32_t format_version,const ImmutableCFOptions & ioptions,MemoryAllocator * allocator)460 Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
461                                const char* data, size_t n,
462                                BlockContents* contents, uint32_t format_version,
463                                const ImmutableCFOptions& ioptions,
464                                MemoryAllocator* allocator) {
465   assert(data[n] != kNoCompression);
466   assert(data[n] == uncompression_info.type());
467   return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
468                                                    contents, format_version,
469                                                    ioptions, allocator);
470 }
471 
472 }  // namespace ROCKSDB_NAMESPACE
473