1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/format.h"
11
12 #include <cinttypes>
13 #include <string>
14
15 #include "block_fetcher.h"
16 #include "file/random_access_file_reader.h"
17 #include "logging/logging.h"
18 #include "memory/memory_allocator.h"
19 #include "monitoring/perf_context_imp.h"
20 #include "monitoring/statistics.h"
21 #include "rocksdb/env.h"
22 #include "table/block_based/block.h"
23 #include "table/block_based/block_based_table_reader.h"
24 #include "table/persistent_cache_helper.h"
25 #include "util/coding.h"
26 #include "util/compression.h"
27 #include "util/crc32c.h"
28 #include "util/stop_watch.h"
29 #include "util/string_util.h"
30
31 namespace ROCKSDB_NAMESPACE {
32
33 extern const uint64_t kLegacyBlockBasedTableMagicNumber;
34 extern const uint64_t kBlockBasedTableMagicNumber;
35
36 #ifndef ROCKSDB_LITE
37 extern const uint64_t kLegacyPlainTableMagicNumber;
38 extern const uint64_t kPlainTableMagicNumber;
39 #else
40 // ROCKSDB_LITE doesn't have plain table
41 const uint64_t kLegacyPlainTableMagicNumber = 0;
42 const uint64_t kPlainTableMagicNumber = 0;
43 #endif
44
ShouldReportDetailedTime(Env * env,Statistics * stats)45 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
46 return env != nullptr && stats != nullptr &&
47 stats->get_stats_level() > kExceptDetailedTimers;
48 }
49
EncodeTo(std::string * dst) const50 void BlockHandle::EncodeTo(std::string* dst) const {
51 // Sanity check that all fields have been set
52 assert(offset_ != ~static_cast<uint64_t>(0));
53 assert(size_ != ~static_cast<uint64_t>(0));
54 PutVarint64Varint64(dst, offset_, size_);
55 }
56
DecodeFrom(Slice * input)57 Status BlockHandle::DecodeFrom(Slice* input) {
58 if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
59 return Status::OK();
60 } else {
61 // reset in case failure after partially decoding
62 offset_ = 0;
63 size_ = 0;
64 return Status::Corruption("bad block handle");
65 }
66 }
67
DecodeSizeFrom(uint64_t _offset,Slice * input)68 Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
69 if (GetVarint64(input, &size_)) {
70 offset_ = _offset;
71 return Status::OK();
72 } else {
73 // reset in case failure after partially decoding
74 offset_ = 0;
75 size_ = 0;
76 return Status::Corruption("bad block handle");
77 }
78 }
79
80 // Return a string that contains the copy of handle.
ToString(bool hex) const81 std::string BlockHandle::ToString(bool hex) const {
82 std::string handle_str;
83 EncodeTo(&handle_str);
84 if (hex) {
85 return Slice(handle_str).ToString(true);
86 } else {
87 return handle_str;
88 }
89 }
90
91 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
92
EncodeTo(std::string * dst,bool have_first_key,const BlockHandle * previous_handle) const93 void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
94 const BlockHandle* previous_handle) const {
95 if (previous_handle) {
96 assert(handle.offset() == previous_handle->offset() +
97 previous_handle->size() + kBlockTrailerSize);
98 PutVarsignedint64(dst, handle.size() - previous_handle->size());
99 } else {
100 handle.EncodeTo(dst);
101 }
102 assert(dst->size() != 0);
103
104 if (have_first_key) {
105 PutLengthPrefixedSlice(dst, first_internal_key);
106 }
107 }
108
DecodeFrom(Slice * input,bool have_first_key,const BlockHandle * previous_handle)109 Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
110 const BlockHandle* previous_handle) {
111 if (previous_handle) {
112 int64_t delta;
113 if (!GetVarsignedint64(input, &delta)) {
114 return Status::Corruption("bad delta-encoded index value");
115 }
116 handle = BlockHandle(
117 previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
118 previous_handle->size() + delta);
119 } else {
120 Status s = handle.DecodeFrom(input);
121 if (!s.ok()) {
122 return s;
123 }
124 }
125
126 if (!have_first_key) {
127 first_internal_key = Slice();
128 } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
129 return Status::Corruption("bad first key in block info");
130 }
131
132 return Status::OK();
133 }
134
ToString(bool hex,bool have_first_key) const135 std::string IndexValue::ToString(bool hex, bool have_first_key) const {
136 std::string s;
137 EncodeTo(&s, have_first_key, nullptr);
138 if (hex) {
139 return Slice(s).ToString(true);
140 } else {
141 return s;
142 }
143 }
144
145 namespace {
IsLegacyFooterFormat(uint64_t magic_number)146 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
147 return magic_number == kLegacyBlockBasedTableMagicNumber ||
148 magic_number == kLegacyPlainTableMagicNumber;
149 }
UpconvertLegacyFooterFormat(uint64_t magic_number)150 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
151 if (magic_number == kLegacyBlockBasedTableMagicNumber) {
152 return kBlockBasedTableMagicNumber;
153 }
154 if (magic_number == kLegacyPlainTableMagicNumber) {
155 return kPlainTableMagicNumber;
156 }
157 assert(false);
158 return 0;
159 }
160 } // namespace
161
162 // legacy footer format:
163 // metaindex handle (varint64 offset, varint64 size)
164 // index handle (varint64 offset, varint64 size)
165 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
166 // table_magic_number (8 bytes)
167 // new footer format:
168 // checksum type (char, 1 byte)
169 // metaindex handle (varint64 offset, varint64 size)
170 // index handle (varint64 offset, varint64 size)
171 // <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
172 // footer version (4 bytes)
173 // table_magic_number (8 bytes)
EncodeTo(std::string * dst) const174 void Footer::EncodeTo(std::string* dst) const {
175 assert(HasInitializedTableMagicNumber());
176 if (IsLegacyFooterFormat(table_magic_number())) {
177 // has to be default checksum with legacy footer
178 assert(checksum_ == kCRC32c);
179 const size_t original_size = dst->size();
180 metaindex_handle_.EncodeTo(dst);
181 index_handle_.EncodeTo(dst);
182 dst->resize(original_size + 2 * BlockHandle::kMaxEncodedLength); // Padding
183 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
184 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
185 assert(dst->size() == original_size + kVersion0EncodedLength);
186 } else {
187 const size_t original_size = dst->size();
188 dst->push_back(static_cast<char>(checksum_));
189 metaindex_handle_.EncodeTo(dst);
190 index_handle_.EncodeTo(dst);
191 dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding
192 PutFixed32(dst, version());
193 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() & 0xffffffffu));
194 PutFixed32(dst, static_cast<uint32_t>(table_magic_number() >> 32));
195 assert(dst->size() == original_size + kNewVersionsEncodedLength);
196 }
197 }
198
Footer(uint64_t _table_magic_number,uint32_t _version)199 Footer::Footer(uint64_t _table_magic_number, uint32_t _version)
200 : version_(_version),
201 checksum_(kCRC32c),
202 table_magic_number_(_table_magic_number) {
203 // This should be guaranteed by constructor callers
204 assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0);
205 }
206
DecodeFrom(Slice * input)207 Status Footer::DecodeFrom(Slice* input) {
208 assert(!HasInitializedTableMagicNumber());
209 assert(input != nullptr);
210 assert(input->size() >= kMinEncodedLength);
211
212 const char* magic_ptr =
213 input->data() + input->size() - kMagicNumberLengthByte;
214 const uint32_t magic_lo = DecodeFixed32(magic_ptr);
215 const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
216 uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
217 (static_cast<uint64_t>(magic_lo)));
218
219 // We check for legacy formats here and silently upconvert them
220 bool legacy = IsLegacyFooterFormat(magic);
221 if (legacy) {
222 magic = UpconvertLegacyFooterFormat(magic);
223 }
224 set_table_magic_number(magic);
225
226 if (legacy) {
227 // The size is already asserted to be at least kMinEncodedLength
228 // at the beginning of the function
229 input->remove_prefix(input->size() - kVersion0EncodedLength);
230 version_ = 0 /* legacy */;
231 checksum_ = kCRC32c;
232 } else {
233 version_ = DecodeFixed32(magic_ptr - 4);
234 // Footer version 1 and higher will always occupy exactly this many bytes.
235 // It consists of the checksum type, two block handles, padding,
236 // a version number, and a magic number
237 if (input->size() < kNewVersionsEncodedLength) {
238 return Status::Corruption("input is too short to be an sstable");
239 } else {
240 input->remove_prefix(input->size() - kNewVersionsEncodedLength);
241 }
242 uint32_t chksum;
243 if (!GetVarint32(input, &chksum)) {
244 return Status::Corruption("bad checksum type");
245 }
246 checksum_ = static_cast<ChecksumType>(chksum);
247 }
248
249 Status result = metaindex_handle_.DecodeFrom(input);
250 if (result.ok()) {
251 result = index_handle_.DecodeFrom(input);
252 }
253 if (result.ok()) {
254 // We skip over any leftover data (just padding for now) in "input"
255 const char* end = magic_ptr + kMagicNumberLengthByte;
256 *input = Slice(end, input->data() + input->size() - end);
257 }
258 return result;
259 }
260
ToString() const261 std::string Footer::ToString() const {
262 std::string result;
263 result.reserve(1024);
264
265 bool legacy = IsLegacyFooterFormat(table_magic_number_);
266 if (legacy) {
267 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
268 result.append("index handle: " + index_handle_.ToString() + "\n ");
269 result.append("table_magic_number: " +
270 ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
271 } else {
272 result.append("checksum: " + ROCKSDB_NAMESPACE::ToString(checksum_) +
273 "\n ");
274 result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
275 result.append("index handle: " + index_handle_.ToString() + "\n ");
276 result.append("footer version: " + ROCKSDB_NAMESPACE::ToString(version_) +
277 "\n ");
278 result.append("table_magic_number: " +
279 ROCKSDB_NAMESPACE::ToString(table_magic_number_) + "\n ");
280 }
281 return result;
282 }
283
ReadFooterFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,uint64_t file_size,Footer * footer,uint64_t enforce_table_magic_number)284 Status ReadFooterFromFile(RandomAccessFileReader* file,
285 FilePrefetchBuffer* prefetch_buffer,
286 uint64_t file_size, Footer* footer,
287 uint64_t enforce_table_magic_number) {
288 if (file_size < Footer::kMinEncodedLength) {
289 return Status::Corruption("file is too short (" + ToString(file_size) +
290 " bytes) to be an "
291 "sstable: " +
292 file->file_name());
293 }
294
295 std::string footer_buf;
296 std::unique_ptr<const char[]> internal_buf;
297 Slice footer_input;
298 size_t read_offset =
299 (file_size > Footer::kMaxEncodedLength)
300 ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
301 : 0;
302 Status s;
303 if (prefetch_buffer == nullptr ||
304 !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
305 &footer_input)) {
306 if (file->use_direct_io()) {
307 s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
308 nullptr, &internal_buf);
309 } else {
310 footer_buf.reserve(Footer::kMaxEncodedLength);
311 s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
312 &footer_buf[0], nullptr);
313 }
314 if (!s.ok()) return s;
315 }
316
317 // Check that we actually read the whole footer from the file. It may be
318 // that size isn't correct.
319 if (footer_input.size() < Footer::kMinEncodedLength) {
320 return Status::Corruption("file is too short (" + ToString(file_size) +
321 " bytes) to be an "
322 "sstable" +
323 file->file_name());
324 }
325
326 s = footer->DecodeFrom(&footer_input);
327 if (!s.ok()) {
328 return s;
329 }
330 if (enforce_table_magic_number != 0 &&
331 enforce_table_magic_number != footer->table_magic_number()) {
332 return Status::Corruption(
333 "Bad table magic number: expected " +
334 ToString(enforce_table_magic_number) + ", found " +
335 ToString(footer->table_magic_number()) + " in " + file->file_name());
336 }
337 return Status::OK();
338 }
339
UncompressBlockContentsForCompressionType(const UncompressionInfo & uncompression_info,const char * data,size_t n,BlockContents * contents,uint32_t format_version,const ImmutableCFOptions & ioptions,MemoryAllocator * allocator)340 Status UncompressBlockContentsForCompressionType(
341 const UncompressionInfo& uncompression_info, const char* data, size_t n,
342 BlockContents* contents, uint32_t format_version,
343 const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
344 CacheAllocationPtr ubuf;
345
346 assert(uncompression_info.type() != kNoCompression &&
347 "Invalid compression type");
348
349 StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
350 ioptions.env, ioptions.statistics));
351 int decompress_size = 0;
352 switch (uncompression_info.type()) {
353 case kSnappyCompression: {
354 size_t ulength = 0;
355 static char snappy_corrupt_msg[] =
356 "Snappy not supported or corrupted Snappy compressed block contents";
357 if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
358 return Status::Corruption(snappy_corrupt_msg);
359 }
360 ubuf = AllocateBlock(ulength, allocator);
361 if (!Snappy_Uncompress(data, n, ubuf.get())) {
362 return Status::Corruption(snappy_corrupt_msg);
363 }
364 *contents = BlockContents(std::move(ubuf), ulength);
365 break;
366 }
367 case kZlibCompression:
368 ubuf = Zlib_Uncompress(
369 uncompression_info, data, n, &decompress_size,
370 GetCompressFormatForVersion(kZlibCompression, format_version),
371 allocator);
372 if (!ubuf) {
373 static char zlib_corrupt_msg[] =
374 "Zlib not supported or corrupted Zlib compressed block contents";
375 return Status::Corruption(zlib_corrupt_msg);
376 }
377 *contents = BlockContents(std::move(ubuf), decompress_size);
378 break;
379 case kBZip2Compression:
380 ubuf = BZip2_Uncompress(
381 data, n, &decompress_size,
382 GetCompressFormatForVersion(kBZip2Compression, format_version),
383 allocator);
384 if (!ubuf) {
385 static char bzip2_corrupt_msg[] =
386 "Bzip2 not supported or corrupted Bzip2 compressed block contents";
387 return Status::Corruption(bzip2_corrupt_msg);
388 }
389 *contents = BlockContents(std::move(ubuf), decompress_size);
390 break;
391 case kLZ4Compression:
392 ubuf = LZ4_Uncompress(
393 uncompression_info, data, n, &decompress_size,
394 GetCompressFormatForVersion(kLZ4Compression, format_version),
395 allocator);
396 if (!ubuf) {
397 static char lz4_corrupt_msg[] =
398 "LZ4 not supported or corrupted LZ4 compressed block contents";
399 return Status::Corruption(lz4_corrupt_msg);
400 }
401 *contents = BlockContents(std::move(ubuf), decompress_size);
402 break;
403 case kLZ4HCCompression:
404 ubuf = LZ4_Uncompress(
405 uncompression_info, data, n, &decompress_size,
406 GetCompressFormatForVersion(kLZ4HCCompression, format_version),
407 allocator);
408 if (!ubuf) {
409 static char lz4hc_corrupt_msg[] =
410 "LZ4HC not supported or corrupted LZ4HC compressed block contents";
411 return Status::Corruption(lz4hc_corrupt_msg);
412 }
413 *contents = BlockContents(std::move(ubuf), decompress_size);
414 break;
415 case kXpressCompression:
416 // XPRESS allocates memory internally, thus no support for custom
417 // allocator.
418 ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
419 if (!ubuf) {
420 static char xpress_corrupt_msg[] =
421 "XPRESS not supported or corrupted XPRESS compressed block "
422 "contents";
423 return Status::Corruption(xpress_corrupt_msg);
424 }
425 *contents = BlockContents(std::move(ubuf), decompress_size);
426 break;
427 case kZSTD:
428 case kZSTDNotFinalCompression:
429 ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
430 allocator);
431 if (!ubuf) {
432 static char zstd_corrupt_msg[] =
433 "ZSTD not supported or corrupted ZSTD compressed block contents";
434 return Status::Corruption(zstd_corrupt_msg);
435 }
436 *contents = BlockContents(std::move(ubuf), decompress_size);
437 break;
438 default:
439 return Status::Corruption("bad block type");
440 }
441
442 if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
443 RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
444 timer.ElapsedNanos());
445 }
446 RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
447 contents->data.size());
448 RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
449
450 return Status::OK();
451 }
452
453 //
454 // The 'data' points to the raw block contents that was read in from file.
455 // This method allocates a new heap buffer and the raw block
456 // contents are uncompresed into this buffer. This
457 // buffer is returned via 'result' and it is upto the caller to
458 // free this buffer.
459 // format_version is the block format as defined in include/rocksdb/table.h
UncompressBlockContents(const UncompressionInfo & uncompression_info,const char * data,size_t n,BlockContents * contents,uint32_t format_version,const ImmutableCFOptions & ioptions,MemoryAllocator * allocator)460 Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
461 const char* data, size_t n,
462 BlockContents* contents, uint32_t format_version,
463 const ImmutableCFOptions& ioptions,
464 MemoryAllocator* allocator) {
465 assert(data[n] != kNoCompression);
466 assert(data[n] == uncompression_info.type());
467 return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
468 contents, format_version,
469 ioptions, allocator);
470 }
471
472 } // namespace ROCKSDB_NAMESPACE
473