1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_reader.h"
10 #include <algorithm>
11 #include <array>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 #include "cache/simple_deleter.h"
18
19 #include "db/dbformat.h"
20 #include "db/pinned_iterators_manager.h"
21
22 #include "file/file_prefetch_buffer.h"
23 #include "file/random_access_file_reader.h"
24
25 #include "rocksdb/cache.h"
26 #include "rocksdb/comparator.h"
27 #include "rocksdb/env.h"
28 #include "rocksdb/file_system.h"
29 #include "rocksdb/filter_policy.h"
30 #include "rocksdb/iterator.h"
31 #include "rocksdb/options.h"
32 #include "rocksdb/statistics.h"
33 #include "rocksdb/table.h"
34 #include "rocksdb/table_properties.h"
35
36 #include "table/block_based/binary_search_index_reader.h"
37 #include "table/block_based/block.h"
38 #include "table/block_based/block_based_filter_block.h"
39 #include "table/block_based/block_based_table_factory.h"
40 #include "table/block_based/block_based_table_iterator.h"
41 #include "table/block_based/block_prefix_index.h"
42 #include "table/block_based/filter_block.h"
43 #include "table/block_based/full_filter_block.h"
44 #include "table/block_based/hash_index_reader.h"
45 #include "table/block_based/partitioned_filter_block.h"
46 #include "table/block_based/partitioned_index_reader.h"
47 #include "table/block_fetcher.h"
48 #include "table/format.h"
49 #include "table/get_context.h"
50 #include "table/internal_iterator.h"
51 #include "table/meta_blocks.h"
52 #include "table/multiget_context.h"
53 #include "table/persistent_cache_helper.h"
54 #include "table/sst_file_writer_collectors.h"
55 #include "table/two_level_iterator.h"
56
57 #include "monitoring/perf_context_imp.h"
58 #include "test_util/sync_point.h"
59 #include "util/coding.h"
60 #include "util/crc32c.h"
61 #include "util/stop_watch.h"
62 #include "util/string_util.h"
63 #include "util/util.h"
64 #include "util/xxhash.h"
65
66 namespace ROCKSDB_NAMESPACE {
67
68 extern const uint64_t kBlockBasedTableMagicNumber;
69 extern const std::string kHashIndexPrefixesBlock;
70 extern const std::string kHashIndexPrefixesMetadataBlock;
71
72 typedef BlockBasedTable::IndexReader IndexReader;
73
74 // Found that 256 KB readahead size provides the best performance, based on
75 // experiments, for auto readahead. Experiment data is in PR #3282.
76 const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
77
~BlockBasedTable()78 BlockBasedTable::~BlockBasedTable() {
79 delete rep_;
80 }
81
82 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
83
84 template <typename TBlocklike>
85 class BlocklikeTraits;
86
87 template <>
88 class BlocklikeTraits<BlockContents> {
89 public:
Create(BlockContents && contents,size_t,Statistics *,bool,const FilterPolicy *)90 static BlockContents* Create(BlockContents&& contents,
91 size_t /* read_amp_bytes_per_bit */,
92 Statistics* /* statistics */,
93 bool /* using_zstd */,
94 const FilterPolicy* /* filter_policy */) {
95 return new BlockContents(std::move(contents));
96 }
97
GetNumRestarts(const BlockContents &)98 static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
99 return 0;
100 }
101 };
102
103 template <>
104 class BlocklikeTraits<ParsedFullFilterBlock> {
105 public:
Create(BlockContents && contents,size_t,Statistics *,bool,const FilterPolicy * filter_policy)106 static ParsedFullFilterBlock* Create(BlockContents&& contents,
107 size_t /* read_amp_bytes_per_bit */,
108 Statistics* /* statistics */,
109 bool /* using_zstd */,
110 const FilterPolicy* filter_policy) {
111 return new ParsedFullFilterBlock(filter_policy, std::move(contents));
112 }
113
GetNumRestarts(const ParsedFullFilterBlock &)114 static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
115 return 0;
116 }
117 };
118
119 template <>
120 class BlocklikeTraits<Block> {
121 public:
Create(BlockContents && contents,size_t read_amp_bytes_per_bit,Statistics * statistics,bool,const FilterPolicy *)122 static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
123 Statistics* statistics, bool /* using_zstd */,
124 const FilterPolicy* /* filter_policy */) {
125 return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
126 }
127
GetNumRestarts(const Block & block)128 static uint32_t GetNumRestarts(const Block& block) {
129 return block.NumRestarts();
130 }
131 };
132
133 template <>
134 class BlocklikeTraits<UncompressionDict> {
135 public:
Create(BlockContents && contents,size_t,Statistics *,bool using_zstd,const FilterPolicy *)136 static UncompressionDict* Create(BlockContents&& contents,
137 size_t /* read_amp_bytes_per_bit */,
138 Statistics* /* statistics */,
139 bool using_zstd,
140 const FilterPolicy* /* filter_policy */) {
141 return new UncompressionDict(contents.data, std::move(contents.allocation),
142 using_zstd);
143 }
144
GetNumRestarts(const UncompressionDict &)145 static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
146 return 0;
147 }
148 };
149
150 namespace {
151 // Read the block identified by "handle" from "file".
152 // The only relevant option is options.verify_checksums for now.
153 // On failure return non-OK.
154 // On success fill *result and return OK - caller owns *result
155 // @param uncompression_dict Data for presetting the compression library's
156 // dictionary.
157 template <typename TBlocklike>
ReadBlockFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,const Footer & footer,const ReadOptions & options,const BlockHandle & handle,std::unique_ptr<TBlocklike> * result,const ImmutableCFOptions & ioptions,bool do_uncompress,bool maybe_compressed,BlockType block_type,const UncompressionDict & uncompression_dict,const PersistentCacheOptions & cache_options,size_t read_amp_bytes_per_bit,MemoryAllocator * memory_allocator,bool for_compaction,bool using_zstd,const FilterPolicy * filter_policy)158 Status ReadBlockFromFile(
159 RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
160 const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
161 std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
162 bool do_uncompress, bool maybe_compressed, BlockType block_type,
163 const UncompressionDict& uncompression_dict,
164 const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
165 MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
166 const FilterPolicy* filter_policy) {
167 assert(result);
168
169 BlockContents contents;
170 BlockFetcher block_fetcher(
171 file, prefetch_buffer, footer, options, handle, &contents, ioptions,
172 do_uncompress, maybe_compressed, block_type, uncompression_dict,
173 cache_options, memory_allocator, nullptr, for_compaction);
174 Status s = block_fetcher.ReadBlockContents();
175 if (s.ok()) {
176 result->reset(BlocklikeTraits<TBlocklike>::Create(
177 std::move(contents), read_amp_bytes_per_bit, ioptions.statistics,
178 using_zstd, filter_policy));
179 }
180
181 return s;
182 }
183
184 // Release the cached entry and decrement its ref count.
185 // Do not force erase
ReleaseCachedEntry(void * arg,void * h)186 void ReleaseCachedEntry(void* arg, void* h) {
187 Cache* cache = reinterpret_cast<Cache*>(arg);
188 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
189 cache->Release(handle, false /* force_erase */);
190 }
191
192 // For hash based index, return true if prefix_extractor and
193 // prefix_extractor_block mismatch, false otherwise. This flag will be used
194 // as total_order_seek via NewIndexIterator
PrefixExtractorChanged(const TableProperties * table_properties,const SliceTransform * prefix_extractor)195 bool PrefixExtractorChanged(const TableProperties* table_properties,
196 const SliceTransform* prefix_extractor) {
197 // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
198 // Turn off hash index in prefix_extractor is not set; if prefix_extractor
199 // is set but prefix_extractor_block is not set, also disable hash index
200 if (prefix_extractor == nullptr || table_properties == nullptr ||
201 table_properties->prefix_extractor_name.empty()) {
202 return true;
203 }
204
205 // prefix_extractor and prefix_extractor_block are both non-empty
206 if (table_properties->prefix_extractor_name.compare(
207 prefix_extractor->Name()) != 0) {
208 return true;
209 } else {
210 return false;
211 }
212 }
213
CopyBufferToHeap(MemoryAllocator * allocator,Slice & buf)214 CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
215 CacheAllocationPtr heap_buf;
216 heap_buf = AllocateBlock(buf.size(), allocator);
217 memcpy(heap_buf.get(), buf.data(), buf.size());
218 return heap_buf;
219 }
220 } // namespace
221
UpdateCacheHitMetrics(BlockType block_type,GetContext * get_context,size_t usage) const222 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
223 GetContext* get_context,
224 size_t usage) const {
225 Statistics* const statistics = rep_->ioptions.statistics;
226
227 PERF_COUNTER_ADD(block_cache_hit_count, 1);
228 PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
229 static_cast<uint32_t>(rep_->level));
230
231 if (get_context) {
232 ++get_context->get_context_stats_.num_cache_hit;
233 get_context->get_context_stats_.num_cache_bytes_read += usage;
234 } else {
235 RecordTick(statistics, BLOCK_CACHE_HIT);
236 RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
237 }
238
239 switch (block_type) {
240 case BlockType::kFilter:
241 PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
242
243 if (get_context) {
244 ++get_context->get_context_stats_.num_cache_filter_hit;
245 } else {
246 RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
247 }
248 break;
249
250 case BlockType::kCompressionDictionary:
251 // TODO: introduce perf counter for compression dictionary hit count
252 if (get_context) {
253 ++get_context->get_context_stats_.num_cache_compression_dict_hit;
254 } else {
255 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
256 }
257 break;
258
259 case BlockType::kIndex:
260 PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
261
262 if (get_context) {
263 ++get_context->get_context_stats_.num_cache_index_hit;
264 } else {
265 RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
266 }
267 break;
268
269 default:
270 // TODO: introduce dedicated tickers/statistics/counters
271 // for range tombstones
272 if (get_context) {
273 ++get_context->get_context_stats_.num_cache_data_hit;
274 } else {
275 RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
276 }
277 break;
278 }
279 }
280
UpdateCacheMissMetrics(BlockType block_type,GetContext * get_context) const281 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
282 GetContext* get_context) const {
283 Statistics* const statistics = rep_->ioptions.statistics;
284
285 // TODO: introduce aggregate (not per-level) block cache miss count
286 PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
287 static_cast<uint32_t>(rep_->level));
288
289 if (get_context) {
290 ++get_context->get_context_stats_.num_cache_miss;
291 } else {
292 RecordTick(statistics, BLOCK_CACHE_MISS);
293 }
294
295 // TODO: introduce perf counters for misses per block type
296 switch (block_type) {
297 case BlockType::kFilter:
298 if (get_context) {
299 ++get_context->get_context_stats_.num_cache_filter_miss;
300 } else {
301 RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
302 }
303 break;
304
305 case BlockType::kCompressionDictionary:
306 if (get_context) {
307 ++get_context->get_context_stats_.num_cache_compression_dict_miss;
308 } else {
309 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
310 }
311 break;
312
313 case BlockType::kIndex:
314 if (get_context) {
315 ++get_context->get_context_stats_.num_cache_index_miss;
316 } else {
317 RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
318 }
319 break;
320
321 default:
322 // TODO: introduce dedicated tickers/statistics/counters
323 // for range tombstones
324 if (get_context) {
325 ++get_context->get_context_stats_.num_cache_data_miss;
326 } else {
327 RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
328 }
329 break;
330 }
331 }
332
UpdateCacheInsertionMetrics(BlockType block_type,GetContext * get_context,size_t usage) const333 void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
334 GetContext* get_context,
335 size_t usage) const {
336 Statistics* const statistics = rep_->ioptions.statistics;
337
338 // TODO: introduce perf counters for block cache insertions
339 if (get_context) {
340 ++get_context->get_context_stats_.num_cache_add;
341 get_context->get_context_stats_.num_cache_bytes_write += usage;
342 } else {
343 RecordTick(statistics, BLOCK_CACHE_ADD);
344 RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
345 }
346
347 switch (block_type) {
348 case BlockType::kFilter:
349 if (get_context) {
350 ++get_context->get_context_stats_.num_cache_filter_add;
351 get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
352 } else {
353 RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
354 RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
355 }
356 break;
357
358 case BlockType::kCompressionDictionary:
359 if (get_context) {
360 ++get_context->get_context_stats_.num_cache_compression_dict_add;
361 get_context->get_context_stats_
362 .num_cache_compression_dict_bytes_insert += usage;
363 } else {
364 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
365 RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
366 usage);
367 }
368 break;
369
370 case BlockType::kIndex:
371 if (get_context) {
372 ++get_context->get_context_stats_.num_cache_index_add;
373 get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
374 } else {
375 RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
376 RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
377 }
378 break;
379
380 default:
381 // TODO: introduce dedicated tickers/statistics/counters
382 // for range tombstones
383 if (get_context) {
384 ++get_context->get_context_stats_.num_cache_data_add;
385 get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
386 } else {
387 RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
388 RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
389 }
390 break;
391 }
392 }
393
GetEntryFromCache(Cache * block_cache,const Slice & key,BlockType block_type,GetContext * get_context) const394 Cache::Handle* BlockBasedTable::GetEntryFromCache(
395 Cache* block_cache, const Slice& key, BlockType block_type,
396 GetContext* get_context) const {
397 auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
398
399 if (cache_handle != nullptr) {
400 UpdateCacheHitMetrics(block_type, get_context,
401 block_cache->GetUsage(cache_handle));
402 } else {
403 UpdateCacheMissMetrics(block_type, get_context);
404 }
405
406 return cache_handle;
407 }
408
409 // Helper function to setup the cache key's prefix for the Table.
SetupCacheKeyPrefix(Rep * rep)410 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
411 assert(kMaxCacheKeyPrefixSize >= 10);
412 rep->cache_key_prefix_size = 0;
413 rep->compressed_cache_key_prefix_size = 0;
414 if (rep->table_options.block_cache != nullptr) {
415 GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
416 &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
417 }
418 if (rep->table_options.persistent_cache != nullptr) {
419 GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
420 &rep->persistent_cache_key_prefix[0],
421 &rep->persistent_cache_key_prefix_size);
422 }
423 if (rep->table_options.block_cache_compressed != nullptr) {
424 GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
425 rep->file->file(), &rep->compressed_cache_key_prefix[0],
426 &rep->compressed_cache_key_prefix_size);
427 }
428 }
429
GenerateCachePrefix(Cache * cc,FSRandomAccessFile * file,char * buffer,size_t * size)430 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
431 char* buffer, size_t* size) {
432 // generate an id from the file
433 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
434
435 // If the prefix wasn't generated or was too long,
436 // create one from the cache.
437 if (cc != nullptr && *size == 0) {
438 char* end = EncodeVarint64(buffer, cc->NewId());
439 *size = static_cast<size_t>(end - buffer);
440 }
441 }
442
GenerateCachePrefix(Cache * cc,FSWritableFile * file,char * buffer,size_t * size)443 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
444 char* buffer, size_t* size) {
445 // generate an id from the file
446 *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
447
448 // If the prefix wasn't generated or was too long,
449 // create one from the cache.
450 if (cc != nullptr && *size == 0) {
451 char* end = EncodeVarint64(buffer, cc->NewId());
452 *size = static_cast<size_t>(end - buffer);
453 }
454 }
455
456 namespace {
457 // Return True if table_properties has `user_prop_name` has a `true` value
458 // or it doesn't contain this property (for backward compatible).
IsFeatureSupported(const TableProperties & table_properties,const std::string & user_prop_name,Logger * info_log)459 bool IsFeatureSupported(const TableProperties& table_properties,
460 const std::string& user_prop_name, Logger* info_log) {
461 auto& props = table_properties.user_collected_properties;
462 auto pos = props.find(user_prop_name);
463 // Older version doesn't have this value set. Skip this check.
464 if (pos != props.end()) {
465 if (pos->second == kPropFalse) {
466 return false;
467 } else if (pos->second != kPropTrue) {
468 ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
469 user_prop_name.c_str(), pos->second.c_str());
470 }
471 }
472 return true;
473 }
474
475 // Caller has to ensure seqno is not nullptr.
GetGlobalSequenceNumber(const TableProperties & table_properties,SequenceNumber largest_seqno,SequenceNumber * seqno)476 Status GetGlobalSequenceNumber(const TableProperties& table_properties,
477 SequenceNumber largest_seqno,
478 SequenceNumber* seqno) {
479 const auto& props = table_properties.user_collected_properties;
480 const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
481 const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
482
483 *seqno = kDisableGlobalSequenceNumber;
484 if (version_pos == props.end()) {
485 if (seqno_pos != props.end()) {
486 std::array<char, 200> msg_buf;
487 // This is not an external sst file, global_seqno is not supported.
488 snprintf(
489 msg_buf.data(), msg_buf.max_size(),
490 "A non-external sst file have global seqno property with value %s",
491 seqno_pos->second.c_str());
492 return Status::Corruption(msg_buf.data());
493 }
494 return Status::OK();
495 }
496
497 uint32_t version = DecodeFixed32(version_pos->second.c_str());
498 if (version < 2) {
499 if (seqno_pos != props.end() || version != 1) {
500 std::array<char, 200> msg_buf;
501 // This is a v1 external sst file, global_seqno is not supported.
502 snprintf(msg_buf.data(), msg_buf.max_size(),
503 "An external sst file with version %u have global seqno "
504 "property with value %s",
505 version, seqno_pos->second.c_str());
506 return Status::Corruption(msg_buf.data());
507 }
508 return Status::OK();
509 }
510
511 // Since we have a plan to deprecate global_seqno, we do not return failure
512 // if seqno_pos == props.end(). We rely on version_pos to detect whether the
513 // SST is external.
514 SequenceNumber global_seqno(0);
515 if (seqno_pos != props.end()) {
516 global_seqno = DecodeFixed64(seqno_pos->second.c_str());
517 }
518 // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
519 // to denote it is unknown.
520 if (largest_seqno < kMaxSequenceNumber) {
521 if (global_seqno == 0) {
522 global_seqno = largest_seqno;
523 }
524 if (global_seqno != largest_seqno) {
525 std::array<char, 200> msg_buf;
526 snprintf(
527 msg_buf.data(), msg_buf.max_size(),
528 "An external sst file with version %u have global seqno property "
529 "with value %s, while largest seqno in the file is %llu",
530 version, seqno_pos->second.c_str(),
531 static_cast<unsigned long long>(largest_seqno));
532 return Status::Corruption(msg_buf.data());
533 }
534 }
535 *seqno = global_seqno;
536
537 if (global_seqno > kMaxSequenceNumber) {
538 std::array<char, 200> msg_buf;
539 snprintf(msg_buf.data(), msg_buf.max_size(),
540 "An external sst file with version %u have global seqno property "
541 "with value %llu, which is greater than kMaxSequenceNumber",
542 version, static_cast<unsigned long long>(global_seqno));
543 return Status::Corruption(msg_buf.data());
544 }
545
546 return Status::OK();
547 }
548 } // namespace
549
GetCacheKey(const char * cache_key_prefix,size_t cache_key_prefix_size,const BlockHandle & handle,char * cache_key)550 Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
551 size_t cache_key_prefix_size,
552 const BlockHandle& handle, char* cache_key) {
553 assert(cache_key != nullptr);
554 assert(cache_key_prefix_size != 0);
555 assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
556 memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
557 char* end =
558 EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
559 return Slice(cache_key, static_cast<size_t>(end - cache_key));
560 }
561
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const BlockBasedTableOptions & table_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const SliceTransform * prefix_extractor,const bool prefetch_index_and_filter_in_cache,const bool skip_filters,const int level,const bool immortal_table,const SequenceNumber largest_seqno,TailPrefetchStats * tail_prefetch_stats,BlockCacheTracer * const block_cache_tracer)562 Status BlockBasedTable::Open(
563 const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
564 const BlockBasedTableOptions& table_options,
565 const InternalKeyComparator& internal_comparator,
566 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
567 std::unique_ptr<TableReader>* table_reader,
568 const SliceTransform* prefix_extractor,
569 const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
570 const int level, const bool immortal_table,
571 const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
572 BlockCacheTracer* const block_cache_tracer) {
573 table_reader->reset();
574
575 Status s;
576 Footer footer;
577 std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
578
579 // prefetch both index and filters, down to all partitions
580 const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
581 const bool preload_all = !table_options.cache_index_and_filter_blocks;
582
583 if (!ioptions.allow_mmap_reads) {
584 s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
585 preload_all, &prefetch_buffer);
586 } else {
587 // Should not prefetch for mmap mode.
588 prefetch_buffer.reset(new FilePrefetchBuffer(
589 nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
590 }
591
592 // Read in the following order:
593 // 1. Footer
594 // 2. [metaindex block]
595 // 3. [meta block: properties]
596 // 4. [meta block: range deletion tombstone]
597 // 5. [meta block: compression dictionary]
598 // 6. [meta block: index]
599 // 7. [meta block: filter]
600 s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
601 kBlockBasedTableMagicNumber);
602 if (!s.ok()) {
603 return s;
604 }
605 if (!BlockBasedTableSupportedVersion(footer.version())) {
606 return Status::Corruption(
607 "Unknown Footer version. Maybe this file was created with newer "
608 "version of RocksDB?");
609 }
610
611 // We've successfully read the footer. We are ready to serve requests.
612 // Better not mutate rep_ after the creation. eg. internal_prefix_transform
613 // raw pointer will be used to create HashIndexReader, whose reset may
614 // access a dangling pointer.
615 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
616 Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
617 internal_comparator, skip_filters, level,
618 immortal_table);
619 rep->file = std::move(file);
620 rep->footer = footer;
621 rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
622 // We need to wrap data with internal_prefix_transform to make sure it can
623 // handle prefix correctly.
624 if (prefix_extractor != nullptr) {
625 rep->internal_prefix_transform.reset(
626 new InternalKeySliceTransform(prefix_extractor));
627 }
628 SetupCacheKeyPrefix(rep);
629 std::unique_ptr<BlockBasedTable> new_table(
630 new BlockBasedTable(rep, block_cache_tracer));
631
632 // page cache options
633 rep->persistent_cache_options =
634 PersistentCacheOptions(rep->table_options.persistent_cache,
635 std::string(rep->persistent_cache_key_prefix,
636 rep->persistent_cache_key_prefix_size),
637 rep->ioptions.statistics);
638
639 // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
640 // handle to null, otherwise it may be seen as uninitialized during the below
641 // meta-block reads.
642 rep->compression_dict_handle = BlockHandle::NullBlockHandle();
643
644 // Read metaindex
645 std::unique_ptr<Block> metaindex;
646 std::unique_ptr<InternalIterator> metaindex_iter;
647 s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
648 &metaindex_iter);
649 if (!s.ok()) {
650 return s;
651 }
652
653 // Populates table_properties and some fields that depend on it,
654 // such as index_type.
655 s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
656 metaindex_iter.get(), largest_seqno);
657 if (!s.ok()) {
658 return s;
659 }
660 s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
661 internal_comparator, &lookup_context);
662 if (!s.ok()) {
663 return s;
664 }
665 s = new_table->PrefetchIndexAndFilterBlocks(
666 prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
667 prefetch_all, table_options, level, &lookup_context);
668
669 if (s.ok()) {
670 // Update tail prefetch stats
671 assert(prefetch_buffer.get() != nullptr);
672 if (tail_prefetch_stats != nullptr) {
673 assert(prefetch_buffer->min_offset_read() < file_size);
674 tail_prefetch_stats->RecordEffectiveSize(
675 static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
676 }
677
678 *table_reader = std::move(new_table);
679 }
680
681 return s;
682 }
683
PrefetchTail(RandomAccessFileReader * file,uint64_t file_size,TailPrefetchStats * tail_prefetch_stats,const bool prefetch_all,const bool preload_all,std::unique_ptr<FilePrefetchBuffer> * prefetch_buffer)684 Status BlockBasedTable::PrefetchTail(
685 RandomAccessFileReader* file, uint64_t file_size,
686 TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
687 const bool preload_all,
688 std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
689 size_t tail_prefetch_size = 0;
690 if (tail_prefetch_stats != nullptr) {
691 // Multiple threads may get a 0 (no history) when running in parallel,
692 // but it will get cleared after the first of them finishes.
693 tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
694 }
695 if (tail_prefetch_size == 0) {
696 // Before read footer, readahead backwards to prefetch data. Do more
697 // readahead if we're going to read index/filter.
698 // TODO: This may incorrectly select small readahead in case partitioned
699 // index/filter is enabled and top-level partition pinning is enabled.
700 // That's because we need to issue readahead before we read the properties,
701 // at which point we don't yet know the index type.
702 tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
703 }
704 size_t prefetch_off;
705 size_t prefetch_len;
706 if (file_size < tail_prefetch_size) {
707 prefetch_off = 0;
708 prefetch_len = static_cast<size_t>(file_size);
709 } else {
710 prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
711 prefetch_len = tail_prefetch_size;
712 }
713 TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
714 &tail_prefetch_size);
715 Status s;
716 // TODO should not have this special logic in the future.
717 if (!file->use_direct_io()) {
718 prefetch_buffer->reset(new FilePrefetchBuffer(
719 nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
720 s = file->Prefetch(prefetch_off, prefetch_len);
721 } else {
722 prefetch_buffer->reset(new FilePrefetchBuffer(
723 nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
724 s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
725 }
726 return s;
727 }
728
729
TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer * prefetch_buffer,const Slice & handle_value,TableProperties ** table_properties)730 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
731 FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
732 TableProperties** table_properties) {
733 assert(table_properties != nullptr);
734 // If this is an external SST file ingested with write_global_seqno set to
735 // true, then we expect the checksum mismatch because checksum was written
736 // by SstFileWriter, but its global seqno in the properties block may have
737 // been changed during ingestion. In this case, we read the properties
738 // block, copy it to a memory buffer, change the global seqno to its
739 // original value, i.e. 0, and verify the checksum again.
740 BlockHandle props_block_handle;
741 CacheAllocationPtr tmp_buf;
742 Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
743 rep_->footer, rep_->ioptions, table_properties,
744 false /* verify_checksum */, &props_block_handle,
745 &tmp_buf, false /* compression_type_missing */,
746 nullptr /* memory_allocator */);
747 if (s.ok() && tmp_buf) {
748 const auto seqno_pos_iter =
749 (*table_properties)
750 ->properties_offsets.find(
751 ExternalSstFilePropertyNames::kGlobalSeqno);
752 size_t block_size = static_cast<size_t>(props_block_handle.size());
753 if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
754 uint64_t global_seqno_offset = seqno_pos_iter->second;
755 EncodeFixed64(
756 tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
757 }
758 uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
759 s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
760 tmp_buf.get(), block_size + 1, value);
761 }
762 return s;
763 }
764
ReadPropertiesBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const SequenceNumber largest_seqno)765 Status BlockBasedTable::ReadPropertiesBlock(
766 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
767 const SequenceNumber largest_seqno) {
768 bool found_properties_block = true;
769 Status s;
770 s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
771
772 if (!s.ok()) {
773 ROCKS_LOG_WARN(rep_->ioptions.info_log,
774 "Error when seeking to properties block from file: %s",
775 s.ToString().c_str());
776 } else if (found_properties_block) {
777 s = meta_iter->status();
778 TableProperties* table_properties = nullptr;
779 if (s.ok()) {
780 s = ReadProperties(
781 meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
782 rep_->ioptions, &table_properties, true /* verify_checksum */,
783 nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
784 false /* compression_type_missing */, nullptr /* memory_allocator */);
785 }
786
787 if (s.IsCorruption()) {
788 s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
789 &table_properties);
790 }
791 std::unique_ptr<TableProperties> props_guard;
792 if (table_properties != nullptr) {
793 props_guard.reset(table_properties);
794 }
795
796 if (!s.ok()) {
797 ROCKS_LOG_WARN(rep_->ioptions.info_log,
798 "Encountered error while reading data from properties "
799 "block %s",
800 s.ToString().c_str());
801 } else {
802 assert(table_properties != nullptr);
803 rep_->table_properties.reset(props_guard.release());
804 rep_->blocks_maybe_compressed =
805 rep_->table_properties->compression_name !=
806 CompressionTypeToString(kNoCompression);
807 rep_->blocks_definitely_zstd_compressed =
808 (rep_->table_properties->compression_name ==
809 CompressionTypeToString(kZSTD) ||
810 rep_->table_properties->compression_name ==
811 CompressionTypeToString(kZSTDNotFinalCompression));
812 }
813 } else {
814 ROCKS_LOG_ERROR(rep_->ioptions.info_log,
815 "Cannot find Properties block from file.");
816 }
817 #ifndef ROCKSDB_LITE
818 if (rep_->table_properties) {
819 ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
820 &(rep_->table_prefix_extractor));
821 }
822 #endif // ROCKSDB_LITE
823
824 // Read the table properties, if provided.
825 if (rep_->table_properties) {
826 rep_->whole_key_filtering &=
827 IsFeatureSupported(*(rep_->table_properties),
828 BlockBasedTablePropertyNames::kWholeKeyFiltering,
829 rep_->ioptions.info_log);
830 rep_->prefix_filtering &=
831 IsFeatureSupported(*(rep_->table_properties),
832 BlockBasedTablePropertyNames::kPrefixFiltering,
833 rep_->ioptions.info_log);
834
835 rep_->index_key_includes_seq =
836 rep_->table_properties->index_key_is_user_key == 0;
837 rep_->index_value_is_full =
838 rep_->table_properties->index_value_is_delta_encoded == 0;
839
840 // Update index_type with the true type.
841 // If table properties don't contain index type, we assume that the table
842 // is in very old format and has kBinarySearch index type.
843 auto& props = rep_->table_properties->user_collected_properties;
844 auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
845 if (pos != props.end()) {
846 rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
847 DecodeFixed32(pos->second.c_str()));
848 }
849
850 rep_->index_has_first_key =
851 rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
852
853 s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
854 &(rep_->global_seqno));
855 if (!s.ok()) {
856 ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
857 }
858 }
859 return s;
860 }
861
ReadRangeDelBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const InternalKeyComparator & internal_comparator,BlockCacheLookupContext * lookup_context)862 Status BlockBasedTable::ReadRangeDelBlock(
863 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
864 const InternalKeyComparator& internal_comparator,
865 BlockCacheLookupContext* lookup_context) {
866 Status s;
867 bool found_range_del_block;
868 BlockHandle range_del_handle;
869 s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
870 if (!s.ok()) {
871 ROCKS_LOG_WARN(
872 rep_->ioptions.info_log,
873 "Error when seeking to range delete tombstones block from file: %s",
874 s.ToString().c_str());
875 } else if (found_range_del_block && !range_del_handle.IsNull()) {
876 ReadOptions read_options;
877 std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
878 read_options, range_del_handle,
879 /*input_iter=*/nullptr, BlockType::kRangeDeletion,
880 /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
881 assert(iter != nullptr);
882 s = iter->status();
883 if (!s.ok()) {
884 ROCKS_LOG_WARN(
885 rep_->ioptions.info_log,
886 "Encountered error while reading data from range del block %s",
887 s.ToString().c_str());
888 } else {
889 rep_->fragmented_range_dels =
890 std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
891 internal_comparator);
892 }
893 }
894 return s;
895 }
896
PrefetchIndexAndFilterBlocks(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,BlockBasedTable * new_table,bool prefetch_all,const BlockBasedTableOptions & table_options,const int level,BlockCacheLookupContext * lookup_context)897 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
898 FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
899 BlockBasedTable* new_table, bool prefetch_all,
900 const BlockBasedTableOptions& table_options, const int level,
901 BlockCacheLookupContext* lookup_context) {
902 Status s;
903
904 // Find filter handle and filter type
905 if (rep_->filter_policy) {
906 for (auto filter_type :
907 {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
908 Rep::FilterType::kBlockFilter}) {
909 std::string prefix;
910 switch (filter_type) {
911 case Rep::FilterType::kFullFilter:
912 prefix = kFullFilterBlockPrefix;
913 break;
914 case Rep::FilterType::kPartitionedFilter:
915 prefix = kPartitionedFilterBlockPrefix;
916 break;
917 case Rep::FilterType::kBlockFilter:
918 prefix = kFilterBlockPrefix;
919 break;
920 default:
921 assert(0);
922 }
923 std::string filter_block_key = prefix;
924 filter_block_key.append(rep_->filter_policy->Name());
925 if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
926 .ok()) {
927 rep_->filter_type = filter_type;
928 break;
929 }
930 }
931 }
932
933 // Find compression dictionary handle
934 bool found_compression_dict = false;
935 s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
936 &rep_->compression_dict_handle);
937 if (!s.ok()) {
938 return s;
939 }
940
941 BlockBasedTableOptions::IndexType index_type = rep_->index_type;
942
943 const bool use_cache = table_options.cache_index_and_filter_blocks;
944
945 // pin both index and filters, down to all partitions
946 const bool pin_all =
947 rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
948
949 // prefetch the first level of index
950 const bool prefetch_index =
951 prefetch_all ||
952 (table_options.pin_top_level_index_and_filter &&
953 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
954 // pin the first level of index
955 const bool pin_index =
956 pin_all || (table_options.pin_top_level_index_and_filter &&
957 index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
958
959 std::unique_ptr<IndexReader> index_reader;
960 s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
961 prefetch_index, pin_index, lookup_context,
962 &index_reader);
963 if (!s.ok()) {
964 return s;
965 }
966
967 rep_->index_reader = std::move(index_reader);
968
969 // The partitions of partitioned index are always stored in cache. They
970 // are hence follow the configuration for pin and prefetch regardless of
971 // the value of cache_index_and_filter_blocks
972 if (prefetch_all) {
973 rep_->index_reader->CacheDependencies(pin_all);
974 }
975
976 // prefetch the first level of filter
977 const bool prefetch_filter =
978 prefetch_all ||
979 (table_options.pin_top_level_index_and_filter &&
980 rep_->filter_type == Rep::FilterType::kPartitionedFilter);
981 // Partition fitlers cannot be enabled without partition indexes
982 assert(!prefetch_filter || prefetch_index);
983 // pin the first level of filter
984 const bool pin_filter =
985 pin_all || (table_options.pin_top_level_index_and_filter &&
986 rep_->filter_type == Rep::FilterType::kPartitionedFilter);
987
988 if (rep_->filter_policy) {
989 auto filter = new_table->CreateFilterBlockReader(
990 prefetch_buffer, use_cache, prefetch_filter, pin_filter,
991 lookup_context);
992 if (filter) {
993 // Refer to the comment above about paritioned indexes always being cached
994 if (prefetch_all) {
995 filter->CacheDependencies(pin_all);
996 }
997
998 rep_->filter = std::move(filter);
999 }
1000 }
1001
1002 if (!rep_->compression_dict_handle.IsNull()) {
1003 std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
1004 s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
1005 prefetch_all, pin_all, lookup_context,
1006 &uncompression_dict_reader);
1007 if (!s.ok()) {
1008 return s;
1009 }
1010
1011 rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1012 }
1013
1014 assert(s.ok());
1015 return s;
1016 }
1017
SetupForCompaction()1018 void BlockBasedTable::SetupForCompaction() {
1019 switch (rep_->ioptions.access_hint_on_compaction_start) {
1020 case Options::NONE:
1021 break;
1022 case Options::NORMAL:
1023 rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1024 break;
1025 case Options::SEQUENTIAL:
1026 rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1027 break;
1028 case Options::WILLNEED:
1029 rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1030 break;
1031 default:
1032 assert(false);
1033 }
1034 }
1035
GetTableProperties() const1036 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1037 const {
1038 return rep_->table_properties;
1039 }
1040
ApproximateMemoryUsage() const1041 size_t BlockBasedTable::ApproximateMemoryUsage() const {
1042 size_t usage = 0;
1043 if (rep_->filter) {
1044 usage += rep_->filter->ApproximateMemoryUsage();
1045 }
1046 if (rep_->index_reader) {
1047 usage += rep_->index_reader->ApproximateMemoryUsage();
1048 }
1049 if (rep_->uncompression_dict_reader) {
1050 usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1051 }
1052 return usage;
1053 }
1054
1055 // Load the meta-index-block from the file. On success, return the loaded
1056 // metaindex
1057 // block and its iterator.
ReadMetaIndexBlock(FilePrefetchBuffer * prefetch_buffer,std::unique_ptr<Block> * metaindex_block,std::unique_ptr<InternalIterator> * iter)1058 Status BlockBasedTable::ReadMetaIndexBlock(
1059 FilePrefetchBuffer* prefetch_buffer,
1060 std::unique_ptr<Block>* metaindex_block,
1061 std::unique_ptr<InternalIterator>* iter) {
1062 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1063 // it is an empty block.
1064 std::unique_ptr<Block> metaindex;
1065 Status s = ReadBlockFromFile(
1066 rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
1067 rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1068 true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1069 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1070 0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
1071 false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
1072 nullptr /* filter_policy */);
1073
1074 if (!s.ok()) {
1075 ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1076 "Encountered error while reading data from properties"
1077 " block %s",
1078 s.ToString().c_str());
1079 return s;
1080 }
1081
1082 *metaindex_block = std::move(metaindex);
1083 // meta block uses bytewise comparator.
1084 iter->reset(metaindex_block->get()->NewDataIterator(
1085 BytewiseComparator(), BytewiseComparator(),
1086 kDisableGlobalSequenceNumber));
1087 return Status::OK();
1088 }
1089
1090 template <typename TBlocklike>
GetDataBlockFromCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,const ReadOptions & read_options,CachableEntry<TBlocklike> * block,const UncompressionDict & uncompression_dict,BlockType block_type,GetContext * get_context) const1091 Status BlockBasedTable::GetDataBlockFromCache(
1092 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1093 Cache* block_cache, Cache* block_cache_compressed,
1094 const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
1095 const UncompressionDict& uncompression_dict, BlockType block_type,
1096 GetContext* get_context) const {
1097 const size_t read_amp_bytes_per_bit =
1098 block_type == BlockType::kData
1099 ? rep_->table_options.read_amp_bytes_per_bit
1100 : 0;
1101 assert(block);
1102 assert(block->IsEmpty());
1103
1104 Status s;
1105 BlockContents* compressed_block = nullptr;
1106 Cache::Handle* block_cache_compressed_handle = nullptr;
1107
1108 // Lookup uncompressed cache first
1109 if (block_cache != nullptr) {
1110 auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
1111 block_type, get_context);
1112 if (cache_handle != nullptr) {
1113 block->SetCachedValue(
1114 reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1115 block_cache, cache_handle);
1116 return s;
1117 }
1118 }
1119
1120 // If not found, search from the compressed block cache.
1121 assert(block->IsEmpty());
1122
1123 if (block_cache_compressed == nullptr) {
1124 return s;
1125 }
1126
1127 assert(!compressed_block_cache_key.empty());
1128 block_cache_compressed_handle =
1129 block_cache_compressed->Lookup(compressed_block_cache_key);
1130
1131 Statistics* statistics = rep_->ioptions.statistics;
1132
1133 // if we found in the compressed cache, then uncompress and insert into
1134 // uncompressed cache
1135 if (block_cache_compressed_handle == nullptr) {
1136 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1137 return s;
1138 }
1139
1140 // found compressed block
1141 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1142 compressed_block = reinterpret_cast<BlockContents*>(
1143 block_cache_compressed->Value(block_cache_compressed_handle));
1144 CompressionType compression_type = compressed_block->get_compression_type();
1145 assert(compression_type != kNoCompression);
1146
1147 // Retrieve the uncompressed contents into a new buffer
1148 BlockContents contents;
1149 UncompressionContext context(compression_type);
1150 UncompressionInfo info(context, uncompression_dict, compression_type);
1151 s = UncompressBlockContents(
1152 info, compressed_block->data.data(), compressed_block->data.size(),
1153 &contents, rep_->table_options.format_version, rep_->ioptions,
1154 GetMemoryAllocator(rep_->table_options));
1155
1156 // Insert uncompressed block into block cache
1157 if (s.ok()) {
1158 std::unique_ptr<TBlocklike> block_holder(
1159 BlocklikeTraits<TBlocklike>::Create(
1160 std::move(contents), read_amp_bytes_per_bit, statistics,
1161 rep_->blocks_definitely_zstd_compressed,
1162 rep_->table_options.filter_policy.get())); // uncompressed block
1163
1164 if (block_cache != nullptr && block_holder->own_bytes() &&
1165 read_options.fill_cache) {
1166 size_t charge = block_holder->ApproximateMemoryUsage();
1167 Cache::Handle* cache_handle = nullptr;
1168 s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1169 SimpleDeleter<TBlocklike>::GetInstance(),
1170 &cache_handle);
1171 if (s.ok()) {
1172 assert(cache_handle != nullptr);
1173 block->SetCachedValue(block_holder.release(), block_cache,
1174 cache_handle);
1175
1176 UpdateCacheInsertionMetrics(block_type, get_context, charge);
1177 } else {
1178 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1179 }
1180 } else {
1181 block->SetOwnedValue(block_holder.release());
1182 }
1183 }
1184
1185 // Release hold on compressed cache entry
1186 block_cache_compressed->Release(block_cache_compressed_handle);
1187 return s;
1188 }
1189
1190 template <typename TBlocklike>
PutDataBlockToCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,CachableEntry<TBlocklike> * cached_block,BlockContents * raw_block_contents,CompressionType raw_block_comp_type,const UncompressionDict & uncompression_dict,MemoryAllocator * memory_allocator,BlockType block_type,GetContext * get_context) const1191 Status BlockBasedTable::PutDataBlockToCache(
1192 const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1193 Cache* block_cache, Cache* block_cache_compressed,
1194 CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
1195 CompressionType raw_block_comp_type,
1196 const UncompressionDict& uncompression_dict,
1197 MemoryAllocator* memory_allocator, BlockType block_type,
1198 GetContext* get_context) const {
1199 const ImmutableCFOptions& ioptions = rep_->ioptions;
1200 const uint32_t format_version = rep_->table_options.format_version;
1201 const size_t read_amp_bytes_per_bit =
1202 block_type == BlockType::kData
1203 ? rep_->table_options.read_amp_bytes_per_bit
1204 : 0;
1205 const Cache::Priority priority =
1206 rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1207 (block_type == BlockType::kFilter ||
1208 block_type == BlockType::kCompressionDictionary ||
1209 block_type == BlockType::kIndex)
1210 ? Cache::Priority::HIGH
1211 : Cache::Priority::LOW;
1212 assert(cached_block);
1213 assert(cached_block->IsEmpty());
1214
1215 Status s;
1216 Statistics* statistics = ioptions.statistics;
1217
1218 std::unique_ptr<TBlocklike> block_holder;
1219 if (raw_block_comp_type != kNoCompression) {
1220 // Retrieve the uncompressed contents into a new buffer
1221 BlockContents uncompressed_block_contents;
1222 UncompressionContext context(raw_block_comp_type);
1223 UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1224 s = UncompressBlockContents(info, raw_block_contents->data.data(),
1225 raw_block_contents->data.size(),
1226 &uncompressed_block_contents, format_version,
1227 ioptions, memory_allocator);
1228 if (!s.ok()) {
1229 return s;
1230 }
1231
1232 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1233 std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
1234 statistics, rep_->blocks_definitely_zstd_compressed,
1235 rep_->table_options.filter_policy.get()));
1236 } else {
1237 block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1238 std::move(*raw_block_contents), read_amp_bytes_per_bit, statistics,
1239 rep_->blocks_definitely_zstd_compressed,
1240 rep_->table_options.filter_policy.get()));
1241 }
1242
1243 // Insert compressed block into compressed block cache.
1244 // Release the hold on the compressed cache entry immediately.
1245 if (block_cache_compressed != nullptr &&
1246 raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
1247 raw_block_contents->own_bytes()) {
1248 #ifndef NDEBUG
1249 assert(raw_block_contents->is_raw_block);
1250 #endif // NDEBUG
1251
1252 // We cannot directly put raw_block_contents because this could point to
1253 // an object in the stack.
1254 BlockContents* block_cont_for_comp_cache =
1255 new BlockContents(std::move(*raw_block_contents));
1256 s = block_cache_compressed->Insert(
1257 compressed_block_cache_key, block_cont_for_comp_cache,
1258 block_cont_for_comp_cache->ApproximateMemoryUsage(),
1259 SimpleDeleter<BlockContents>::GetInstance());
1260 if (s.ok()) {
1261 // Avoid the following code to delete this cached block.
1262 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1263 } else {
1264 RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1265 delete block_cont_for_comp_cache;
1266 }
1267 }
1268
1269 // insert into uncompressed block cache
1270 if (block_cache != nullptr && block_holder->own_bytes()) {
1271 size_t charge = block_holder->ApproximateMemoryUsage();
1272 Cache::Handle* cache_handle = nullptr;
1273 s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1274 SimpleDeleter<TBlocklike>::GetInstance(),
1275 &cache_handle, priority);
1276 if (s.ok()) {
1277 assert(cache_handle != nullptr);
1278 cached_block->SetCachedValue(block_holder.release(), block_cache,
1279 cache_handle);
1280
1281 UpdateCacheInsertionMetrics(block_type, get_context, charge);
1282 } else {
1283 RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1284 }
1285 } else {
1286 cached_block->SetOwnedValue(block_holder.release());
1287 }
1288
1289 return s;
1290 }
1291
CreateFilterBlockReader(FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)1292 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
1293 FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
1294 bool pin, BlockCacheLookupContext* lookup_context) {
1295 auto& rep = rep_;
1296 auto filter_type = rep->filter_type;
1297 if (filter_type == Rep::FilterType::kNoFilter) {
1298 return std::unique_ptr<FilterBlockReader>();
1299 }
1300
1301 assert(rep->filter_policy);
1302
1303 switch (filter_type) {
1304 case Rep::FilterType::kPartitionedFilter:
1305 return PartitionedFilterBlockReader::Create(
1306 this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1307
1308 case Rep::FilterType::kBlockFilter:
1309 return BlockBasedFilterBlockReader::Create(
1310 this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1311
1312 case Rep::FilterType::kFullFilter:
1313 return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
1314 prefetch, pin, lookup_context);
1315
1316 default:
1317 // filter_type is either kNoFilter (exited the function at the first if),
1318 // or it must be covered in this switch block
1319 assert(false);
1320 return std::unique_ptr<FilterBlockReader>();
1321 }
1322 }
1323
1324 // disable_prefix_seek should be set to true when prefix_extractor found in SST
1325 // differs from the one in mutable_cf_options and index type is HashBasedIndex
NewIndexIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * input_iter,GetContext * get_context,BlockCacheLookupContext * lookup_context) const1326 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1327 const ReadOptions& read_options, bool disable_prefix_seek,
1328 IndexBlockIter* input_iter, GetContext* get_context,
1329 BlockCacheLookupContext* lookup_context) const {
1330 assert(rep_ != nullptr);
1331 assert(rep_->index_reader != nullptr);
1332
1333 // We don't return pinned data from index blocks, so no need
1334 // to set `block_contents_pinned`.
1335 return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1336 input_iter, get_context,
1337 lookup_context);
1338 }
1339
1340
1341 template <>
InitBlockIterator(const Rep * rep,Block * block,BlockType block_type,DataBlockIter * input_iter,bool block_contents_pinned)1342 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
1343 const Rep* rep, Block* block, BlockType block_type,
1344 DataBlockIter* input_iter, bool block_contents_pinned) {
1345 return block->NewDataIterator(
1346 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1347 rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics,
1348 block_contents_pinned);
1349 }
1350
1351 template <>
InitBlockIterator(const Rep * rep,Block * block,BlockType block_type,IndexBlockIter * input_iter,bool block_contents_pinned)1352 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
1353 const Rep* rep, Block* block, BlockType block_type,
1354 IndexBlockIter* input_iter, bool block_contents_pinned) {
1355 return block->NewIndexIterator(
1356 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1357 rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics,
1358 /* total_order_seek */ true, rep->index_has_first_key,
1359 rep->index_key_includes_seq, rep->index_value_is_full,
1360 block_contents_pinned);
1361 }
1362
1363
1364 // If contents is nullptr, this function looks up the block caches for the
1365 // data block referenced by handle, and read the block from disk if necessary.
1366 // If contents is non-null, it skips the cache lookup and disk read, since
1367 // the caller has already read it. In both cases, if ro.fill_cache is true,
1368 // it inserts the block into the block cache.
1369 template <typename TBlocklike>
MaybeReadBlockAndLoadToCache(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,BlockContents * contents) const1370 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
1371 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1372 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1373 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
1374 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1375 BlockContents* contents) const {
1376 assert(block_entry != nullptr);
1377 const bool no_io = (ro.read_tier == kBlockCacheTier);
1378 Cache* block_cache = rep_->table_options.block_cache.get();
1379 // No point to cache compressed blocks if it never goes away
1380 Cache* block_cache_compressed =
1381 rep_->immortal_table ? nullptr
1382 : rep_->table_options.block_cache_compressed.get();
1383
1384 // First, try to get the block from the cache
1385 //
1386 // If either block cache is enabled, we'll try to read from it.
1387 Status s;
1388 char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1389 char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1390 Slice key /* key to the block cache */;
1391 Slice ckey /* key to the compressed block cache */;
1392 bool is_cache_hit = false;
1393 if (block_cache != nullptr || block_cache_compressed != nullptr) {
1394 // create key for block cache
1395 if (block_cache != nullptr) {
1396 key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1397 handle, cache_key);
1398 }
1399
1400 if (block_cache_compressed != nullptr) {
1401 ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
1402 rep_->compressed_cache_key_prefix_size, handle,
1403 compressed_cache_key);
1404 }
1405
1406 if (!contents) {
1407 s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
1408 ro, block_entry, uncompression_dict, block_type,
1409 get_context);
1410 if (block_entry->GetValue()) {
1411 // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
1412 // compressed block cache.
1413 is_cache_hit = true;
1414 }
1415 }
1416
1417 // Can't find the block from the cache. If I/O is allowed, read from the
1418 // file.
1419 if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
1420 Statistics* statistics = rep_->ioptions.statistics;
1421 const bool maybe_compressed =
1422 block_type != BlockType::kFilter &&
1423 block_type != BlockType::kCompressionDictionary &&
1424 rep_->blocks_maybe_compressed;
1425 const bool do_uncompress = maybe_compressed && !block_cache_compressed;
1426 CompressionType raw_block_comp_type;
1427 BlockContents raw_block_contents;
1428 if (!contents) {
1429 StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
1430 BlockFetcher block_fetcher(
1431 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
1432 &raw_block_contents, rep_->ioptions, do_uncompress,
1433 maybe_compressed, block_type, uncompression_dict,
1434 rep_->persistent_cache_options,
1435 GetMemoryAllocator(rep_->table_options),
1436 GetMemoryAllocatorForCompressedBlock(rep_->table_options));
1437 s = block_fetcher.ReadBlockContents();
1438 raw_block_comp_type = block_fetcher.get_compression_type();
1439 contents = &raw_block_contents;
1440 } else {
1441 raw_block_comp_type = contents->get_compression_type();
1442 }
1443
1444 if (s.ok()) {
1445 // If filling cache is allowed and a cache is configured, try to put the
1446 // block to the cache.
1447 s = PutDataBlockToCache(
1448 key, ckey, block_cache, block_cache_compressed, block_entry,
1449 contents, raw_block_comp_type, uncompression_dict,
1450 GetMemoryAllocator(rep_->table_options), block_type, get_context);
1451 }
1452 }
1453 }
1454
1455 // Fill lookup_context.
1456 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
1457 lookup_context) {
1458 size_t usage = 0;
1459 uint64_t nkeys = 0;
1460 if (block_entry->GetValue()) {
1461 // Approximate the number of keys in the block using restarts.
1462 nkeys =
1463 rep_->table_options.block_restart_interval *
1464 BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
1465 usage = block_entry->GetValue()->ApproximateMemoryUsage();
1466 }
1467 TraceType trace_block_type = TraceType::kTraceMax;
1468 switch (block_type) {
1469 case BlockType::kData:
1470 trace_block_type = TraceType::kBlockTraceDataBlock;
1471 break;
1472 case BlockType::kFilter:
1473 trace_block_type = TraceType::kBlockTraceFilterBlock;
1474 break;
1475 case BlockType::kCompressionDictionary:
1476 trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
1477 break;
1478 case BlockType::kRangeDeletion:
1479 trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
1480 break;
1481 case BlockType::kIndex:
1482 trace_block_type = TraceType::kBlockTraceIndexBlock;
1483 break;
1484 default:
1485 // This cannot happen.
1486 assert(false);
1487 break;
1488 }
1489 bool no_insert = no_io || !ro.fill_cache;
1490 if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
1491 trace_block_type, lookup_context->caller)) {
1492 // Defer logging the access to Get() and MultiGet() to trace additional
1493 // information, e.g., referenced_key_exist_in_block.
1494
1495 // Make a copy of the block key here since it will be logged later.
1496 lookup_context->FillLookupContext(
1497 is_cache_hit, no_insert, trace_block_type,
1498 /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
1499 } else {
1500 // Avoid making copy of block_key and cf_name when constructing the access
1501 // record.
1502 BlockCacheTraceRecord access_record(
1503 rep_->ioptions.env->NowMicros(),
1504 /*block_key=*/"", trace_block_type,
1505 /*block_size=*/usage, rep_->cf_id_for_tracing(),
1506 /*cf_name=*/"", rep_->level_for_tracing(),
1507 rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
1508 no_insert, lookup_context->get_id,
1509 lookup_context->get_from_user_specified_snapshot,
1510 /*referenced_key=*/"");
1511 block_cache_tracer_->WriteBlockAccess(access_record, key,
1512 rep_->cf_name_for_tracing(),
1513 lookup_context->referenced_key);
1514 }
1515 }
1516
1517 assert(s.ok() || block_entry->GetValue() == nullptr);
1518 return s;
1519 }
1520
1521 // This function reads multiple data blocks from disk using Env::MultiRead()
1522 // and optionally inserts them into the block cache. It uses the scratch
1523 // buffer provided by the caller, which is contiguous. If scratch is a nullptr
1524 // it allocates a separate buffer for each block. Typically, if the blocks
1525 // need to be uncompressed and there is no compressed block cache, callers
1526 // can allocate a temporary scratch buffer in order to minimize memory
1527 // allocations.
1528 // If options.fill_cache is true, it inserts the blocks into cache. If its
1529 // false and scratch is non-null and the blocks are uncompressed, it copies
1530 // the buffers to heap. In any case, the CachableEntry<Block> returned will
1531 // own the data bytes.
1532 // If compression is enabled and also there is no compressed block cache,
1533 // the adjacent blocks are read out in one IO (combined read)
1534 // batch - A MultiGetRange with only those keys with unique data blocks not
1535 // found in cache
1536 // handles - A vector of block handles. Some of them me be NULL handles
1537 // scratch - An optional contiguous buffer to read compressed blocks into
RetrieveMultipleBlocks(const ReadOptions & options,const MultiGetRange * batch,const autovector<BlockHandle,MultiGetContext::MAX_BATCH_SIZE> * handles,autovector<Status,MultiGetContext::MAX_BATCH_SIZE> * statuses,autovector<CachableEntry<Block>,MultiGetContext::MAX_BATCH_SIZE> * results,char * scratch,const UncompressionDict & uncompression_dict) const1538 void BlockBasedTable::RetrieveMultipleBlocks(
1539 const ReadOptions& options, const MultiGetRange* batch,
1540 const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
1541 autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
1542 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
1543 char* scratch, const UncompressionDict& uncompression_dict) const {
1544 RandomAccessFileReader* file = rep_->file.get();
1545 const Footer& footer = rep_->footer;
1546 const ImmutableCFOptions& ioptions = rep_->ioptions;
1547 size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
1548 MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
1549
1550 if (file->use_direct_io() || ioptions.allow_mmap_reads) {
1551 size_t idx_in_batch = 0;
1552 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
1553 ++mget_iter, ++idx_in_batch) {
1554 BlockCacheLookupContext lookup_data_block_context(
1555 TableReaderCaller::kUserMultiGet);
1556 const BlockHandle& handle = (*handles)[idx_in_batch];
1557 if (handle.IsNull()) {
1558 continue;
1559 }
1560
1561 (*statuses)[idx_in_batch] =
1562 RetrieveBlock(nullptr, options, handle, uncompression_dict,
1563 &(*results)[idx_in_batch], BlockType::kData,
1564 mget_iter->get_context, &lookup_data_block_context,
1565 /* for_compaction */ false, /* use_cache */ true);
1566 }
1567 return;
1568 }
1569
1570 autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
1571 size_t buf_offset = 0;
1572 size_t idx_in_batch = 0;
1573
1574 uint64_t prev_offset = 0;
1575 size_t prev_len = 0;
1576 autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
1577 autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
1578 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
1579 ++mget_iter, ++idx_in_batch) {
1580 const BlockHandle& handle = (*handles)[idx_in_batch];
1581 if (handle.IsNull()) {
1582 continue;
1583 }
1584
1585 size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
1586
1587 // If current block is adjacent to the previous one, at the same time,
1588 // compression is enabled and there is no compressed cache, we combine
1589 // the two block read as one.
1590 if (scratch != nullptr && prev_end == handle.offset()) {
1591 req_offset_for_block.emplace_back(prev_len);
1592 prev_len += block_size(handle);
1593 } else {
1594 // No compression or current block and previous one is not adjacent:
1595 // Step 1, create a new request for previous blocks
1596 if (prev_len != 0) {
1597 FSReadRequest req;
1598 req.offset = prev_offset;
1599 req.len = prev_len;
1600 if (scratch == nullptr) {
1601 req.scratch = new char[req.len];
1602 } else {
1603 req.scratch = scratch + buf_offset;
1604 buf_offset += req.len;
1605 }
1606 read_reqs.emplace_back(req);
1607 }
1608
1609 // Step 2, remeber the previous block info
1610 prev_offset = handle.offset();
1611 prev_len = block_size(handle);
1612 req_offset_for_block.emplace_back(0);
1613 }
1614 req_idx_for_block.emplace_back(read_reqs.size());
1615 }
1616 // Handle the last block and process the pending last request
1617 if (prev_len != 0) {
1618 FSReadRequest req;
1619 req.offset = prev_offset;
1620 req.len = prev_len;
1621 if (scratch == nullptr) {
1622 req.scratch = new char[req.len];
1623 } else {
1624 req.scratch = scratch + buf_offset;
1625 }
1626 read_reqs.emplace_back(req);
1627 }
1628
1629 file->MultiRead(&read_reqs[0], read_reqs.size(), nullptr);
1630
1631 idx_in_batch = 0;
1632 size_t valid_batch_idx = 0;
1633 for (auto mget_iter = batch->begin(); mget_iter != batch->end();
1634 ++mget_iter, ++idx_in_batch) {
1635 const BlockHandle& handle = (*handles)[idx_in_batch];
1636
1637 if (handle.IsNull()) {
1638 continue;
1639 }
1640
1641 assert(valid_batch_idx < req_idx_for_block.size());
1642 assert(valid_batch_idx < req_offset_for_block.size());
1643 assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
1644 size_t& req_idx = req_idx_for_block[valid_batch_idx];
1645 size_t& req_offset = req_offset_for_block[valid_batch_idx];
1646 valid_batch_idx++;
1647 FSReadRequest& req = read_reqs[req_idx];
1648 Status s = req.status;
1649 if (s.ok()) {
1650 if (req.result.size() != req.len) {
1651 s = Status::Corruption(
1652 "truncated block read from " + rep_->file->file_name() +
1653 " offset " + ToString(handle.offset()) + ", expected " +
1654 ToString(req.len) + " bytes, got " + ToString(req.result.size()));
1655 }
1656 }
1657
1658 BlockContents raw_block_contents;
1659 size_t cur_read_end = req_offset + block_size(handle);
1660 if (cur_read_end > req.result.size()) {
1661 s = Status::Corruption(
1662 "truncated block read from " + rep_->file->file_name() + " offset " +
1663 ToString(handle.offset()) + ", expected " + ToString(req.len) +
1664 " bytes, got " + ToString(req.result.size()));
1665 }
1666
1667 bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
1668 if (s.ok()) {
1669 if (scratch == nullptr && !blocks_share_read_buffer) {
1670 // We allocated a buffer for this block. Give ownership of it to
1671 // BlockContents so it can free the memory
1672 assert(req.result.data() == req.scratch);
1673 std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
1674 raw_block_contents = BlockContents(std::move(raw_block), handle.size());
1675 } else {
1676 // We used the scratch buffer which are shared by the blocks.
1677 // raw_block_contents does not have the ownership.
1678 raw_block_contents =
1679 BlockContents(Slice(req.scratch + req_offset, handle.size()));
1680 }
1681
1682 #ifndef NDEBUG
1683 raw_block_contents.is_raw_block = true;
1684 #endif
1685 if (options.verify_checksums) {
1686 PERF_TIMER_GUARD(block_checksum_time);
1687 const char* data = req.result.data();
1688 uint32_t expected =
1689 DecodeFixed32(data + req_offset + handle.size() + 1);
1690 // Since the scratch might be shared. the offset of the data block in
1691 // the buffer might not be 0. req.result.data() only point to the
1692 // begin address of each read request, we need to add the offset
1693 // in each read request. Checksum is stored in the block trailer,
1694 // which is handle.size() + 1.
1695 s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
1696 data + req_offset,
1697 handle.size() + 1, expected);
1698 TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
1699 }
1700 }
1701
1702 if (s.ok()) {
1703 // It handles a rare case: compression is set and these is no compressed
1704 // cache (enable combined read). In this case, the scratch != nullptr.
1705 // At the same time, some blocks are actually not compressed,
1706 // since its compression space saving is smaller than the threshold. In
1707 // this case, if the block shares the scratch memory, we need to copy it
1708 // to the heap such that it can be added to the regular block cache.
1709 CompressionType compression_type =
1710 raw_block_contents.get_compression_type();
1711 if (scratch != nullptr && compression_type == kNoCompression) {
1712 Slice raw = Slice(req.scratch + req_offset, block_size(handle));
1713 raw_block_contents = BlockContents(
1714 CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
1715 handle.size());
1716 #ifndef NDEBUG
1717 raw_block_contents.is_raw_block = true;
1718 #endif
1719 }
1720 }
1721
1722 if (s.ok()) {
1723 if (options.fill_cache) {
1724 BlockCacheLookupContext lookup_data_block_context(
1725 TableReaderCaller::kUserMultiGet);
1726 CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
1727 // MaybeReadBlockAndLoadToCache will insert into the block caches if
1728 // necessary. Since we're passing the raw block contents, it will
1729 // avoid looking up the block cache
1730 s = MaybeReadBlockAndLoadToCache(
1731 nullptr, options, handle, uncompression_dict, block_entry,
1732 BlockType::kData, mget_iter->get_context,
1733 &lookup_data_block_context, &raw_block_contents);
1734
1735 // block_entry value could be null if no block cache is present, i.e
1736 // BlockBasedTableOptions::no_block_cache is true and no compressed
1737 // block cache is configured. In that case, fall
1738 // through and set up the block explicitly
1739 if (block_entry->GetValue() != nullptr) {
1740 continue;
1741 }
1742 }
1743
1744 CompressionType compression_type =
1745 raw_block_contents.get_compression_type();
1746 BlockContents contents;
1747 if (compression_type != kNoCompression) {
1748 UncompressionContext context(compression_type);
1749 UncompressionInfo info(context, uncompression_dict, compression_type);
1750 s = UncompressBlockContents(info, req.result.data() + req_offset,
1751 handle.size(), &contents, footer.version(),
1752 rep_->ioptions, memory_allocator);
1753 } else {
1754 // There are two cases here: 1) caller uses the scratch buffer; 2) we
1755 // use the requst buffer. If scratch buffer is used, we ensure that
1756 // all raw blocks are copyed to the heap as single blocks. If scratch
1757 // buffer is not used, we also have no combined read, so the raw
1758 // block can be used directly.
1759 contents = std::move(raw_block_contents);
1760 }
1761 if (s.ok()) {
1762 (*results)[idx_in_batch].SetOwnedValue(new Block(
1763 std::move(contents), read_amp_bytes_per_bit, ioptions.statistics));
1764 }
1765 }
1766 (*statuses)[idx_in_batch] = s;
1767 }
1768 }
1769
1770 template <typename TBlocklike>
RetrieveBlock(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,bool for_compaction,bool use_cache) const1771 Status BlockBasedTable::RetrieveBlock(
1772 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1773 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1774 CachableEntry<TBlocklike>* block_entry, BlockType block_type,
1775 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1776 bool for_compaction, bool use_cache) const {
1777 assert(block_entry);
1778 assert(block_entry->IsEmpty());
1779
1780 Status s;
1781 if (use_cache) {
1782 s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
1783 uncompression_dict, block_entry,
1784 block_type, get_context, lookup_context,
1785 /*contents=*/nullptr);
1786
1787 if (!s.ok()) {
1788 return s;
1789 }
1790
1791 if (block_entry->GetValue() != nullptr) {
1792 assert(s.ok());
1793 return s;
1794 }
1795 }
1796
1797 assert(block_entry->IsEmpty());
1798
1799 const bool no_io = ro.read_tier == kBlockCacheTier;
1800 if (no_io) {
1801 return Status::Incomplete("no blocking io");
1802 }
1803
1804 const bool maybe_compressed =
1805 block_type != BlockType::kFilter &&
1806 block_type != BlockType::kCompressionDictionary &&
1807 rep_->blocks_maybe_compressed;
1808 const bool do_uncompress = maybe_compressed;
1809 std::unique_ptr<TBlocklike> block;
1810
1811 {
1812 StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
1813 READ_BLOCK_GET_MICROS);
1814 s = ReadBlockFromFile(
1815 rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
1816 rep_->ioptions, do_uncompress, maybe_compressed, block_type,
1817 uncompression_dict, rep_->persistent_cache_options,
1818 block_type == BlockType::kData
1819 ? rep_->table_options.read_amp_bytes_per_bit
1820 : 0,
1821 GetMemoryAllocator(rep_->table_options), for_compaction,
1822 rep_->blocks_definitely_zstd_compressed,
1823 rep_->table_options.filter_policy.get());
1824 }
1825
1826 if (!s.ok()) {
1827 return s;
1828 }
1829
1830 block_entry->SetOwnedValue(block.release());
1831
1832 assert(s.ok());
1833 return s;
1834 }
1835
1836 // Explicitly instantiate templates for both "blocklike" types we use.
1837 // This makes it possible to keep the template definitions in the .cc file.
1838 template Status BlockBasedTable::RetrieveBlock<BlockContents>(
1839 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1840 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1841 CachableEntry<BlockContents>* block_entry, BlockType block_type,
1842 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1843 bool for_compaction, bool use_cache) const;
1844
1845 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
1846 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1847 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1848 CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
1849 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1850 bool for_compaction, bool use_cache) const;
1851
1852 template Status BlockBasedTable::RetrieveBlock<Block>(
1853 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1854 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1855 CachableEntry<Block>* block_entry, BlockType block_type,
1856 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1857 bool for_compaction, bool use_cache) const;
1858
1859 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
1860 FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1861 const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1862 CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
1863 GetContext* get_context, BlockCacheLookupContext* lookup_context,
1864 bool for_compaction, bool use_cache) const;
1865
PartitionedIndexIteratorState(const BlockBasedTable * table,std::unordered_map<uint64_t,CachableEntry<Block>> * block_map)1866 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
1867 const BlockBasedTable* table,
1868 std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
1869 : table_(table), block_map_(block_map) {}
1870
1871 InternalIteratorBase<IndexValue>*
NewSecondaryIterator(const BlockHandle & handle)1872 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
1873 const BlockHandle& handle) {
1874 // Return a block iterator on the index partition
1875 auto block = block_map_->find(handle.offset());
1876 // This is a possible scenario since block cache might not have had space
1877 // for the partition
1878 if (block != block_map_->end()) {
1879 const Rep* rep = table_->get_rep();
1880 assert(rep);
1881
1882 Statistics* kNullStats = nullptr;
1883 // We don't return pinned data from index blocks, so no need
1884 // to set `block_contents_pinned`.
1885 return block->second.GetValue()->NewIndexIterator(
1886 &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1887 rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
1888 rep->index_has_first_key, rep->index_key_includes_seq,
1889 rep->index_value_is_full);
1890 }
1891 // Create an empty iterator
1892 return new IndexBlockIter();
1893 }
1894
1895 // This will be broken if the user specifies an unusual implementation
1896 // of Options.comparator, or if the user specifies an unusual
1897 // definition of prefixes in BlockBasedTableOptions.filter_policy.
1898 // In particular, we require the following three properties:
1899 //
1900 // 1) key.starts_with(prefix(key))
1901 // 2) Compare(prefix(key), key) <= 0.
1902 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
1903 //
1904 // If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
1905 // will return true if the filter block is not in memory and not found in block
1906 // cache.
1907 //
1908 // REQUIRES: this method shouldn't be called while the DB lock is held.
PrefixMayMatch(const Slice & internal_key,const ReadOptions & read_options,const SliceTransform * options_prefix_extractor,const bool need_upper_bound_check,BlockCacheLookupContext * lookup_context) const1909 bool BlockBasedTable::PrefixMayMatch(
1910 const Slice& internal_key, const ReadOptions& read_options,
1911 const SliceTransform* options_prefix_extractor,
1912 const bool need_upper_bound_check,
1913 BlockCacheLookupContext* lookup_context) const {
1914 if (!rep_->filter_policy) {
1915 return true;
1916 }
1917
1918 const SliceTransform* prefix_extractor;
1919
1920 if (rep_->table_prefix_extractor == nullptr) {
1921 if (need_upper_bound_check) {
1922 return true;
1923 }
1924 prefix_extractor = options_prefix_extractor;
1925 } else {
1926 prefix_extractor = rep_->table_prefix_extractor.get();
1927 }
1928 auto user_key = ExtractUserKey(internal_key);
1929 if (!prefix_extractor->InDomain(user_key)) {
1930 return true;
1931 }
1932
1933 bool may_match = true;
1934 Status s;
1935
1936 // First, try check with full filter
1937 FilterBlockReader* const filter = rep_->filter.get();
1938 bool filter_checked = true;
1939 if (filter != nullptr) {
1940 const bool no_io = read_options.read_tier == kBlockCacheTier;
1941
1942 if (!filter->IsBlockBased()) {
1943 const Slice* const const_ikey_ptr = &internal_key;
1944 may_match = filter->RangeMayExist(
1945 read_options.iterate_upper_bound, user_key, prefix_extractor,
1946 rep_->internal_comparator.user_comparator(), const_ikey_ptr,
1947 &filter_checked, need_upper_bound_check, no_io, lookup_context);
1948 } else {
1949 // if prefix_extractor changed for block based filter, skip filter
1950 if (need_upper_bound_check) {
1951 return true;
1952 }
1953 auto prefix = prefix_extractor->Transform(user_key);
1954 InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
1955 auto internal_prefix = internal_key_prefix.Encode();
1956
1957 // To prevent any io operation in this method, we set `read_tier` to make
1958 // sure we always read index or filter only when they have already been
1959 // loaded to memory.
1960 ReadOptions no_io_read_options;
1961 no_io_read_options.read_tier = kBlockCacheTier;
1962
1963 // Then, try find it within each block
1964 // we already know prefix_extractor and prefix_extractor_name must match
1965 // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
1966 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
1967 no_io_read_options,
1968 /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
1969 /*get_context=*/nullptr, lookup_context));
1970 iiter->Seek(internal_prefix);
1971
1972 if (!iiter->Valid()) {
1973 // we're past end of file
1974 // if it's incomplete, it means that we avoided I/O
1975 // and we're not really sure that we're past the end
1976 // of the file
1977 may_match = iiter->status().IsIncomplete();
1978 } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
1979 : iiter->key())
1980 .starts_with(ExtractUserKey(internal_prefix))) {
1981 // we need to check for this subtle case because our only
1982 // guarantee is that "the key is a string >= last key in that data
1983 // block" according to the doc/table_format.txt spec.
1984 //
1985 // Suppose iiter->key() starts with the desired prefix; it is not
1986 // necessarily the case that the corresponding data block will
1987 // contain the prefix, since iiter->key() need not be in the
1988 // block. However, the next data block may contain the prefix, so
1989 // we return true to play it safe.
1990 may_match = true;
1991 } else if (filter->IsBlockBased()) {
1992 // iiter->key() does NOT start with the desired prefix. Because
1993 // Seek() finds the first key that is >= the seek target, this
1994 // means that iiter->key() > prefix. Thus, any data blocks coming
1995 // after the data block corresponding to iiter->key() cannot
1996 // possibly contain the key. Thus, the corresponding data block
1997 // is the only on could potentially contain the prefix.
1998 BlockHandle handle = iiter->value().handle;
1999 may_match = filter->PrefixMayMatch(
2000 prefix, prefix_extractor, handle.offset(), no_io,
2001 /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
2002 }
2003 }
2004 }
2005
2006 if (filter_checked) {
2007 Statistics* statistics = rep_->ioptions.statistics;
2008 RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
2009 if (!may_match) {
2010 RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
2011 }
2012 }
2013
2014 return may_match;
2015 }
2016
2017
NewIterator(const ReadOptions & read_options,const SliceTransform * prefix_extractor,Arena * arena,bool skip_filters,TableReaderCaller caller,size_t compaction_readahead_size)2018 InternalIterator* BlockBasedTable::NewIterator(
2019 const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2020 Arena* arena, bool skip_filters, TableReaderCaller caller,
2021 size_t compaction_readahead_size) {
2022 BlockCacheLookupContext lookup_context{caller};
2023 bool need_upper_bound_check =
2024 read_options.auto_prefix_mode ||
2025 PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
2026 std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
2027 read_options,
2028 need_upper_bound_check &&
2029 rep_->index_type == BlockBasedTableOptions::kHashSearch,
2030 /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
2031 if (arena == nullptr) {
2032 return new BlockBasedTableIterator(
2033 this, read_options, rep_->internal_comparator, std::move(index_iter),
2034 !skip_filters && !read_options.total_order_seek &&
2035 prefix_extractor != nullptr,
2036 need_upper_bound_check, prefix_extractor, caller,
2037 compaction_readahead_size);
2038 } else {
2039 auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
2040 return new (mem) BlockBasedTableIterator(
2041 this, read_options, rep_->internal_comparator, std::move(index_iter),
2042 !skip_filters && !read_options.total_order_seek &&
2043 prefix_extractor != nullptr,
2044 need_upper_bound_check, prefix_extractor, caller,
2045 compaction_readahead_size);
2046 }
2047 }
2048
NewRangeTombstoneIterator(const ReadOptions & read_options)2049 FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2050 const ReadOptions& read_options) {
2051 if (rep_->fragmented_range_dels == nullptr) {
2052 return nullptr;
2053 }
2054 SequenceNumber snapshot = kMaxSequenceNumber;
2055 if (read_options.snapshot != nullptr) {
2056 snapshot = read_options.snapshot->GetSequenceNumber();
2057 }
2058 return new FragmentedRangeTombstoneIterator(
2059 rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
2060 }
2061
FullFilterKeyMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,const Slice & internal_key,const bool no_io,const SliceTransform * prefix_extractor,GetContext * get_context,BlockCacheLookupContext * lookup_context) const2062 bool BlockBasedTable::FullFilterKeyMayMatch(
2063 const ReadOptions& read_options, FilterBlockReader* filter,
2064 const Slice& internal_key, const bool no_io,
2065 const SliceTransform* prefix_extractor, GetContext* get_context,
2066 BlockCacheLookupContext* lookup_context) const {
2067 if (filter == nullptr || filter->IsBlockBased()) {
2068 return true;
2069 }
2070 Slice user_key = ExtractUserKey(internal_key);
2071 const Slice* const const_ikey_ptr = &internal_key;
2072 bool may_match = true;
2073 if (rep_->whole_key_filtering) {
2074 size_t ts_sz =
2075 rep_->internal_comparator.user_comparator()->timestamp_size();
2076 Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
2077 may_match =
2078 filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
2079 no_io, const_ikey_ptr, get_context, lookup_context);
2080 } else if (!read_options.total_order_seek && prefix_extractor &&
2081 rep_->table_properties->prefix_extractor_name.compare(
2082 prefix_extractor->Name()) == 0 &&
2083 prefix_extractor->InDomain(user_key) &&
2084 !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
2085 prefix_extractor, kNotValid, no_io,
2086 const_ikey_ptr, get_context,
2087 lookup_context)) {
2088 may_match = false;
2089 }
2090 if (may_match) {
2091 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
2092 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2093 }
2094 return may_match;
2095 }
2096
FullFilterKeysMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,MultiGetRange * range,const bool no_io,const SliceTransform * prefix_extractor,BlockCacheLookupContext * lookup_context) const2097 void BlockBasedTable::FullFilterKeysMayMatch(
2098 const ReadOptions& read_options, FilterBlockReader* filter,
2099 MultiGetRange* range, const bool no_io,
2100 const SliceTransform* prefix_extractor,
2101 BlockCacheLookupContext* lookup_context) const {
2102 if (filter == nullptr || filter->IsBlockBased()) {
2103 return;
2104 }
2105 if (rep_->whole_key_filtering) {
2106 filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
2107 lookup_context);
2108 } else if (!read_options.total_order_seek && prefix_extractor &&
2109 rep_->table_properties->prefix_extractor_name.compare(
2110 prefix_extractor->Name()) == 0) {
2111 filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
2112 lookup_context);
2113 }
2114 }
2115
Get(const ReadOptions & read_options,const Slice & key,GetContext * get_context,const SliceTransform * prefix_extractor,bool skip_filters)2116 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2117 GetContext* get_context,
2118 const SliceTransform* prefix_extractor,
2119 bool skip_filters) {
2120 assert(key.size() >= 8); // key must be internal key
2121 assert(get_context != nullptr);
2122 Status s;
2123 const bool no_io = read_options.read_tier == kBlockCacheTier;
2124
2125 FilterBlockReader* const filter =
2126 !skip_filters ? rep_->filter.get() : nullptr;
2127
2128 // First check the full filter
2129 // If full filter not useful, Then go into each block
2130 uint64_t tracing_get_id = get_context->get_tracing_get_id();
2131 BlockCacheLookupContext lookup_context{
2132 TableReaderCaller::kUserGet, tracing_get_id,
2133 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2134 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2135 // Trace the key since it contains both user key and sequence number.
2136 lookup_context.referenced_key = key.ToString();
2137 lookup_context.get_from_user_specified_snapshot =
2138 read_options.snapshot != nullptr;
2139 }
2140 const bool may_match =
2141 FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
2142 get_context, &lookup_context);
2143 if (!may_match) {
2144 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2145 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2146 } else {
2147 IndexBlockIter iiter_on_stack;
2148 // if prefix_extractor found in block differs from options, disable
2149 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2150 bool need_upper_bound_check = false;
2151 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2152 need_upper_bound_check = PrefixExtractorChanged(
2153 rep_->table_properties.get(), prefix_extractor);
2154 }
2155 auto iiter =
2156 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
2157 get_context, &lookup_context);
2158 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2159 if (iiter != &iiter_on_stack) {
2160 iiter_unique_ptr.reset(iiter);
2161 }
2162
2163 size_t ts_sz =
2164 rep_->internal_comparator.user_comparator()->timestamp_size();
2165 bool matched = false; // if such user key mathced a key in SST
2166 bool done = false;
2167 for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2168 IndexValue v = iiter->value();
2169
2170 bool not_exist_in_filter =
2171 filter != nullptr && filter->IsBlockBased() == true &&
2172 !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
2173 prefix_extractor, v.handle.offset(), no_io,
2174 /*const_ikey_ptr=*/nullptr, get_context,
2175 &lookup_context);
2176
2177 if (not_exist_in_filter) {
2178 // Not found
2179 // TODO: think about interaction with Merge. If a user key cannot
2180 // cross one data block, we should be fine.
2181 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2182 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2183 break;
2184 }
2185
2186 if (!v.first_internal_key.empty() && !skip_filters &&
2187 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
2188 .Compare(ExtractUserKey(key),
2189 ExtractUserKey(v.first_internal_key)) < 0) {
2190 // The requested key falls between highest key in previous block and
2191 // lowest key in current block.
2192 break;
2193 }
2194
2195 BlockCacheLookupContext lookup_data_block_context{
2196 TableReaderCaller::kUserGet, tracing_get_id,
2197 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
2198 nullptr};
2199 bool does_referenced_key_exist = false;
2200 DataBlockIter biter;
2201 uint64_t referenced_data_size = 0;
2202 NewDataBlockIterator<DataBlockIter>(
2203 read_options, v.handle, &biter, BlockType::kData, get_context,
2204 &lookup_data_block_context,
2205 /*s=*/Status(), /*prefetch_buffer*/ nullptr);
2206
2207 if (no_io && biter.status().IsIncomplete()) {
2208 // couldn't get block from block_cache
2209 // Update Saver.state to Found because we are only looking for
2210 // whether we can guarantee the key is not there when "no_io" is set
2211 get_context->MarkKeyMayExist();
2212 break;
2213 }
2214 if (!biter.status().ok()) {
2215 s = biter.status();
2216 break;
2217 }
2218
2219 bool may_exist = biter.SeekForGet(key);
2220 // If user-specified timestamp is supported, we cannot end the search
2221 // just because hash index lookup indicates the key+ts does not exist.
2222 if (!may_exist && ts_sz == 0) {
2223 // HashSeek cannot find the key this block and the the iter is not
2224 // the end of the block, i.e. cannot be in the following blocks
2225 // either. In this case, the seek_key cannot be found, so we break
2226 // from the top level for-loop.
2227 done = true;
2228 } else {
2229 // Call the *saver function on each entry/block until it returns false
2230 for (; biter.Valid(); biter.Next()) {
2231 ParsedInternalKey parsed_key;
2232 if (!ParseInternalKey(biter.key(), &parsed_key)) {
2233 s = Status::Corruption(Slice());
2234 }
2235
2236 if (!get_context->SaveValue(
2237 parsed_key, biter.value(), &matched,
2238 biter.IsValuePinned() ? &biter : nullptr)) {
2239 if (get_context->State() == GetContext::GetState::kFound) {
2240 does_referenced_key_exist = true;
2241 referenced_data_size = biter.key().size() + biter.value().size();
2242 }
2243 done = true;
2244 break;
2245 }
2246 }
2247 s = biter.status();
2248 }
2249 // Write the block cache access record.
2250 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2251 // Avoid making copy of block_key, cf_name, and referenced_key when
2252 // constructing the access record.
2253 Slice referenced_key;
2254 if (does_referenced_key_exist) {
2255 referenced_key = biter.key();
2256 } else {
2257 referenced_key = key;
2258 }
2259 BlockCacheTraceRecord access_record(
2260 rep_->ioptions.env->NowMicros(),
2261 /*block_key=*/"", lookup_data_block_context.block_type,
2262 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
2263 /*cf_name=*/"", rep_->level_for_tracing(),
2264 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
2265 lookup_data_block_context.is_cache_hit,
2266 lookup_data_block_context.no_insert,
2267 lookup_data_block_context.get_id,
2268 lookup_data_block_context.get_from_user_specified_snapshot,
2269 /*referenced_key=*/"", referenced_data_size,
2270 lookup_data_block_context.num_keys_in_block,
2271 does_referenced_key_exist);
2272 block_cache_tracer_->WriteBlockAccess(
2273 access_record, lookup_data_block_context.block_key,
2274 rep_->cf_name_for_tracing(), referenced_key);
2275 }
2276
2277 if (done) {
2278 // Avoid the extra Next which is expensive in two-level indexes
2279 break;
2280 }
2281 }
2282 if (matched && filter != nullptr && !filter->IsBlockBased()) {
2283 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
2284 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
2285 rep_->level);
2286 }
2287 if (s.ok() && !iiter->status().IsNotFound()) {
2288 s = iiter->status();
2289 }
2290 }
2291
2292 return s;
2293 }
2294
2295 using MultiGetRange = MultiGetContext::Range;
MultiGet(const ReadOptions & read_options,const MultiGetRange * mget_range,const SliceTransform * prefix_extractor,bool skip_filters)2296 void BlockBasedTable::MultiGet(const ReadOptions& read_options,
2297 const MultiGetRange* mget_range,
2298 const SliceTransform* prefix_extractor,
2299 bool skip_filters) {
2300 FilterBlockReader* const filter =
2301 !skip_filters ? rep_->filter.get() : nullptr;
2302 MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
2303 mget_range->end());
2304
2305 // First check the full filter
2306 // If full filter not useful, Then go into each block
2307 const bool no_io = read_options.read_tier == kBlockCacheTier;
2308 uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
2309 if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
2310 tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
2311 }
2312 BlockCacheLookupContext lookup_context{
2313 TableReaderCaller::kUserMultiGet, tracing_mget_id,
2314 /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2315 FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
2316 prefix_extractor, &lookup_context);
2317
2318 if (skip_filters || !sst_file_range.empty()) {
2319 IndexBlockIter iiter_on_stack;
2320 // if prefix_extractor found in block differs from options, disable
2321 // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2322 bool need_upper_bound_check = false;
2323 if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2324 need_upper_bound_check = PrefixExtractorChanged(
2325 rep_->table_properties.get(), prefix_extractor);
2326 }
2327 auto iiter =
2328 NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
2329 sst_file_range.begin()->get_context, &lookup_context);
2330 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2331 if (iiter != &iiter_on_stack) {
2332 iiter_unique_ptr.reset(iiter);
2333 }
2334
2335 uint64_t offset = std::numeric_limits<uint64_t>::max();
2336 autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
2337 autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
2338 autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
2339 char stack_buf[kMultiGetReadStackBufSize];
2340 std::unique_ptr<char[]> block_buf;
2341 {
2342 MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
2343 sst_file_range.end());
2344
2345 CachableEntry<UncompressionDict> uncompression_dict;
2346 Status uncompression_dict_status;
2347 if (rep_->uncompression_dict_reader) {
2348 uncompression_dict_status =
2349 rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
2350 nullptr /* prefetch_buffer */, no_io,
2351 sst_file_range.begin()->get_context, &lookup_context,
2352 &uncompression_dict);
2353 }
2354
2355 const UncompressionDict& dict = uncompression_dict.GetValue()
2356 ? *uncompression_dict.GetValue()
2357 : UncompressionDict::GetEmptyDict();
2358
2359 size_t total_len = 0;
2360 ReadOptions ro = read_options;
2361 ro.read_tier = kBlockCacheTier;
2362
2363 for (auto miter = data_block_range.begin();
2364 miter != data_block_range.end(); ++miter) {
2365 const Slice& key = miter->ikey;
2366 iiter->Seek(miter->ikey);
2367
2368 IndexValue v;
2369 if (iiter->Valid()) {
2370 v = iiter->value();
2371 }
2372 if (!iiter->Valid() ||
2373 (!v.first_internal_key.empty() && !skip_filters &&
2374 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
2375 .Compare(ExtractUserKey(key),
2376 ExtractUserKey(v.first_internal_key)) < 0)) {
2377 // The requested key falls between highest key in previous block and
2378 // lowest key in current block.
2379 *(miter->s) = iiter->status();
2380 data_block_range.SkipKey(miter);
2381 sst_file_range.SkipKey(miter);
2382 continue;
2383 }
2384
2385 if (!uncompression_dict_status.ok()) {
2386 *(miter->s) = uncompression_dict_status;
2387 data_block_range.SkipKey(miter);
2388 sst_file_range.SkipKey(miter);
2389 continue;
2390 }
2391
2392 statuses.emplace_back();
2393 results.emplace_back();
2394 if (v.handle.offset() == offset) {
2395 // We're going to reuse the block for this key later on. No need to
2396 // look it up now. Place a null handle
2397 block_handles.emplace_back(BlockHandle::NullBlockHandle());
2398 continue;
2399 }
2400 // Lookup the cache for the given data block referenced by an index
2401 // iterator value (i.e BlockHandle). If it exists in the cache,
2402 // initialize block to the contents of the data block.
2403 offset = v.handle.offset();
2404 BlockHandle handle = v.handle;
2405 BlockCacheLookupContext lookup_data_block_context(
2406 TableReaderCaller::kUserMultiGet);
2407 Status s = RetrieveBlock(
2408 nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
2409 miter->get_context, &lookup_data_block_context,
2410 /* for_compaction */ false, /* use_cache */ true);
2411 if (s.IsIncomplete()) {
2412 s = Status::OK();
2413 }
2414 if (s.ok() && !results.back().IsEmpty()) {
2415 // Found it in the cache. Add NULL handle to indicate there is
2416 // nothing to read from disk
2417 block_handles.emplace_back(BlockHandle::NullBlockHandle());
2418 } else {
2419 block_handles.emplace_back(handle);
2420 total_len += block_size(handle);
2421 }
2422 }
2423
2424 if (total_len) {
2425 char* scratch = nullptr;
2426 // If the blocks need to be uncompressed and we don't need the
2427 // compressed blocks, then we can use a contiguous block of
2428 // memory to read in all the blocks as it will be temporary
2429 // storage
2430 // 1. If blocks are compressed and compressed block cache is there,
2431 // alloc heap bufs
2432 // 2. If blocks are uncompressed, alloc heap bufs
2433 // 3. If blocks are compressed and no compressed block cache, use
2434 // stack buf
2435 if (rep_->table_options.block_cache_compressed == nullptr &&
2436 rep_->blocks_maybe_compressed) {
2437 if (total_len <= kMultiGetReadStackBufSize) {
2438 scratch = stack_buf;
2439 } else {
2440 scratch = new char[total_len];
2441 block_buf.reset(scratch);
2442 }
2443 }
2444 RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
2445 &statuses, &results, scratch, dict);
2446 }
2447 }
2448
2449 DataBlockIter first_biter;
2450 DataBlockIter next_biter;
2451 size_t idx_in_batch = 0;
2452 for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
2453 ++miter) {
2454 Status s;
2455 GetContext* get_context = miter->get_context;
2456 const Slice& key = miter->ikey;
2457 bool matched = false; // if such user key matched a key in SST
2458 bool done = false;
2459 bool first_block = true;
2460 do {
2461 DataBlockIter* biter = nullptr;
2462 bool reusing_block = true;
2463 uint64_t referenced_data_size = 0;
2464 bool does_referenced_key_exist = false;
2465 BlockCacheLookupContext lookup_data_block_context(
2466 TableReaderCaller::kUserMultiGet, tracing_mget_id,
2467 /*get_from_user_specified_snapshot=*/read_options.snapshot !=
2468 nullptr);
2469 if (first_block) {
2470 if (!block_handles[idx_in_batch].IsNull() ||
2471 !results[idx_in_batch].IsEmpty()) {
2472 first_biter.Invalidate(Status::OK());
2473 NewDataBlockIterator<DataBlockIter>(
2474 read_options, results[idx_in_batch], &first_biter,
2475 statuses[idx_in_batch]);
2476 reusing_block = false;
2477 }
2478 biter = &first_biter;
2479 idx_in_batch++;
2480 } else {
2481 IndexValue v = iiter->value();
2482 if (!v.first_internal_key.empty() && !skip_filters &&
2483 UserComparatorWrapper(rep_->internal_comparator.user_comparator())
2484 .Compare(ExtractUserKey(key),
2485 ExtractUserKey(v.first_internal_key)) < 0) {
2486 // The requested key falls between highest key in previous block and
2487 // lowest key in current block.
2488 break;
2489 }
2490
2491 next_biter.Invalidate(Status::OK());
2492 NewDataBlockIterator<DataBlockIter>(
2493 read_options, iiter->value().handle, &next_biter,
2494 BlockType::kData, get_context, &lookup_data_block_context,
2495 Status(), nullptr);
2496 biter = &next_biter;
2497 reusing_block = false;
2498 }
2499
2500 if (read_options.read_tier == kBlockCacheTier &&
2501 biter->status().IsIncomplete()) {
2502 // couldn't get block from block_cache
2503 // Update Saver.state to Found because we are only looking for
2504 // whether we can guarantee the key is not there when "no_io" is set
2505 get_context->MarkKeyMayExist();
2506 break;
2507 }
2508 if (!biter->status().ok()) {
2509 s = biter->status();
2510 break;
2511 }
2512
2513 bool may_exist = biter->SeekForGet(key);
2514 if (!may_exist) {
2515 // HashSeek cannot find the key this block and the the iter is not
2516 // the end of the block, i.e. cannot be in the following blocks
2517 // either. In this case, the seek_key cannot be found, so we break
2518 // from the top level for-loop.
2519 break;
2520 }
2521
2522 // Call the *saver function on each entry/block until it returns false
2523 for (; biter->Valid(); biter->Next()) {
2524 ParsedInternalKey parsed_key;
2525 Cleanable dummy;
2526 Cleanable* value_pinner = nullptr;
2527 if (!ParseInternalKey(biter->key(), &parsed_key)) {
2528 s = Status::Corruption(Slice());
2529 }
2530 if (biter->IsValuePinned()) {
2531 if (reusing_block) {
2532 Cache* block_cache = rep_->table_options.block_cache.get();
2533 assert(biter->cache_handle() != nullptr);
2534 block_cache->Ref(biter->cache_handle());
2535 dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
2536 biter->cache_handle());
2537 value_pinner = &dummy;
2538 } else {
2539 value_pinner = biter;
2540 }
2541 }
2542 if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
2543 value_pinner)) {
2544 if (get_context->State() == GetContext::GetState::kFound) {
2545 does_referenced_key_exist = true;
2546 referenced_data_size =
2547 biter->key().size() + biter->value().size();
2548 }
2549 done = true;
2550 break;
2551 }
2552 s = biter->status();
2553 }
2554 // Write the block cache access.
2555 if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2556 // Avoid making copy of block_key, cf_name, and referenced_key when
2557 // constructing the access record.
2558 Slice referenced_key;
2559 if (does_referenced_key_exist) {
2560 referenced_key = biter->key();
2561 } else {
2562 referenced_key = key;
2563 }
2564 BlockCacheTraceRecord access_record(
2565 rep_->ioptions.env->NowMicros(),
2566 /*block_key=*/"", lookup_data_block_context.block_type,
2567 lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
2568 /*cf_name=*/"", rep_->level_for_tracing(),
2569 rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
2570 lookup_data_block_context.is_cache_hit,
2571 lookup_data_block_context.no_insert,
2572 lookup_data_block_context.get_id,
2573 lookup_data_block_context.get_from_user_specified_snapshot,
2574 /*referenced_key=*/"", referenced_data_size,
2575 lookup_data_block_context.num_keys_in_block,
2576 does_referenced_key_exist);
2577 block_cache_tracer_->WriteBlockAccess(
2578 access_record, lookup_data_block_context.block_key,
2579 rep_->cf_name_for_tracing(), referenced_key);
2580 }
2581 s = biter->status();
2582 if (done) {
2583 // Avoid the extra Next which is expensive in two-level indexes
2584 break;
2585 }
2586 if (first_block) {
2587 iiter->Seek(key);
2588 }
2589 first_block = false;
2590 iiter->Next();
2591 } while (iiter->Valid());
2592
2593 if (matched && filter != nullptr && !filter->IsBlockBased()) {
2594 RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
2595 PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
2596 rep_->level);
2597 }
2598 if (s.ok()) {
2599 s = iiter->status();
2600 }
2601 *(miter->s) = s;
2602 }
2603 }
2604 }
2605
Prefetch(const Slice * const begin,const Slice * const end)2606 Status BlockBasedTable::Prefetch(const Slice* const begin,
2607 const Slice* const end) {
2608 auto& comparator = rep_->internal_comparator;
2609 UserComparatorWrapper user_comparator(comparator.user_comparator());
2610 // pre-condition
2611 if (begin && end && comparator.Compare(*begin, *end) > 0) {
2612 return Status::InvalidArgument(*begin, *end);
2613 }
2614 BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
2615 IndexBlockIter iiter_on_stack;
2616 auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2617 &iiter_on_stack, /*get_context=*/nullptr,
2618 &lookup_context);
2619 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2620 if (iiter != &iiter_on_stack) {
2621 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
2622 }
2623
2624 if (!iiter->status().ok()) {
2625 // error opening index iterator
2626 return iiter->status();
2627 }
2628
2629 // indicates if we are on the last page that need to be pre-fetched
2630 bool prefetching_boundary_page = false;
2631
2632 for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
2633 iiter->Next()) {
2634 BlockHandle block_handle = iiter->value().handle;
2635 const bool is_user_key = !rep_->index_key_includes_seq;
2636 if (end &&
2637 ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
2638 (is_user_key &&
2639 user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
2640 if (prefetching_boundary_page) {
2641 break;
2642 }
2643
2644 // The index entry represents the last key in the data block.
2645 // We should load this page into memory as well, but no more
2646 prefetching_boundary_page = true;
2647 }
2648
2649 // Load the block specified by the block_handle into the block cache
2650 DataBlockIter biter;
2651
2652 NewDataBlockIterator<DataBlockIter>(
2653 ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
2654 /*get_context=*/nullptr, &lookup_context, Status(),
2655 /*prefetch_buffer=*/nullptr);
2656
2657 if (!biter.status().ok()) {
2658 // there was an unexpected error while pre-fetching
2659 return biter.status();
2660 }
2661 }
2662
2663 return Status::OK();
2664 }
2665
VerifyChecksum(const ReadOptions & read_options,TableReaderCaller caller)2666 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
2667 TableReaderCaller caller) {
2668 Status s;
2669 // Check Meta blocks
2670 std::unique_ptr<Block> metaindex;
2671 std::unique_ptr<InternalIterator> metaindex_iter;
2672 s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
2673 &metaindex_iter);
2674 if (s.ok()) {
2675 s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
2676 if (!s.ok()) {
2677 return s;
2678 }
2679 } else {
2680 return s;
2681 }
2682 // Check Data blocks
2683 IndexBlockIter iiter_on_stack;
2684 BlockCacheLookupContext context{caller};
2685 InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
2686 read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
2687 /*get_context=*/nullptr, &context);
2688 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2689 if (iiter != &iiter_on_stack) {
2690 iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
2691 }
2692 if (!iiter->status().ok()) {
2693 // error opening index iterator
2694 return iiter->status();
2695 }
2696 s = VerifyChecksumInBlocks(read_options, iiter);
2697 return s;
2698 }
2699
VerifyChecksumInBlocks(const ReadOptions & read_options,InternalIteratorBase<IndexValue> * index_iter)2700 Status BlockBasedTable::VerifyChecksumInBlocks(
2701 const ReadOptions& read_options,
2702 InternalIteratorBase<IndexValue>* index_iter) {
2703 Status s;
2704 // We are scanning the whole file, so no need to do exponential
2705 // increasing of the buffer size.
2706 size_t readahead_size = (read_options.readahead_size != 0)
2707 ? read_options.readahead_size
2708 : kMaxAutoReadaheadSize;
2709 // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
2710 // needed there.
2711 FilePrefetchBuffer prefetch_buffer(
2712 rep_->file.get(), readahead_size /* readadhead_size */,
2713 readahead_size /* max_readahead_size */,
2714 !rep_->ioptions.allow_mmap_reads /* enable */);
2715
2716 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2717 s = index_iter->status();
2718 if (!s.ok()) {
2719 break;
2720 }
2721 BlockHandle handle = index_iter->value().handle;
2722 BlockContents contents;
2723 BlockFetcher block_fetcher(
2724 rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
2725 &contents, rep_->ioptions, false /* decompress */,
2726 false /*maybe_compressed*/, BlockType::kData,
2727 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
2728 s = block_fetcher.ReadBlockContents();
2729 if (!s.ok()) {
2730 break;
2731 }
2732 }
2733 return s;
2734 }
2735
GetBlockTypeForMetaBlockByName(const Slice & meta_block_name)2736 BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
2737 const Slice& meta_block_name) {
2738 if (meta_block_name.starts_with(kFilterBlockPrefix) ||
2739 meta_block_name.starts_with(kFullFilterBlockPrefix) ||
2740 meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
2741 return BlockType::kFilter;
2742 }
2743
2744 if (meta_block_name == kPropertiesBlock) {
2745 return BlockType::kProperties;
2746 }
2747
2748 if (meta_block_name == kCompressionDictBlock) {
2749 return BlockType::kCompressionDictionary;
2750 }
2751
2752 if (meta_block_name == kRangeDelBlock) {
2753 return BlockType::kRangeDeletion;
2754 }
2755
2756 if (meta_block_name == kHashIndexPrefixesBlock) {
2757 return BlockType::kHashIndexPrefixes;
2758 }
2759
2760 if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
2761 return BlockType::kHashIndexMetadata;
2762 }
2763
2764 assert(false);
2765 return BlockType::kInvalid;
2766 }
2767
VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice> * index_iter)2768 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
2769 InternalIteratorBase<Slice>* index_iter) {
2770 Status s;
2771 for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2772 s = index_iter->status();
2773 if (!s.ok()) {
2774 break;
2775 }
2776 BlockHandle handle;
2777 Slice input = index_iter->value();
2778 s = handle.DecodeFrom(&input);
2779 BlockContents contents;
2780 const Slice meta_block_name = index_iter->key();
2781 BlockFetcher block_fetcher(
2782 rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
2783 ReadOptions(), handle, &contents, rep_->ioptions,
2784 false /* decompress */, false /*maybe_compressed*/,
2785 GetBlockTypeForMetaBlockByName(meta_block_name),
2786 UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
2787 s = block_fetcher.ReadBlockContents();
2788 if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
2789 TableProperties* table_properties;
2790 s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
2791 index_iter->value(),
2792 &table_properties);
2793 delete table_properties;
2794 }
2795 if (!s.ok()) {
2796 break;
2797 }
2798 }
2799 return s;
2800 }
2801
TEST_BlockInCache(const BlockHandle & handle) const2802 bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
2803 assert(rep_ != nullptr);
2804
2805 Cache* const cache = rep_->table_options.block_cache.get();
2806 if (cache == nullptr) {
2807 return false;
2808 }
2809
2810 char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2811 Slice cache_key =
2812 GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
2813 cache_key_storage);
2814
2815 Cache::Handle* const cache_handle = cache->Lookup(cache_key);
2816 if (cache_handle == nullptr) {
2817 return false;
2818 }
2819
2820 cache->Release(cache_handle);
2821
2822 return true;
2823 }
2824
TEST_KeyInCache(const ReadOptions & options,const Slice & key)2825 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
2826 const Slice& key) {
2827 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2828 options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2829 /*get_context=*/nullptr, /*lookup_context=*/nullptr));
2830 iiter->Seek(key);
2831 assert(iiter->Valid());
2832
2833 return TEST_BlockInCache(iiter->value().handle);
2834 }
2835
2836 // REQUIRES: The following fields of rep_ should have already been populated:
2837 // 1. file
2838 // 2. index_handle,
2839 // 3. options
2840 // 4. internal_comparator
2841 // 5. index_type
CreateIndexReader(FilePrefetchBuffer * prefetch_buffer,InternalIterator * preloaded_meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)2842 Status BlockBasedTable::CreateIndexReader(
2843 FilePrefetchBuffer* prefetch_buffer,
2844 InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
2845 bool pin, BlockCacheLookupContext* lookup_context,
2846 std::unique_ptr<IndexReader>* index_reader) {
2847 // kHashSearch requires non-empty prefix_extractor but bypass checking
2848 // prefix_extractor here since we have no access to MutableCFOptions.
2849 // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
2850 // If prefix_extractor does not match prefix_extractor_name from table
2851 // properties, turn off Hash Index by setting total_order_seek to true
2852
2853 switch (rep_->index_type) {
2854 case BlockBasedTableOptions::kTwoLevelIndexSearch: {
2855 return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
2856 prefetch, pin, lookup_context,
2857 index_reader);
2858 }
2859 case BlockBasedTableOptions::kBinarySearch:
2860 FALLTHROUGH_INTENDED;
2861 case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
2862 return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
2863 prefetch, pin, lookup_context,
2864 index_reader);
2865 }
2866 case BlockBasedTableOptions::kHashSearch: {
2867 std::unique_ptr<Block> metaindex_guard;
2868 std::unique_ptr<InternalIterator> metaindex_iter_guard;
2869 auto meta_index_iter = preloaded_meta_index_iter;
2870 bool should_fallback = false;
2871 if (rep_->internal_prefix_transform.get() == nullptr) {
2872 ROCKS_LOG_WARN(rep_->ioptions.info_log,
2873 "No prefix extractor passed in. Fall back to binary"
2874 " search index.");
2875 should_fallback = true;
2876 } else if (meta_index_iter == nullptr) {
2877 auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
2878 &metaindex_iter_guard);
2879 if (!s.ok()) {
2880 // we simply fall back to binary search in case there is any
2881 // problem with prefix hash index loading.
2882 ROCKS_LOG_WARN(rep_->ioptions.info_log,
2883 "Unable to read the metaindex block."
2884 " Fall back to binary search index.");
2885 should_fallback = true;
2886 }
2887 meta_index_iter = metaindex_iter_guard.get();
2888 }
2889
2890 if (should_fallback) {
2891 return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
2892 prefetch, pin, lookup_context,
2893 index_reader);
2894 } else {
2895 return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
2896 use_cache, prefetch, pin, lookup_context,
2897 index_reader);
2898 }
2899 }
2900 default: {
2901 std::string error_message =
2902 "Unrecognized index type: " + ToString(rep_->index_type);
2903 return Status::InvalidArgument(error_message.c_str());
2904 }
2905 }
2906 }
2907
ApproximateOffsetOf(const InternalIteratorBase<IndexValue> & index_iter) const2908 uint64_t BlockBasedTable::ApproximateOffsetOf(
2909 const InternalIteratorBase<IndexValue>& index_iter) const {
2910 uint64_t result = 0;
2911 if (index_iter.Valid()) {
2912 BlockHandle handle = index_iter.value().handle;
2913 result = handle.offset();
2914 } else {
2915 // The iterator is past the last key in the file. If table_properties is not
2916 // available, approximate the offset by returning the offset of the
2917 // metaindex block (which is right near the end of the file).
2918 if (rep_->table_properties) {
2919 result = rep_->table_properties->data_size;
2920 }
2921 // table_properties is not present in the table.
2922 if (result == 0) {
2923 result = rep_->footer.metaindex_handle().offset();
2924 }
2925 }
2926
2927 return result;
2928 }
2929
ApproximateOffsetOf(const Slice & key,TableReaderCaller caller)2930 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
2931 TableReaderCaller caller) {
2932 BlockCacheLookupContext context(caller);
2933 IndexBlockIter iiter_on_stack;
2934 ReadOptions ro;
2935 ro.total_order_seek = true;
2936 auto index_iter =
2937 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2938 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2939 /*lookup_context=*/&context);
2940 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2941 if (index_iter != &iiter_on_stack) {
2942 iiter_unique_ptr.reset(index_iter);
2943 }
2944
2945 index_iter->Seek(key);
2946 return ApproximateOffsetOf(*index_iter);
2947 }
2948
ApproximateSize(const Slice & start,const Slice & end,TableReaderCaller caller)2949 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
2950 TableReaderCaller caller) {
2951 assert(rep_->internal_comparator.Compare(start, end) <= 0);
2952
2953 BlockCacheLookupContext context(caller);
2954 IndexBlockIter iiter_on_stack;
2955 ReadOptions ro;
2956 ro.total_order_seek = true;
2957 auto index_iter =
2958 NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2959 /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2960 /*lookup_context=*/&context);
2961 std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2962 if (index_iter != &iiter_on_stack) {
2963 iiter_unique_ptr.reset(index_iter);
2964 }
2965
2966 index_iter->Seek(start);
2967 uint64_t start_offset = ApproximateOffsetOf(*index_iter);
2968 index_iter->Seek(end);
2969 uint64_t end_offset = ApproximateOffsetOf(*index_iter);
2970
2971 assert(end_offset >= start_offset);
2972 return end_offset - start_offset;
2973 }
2974
TEST_FilterBlockInCache() const2975 bool BlockBasedTable::TEST_FilterBlockInCache() const {
2976 assert(rep_ != nullptr);
2977 return TEST_BlockInCache(rep_->filter_handle);
2978 }
2979
TEST_IndexBlockInCache() const2980 bool BlockBasedTable::TEST_IndexBlockInCache() const {
2981 assert(rep_ != nullptr);
2982
2983 return TEST_BlockInCache(rep_->footer.index_handle());
2984 }
2985
GetKVPairsFromDataBlocks(std::vector<KVPairBlock> * kv_pair_blocks)2986 Status BlockBasedTable::GetKVPairsFromDataBlocks(
2987 std::vector<KVPairBlock>* kv_pair_blocks) {
2988 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
2989 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2990 /*input_iter=*/nullptr, /*get_context=*/nullptr,
2991 /*lookup_contex=*/nullptr));
2992
2993 Status s = blockhandles_iter->status();
2994 if (!s.ok()) {
2995 // Cannot read Index Block
2996 return s;
2997 }
2998
2999 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3000 blockhandles_iter->Next()) {
3001 s = blockhandles_iter->status();
3002
3003 if (!s.ok()) {
3004 break;
3005 }
3006
3007 std::unique_ptr<InternalIterator> datablock_iter;
3008 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3009 ReadOptions(), blockhandles_iter->value().handle,
3010 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
3011 /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
3012 /*prefetch_buffer=*/nullptr));
3013 s = datablock_iter->status();
3014
3015 if (!s.ok()) {
3016 // Error reading the block - Skipped
3017 continue;
3018 }
3019
3020 KVPairBlock kv_pair_block;
3021 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
3022 datablock_iter->Next()) {
3023 s = datablock_iter->status();
3024 if (!s.ok()) {
3025 // Error reading the block - Skipped
3026 break;
3027 }
3028 const Slice& key = datablock_iter->key();
3029 const Slice& value = datablock_iter->value();
3030 std::string key_copy = std::string(key.data(), key.size());
3031 std::string value_copy = std::string(value.data(), value.size());
3032
3033 kv_pair_block.push_back(
3034 std::make_pair(std::move(key_copy), std::move(value_copy)));
3035 }
3036 kv_pair_blocks->push_back(std::move(kv_pair_block));
3037 }
3038 return Status::OK();
3039 }
3040
DumpTable(WritableFile * out_file)3041 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
3042 // Output Footer
3043 out_file->Append(
3044 "Footer Details:\n"
3045 "--------------------------------------\n"
3046 " ");
3047 out_file->Append(rep_->footer.ToString().c_str());
3048 out_file->Append("\n");
3049
3050 // Output MetaIndex
3051 out_file->Append(
3052 "Metaindex Details:\n"
3053 "--------------------------------------\n");
3054 std::unique_ptr<Block> metaindex;
3055 std::unique_ptr<InternalIterator> metaindex_iter;
3056 Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
3057 &metaindex_iter);
3058 if (s.ok()) {
3059 for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
3060 metaindex_iter->Next()) {
3061 s = metaindex_iter->status();
3062 if (!s.ok()) {
3063 return s;
3064 }
3065 if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
3066 out_file->Append(" Properties block handle: ");
3067 out_file->Append(metaindex_iter->value().ToString(true).c_str());
3068 out_file->Append("\n");
3069 } else if (metaindex_iter->key() ==
3070 ROCKSDB_NAMESPACE::kCompressionDictBlock) {
3071 out_file->Append(" Compression dictionary block handle: ");
3072 out_file->Append(metaindex_iter->value().ToString(true).c_str());
3073 out_file->Append("\n");
3074 } else if (strstr(metaindex_iter->key().ToString().c_str(),
3075 "filter.rocksdb.") != nullptr) {
3076 out_file->Append(" Filter block handle: ");
3077 out_file->Append(metaindex_iter->value().ToString(true).c_str());
3078 out_file->Append("\n");
3079 } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
3080 out_file->Append(" Range deletion block handle: ");
3081 out_file->Append(metaindex_iter->value().ToString(true).c_str());
3082 out_file->Append("\n");
3083 }
3084 }
3085 out_file->Append("\n");
3086 } else {
3087 return s;
3088 }
3089
3090 // Output TableProperties
3091 const ROCKSDB_NAMESPACE::TableProperties* table_properties;
3092 table_properties = rep_->table_properties.get();
3093
3094 if (table_properties != nullptr) {
3095 out_file->Append(
3096 "Table Properties:\n"
3097 "--------------------------------------\n"
3098 " ");
3099 out_file->Append(table_properties->ToString("\n ", ": ").c_str());
3100 out_file->Append("\n");
3101 }
3102
3103 if (rep_->filter) {
3104 out_file->Append(
3105 "Filter Details:\n"
3106 "--------------------------------------\n"
3107 " ");
3108 out_file->Append(rep_->filter->ToString().c_str());
3109 out_file->Append("\n");
3110 }
3111
3112 // Output Index block
3113 s = DumpIndexBlock(out_file);
3114 if (!s.ok()) {
3115 return s;
3116 }
3117
3118 // Output compression dictionary
3119 if (rep_->uncompression_dict_reader) {
3120 CachableEntry<UncompressionDict> uncompression_dict;
3121 s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3122 nullptr /* prefetch_buffer */, false /* no_io */,
3123 nullptr /* get_context */, nullptr /* lookup_context */,
3124 &uncompression_dict);
3125 if (!s.ok()) {
3126 return s;
3127 }
3128
3129 assert(uncompression_dict.GetValue());
3130
3131 const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
3132 out_file->Append(
3133 "Compression Dictionary:\n"
3134 "--------------------------------------\n");
3135 out_file->Append(" size (bytes): ");
3136 out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
3137 out_file->Append("\n\n");
3138 out_file->Append(" HEX ");
3139 out_file->Append(raw_dict.ToString(true).c_str());
3140 out_file->Append("\n\n");
3141 }
3142
3143 // Output range deletions block
3144 auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
3145 if (range_del_iter != nullptr) {
3146 range_del_iter->SeekToFirst();
3147 if (range_del_iter->Valid()) {
3148 out_file->Append(
3149 "Range deletions:\n"
3150 "--------------------------------------\n"
3151 " ");
3152 for (; range_del_iter->Valid(); range_del_iter->Next()) {
3153 DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
3154 }
3155 out_file->Append("\n");
3156 }
3157 delete range_del_iter;
3158 }
3159 // Output Data blocks
3160 s = DumpDataBlocks(out_file);
3161
3162 return s;
3163 }
3164
DumpIndexBlock(WritableFile * out_file)3165 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
3166 out_file->Append(
3167 "Index Details:\n"
3168 "--------------------------------------\n");
3169 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
3170 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3171 /*input_iter=*/nullptr, /*get_context=*/nullptr,
3172 /*lookup_contex=*/nullptr));
3173 Status s = blockhandles_iter->status();
3174 if (!s.ok()) {
3175 out_file->Append("Can not read Index Block \n\n");
3176 return s;
3177 }
3178
3179 out_file->Append(" Block key hex dump: Data block handle\n");
3180 out_file->Append(" Block key ascii\n\n");
3181 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3182 blockhandles_iter->Next()) {
3183 s = blockhandles_iter->status();
3184 if (!s.ok()) {
3185 break;
3186 }
3187 Slice key = blockhandles_iter->key();
3188 Slice user_key;
3189 InternalKey ikey;
3190 if (!rep_->index_key_includes_seq) {
3191 user_key = key;
3192 } else {
3193 ikey.DecodeFrom(key);
3194 user_key = ikey.user_key();
3195 }
3196
3197 out_file->Append(" HEX ");
3198 out_file->Append(user_key.ToString(true).c_str());
3199 out_file->Append(": ");
3200 out_file->Append(blockhandles_iter->value()
3201 .ToString(true, rep_->index_has_first_key)
3202 .c_str());
3203 out_file->Append("\n");
3204
3205 std::string str_key = user_key.ToString();
3206 std::string res_key("");
3207 char cspace = ' ';
3208 for (size_t i = 0; i < str_key.size(); i++) {
3209 res_key.append(&str_key[i], 1);
3210 res_key.append(1, cspace);
3211 }
3212 out_file->Append(" ASCII ");
3213 out_file->Append(res_key.c_str());
3214 out_file->Append("\n ------\n");
3215 }
3216 out_file->Append("\n");
3217 return Status::OK();
3218 }
3219
DumpDataBlocks(WritableFile * out_file)3220 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
3221 std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
3222 NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3223 /*input_iter=*/nullptr, /*get_context=*/nullptr,
3224 /*lookup_contex=*/nullptr));
3225 Status s = blockhandles_iter->status();
3226 if (!s.ok()) {
3227 out_file->Append("Can not read Index Block \n\n");
3228 return s;
3229 }
3230
3231 uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
3232 uint64_t datablock_size_max = 0;
3233 uint64_t datablock_size_sum = 0;
3234
3235 size_t block_id = 1;
3236 for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3237 block_id++, blockhandles_iter->Next()) {
3238 s = blockhandles_iter->status();
3239 if (!s.ok()) {
3240 break;
3241 }
3242
3243 BlockHandle bh = blockhandles_iter->value().handle;
3244 uint64_t datablock_size = bh.size();
3245 datablock_size_min = std::min(datablock_size_min, datablock_size);
3246 datablock_size_max = std::max(datablock_size_max, datablock_size);
3247 datablock_size_sum += datablock_size;
3248
3249 out_file->Append("Data Block # ");
3250 out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
3251 out_file->Append(" @ ");
3252 out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
3253 out_file->Append("\n");
3254 out_file->Append("--------------------------------------\n");
3255
3256 std::unique_ptr<InternalIterator> datablock_iter;
3257 datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3258 ReadOptions(), blockhandles_iter->value().handle,
3259 /*input_iter=*/nullptr, /*type=*/BlockType::kData,
3260 /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
3261 /*prefetch_buffer=*/nullptr));
3262 s = datablock_iter->status();
3263
3264 if (!s.ok()) {
3265 out_file->Append("Error reading the block - Skipped \n\n");
3266 continue;
3267 }
3268
3269 for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
3270 datablock_iter->Next()) {
3271 s = datablock_iter->status();
3272 if (!s.ok()) {
3273 out_file->Append("Error reading the block - Skipped \n");
3274 break;
3275 }
3276 DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3277 }
3278 out_file->Append("\n");
3279 }
3280
3281 uint64_t num_datablocks = block_id - 1;
3282 if (num_datablocks) {
3283 double datablock_size_avg =
3284 static_cast<double>(datablock_size_sum) / num_datablocks;
3285 out_file->Append("Data Block Summary:\n");
3286 out_file->Append("--------------------------------------");
3287 out_file->Append("\n # data blocks: ");
3288 out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
3289 out_file->Append("\n min data block size: ");
3290 out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
3291 out_file->Append("\n max data block size: ");
3292 out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
3293 out_file->Append("\n avg data block size: ");
3294 out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
3295 out_file->Append("\n");
3296 }
3297
3298 return Status::OK();
3299 }
3300
DumpKeyValue(const Slice & key,const Slice & value,WritableFile * out_file)3301 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
3302 WritableFile* out_file) {
3303 InternalKey ikey;
3304 ikey.DecodeFrom(key);
3305
3306 out_file->Append(" HEX ");
3307 out_file->Append(ikey.user_key().ToString(true).c_str());
3308 out_file->Append(": ");
3309 out_file->Append(value.ToString(true).c_str());
3310 out_file->Append("\n");
3311
3312 std::string str_key = ikey.user_key().ToString();
3313 std::string str_value = value.ToString();
3314 std::string res_key(""), res_value("");
3315 char cspace = ' ';
3316 for (size_t i = 0; i < str_key.size(); i++) {
3317 if (str_key[i] == '\0') {
3318 res_key.append("\\0", 2);
3319 } else {
3320 res_key.append(&str_key[i], 1);
3321 }
3322 res_key.append(1, cspace);
3323 }
3324 for (size_t i = 0; i < str_value.size(); i++) {
3325 if (str_value[i] == '\0') {
3326 res_value.append("\\0", 2);
3327 } else {
3328 res_value.append(&str_value[i], 1);
3329 }
3330 res_value.append(1, cspace);
3331 }
3332
3333 out_file->Append(" ASCII ");
3334 out_file->Append(res_key.c_str());
3335 out_file->Append(": ");
3336 out_file->Append(res_value.c_str());
3337 out_file->Append("\n ------\n");
3338 }
3339
3340 } // namespace ROCKSDB_NAMESPACE
3341