1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_reader.h"
10 #include <algorithm>
11 #include <array>
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 #include "cache/simple_deleter.h"
18 
19 #include "db/dbformat.h"
20 #include "db/pinned_iterators_manager.h"
21 
22 #include "file/file_prefetch_buffer.h"
23 #include "file/random_access_file_reader.h"
24 
25 #include "rocksdb/cache.h"
26 #include "rocksdb/comparator.h"
27 #include "rocksdb/env.h"
28 #include "rocksdb/file_system.h"
29 #include "rocksdb/filter_policy.h"
30 #include "rocksdb/iterator.h"
31 #include "rocksdb/options.h"
32 #include "rocksdb/statistics.h"
33 #include "rocksdb/table.h"
34 #include "rocksdb/table_properties.h"
35 
36 #include "table/block_based/binary_search_index_reader.h"
37 #include "table/block_based/block.h"
38 #include "table/block_based/block_based_filter_block.h"
39 #include "table/block_based/block_based_table_factory.h"
40 #include "table/block_based/block_based_table_iterator.h"
41 #include "table/block_based/block_prefix_index.h"
42 #include "table/block_based/filter_block.h"
43 #include "table/block_based/full_filter_block.h"
44 #include "table/block_based/hash_index_reader.h"
45 #include "table/block_based/partitioned_filter_block.h"
46 #include "table/block_based/partitioned_index_reader.h"
47 #include "table/block_fetcher.h"
48 #include "table/format.h"
49 #include "table/get_context.h"
50 #include "table/internal_iterator.h"
51 #include "table/meta_blocks.h"
52 #include "table/multiget_context.h"
53 #include "table/persistent_cache_helper.h"
54 #include "table/sst_file_writer_collectors.h"
55 #include "table/two_level_iterator.h"
56 
57 #include "monitoring/perf_context_imp.h"
58 #include "test_util/sync_point.h"
59 #include "util/coding.h"
60 #include "util/crc32c.h"
61 #include "util/stop_watch.h"
62 #include "util/string_util.h"
63 #include "util/util.h"
64 #include "util/xxhash.h"
65 
66 namespace ROCKSDB_NAMESPACE {
67 
68 extern const uint64_t kBlockBasedTableMagicNumber;
69 extern const std::string kHashIndexPrefixesBlock;
70 extern const std::string kHashIndexPrefixesMetadataBlock;
71 
72 typedef BlockBasedTable::IndexReader IndexReader;
73 
74 // Found that 256 KB readahead size provides the best performance, based on
75 // experiments, for auto readahead. Experiment data is in PR #3282.
76 const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
77 
~BlockBasedTable()78 BlockBasedTable::~BlockBasedTable() {
79   delete rep_;
80 }
81 
82 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
83 
84 template <typename TBlocklike>
85 class BlocklikeTraits;
86 
87 template <>
88 class BlocklikeTraits<BlockContents> {
89  public:
Create(BlockContents && contents,size_t,Statistics *,bool,const FilterPolicy *)90   static BlockContents* Create(BlockContents&& contents,
91                                size_t /* read_amp_bytes_per_bit */,
92                                Statistics* /* statistics */,
93                                bool /* using_zstd */,
94                                const FilterPolicy* /* filter_policy */) {
95     return new BlockContents(std::move(contents));
96   }
97 
GetNumRestarts(const BlockContents &)98   static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
99     return 0;
100   }
101 };
102 
103 template <>
104 class BlocklikeTraits<ParsedFullFilterBlock> {
105  public:
Create(BlockContents && contents,size_t,Statistics *,bool,const FilterPolicy * filter_policy)106   static ParsedFullFilterBlock* Create(BlockContents&& contents,
107                                        size_t /* read_amp_bytes_per_bit */,
108                                        Statistics* /* statistics */,
109                                        bool /* using_zstd */,
110                                        const FilterPolicy* filter_policy) {
111     return new ParsedFullFilterBlock(filter_policy, std::move(contents));
112   }
113 
GetNumRestarts(const ParsedFullFilterBlock &)114   static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
115     return 0;
116   }
117 };
118 
119 template <>
120 class BlocklikeTraits<Block> {
121  public:
Create(BlockContents && contents,size_t read_amp_bytes_per_bit,Statistics * statistics,bool,const FilterPolicy *)122   static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
123                        Statistics* statistics, bool /* using_zstd */,
124                        const FilterPolicy* /* filter_policy */) {
125     return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
126   }
127 
GetNumRestarts(const Block & block)128   static uint32_t GetNumRestarts(const Block& block) {
129     return block.NumRestarts();
130   }
131 };
132 
133 template <>
134 class BlocklikeTraits<UncompressionDict> {
135  public:
Create(BlockContents && contents,size_t,Statistics *,bool using_zstd,const FilterPolicy *)136   static UncompressionDict* Create(BlockContents&& contents,
137                                    size_t /* read_amp_bytes_per_bit */,
138                                    Statistics* /* statistics */,
139                                    bool using_zstd,
140                                    const FilterPolicy* /* filter_policy */) {
141     return new UncompressionDict(contents.data, std::move(contents.allocation),
142                                  using_zstd);
143   }
144 
GetNumRestarts(const UncompressionDict &)145   static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
146     return 0;
147   }
148 };
149 
150 namespace {
151 // Read the block identified by "handle" from "file".
152 // The only relevant option is options.verify_checksums for now.
153 // On failure return non-OK.
154 // On success fill *result and return OK - caller owns *result
155 // @param uncompression_dict Data for presetting the compression library's
156 //    dictionary.
157 template <typename TBlocklike>
ReadBlockFromFile(RandomAccessFileReader * file,FilePrefetchBuffer * prefetch_buffer,const Footer & footer,const ReadOptions & options,const BlockHandle & handle,std::unique_ptr<TBlocklike> * result,const ImmutableCFOptions & ioptions,bool do_uncompress,bool maybe_compressed,BlockType block_type,const UncompressionDict & uncompression_dict,const PersistentCacheOptions & cache_options,size_t read_amp_bytes_per_bit,MemoryAllocator * memory_allocator,bool for_compaction,bool using_zstd,const FilterPolicy * filter_policy)158 Status ReadBlockFromFile(
159     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
160     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
161     std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
162     bool do_uncompress, bool maybe_compressed, BlockType block_type,
163     const UncompressionDict& uncompression_dict,
164     const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
165     MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
166     const FilterPolicy* filter_policy) {
167   assert(result);
168 
169   BlockContents contents;
170   BlockFetcher block_fetcher(
171       file, prefetch_buffer, footer, options, handle, &contents, ioptions,
172       do_uncompress, maybe_compressed, block_type, uncompression_dict,
173       cache_options, memory_allocator, nullptr, for_compaction);
174   Status s = block_fetcher.ReadBlockContents();
175   if (s.ok()) {
176     result->reset(BlocklikeTraits<TBlocklike>::Create(
177         std::move(contents), read_amp_bytes_per_bit, ioptions.statistics,
178         using_zstd, filter_policy));
179   }
180 
181   return s;
182 }
183 
184 // Release the cached entry and decrement its ref count.
185 // Do not force erase
ReleaseCachedEntry(void * arg,void * h)186 void ReleaseCachedEntry(void* arg, void* h) {
187   Cache* cache = reinterpret_cast<Cache*>(arg);
188   Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
189   cache->Release(handle, false /* force_erase */);
190 }
191 
192 // For hash based index, return true if prefix_extractor and
193 // prefix_extractor_block mismatch, false otherwise. This flag will be used
194 // as total_order_seek via NewIndexIterator
PrefixExtractorChanged(const TableProperties * table_properties,const SliceTransform * prefix_extractor)195 bool PrefixExtractorChanged(const TableProperties* table_properties,
196                             const SliceTransform* prefix_extractor) {
197   // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
198   // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
199   // is set but prefix_extractor_block is not set, also disable hash index
200   if (prefix_extractor == nullptr || table_properties == nullptr ||
201       table_properties->prefix_extractor_name.empty()) {
202     return true;
203   }
204 
205   // prefix_extractor and prefix_extractor_block are both non-empty
206   if (table_properties->prefix_extractor_name.compare(
207           prefix_extractor->Name()) != 0) {
208     return true;
209   } else {
210     return false;
211   }
212 }
213 
CopyBufferToHeap(MemoryAllocator * allocator,Slice & buf)214 CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
215   CacheAllocationPtr heap_buf;
216   heap_buf = AllocateBlock(buf.size(), allocator);
217   memcpy(heap_buf.get(), buf.data(), buf.size());
218   return heap_buf;
219 }
220 }  // namespace
221 
UpdateCacheHitMetrics(BlockType block_type,GetContext * get_context,size_t usage) const222 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
223                                             GetContext* get_context,
224                                             size_t usage) const {
225   Statistics* const statistics = rep_->ioptions.statistics;
226 
227   PERF_COUNTER_ADD(block_cache_hit_count, 1);
228   PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
229                             static_cast<uint32_t>(rep_->level));
230 
231   if (get_context) {
232     ++get_context->get_context_stats_.num_cache_hit;
233     get_context->get_context_stats_.num_cache_bytes_read += usage;
234   } else {
235     RecordTick(statistics, BLOCK_CACHE_HIT);
236     RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
237   }
238 
239   switch (block_type) {
240     case BlockType::kFilter:
241       PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
242 
243       if (get_context) {
244         ++get_context->get_context_stats_.num_cache_filter_hit;
245       } else {
246         RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
247       }
248       break;
249 
250     case BlockType::kCompressionDictionary:
251       // TODO: introduce perf counter for compression dictionary hit count
252       if (get_context) {
253         ++get_context->get_context_stats_.num_cache_compression_dict_hit;
254       } else {
255         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
256       }
257       break;
258 
259     case BlockType::kIndex:
260       PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
261 
262       if (get_context) {
263         ++get_context->get_context_stats_.num_cache_index_hit;
264       } else {
265         RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
266       }
267       break;
268 
269     default:
270       // TODO: introduce dedicated tickers/statistics/counters
271       // for range tombstones
272       if (get_context) {
273         ++get_context->get_context_stats_.num_cache_data_hit;
274       } else {
275         RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
276       }
277       break;
278   }
279 }
280 
UpdateCacheMissMetrics(BlockType block_type,GetContext * get_context) const281 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
282                                              GetContext* get_context) const {
283   Statistics* const statistics = rep_->ioptions.statistics;
284 
285   // TODO: introduce aggregate (not per-level) block cache miss count
286   PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
287                             static_cast<uint32_t>(rep_->level));
288 
289   if (get_context) {
290     ++get_context->get_context_stats_.num_cache_miss;
291   } else {
292     RecordTick(statistics, BLOCK_CACHE_MISS);
293   }
294 
295   // TODO: introduce perf counters for misses per block type
296   switch (block_type) {
297     case BlockType::kFilter:
298       if (get_context) {
299         ++get_context->get_context_stats_.num_cache_filter_miss;
300       } else {
301         RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
302       }
303       break;
304 
305     case BlockType::kCompressionDictionary:
306       if (get_context) {
307         ++get_context->get_context_stats_.num_cache_compression_dict_miss;
308       } else {
309         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
310       }
311       break;
312 
313     case BlockType::kIndex:
314       if (get_context) {
315         ++get_context->get_context_stats_.num_cache_index_miss;
316       } else {
317         RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
318       }
319       break;
320 
321     default:
322       // TODO: introduce dedicated tickers/statistics/counters
323       // for range tombstones
324       if (get_context) {
325         ++get_context->get_context_stats_.num_cache_data_miss;
326       } else {
327         RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
328       }
329       break;
330   }
331 }
332 
UpdateCacheInsertionMetrics(BlockType block_type,GetContext * get_context,size_t usage) const333 void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
334                                                   GetContext* get_context,
335                                                   size_t usage) const {
336   Statistics* const statistics = rep_->ioptions.statistics;
337 
338   // TODO: introduce perf counters for block cache insertions
339   if (get_context) {
340     ++get_context->get_context_stats_.num_cache_add;
341     get_context->get_context_stats_.num_cache_bytes_write += usage;
342   } else {
343     RecordTick(statistics, BLOCK_CACHE_ADD);
344     RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
345   }
346 
347   switch (block_type) {
348     case BlockType::kFilter:
349       if (get_context) {
350         ++get_context->get_context_stats_.num_cache_filter_add;
351         get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
352       } else {
353         RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
354         RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
355       }
356       break;
357 
358     case BlockType::kCompressionDictionary:
359       if (get_context) {
360         ++get_context->get_context_stats_.num_cache_compression_dict_add;
361         get_context->get_context_stats_
362             .num_cache_compression_dict_bytes_insert += usage;
363       } else {
364         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
365         RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
366                    usage);
367       }
368       break;
369 
370     case BlockType::kIndex:
371       if (get_context) {
372         ++get_context->get_context_stats_.num_cache_index_add;
373         get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
374       } else {
375         RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
376         RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
377       }
378       break;
379 
380     default:
381       // TODO: introduce dedicated tickers/statistics/counters
382       // for range tombstones
383       if (get_context) {
384         ++get_context->get_context_stats_.num_cache_data_add;
385         get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
386       } else {
387         RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
388         RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
389       }
390       break;
391   }
392 }
393 
GetEntryFromCache(Cache * block_cache,const Slice & key,BlockType block_type,GetContext * get_context) const394 Cache::Handle* BlockBasedTable::GetEntryFromCache(
395     Cache* block_cache, const Slice& key, BlockType block_type,
396     GetContext* get_context) const {
397   auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
398 
399   if (cache_handle != nullptr) {
400     UpdateCacheHitMetrics(block_type, get_context,
401                           block_cache->GetUsage(cache_handle));
402   } else {
403     UpdateCacheMissMetrics(block_type, get_context);
404   }
405 
406   return cache_handle;
407 }
408 
409 // Helper function to setup the cache key's prefix for the Table.
SetupCacheKeyPrefix(Rep * rep)410 void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
411   assert(kMaxCacheKeyPrefixSize >= 10);
412   rep->cache_key_prefix_size = 0;
413   rep->compressed_cache_key_prefix_size = 0;
414   if (rep->table_options.block_cache != nullptr) {
415     GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
416                         &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
417   }
418   if (rep->table_options.persistent_cache != nullptr) {
419     GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
420                         &rep->persistent_cache_key_prefix[0],
421                         &rep->persistent_cache_key_prefix_size);
422   }
423   if (rep->table_options.block_cache_compressed != nullptr) {
424     GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
425                         rep->file->file(), &rep->compressed_cache_key_prefix[0],
426                         &rep->compressed_cache_key_prefix_size);
427   }
428 }
429 
GenerateCachePrefix(Cache * cc,FSRandomAccessFile * file,char * buffer,size_t * size)430 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
431                                           char* buffer, size_t* size) {
432   // generate an id from the file
433   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
434 
435   // If the prefix wasn't generated or was too long,
436   // create one from the cache.
437   if (cc != nullptr && *size == 0) {
438     char* end = EncodeVarint64(buffer, cc->NewId());
439     *size = static_cast<size_t>(end - buffer);
440   }
441 }
442 
GenerateCachePrefix(Cache * cc,FSWritableFile * file,char * buffer,size_t * size)443 void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
444                                           char* buffer, size_t* size) {
445   // generate an id from the file
446   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
447 
448   // If the prefix wasn't generated or was too long,
449   // create one from the cache.
450   if (cc != nullptr && *size == 0) {
451     char* end = EncodeVarint64(buffer, cc->NewId());
452     *size = static_cast<size_t>(end - buffer);
453   }
454 }
455 
456 namespace {
457 // Return True if table_properties has `user_prop_name` has a `true` value
458 // or it doesn't contain this property (for backward compatible).
IsFeatureSupported(const TableProperties & table_properties,const std::string & user_prop_name,Logger * info_log)459 bool IsFeatureSupported(const TableProperties& table_properties,
460                         const std::string& user_prop_name, Logger* info_log) {
461   auto& props = table_properties.user_collected_properties;
462   auto pos = props.find(user_prop_name);
463   // Older version doesn't have this value set. Skip this check.
464   if (pos != props.end()) {
465     if (pos->second == kPropFalse) {
466       return false;
467     } else if (pos->second != kPropTrue) {
468       ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
469                      user_prop_name.c_str(), pos->second.c_str());
470     }
471   }
472   return true;
473 }
474 
475 // Caller has to ensure seqno is not nullptr.
GetGlobalSequenceNumber(const TableProperties & table_properties,SequenceNumber largest_seqno,SequenceNumber * seqno)476 Status GetGlobalSequenceNumber(const TableProperties& table_properties,
477                                SequenceNumber largest_seqno,
478                                SequenceNumber* seqno) {
479   const auto& props = table_properties.user_collected_properties;
480   const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
481   const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
482 
483   *seqno = kDisableGlobalSequenceNumber;
484   if (version_pos == props.end()) {
485     if (seqno_pos != props.end()) {
486       std::array<char, 200> msg_buf;
487       // This is not an external sst file, global_seqno is not supported.
488       snprintf(
489           msg_buf.data(), msg_buf.max_size(),
490           "A non-external sst file have global seqno property with value %s",
491           seqno_pos->second.c_str());
492       return Status::Corruption(msg_buf.data());
493     }
494     return Status::OK();
495   }
496 
497   uint32_t version = DecodeFixed32(version_pos->second.c_str());
498   if (version < 2) {
499     if (seqno_pos != props.end() || version != 1) {
500       std::array<char, 200> msg_buf;
501       // This is a v1 external sst file, global_seqno is not supported.
502       snprintf(msg_buf.data(), msg_buf.max_size(),
503                "An external sst file with version %u have global seqno "
504                "property with value %s",
505                version, seqno_pos->second.c_str());
506       return Status::Corruption(msg_buf.data());
507     }
508     return Status::OK();
509   }
510 
511   // Since we have a plan to deprecate global_seqno, we do not return failure
512   // if seqno_pos == props.end(). We rely on version_pos to detect whether the
513   // SST is external.
514   SequenceNumber global_seqno(0);
515   if (seqno_pos != props.end()) {
516     global_seqno = DecodeFixed64(seqno_pos->second.c_str());
517   }
518   // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
519   // to denote it is unknown.
520   if (largest_seqno < kMaxSequenceNumber) {
521     if (global_seqno == 0) {
522       global_seqno = largest_seqno;
523     }
524     if (global_seqno != largest_seqno) {
525       std::array<char, 200> msg_buf;
526       snprintf(
527           msg_buf.data(), msg_buf.max_size(),
528           "An external sst file with version %u have global seqno property "
529           "with value %s, while largest seqno in the file is %llu",
530           version, seqno_pos->second.c_str(),
531           static_cast<unsigned long long>(largest_seqno));
532       return Status::Corruption(msg_buf.data());
533     }
534   }
535   *seqno = global_seqno;
536 
537   if (global_seqno > kMaxSequenceNumber) {
538     std::array<char, 200> msg_buf;
539     snprintf(msg_buf.data(), msg_buf.max_size(),
540              "An external sst file with version %u have global seqno property "
541              "with value %llu, which is greater than kMaxSequenceNumber",
542              version, static_cast<unsigned long long>(global_seqno));
543     return Status::Corruption(msg_buf.data());
544   }
545 
546   return Status::OK();
547 }
548 }  // namespace
549 
GetCacheKey(const char * cache_key_prefix,size_t cache_key_prefix_size,const BlockHandle & handle,char * cache_key)550 Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
551                                    size_t cache_key_prefix_size,
552                                    const BlockHandle& handle, char* cache_key) {
553   assert(cache_key != nullptr);
554   assert(cache_key_prefix_size != 0);
555   assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
556   memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
557   char* end =
558       EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
559   return Slice(cache_key, static_cast<size_t>(end - cache_key));
560 }
561 
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const BlockBasedTableOptions & table_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const SliceTransform * prefix_extractor,const bool prefetch_index_and_filter_in_cache,const bool skip_filters,const int level,const bool immortal_table,const SequenceNumber largest_seqno,TailPrefetchStats * tail_prefetch_stats,BlockCacheTracer * const block_cache_tracer)562 Status BlockBasedTable::Open(
563     const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
564     const BlockBasedTableOptions& table_options,
565     const InternalKeyComparator& internal_comparator,
566     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
567     std::unique_ptr<TableReader>* table_reader,
568     const SliceTransform* prefix_extractor,
569     const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
570     const int level, const bool immortal_table,
571     const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
572     BlockCacheTracer* const block_cache_tracer) {
573   table_reader->reset();
574 
575   Status s;
576   Footer footer;
577   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
578 
579   // prefetch both index and filters, down to all partitions
580   const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
581   const bool preload_all = !table_options.cache_index_and_filter_blocks;
582 
583   if (!ioptions.allow_mmap_reads) {
584     s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
585                      preload_all, &prefetch_buffer);
586   } else {
587     // Should not prefetch for mmap mode.
588     prefetch_buffer.reset(new FilePrefetchBuffer(
589         nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
590   }
591 
592   // Read in the following order:
593   //    1. Footer
594   //    2. [metaindex block]
595   //    3. [meta block: properties]
596   //    4. [meta block: range deletion tombstone]
597   //    5. [meta block: compression dictionary]
598   //    6. [meta block: index]
599   //    7. [meta block: filter]
600   s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
601                          kBlockBasedTableMagicNumber);
602   if (!s.ok()) {
603     return s;
604   }
605   if (!BlockBasedTableSupportedVersion(footer.version())) {
606     return Status::Corruption(
607         "Unknown Footer version. Maybe this file was created with newer "
608         "version of RocksDB?");
609   }
610 
611   // We've successfully read the footer. We are ready to serve requests.
612   // Better not mutate rep_ after the creation. eg. internal_prefix_transform
613   // raw pointer will be used to create HashIndexReader, whose reset may
614   // access a dangling pointer.
615   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
616   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
617                                       internal_comparator, skip_filters, level,
618                                       immortal_table);
619   rep->file = std::move(file);
620   rep->footer = footer;
621   rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
622   // We need to wrap data with internal_prefix_transform to make sure it can
623   // handle prefix correctly.
624   if (prefix_extractor != nullptr) {
625     rep->internal_prefix_transform.reset(
626         new InternalKeySliceTransform(prefix_extractor));
627   }
628   SetupCacheKeyPrefix(rep);
629   std::unique_ptr<BlockBasedTable> new_table(
630       new BlockBasedTable(rep, block_cache_tracer));
631 
632   // page cache options
633   rep->persistent_cache_options =
634       PersistentCacheOptions(rep->table_options.persistent_cache,
635                              std::string(rep->persistent_cache_key_prefix,
636                                          rep->persistent_cache_key_prefix_size),
637                              rep->ioptions.statistics);
638 
639   // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
640   // handle to null, otherwise it may be seen as uninitialized during the below
641   // meta-block reads.
642   rep->compression_dict_handle = BlockHandle::NullBlockHandle();
643 
644   // Read metaindex
645   std::unique_ptr<Block> metaindex;
646   std::unique_ptr<InternalIterator> metaindex_iter;
647   s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
648                                     &metaindex_iter);
649   if (!s.ok()) {
650     return s;
651   }
652 
653   // Populates table_properties and some fields that depend on it,
654   // such as index_type.
655   s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
656                                      metaindex_iter.get(), largest_seqno);
657   if (!s.ok()) {
658     return s;
659   }
660   s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
661                                    internal_comparator, &lookup_context);
662   if (!s.ok()) {
663     return s;
664   }
665   s = new_table->PrefetchIndexAndFilterBlocks(
666       prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
667       prefetch_all, table_options, level, &lookup_context);
668 
669   if (s.ok()) {
670     // Update tail prefetch stats
671     assert(prefetch_buffer.get() != nullptr);
672     if (tail_prefetch_stats != nullptr) {
673       assert(prefetch_buffer->min_offset_read() < file_size);
674       tail_prefetch_stats->RecordEffectiveSize(
675           static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
676     }
677 
678     *table_reader = std::move(new_table);
679   }
680 
681   return s;
682 }
683 
PrefetchTail(RandomAccessFileReader * file,uint64_t file_size,TailPrefetchStats * tail_prefetch_stats,const bool prefetch_all,const bool preload_all,std::unique_ptr<FilePrefetchBuffer> * prefetch_buffer)684 Status BlockBasedTable::PrefetchTail(
685     RandomAccessFileReader* file, uint64_t file_size,
686     TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
687     const bool preload_all,
688     std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
689   size_t tail_prefetch_size = 0;
690   if (tail_prefetch_stats != nullptr) {
691     // Multiple threads may get a 0 (no history) when running in parallel,
692     // but it will get cleared after the first of them finishes.
693     tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
694   }
695   if (tail_prefetch_size == 0) {
696     // Before read footer, readahead backwards to prefetch data. Do more
697     // readahead if we're going to read index/filter.
698     // TODO: This may incorrectly select small readahead in case partitioned
699     // index/filter is enabled and top-level partition pinning is enabled.
700     // That's because we need to issue readahead before we read the properties,
701     // at which point we don't yet know the index type.
702     tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
703   }
704   size_t prefetch_off;
705   size_t prefetch_len;
706   if (file_size < tail_prefetch_size) {
707     prefetch_off = 0;
708     prefetch_len = static_cast<size_t>(file_size);
709   } else {
710     prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
711     prefetch_len = tail_prefetch_size;
712   }
713   TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
714                            &tail_prefetch_size);
715   Status s;
716   // TODO should not have this special logic in the future.
717   if (!file->use_direct_io()) {
718     prefetch_buffer->reset(new FilePrefetchBuffer(
719         nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
720     s = file->Prefetch(prefetch_off, prefetch_len);
721   } else {
722     prefetch_buffer->reset(new FilePrefetchBuffer(
723         nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
724     s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
725   }
726   return s;
727 }
728 
729 
TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer * prefetch_buffer,const Slice & handle_value,TableProperties ** table_properties)730 Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
731     FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
732     TableProperties** table_properties) {
733   assert(table_properties != nullptr);
734   // If this is an external SST file ingested with write_global_seqno set to
735   // true, then we expect the checksum mismatch because checksum was written
736   // by SstFileWriter, but its global seqno in the properties block may have
737   // been changed during ingestion. In this case, we read the properties
738   // block, copy it to a memory buffer, change the global seqno to its
739   // original value, i.e. 0, and verify the checksum again.
740   BlockHandle props_block_handle;
741   CacheAllocationPtr tmp_buf;
742   Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
743                             rep_->footer, rep_->ioptions, table_properties,
744                             false /* verify_checksum */, &props_block_handle,
745                             &tmp_buf, false /* compression_type_missing */,
746                             nullptr /* memory_allocator */);
747   if (s.ok() && tmp_buf) {
748     const auto seqno_pos_iter =
749         (*table_properties)
750             ->properties_offsets.find(
751                 ExternalSstFilePropertyNames::kGlobalSeqno);
752     size_t block_size = static_cast<size_t>(props_block_handle.size());
753     if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
754       uint64_t global_seqno_offset = seqno_pos_iter->second;
755       EncodeFixed64(
756           tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
757     }
758     uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
759     s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
760                                           tmp_buf.get(), block_size + 1, value);
761   }
762   return s;
763 }
764 
ReadPropertiesBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const SequenceNumber largest_seqno)765 Status BlockBasedTable::ReadPropertiesBlock(
766     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
767     const SequenceNumber largest_seqno) {
768   bool found_properties_block = true;
769   Status s;
770   s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
771 
772   if (!s.ok()) {
773     ROCKS_LOG_WARN(rep_->ioptions.info_log,
774                    "Error when seeking to properties block from file: %s",
775                    s.ToString().c_str());
776   } else if (found_properties_block) {
777     s = meta_iter->status();
778     TableProperties* table_properties = nullptr;
779     if (s.ok()) {
780       s = ReadProperties(
781           meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
782           rep_->ioptions, &table_properties, true /* verify_checksum */,
783           nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
784           false /* compression_type_missing */, nullptr /* memory_allocator */);
785     }
786 
787     if (s.IsCorruption()) {
788       s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
789                                            &table_properties);
790     }
791     std::unique_ptr<TableProperties> props_guard;
792     if (table_properties != nullptr) {
793       props_guard.reset(table_properties);
794     }
795 
796     if (!s.ok()) {
797       ROCKS_LOG_WARN(rep_->ioptions.info_log,
798                      "Encountered error while reading data from properties "
799                      "block %s",
800                      s.ToString().c_str());
801     } else {
802       assert(table_properties != nullptr);
803       rep_->table_properties.reset(props_guard.release());
804       rep_->blocks_maybe_compressed =
805           rep_->table_properties->compression_name !=
806           CompressionTypeToString(kNoCompression);
807       rep_->blocks_definitely_zstd_compressed =
808           (rep_->table_properties->compression_name ==
809                CompressionTypeToString(kZSTD) ||
810            rep_->table_properties->compression_name ==
811                CompressionTypeToString(kZSTDNotFinalCompression));
812     }
813   } else {
814     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
815                     "Cannot find Properties block from file.");
816   }
817 #ifndef ROCKSDB_LITE
818   if (rep_->table_properties) {
819     ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
820                         &(rep_->table_prefix_extractor));
821   }
822 #endif  // ROCKSDB_LITE
823 
824   // Read the table properties, if provided.
825   if (rep_->table_properties) {
826     rep_->whole_key_filtering &=
827         IsFeatureSupported(*(rep_->table_properties),
828                            BlockBasedTablePropertyNames::kWholeKeyFiltering,
829                            rep_->ioptions.info_log);
830     rep_->prefix_filtering &=
831         IsFeatureSupported(*(rep_->table_properties),
832                            BlockBasedTablePropertyNames::kPrefixFiltering,
833                            rep_->ioptions.info_log);
834 
835     rep_->index_key_includes_seq =
836         rep_->table_properties->index_key_is_user_key == 0;
837     rep_->index_value_is_full =
838         rep_->table_properties->index_value_is_delta_encoded == 0;
839 
840     // Update index_type with the true type.
841     // If table properties don't contain index type, we assume that the table
842     // is in very old format and has kBinarySearch index type.
843     auto& props = rep_->table_properties->user_collected_properties;
844     auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
845     if (pos != props.end()) {
846       rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
847           DecodeFixed32(pos->second.c_str()));
848     }
849 
850     rep_->index_has_first_key =
851         rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
852 
853     s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
854                                 &(rep_->global_seqno));
855     if (!s.ok()) {
856       ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
857     }
858   }
859   return s;
860 }
861 
ReadRangeDelBlock(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,const InternalKeyComparator & internal_comparator,BlockCacheLookupContext * lookup_context)862 Status BlockBasedTable::ReadRangeDelBlock(
863     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
864     const InternalKeyComparator& internal_comparator,
865     BlockCacheLookupContext* lookup_context) {
866   Status s;
867   bool found_range_del_block;
868   BlockHandle range_del_handle;
869   s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
870   if (!s.ok()) {
871     ROCKS_LOG_WARN(
872         rep_->ioptions.info_log,
873         "Error when seeking to range delete tombstones block from file: %s",
874         s.ToString().c_str());
875   } else if (found_range_del_block && !range_del_handle.IsNull()) {
876     ReadOptions read_options;
877     std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
878         read_options, range_del_handle,
879         /*input_iter=*/nullptr, BlockType::kRangeDeletion,
880         /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
881     assert(iter != nullptr);
882     s = iter->status();
883     if (!s.ok()) {
884       ROCKS_LOG_WARN(
885           rep_->ioptions.info_log,
886           "Encountered error while reading data from range del block %s",
887           s.ToString().c_str());
888     } else {
889       rep_->fragmented_range_dels =
890           std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
891                                                          internal_comparator);
892     }
893   }
894   return s;
895 }
896 
PrefetchIndexAndFilterBlocks(FilePrefetchBuffer * prefetch_buffer,InternalIterator * meta_iter,BlockBasedTable * new_table,bool prefetch_all,const BlockBasedTableOptions & table_options,const int level,BlockCacheLookupContext * lookup_context)897 Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
898     FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
899     BlockBasedTable* new_table, bool prefetch_all,
900     const BlockBasedTableOptions& table_options, const int level,
901     BlockCacheLookupContext* lookup_context) {
902   Status s;
903 
904   // Find filter handle and filter type
905   if (rep_->filter_policy) {
906     for (auto filter_type :
907          {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
908           Rep::FilterType::kBlockFilter}) {
909       std::string prefix;
910       switch (filter_type) {
911         case Rep::FilterType::kFullFilter:
912           prefix = kFullFilterBlockPrefix;
913           break;
914         case Rep::FilterType::kPartitionedFilter:
915           prefix = kPartitionedFilterBlockPrefix;
916           break;
917         case Rep::FilterType::kBlockFilter:
918           prefix = kFilterBlockPrefix;
919           break;
920         default:
921           assert(0);
922       }
923       std::string filter_block_key = prefix;
924       filter_block_key.append(rep_->filter_policy->Name());
925       if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
926               .ok()) {
927         rep_->filter_type = filter_type;
928         break;
929       }
930     }
931   }
932 
933   // Find compression dictionary handle
934   bool found_compression_dict = false;
935   s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
936                                  &rep_->compression_dict_handle);
937   if (!s.ok()) {
938     return s;
939   }
940 
941   BlockBasedTableOptions::IndexType index_type = rep_->index_type;
942 
943   const bool use_cache = table_options.cache_index_and_filter_blocks;
944 
945   // pin both index and filters, down to all partitions
946   const bool pin_all =
947       rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
948 
949   // prefetch the first level of index
950   const bool prefetch_index =
951       prefetch_all ||
952       (table_options.pin_top_level_index_and_filter &&
953        index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
954   // pin the first level of index
955   const bool pin_index =
956       pin_all || (table_options.pin_top_level_index_and_filter &&
957                   index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
958 
959   std::unique_ptr<IndexReader> index_reader;
960   s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
961                                    prefetch_index, pin_index, lookup_context,
962                                    &index_reader);
963   if (!s.ok()) {
964     return s;
965   }
966 
967   rep_->index_reader = std::move(index_reader);
968 
969   // The partitions of partitioned index are always stored in cache. They
970   // are hence follow the configuration for pin and prefetch regardless of
971   // the value of cache_index_and_filter_blocks
972   if (prefetch_all) {
973     rep_->index_reader->CacheDependencies(pin_all);
974   }
975 
976   // prefetch the first level of filter
977   const bool prefetch_filter =
978       prefetch_all ||
979       (table_options.pin_top_level_index_and_filter &&
980        rep_->filter_type == Rep::FilterType::kPartitionedFilter);
981   // Partition fitlers cannot be enabled without partition indexes
982   assert(!prefetch_filter || prefetch_index);
983   // pin the first level of filter
984   const bool pin_filter =
985       pin_all || (table_options.pin_top_level_index_and_filter &&
986                   rep_->filter_type == Rep::FilterType::kPartitionedFilter);
987 
988   if (rep_->filter_policy) {
989     auto filter = new_table->CreateFilterBlockReader(
990         prefetch_buffer, use_cache, prefetch_filter, pin_filter,
991         lookup_context);
992     if (filter) {
993       // Refer to the comment above about paritioned indexes always being cached
994       if (prefetch_all) {
995         filter->CacheDependencies(pin_all);
996       }
997 
998       rep_->filter = std::move(filter);
999     }
1000   }
1001 
1002   if (!rep_->compression_dict_handle.IsNull()) {
1003     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
1004     s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
1005                                         prefetch_all, pin_all, lookup_context,
1006                                         &uncompression_dict_reader);
1007     if (!s.ok()) {
1008       return s;
1009     }
1010 
1011     rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
1012   }
1013 
1014   assert(s.ok());
1015   return s;
1016 }
1017 
SetupForCompaction()1018 void BlockBasedTable::SetupForCompaction() {
1019   switch (rep_->ioptions.access_hint_on_compaction_start) {
1020     case Options::NONE:
1021       break;
1022     case Options::NORMAL:
1023       rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
1024       break;
1025     case Options::SEQUENTIAL:
1026       rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
1027       break;
1028     case Options::WILLNEED:
1029       rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
1030       break;
1031     default:
1032       assert(false);
1033   }
1034 }
1035 
GetTableProperties() const1036 std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
1037     const {
1038   return rep_->table_properties;
1039 }
1040 
ApproximateMemoryUsage() const1041 size_t BlockBasedTable::ApproximateMemoryUsage() const {
1042   size_t usage = 0;
1043   if (rep_->filter) {
1044     usage += rep_->filter->ApproximateMemoryUsage();
1045   }
1046   if (rep_->index_reader) {
1047     usage += rep_->index_reader->ApproximateMemoryUsage();
1048   }
1049   if (rep_->uncompression_dict_reader) {
1050     usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
1051   }
1052   return usage;
1053 }
1054 
1055 // Load the meta-index-block from the file. On success, return the loaded
1056 // metaindex
1057 // block and its iterator.
ReadMetaIndexBlock(FilePrefetchBuffer * prefetch_buffer,std::unique_ptr<Block> * metaindex_block,std::unique_ptr<InternalIterator> * iter)1058 Status BlockBasedTable::ReadMetaIndexBlock(
1059     FilePrefetchBuffer* prefetch_buffer,
1060     std::unique_ptr<Block>* metaindex_block,
1061     std::unique_ptr<InternalIterator>* iter) {
1062   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
1063   // it is an empty block.
1064   std::unique_ptr<Block> metaindex;
1065   Status s = ReadBlockFromFile(
1066       rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
1067       rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
1068       true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
1069       UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
1070       0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
1071       false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
1072       nullptr /* filter_policy */);
1073 
1074   if (!s.ok()) {
1075     ROCKS_LOG_ERROR(rep_->ioptions.info_log,
1076                     "Encountered error while reading data from properties"
1077                     " block %s",
1078                     s.ToString().c_str());
1079     return s;
1080   }
1081 
1082   *metaindex_block = std::move(metaindex);
1083   // meta block uses bytewise comparator.
1084   iter->reset(metaindex_block->get()->NewDataIterator(
1085       BytewiseComparator(), BytewiseComparator(),
1086       kDisableGlobalSequenceNumber));
1087   return Status::OK();
1088 }
1089 
1090 template <typename TBlocklike>
GetDataBlockFromCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,const ReadOptions & read_options,CachableEntry<TBlocklike> * block,const UncompressionDict & uncompression_dict,BlockType block_type,GetContext * get_context) const1091 Status BlockBasedTable::GetDataBlockFromCache(
1092     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1093     Cache* block_cache, Cache* block_cache_compressed,
1094     const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
1095     const UncompressionDict& uncompression_dict, BlockType block_type,
1096     GetContext* get_context) const {
1097   const size_t read_amp_bytes_per_bit =
1098       block_type == BlockType::kData
1099           ? rep_->table_options.read_amp_bytes_per_bit
1100           : 0;
1101   assert(block);
1102   assert(block->IsEmpty());
1103 
1104   Status s;
1105   BlockContents* compressed_block = nullptr;
1106   Cache::Handle* block_cache_compressed_handle = nullptr;
1107 
1108   // Lookup uncompressed cache first
1109   if (block_cache != nullptr) {
1110     auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
1111                                           block_type, get_context);
1112     if (cache_handle != nullptr) {
1113       block->SetCachedValue(
1114           reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
1115           block_cache, cache_handle);
1116       return s;
1117     }
1118   }
1119 
1120   // If not found, search from the compressed block cache.
1121   assert(block->IsEmpty());
1122 
1123   if (block_cache_compressed == nullptr) {
1124     return s;
1125   }
1126 
1127   assert(!compressed_block_cache_key.empty());
1128   block_cache_compressed_handle =
1129       block_cache_compressed->Lookup(compressed_block_cache_key);
1130 
1131   Statistics* statistics = rep_->ioptions.statistics;
1132 
1133   // if we found in the compressed cache, then uncompress and insert into
1134   // uncompressed cache
1135   if (block_cache_compressed_handle == nullptr) {
1136     RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
1137     return s;
1138   }
1139 
1140   // found compressed block
1141   RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
1142   compressed_block = reinterpret_cast<BlockContents*>(
1143       block_cache_compressed->Value(block_cache_compressed_handle));
1144   CompressionType compression_type = compressed_block->get_compression_type();
1145   assert(compression_type != kNoCompression);
1146 
1147   // Retrieve the uncompressed contents into a new buffer
1148   BlockContents contents;
1149   UncompressionContext context(compression_type);
1150   UncompressionInfo info(context, uncompression_dict, compression_type);
1151   s = UncompressBlockContents(
1152       info, compressed_block->data.data(), compressed_block->data.size(),
1153       &contents, rep_->table_options.format_version, rep_->ioptions,
1154       GetMemoryAllocator(rep_->table_options));
1155 
1156   // Insert uncompressed block into block cache
1157   if (s.ok()) {
1158     std::unique_ptr<TBlocklike> block_holder(
1159         BlocklikeTraits<TBlocklike>::Create(
1160             std::move(contents), read_amp_bytes_per_bit, statistics,
1161             rep_->blocks_definitely_zstd_compressed,
1162             rep_->table_options.filter_policy.get()));  // uncompressed block
1163 
1164     if (block_cache != nullptr && block_holder->own_bytes() &&
1165         read_options.fill_cache) {
1166       size_t charge = block_holder->ApproximateMemoryUsage();
1167       Cache::Handle* cache_handle = nullptr;
1168       s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1169                               SimpleDeleter<TBlocklike>::GetInstance(),
1170                               &cache_handle);
1171       if (s.ok()) {
1172         assert(cache_handle != nullptr);
1173         block->SetCachedValue(block_holder.release(), block_cache,
1174                               cache_handle);
1175 
1176         UpdateCacheInsertionMetrics(block_type, get_context, charge);
1177       } else {
1178         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1179       }
1180     } else {
1181       block->SetOwnedValue(block_holder.release());
1182     }
1183   }
1184 
1185   // Release hold on compressed cache entry
1186   block_cache_compressed->Release(block_cache_compressed_handle);
1187   return s;
1188 }
1189 
1190 template <typename TBlocklike>
PutDataBlockToCache(const Slice & block_cache_key,const Slice & compressed_block_cache_key,Cache * block_cache,Cache * block_cache_compressed,CachableEntry<TBlocklike> * cached_block,BlockContents * raw_block_contents,CompressionType raw_block_comp_type,const UncompressionDict & uncompression_dict,MemoryAllocator * memory_allocator,BlockType block_type,GetContext * get_context) const1191 Status BlockBasedTable::PutDataBlockToCache(
1192     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
1193     Cache* block_cache, Cache* block_cache_compressed,
1194     CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
1195     CompressionType raw_block_comp_type,
1196     const UncompressionDict& uncompression_dict,
1197     MemoryAllocator* memory_allocator, BlockType block_type,
1198     GetContext* get_context) const {
1199   const ImmutableCFOptions& ioptions = rep_->ioptions;
1200   const uint32_t format_version = rep_->table_options.format_version;
1201   const size_t read_amp_bytes_per_bit =
1202       block_type == BlockType::kData
1203           ? rep_->table_options.read_amp_bytes_per_bit
1204           : 0;
1205   const Cache::Priority priority =
1206       rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
1207               (block_type == BlockType::kFilter ||
1208                block_type == BlockType::kCompressionDictionary ||
1209                block_type == BlockType::kIndex)
1210           ? Cache::Priority::HIGH
1211           : Cache::Priority::LOW;
1212   assert(cached_block);
1213   assert(cached_block->IsEmpty());
1214 
1215   Status s;
1216   Statistics* statistics = ioptions.statistics;
1217 
1218   std::unique_ptr<TBlocklike> block_holder;
1219   if (raw_block_comp_type != kNoCompression) {
1220     // Retrieve the uncompressed contents into a new buffer
1221     BlockContents uncompressed_block_contents;
1222     UncompressionContext context(raw_block_comp_type);
1223     UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
1224     s = UncompressBlockContents(info, raw_block_contents->data.data(),
1225                                 raw_block_contents->data.size(),
1226                                 &uncompressed_block_contents, format_version,
1227                                 ioptions, memory_allocator);
1228     if (!s.ok()) {
1229       return s;
1230     }
1231 
1232     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1233         std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
1234         statistics, rep_->blocks_definitely_zstd_compressed,
1235         rep_->table_options.filter_policy.get()));
1236   } else {
1237     block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
1238         std::move(*raw_block_contents), read_amp_bytes_per_bit, statistics,
1239         rep_->blocks_definitely_zstd_compressed,
1240         rep_->table_options.filter_policy.get()));
1241   }
1242 
1243   // Insert compressed block into compressed block cache.
1244   // Release the hold on the compressed cache entry immediately.
1245   if (block_cache_compressed != nullptr &&
1246       raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
1247       raw_block_contents->own_bytes()) {
1248 #ifndef NDEBUG
1249     assert(raw_block_contents->is_raw_block);
1250 #endif  // NDEBUG
1251 
1252     // We cannot directly put raw_block_contents because this could point to
1253     // an object in the stack.
1254     BlockContents* block_cont_for_comp_cache =
1255         new BlockContents(std::move(*raw_block_contents));
1256     s = block_cache_compressed->Insert(
1257         compressed_block_cache_key, block_cont_for_comp_cache,
1258         block_cont_for_comp_cache->ApproximateMemoryUsage(),
1259         SimpleDeleter<BlockContents>::GetInstance());
1260     if (s.ok()) {
1261       // Avoid the following code to delete this cached block.
1262       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
1263     } else {
1264       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
1265       delete block_cont_for_comp_cache;
1266     }
1267   }
1268 
1269   // insert into uncompressed block cache
1270   if (block_cache != nullptr && block_holder->own_bytes()) {
1271     size_t charge = block_holder->ApproximateMemoryUsage();
1272     Cache::Handle* cache_handle = nullptr;
1273     s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
1274                             SimpleDeleter<TBlocklike>::GetInstance(),
1275                             &cache_handle, priority);
1276     if (s.ok()) {
1277       assert(cache_handle != nullptr);
1278       cached_block->SetCachedValue(block_holder.release(), block_cache,
1279                                    cache_handle);
1280 
1281       UpdateCacheInsertionMetrics(block_type, get_context, charge);
1282     } else {
1283       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
1284     }
1285   } else {
1286     cached_block->SetOwnedValue(block_holder.release());
1287   }
1288 
1289   return s;
1290 }
1291 
CreateFilterBlockReader(FilePrefetchBuffer * prefetch_buffer,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context)1292 std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
1293     FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
1294     bool pin, BlockCacheLookupContext* lookup_context) {
1295   auto& rep = rep_;
1296   auto filter_type = rep->filter_type;
1297   if (filter_type == Rep::FilterType::kNoFilter) {
1298     return std::unique_ptr<FilterBlockReader>();
1299   }
1300 
1301   assert(rep->filter_policy);
1302 
1303   switch (filter_type) {
1304     case Rep::FilterType::kPartitionedFilter:
1305       return PartitionedFilterBlockReader::Create(
1306           this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1307 
1308     case Rep::FilterType::kBlockFilter:
1309       return BlockBasedFilterBlockReader::Create(
1310           this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
1311 
1312     case Rep::FilterType::kFullFilter:
1313       return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
1314                                            prefetch, pin, lookup_context);
1315 
1316     default:
1317       // filter_type is either kNoFilter (exited the function at the first if),
1318       // or it must be covered in this switch block
1319       assert(false);
1320       return std::unique_ptr<FilterBlockReader>();
1321   }
1322 }
1323 
1324 // disable_prefix_seek should be set to true when prefix_extractor found in SST
1325 // differs from the one in mutable_cf_options and index type is HashBasedIndex
NewIndexIterator(const ReadOptions & read_options,bool disable_prefix_seek,IndexBlockIter * input_iter,GetContext * get_context,BlockCacheLookupContext * lookup_context) const1326 InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
1327     const ReadOptions& read_options, bool disable_prefix_seek,
1328     IndexBlockIter* input_iter, GetContext* get_context,
1329     BlockCacheLookupContext* lookup_context) const {
1330   assert(rep_ != nullptr);
1331   assert(rep_->index_reader != nullptr);
1332 
1333   // We don't return pinned data from index blocks, so no need
1334   // to set `block_contents_pinned`.
1335   return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
1336                                          input_iter, get_context,
1337                                          lookup_context);
1338 }
1339 
1340 
1341 template <>
InitBlockIterator(const Rep * rep,Block * block,BlockType block_type,DataBlockIter * input_iter,bool block_contents_pinned)1342 DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
1343     const Rep* rep, Block* block, BlockType block_type,
1344     DataBlockIter* input_iter, bool block_contents_pinned) {
1345   return block->NewDataIterator(
1346       &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1347       rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics,
1348       block_contents_pinned);
1349 }
1350 
1351 template <>
InitBlockIterator(const Rep * rep,Block * block,BlockType block_type,IndexBlockIter * input_iter,bool block_contents_pinned)1352 IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
1353     const Rep* rep, Block* block, BlockType block_type,
1354     IndexBlockIter* input_iter, bool block_contents_pinned) {
1355   return block->NewIndexIterator(
1356       &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1357       rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics,
1358       /* total_order_seek */ true, rep->index_has_first_key,
1359       rep->index_key_includes_seq, rep->index_value_is_full,
1360       block_contents_pinned);
1361 }
1362 
1363 
1364 // If contents is nullptr, this function looks up the block caches for the
1365 // data block referenced by handle, and read the block from disk if necessary.
1366 // If contents is non-null, it skips the cache lookup and disk read, since
1367 // the caller has already read it. In both cases, if ro.fill_cache is true,
1368 // it inserts the block into the block cache.
1369 template <typename TBlocklike>
MaybeReadBlockAndLoadToCache(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,BlockContents * contents) const1370 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
1371     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1372     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1373     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
1374     GetContext* get_context, BlockCacheLookupContext* lookup_context,
1375     BlockContents* contents) const {
1376   assert(block_entry != nullptr);
1377   const bool no_io = (ro.read_tier == kBlockCacheTier);
1378   Cache* block_cache = rep_->table_options.block_cache.get();
1379   // No point to cache compressed blocks if it never goes away
1380   Cache* block_cache_compressed =
1381       rep_->immortal_table ? nullptr
1382                            : rep_->table_options.block_cache_compressed.get();
1383 
1384   // First, try to get the block from the cache
1385   //
1386   // If either block cache is enabled, we'll try to read from it.
1387   Status s;
1388   char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1389   char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
1390   Slice key /* key to the block cache */;
1391   Slice ckey /* key to the compressed block cache */;
1392   bool is_cache_hit = false;
1393   if (block_cache != nullptr || block_cache_compressed != nullptr) {
1394     // create key for block cache
1395     if (block_cache != nullptr) {
1396       key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
1397                         handle, cache_key);
1398     }
1399 
1400     if (block_cache_compressed != nullptr) {
1401       ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
1402                          rep_->compressed_cache_key_prefix_size, handle,
1403                          compressed_cache_key);
1404     }
1405 
1406     if (!contents) {
1407       s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
1408                                 ro, block_entry, uncompression_dict, block_type,
1409                                 get_context);
1410       if (block_entry->GetValue()) {
1411         // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
1412         // compressed block cache.
1413         is_cache_hit = true;
1414       }
1415     }
1416 
1417     // Can't find the block from the cache. If I/O is allowed, read from the
1418     // file.
1419     if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
1420       Statistics* statistics = rep_->ioptions.statistics;
1421       const bool maybe_compressed =
1422           block_type != BlockType::kFilter &&
1423           block_type != BlockType::kCompressionDictionary &&
1424           rep_->blocks_maybe_compressed;
1425       const bool do_uncompress = maybe_compressed && !block_cache_compressed;
1426       CompressionType raw_block_comp_type;
1427       BlockContents raw_block_contents;
1428       if (!contents) {
1429         StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
1430         BlockFetcher block_fetcher(
1431             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
1432             &raw_block_contents, rep_->ioptions, do_uncompress,
1433             maybe_compressed, block_type, uncompression_dict,
1434             rep_->persistent_cache_options,
1435             GetMemoryAllocator(rep_->table_options),
1436             GetMemoryAllocatorForCompressedBlock(rep_->table_options));
1437         s = block_fetcher.ReadBlockContents();
1438         raw_block_comp_type = block_fetcher.get_compression_type();
1439         contents = &raw_block_contents;
1440       } else {
1441         raw_block_comp_type = contents->get_compression_type();
1442       }
1443 
1444       if (s.ok()) {
1445         // If filling cache is allowed and a cache is configured, try to put the
1446         // block to the cache.
1447         s = PutDataBlockToCache(
1448             key, ckey, block_cache, block_cache_compressed, block_entry,
1449             contents, raw_block_comp_type, uncompression_dict,
1450             GetMemoryAllocator(rep_->table_options), block_type, get_context);
1451       }
1452     }
1453   }
1454 
1455   // Fill lookup_context.
1456   if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
1457       lookup_context) {
1458     size_t usage = 0;
1459     uint64_t nkeys = 0;
1460     if (block_entry->GetValue()) {
1461       // Approximate the number of keys in the block using restarts.
1462       nkeys =
1463           rep_->table_options.block_restart_interval *
1464           BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
1465       usage = block_entry->GetValue()->ApproximateMemoryUsage();
1466     }
1467     TraceType trace_block_type = TraceType::kTraceMax;
1468     switch (block_type) {
1469       case BlockType::kData:
1470         trace_block_type = TraceType::kBlockTraceDataBlock;
1471         break;
1472       case BlockType::kFilter:
1473         trace_block_type = TraceType::kBlockTraceFilterBlock;
1474         break;
1475       case BlockType::kCompressionDictionary:
1476         trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
1477         break;
1478       case BlockType::kRangeDeletion:
1479         trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
1480         break;
1481       case BlockType::kIndex:
1482         trace_block_type = TraceType::kBlockTraceIndexBlock;
1483         break;
1484       default:
1485         // This cannot happen.
1486         assert(false);
1487         break;
1488     }
1489     bool no_insert = no_io || !ro.fill_cache;
1490     if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
1491             trace_block_type, lookup_context->caller)) {
1492       // Defer logging the access to Get() and MultiGet() to trace additional
1493       // information, e.g., referenced_key_exist_in_block.
1494 
1495       // Make a copy of the block key here since it will be logged later.
1496       lookup_context->FillLookupContext(
1497           is_cache_hit, no_insert, trace_block_type,
1498           /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
1499     } else {
1500       // Avoid making copy of block_key and cf_name when constructing the access
1501       // record.
1502       BlockCacheTraceRecord access_record(
1503           rep_->ioptions.env->NowMicros(),
1504           /*block_key=*/"", trace_block_type,
1505           /*block_size=*/usage, rep_->cf_id_for_tracing(),
1506           /*cf_name=*/"", rep_->level_for_tracing(),
1507           rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
1508           no_insert, lookup_context->get_id,
1509           lookup_context->get_from_user_specified_snapshot,
1510           /*referenced_key=*/"");
1511       block_cache_tracer_->WriteBlockAccess(access_record, key,
1512                                             rep_->cf_name_for_tracing(),
1513                                             lookup_context->referenced_key);
1514     }
1515   }
1516 
1517   assert(s.ok() || block_entry->GetValue() == nullptr);
1518   return s;
1519 }
1520 
1521 // This function reads multiple data blocks from disk using Env::MultiRead()
1522 // and optionally inserts them into the block cache. It uses the scratch
1523 // buffer provided by the caller, which is contiguous. If scratch is a nullptr
1524 // it allocates a separate buffer for each block. Typically, if the blocks
1525 // need to be uncompressed and there is no compressed block cache, callers
1526 // can allocate a temporary scratch buffer in order to minimize memory
1527 // allocations.
1528 // If options.fill_cache is true, it inserts the blocks into cache. If its
1529 // false and scratch is non-null and the blocks are uncompressed, it copies
1530 // the buffers to heap. In any case, the CachableEntry<Block> returned will
1531 // own the data bytes.
1532 // If compression is enabled and also there is no compressed block cache,
1533 // the adjacent blocks are read out in one IO (combined read)
1534 // batch - A MultiGetRange with only those keys with unique data blocks not
1535 //         found in cache
1536 // handles - A vector of block handles. Some of them me be NULL handles
1537 // scratch - An optional contiguous buffer to read compressed blocks into
RetrieveMultipleBlocks(const ReadOptions & options,const MultiGetRange * batch,const autovector<BlockHandle,MultiGetContext::MAX_BATCH_SIZE> * handles,autovector<Status,MultiGetContext::MAX_BATCH_SIZE> * statuses,autovector<CachableEntry<Block>,MultiGetContext::MAX_BATCH_SIZE> * results,char * scratch,const UncompressionDict & uncompression_dict) const1538 void BlockBasedTable::RetrieveMultipleBlocks(
1539     const ReadOptions& options, const MultiGetRange* batch,
1540     const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
1541     autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
1542     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
1543     char* scratch, const UncompressionDict& uncompression_dict) const {
1544   RandomAccessFileReader* file = rep_->file.get();
1545   const Footer& footer = rep_->footer;
1546   const ImmutableCFOptions& ioptions = rep_->ioptions;
1547   size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
1548   MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
1549 
1550   if (file->use_direct_io() || ioptions.allow_mmap_reads) {
1551     size_t idx_in_batch = 0;
1552     for (auto mget_iter = batch->begin(); mget_iter != batch->end();
1553          ++mget_iter, ++idx_in_batch) {
1554       BlockCacheLookupContext lookup_data_block_context(
1555           TableReaderCaller::kUserMultiGet);
1556       const BlockHandle& handle = (*handles)[idx_in_batch];
1557       if (handle.IsNull()) {
1558         continue;
1559       }
1560 
1561       (*statuses)[idx_in_batch] =
1562           RetrieveBlock(nullptr, options, handle, uncompression_dict,
1563                         &(*results)[idx_in_batch], BlockType::kData,
1564                         mget_iter->get_context, &lookup_data_block_context,
1565                         /* for_compaction */ false, /* use_cache */ true);
1566     }
1567     return;
1568   }
1569 
1570   autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
1571   size_t buf_offset = 0;
1572   size_t idx_in_batch = 0;
1573 
1574   uint64_t prev_offset = 0;
1575   size_t prev_len = 0;
1576   autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
1577   autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
1578   for (auto mget_iter = batch->begin(); mget_iter != batch->end();
1579        ++mget_iter, ++idx_in_batch) {
1580     const BlockHandle& handle = (*handles)[idx_in_batch];
1581     if (handle.IsNull()) {
1582       continue;
1583     }
1584 
1585     size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
1586 
1587     // If current block is adjacent to the previous one, at the same time,
1588     // compression is enabled and there is no compressed cache, we combine
1589     // the two block read as one.
1590     if (scratch != nullptr && prev_end == handle.offset()) {
1591       req_offset_for_block.emplace_back(prev_len);
1592       prev_len += block_size(handle);
1593     } else {
1594       // No compression or current block and previous one is not adjacent:
1595       // Step 1, create a new request for previous blocks
1596       if (prev_len != 0) {
1597         FSReadRequest req;
1598         req.offset = prev_offset;
1599         req.len = prev_len;
1600         if (scratch == nullptr) {
1601           req.scratch = new char[req.len];
1602         } else {
1603           req.scratch = scratch + buf_offset;
1604           buf_offset += req.len;
1605         }
1606         read_reqs.emplace_back(req);
1607       }
1608 
1609       // Step 2, remeber the previous block info
1610       prev_offset = handle.offset();
1611       prev_len = block_size(handle);
1612       req_offset_for_block.emplace_back(0);
1613     }
1614     req_idx_for_block.emplace_back(read_reqs.size());
1615   }
1616   // Handle the last block and process the pending last request
1617   if (prev_len != 0) {
1618     FSReadRequest req;
1619     req.offset = prev_offset;
1620     req.len = prev_len;
1621     if (scratch == nullptr) {
1622       req.scratch = new char[req.len];
1623     } else {
1624       req.scratch = scratch + buf_offset;
1625     }
1626     read_reqs.emplace_back(req);
1627   }
1628 
1629   file->MultiRead(&read_reqs[0], read_reqs.size(), nullptr);
1630 
1631   idx_in_batch = 0;
1632   size_t valid_batch_idx = 0;
1633   for (auto mget_iter = batch->begin(); mget_iter != batch->end();
1634        ++mget_iter, ++idx_in_batch) {
1635     const BlockHandle& handle = (*handles)[idx_in_batch];
1636 
1637     if (handle.IsNull()) {
1638       continue;
1639     }
1640 
1641     assert(valid_batch_idx < req_idx_for_block.size());
1642     assert(valid_batch_idx < req_offset_for_block.size());
1643     assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
1644     size_t& req_idx = req_idx_for_block[valid_batch_idx];
1645     size_t& req_offset = req_offset_for_block[valid_batch_idx];
1646     valid_batch_idx++;
1647     FSReadRequest& req = read_reqs[req_idx];
1648     Status s = req.status;
1649     if (s.ok()) {
1650       if (req.result.size() != req.len) {
1651         s = Status::Corruption(
1652             "truncated block read from " + rep_->file->file_name() +
1653             " offset " + ToString(handle.offset()) + ", expected " +
1654             ToString(req.len) + " bytes, got " + ToString(req.result.size()));
1655       }
1656     }
1657 
1658     BlockContents raw_block_contents;
1659     size_t cur_read_end = req_offset + block_size(handle);
1660     if (cur_read_end > req.result.size()) {
1661       s = Status::Corruption(
1662           "truncated block read from " + rep_->file->file_name() + " offset " +
1663           ToString(handle.offset()) + ", expected " + ToString(req.len) +
1664           " bytes, got " + ToString(req.result.size()));
1665     }
1666 
1667     bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
1668     if (s.ok()) {
1669       if (scratch == nullptr && !blocks_share_read_buffer) {
1670         // We allocated a buffer for this block. Give ownership of it to
1671         // BlockContents so it can free the memory
1672         assert(req.result.data() == req.scratch);
1673         std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
1674         raw_block_contents = BlockContents(std::move(raw_block), handle.size());
1675       } else {
1676         // We used the scratch buffer which are shared by the blocks.
1677         // raw_block_contents does not have the ownership.
1678         raw_block_contents =
1679             BlockContents(Slice(req.scratch + req_offset, handle.size()));
1680       }
1681 
1682 #ifndef NDEBUG
1683       raw_block_contents.is_raw_block = true;
1684 #endif
1685       if (options.verify_checksums) {
1686         PERF_TIMER_GUARD(block_checksum_time);
1687         const char* data = req.result.data();
1688         uint32_t expected =
1689             DecodeFixed32(data + req_offset + handle.size() + 1);
1690         // Since the scratch might be shared. the offset of the data block in
1691         // the buffer might not be 0. req.result.data() only point to the
1692         // begin address of each read request, we need to add the offset
1693         // in each read request. Checksum is stored in the block trailer,
1694         // which is handle.size() + 1.
1695         s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
1696                                               data + req_offset,
1697                                               handle.size() + 1, expected);
1698         TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
1699       }
1700     }
1701 
1702     if (s.ok()) {
1703       // It handles a rare case: compression is set and these is no compressed
1704       // cache (enable combined read). In this case, the scratch != nullptr.
1705       // At the same time, some blocks are actually not compressed,
1706       // since its compression space saving is smaller than the threshold. In
1707       // this case, if the block shares the scratch memory, we need to copy it
1708       // to the heap such that it can be added to the regular block cache.
1709       CompressionType compression_type =
1710           raw_block_contents.get_compression_type();
1711       if (scratch != nullptr && compression_type == kNoCompression) {
1712         Slice raw = Slice(req.scratch + req_offset, block_size(handle));
1713         raw_block_contents = BlockContents(
1714             CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
1715             handle.size());
1716 #ifndef NDEBUG
1717         raw_block_contents.is_raw_block = true;
1718 #endif
1719       }
1720     }
1721 
1722     if (s.ok()) {
1723       if (options.fill_cache) {
1724         BlockCacheLookupContext lookup_data_block_context(
1725             TableReaderCaller::kUserMultiGet);
1726         CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
1727         // MaybeReadBlockAndLoadToCache will insert into the block caches if
1728         // necessary. Since we're passing the raw block contents, it will
1729         // avoid looking up the block cache
1730         s = MaybeReadBlockAndLoadToCache(
1731             nullptr, options, handle, uncompression_dict, block_entry,
1732             BlockType::kData, mget_iter->get_context,
1733             &lookup_data_block_context, &raw_block_contents);
1734 
1735         // block_entry value could be null if no block cache is present, i.e
1736         // BlockBasedTableOptions::no_block_cache is true and no compressed
1737         // block cache is configured. In that case, fall
1738         // through and set up the block explicitly
1739         if (block_entry->GetValue() != nullptr) {
1740           continue;
1741         }
1742       }
1743 
1744       CompressionType compression_type =
1745           raw_block_contents.get_compression_type();
1746       BlockContents contents;
1747       if (compression_type != kNoCompression) {
1748         UncompressionContext context(compression_type);
1749         UncompressionInfo info(context, uncompression_dict, compression_type);
1750         s = UncompressBlockContents(info, req.result.data() + req_offset,
1751                                     handle.size(), &contents, footer.version(),
1752                                     rep_->ioptions, memory_allocator);
1753       } else {
1754         // There are two cases here: 1) caller uses the scratch buffer; 2) we
1755         // use the requst buffer. If scratch buffer is used, we ensure that
1756         // all raw blocks are copyed to the heap as single blocks. If scratch
1757         // buffer is not used, we also have no combined read, so the raw
1758         // block can be used directly.
1759         contents = std::move(raw_block_contents);
1760       }
1761       if (s.ok()) {
1762         (*results)[idx_in_batch].SetOwnedValue(new Block(
1763             std::move(contents), read_amp_bytes_per_bit, ioptions.statistics));
1764       }
1765     }
1766     (*statuses)[idx_in_batch] = s;
1767   }
1768 }
1769 
1770 template <typename TBlocklike>
RetrieveBlock(FilePrefetchBuffer * prefetch_buffer,const ReadOptions & ro,const BlockHandle & handle,const UncompressionDict & uncompression_dict,CachableEntry<TBlocklike> * block_entry,BlockType block_type,GetContext * get_context,BlockCacheLookupContext * lookup_context,bool for_compaction,bool use_cache) const1771 Status BlockBasedTable::RetrieveBlock(
1772     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1773     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1774     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
1775     GetContext* get_context, BlockCacheLookupContext* lookup_context,
1776     bool for_compaction, bool use_cache) const {
1777   assert(block_entry);
1778   assert(block_entry->IsEmpty());
1779 
1780   Status s;
1781   if (use_cache) {
1782     s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
1783                                      uncompression_dict, block_entry,
1784                                      block_type, get_context, lookup_context,
1785                                      /*contents=*/nullptr);
1786 
1787     if (!s.ok()) {
1788       return s;
1789     }
1790 
1791     if (block_entry->GetValue() != nullptr) {
1792       assert(s.ok());
1793       return s;
1794     }
1795   }
1796 
1797   assert(block_entry->IsEmpty());
1798 
1799   const bool no_io = ro.read_tier == kBlockCacheTier;
1800   if (no_io) {
1801     return Status::Incomplete("no blocking io");
1802   }
1803 
1804   const bool maybe_compressed =
1805       block_type != BlockType::kFilter &&
1806       block_type != BlockType::kCompressionDictionary &&
1807       rep_->blocks_maybe_compressed;
1808   const bool do_uncompress = maybe_compressed;
1809   std::unique_ptr<TBlocklike> block;
1810 
1811   {
1812     StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
1813                  READ_BLOCK_GET_MICROS);
1814     s = ReadBlockFromFile(
1815         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
1816         rep_->ioptions, do_uncompress, maybe_compressed, block_type,
1817         uncompression_dict, rep_->persistent_cache_options,
1818         block_type == BlockType::kData
1819             ? rep_->table_options.read_amp_bytes_per_bit
1820             : 0,
1821         GetMemoryAllocator(rep_->table_options), for_compaction,
1822         rep_->blocks_definitely_zstd_compressed,
1823         rep_->table_options.filter_policy.get());
1824   }
1825 
1826   if (!s.ok()) {
1827     return s;
1828   }
1829 
1830   block_entry->SetOwnedValue(block.release());
1831 
1832   assert(s.ok());
1833   return s;
1834 }
1835 
1836 // Explicitly instantiate templates for both "blocklike" types we use.
1837 // This makes it possible to keep the template definitions in the .cc file.
1838 template Status BlockBasedTable::RetrieveBlock<BlockContents>(
1839     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1840     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1841     CachableEntry<BlockContents>* block_entry, BlockType block_type,
1842     GetContext* get_context, BlockCacheLookupContext* lookup_context,
1843     bool for_compaction, bool use_cache) const;
1844 
1845 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
1846     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1847     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1848     CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
1849     GetContext* get_context, BlockCacheLookupContext* lookup_context,
1850     bool for_compaction, bool use_cache) const;
1851 
1852 template Status BlockBasedTable::RetrieveBlock<Block>(
1853     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1854     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1855     CachableEntry<Block>* block_entry, BlockType block_type,
1856     GetContext* get_context, BlockCacheLookupContext* lookup_context,
1857     bool for_compaction, bool use_cache) const;
1858 
1859 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
1860     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
1861     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
1862     CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
1863     GetContext* get_context, BlockCacheLookupContext* lookup_context,
1864     bool for_compaction, bool use_cache) const;
1865 
PartitionedIndexIteratorState(const BlockBasedTable * table,std::unordered_map<uint64_t,CachableEntry<Block>> * block_map)1866 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
1867     const BlockBasedTable* table,
1868     std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
1869     : table_(table), block_map_(block_map) {}
1870 
1871 InternalIteratorBase<IndexValue>*
NewSecondaryIterator(const BlockHandle & handle)1872 BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
1873     const BlockHandle& handle) {
1874   // Return a block iterator on the index partition
1875   auto block = block_map_->find(handle.offset());
1876   // This is a possible scenario since block cache might not have had space
1877   // for the partition
1878   if (block != block_map_->end()) {
1879     const Rep* rep = table_->get_rep();
1880     assert(rep);
1881 
1882     Statistics* kNullStats = nullptr;
1883     // We don't return pinned data from index blocks, so no need
1884     // to set `block_contents_pinned`.
1885     return block->second.GetValue()->NewIndexIterator(
1886         &rep->internal_comparator, rep->internal_comparator.user_comparator(),
1887         rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
1888         rep->index_has_first_key, rep->index_key_includes_seq,
1889         rep->index_value_is_full);
1890   }
1891   // Create an empty iterator
1892   return new IndexBlockIter();
1893 }
1894 
1895 // This will be broken if the user specifies an unusual implementation
1896 // of Options.comparator, or if the user specifies an unusual
1897 // definition of prefixes in BlockBasedTableOptions.filter_policy.
1898 // In particular, we require the following three properties:
1899 //
1900 // 1) key.starts_with(prefix(key))
1901 // 2) Compare(prefix(key), key) <= 0.
1902 // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
1903 //
1904 // If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
1905 // will return true if the filter block is not in memory and not found in block
1906 // cache.
1907 //
1908 // REQUIRES: this method shouldn't be called while the DB lock is held.
PrefixMayMatch(const Slice & internal_key,const ReadOptions & read_options,const SliceTransform * options_prefix_extractor,const bool need_upper_bound_check,BlockCacheLookupContext * lookup_context) const1909 bool BlockBasedTable::PrefixMayMatch(
1910     const Slice& internal_key, const ReadOptions& read_options,
1911     const SliceTransform* options_prefix_extractor,
1912     const bool need_upper_bound_check,
1913     BlockCacheLookupContext* lookup_context) const {
1914   if (!rep_->filter_policy) {
1915     return true;
1916   }
1917 
1918   const SliceTransform* prefix_extractor;
1919 
1920   if (rep_->table_prefix_extractor == nullptr) {
1921     if (need_upper_bound_check) {
1922       return true;
1923     }
1924     prefix_extractor = options_prefix_extractor;
1925   } else {
1926     prefix_extractor = rep_->table_prefix_extractor.get();
1927   }
1928   auto user_key = ExtractUserKey(internal_key);
1929   if (!prefix_extractor->InDomain(user_key)) {
1930     return true;
1931   }
1932 
1933   bool may_match = true;
1934   Status s;
1935 
1936   // First, try check with full filter
1937   FilterBlockReader* const filter = rep_->filter.get();
1938   bool filter_checked = true;
1939   if (filter != nullptr) {
1940     const bool no_io = read_options.read_tier == kBlockCacheTier;
1941 
1942     if (!filter->IsBlockBased()) {
1943       const Slice* const const_ikey_ptr = &internal_key;
1944       may_match = filter->RangeMayExist(
1945           read_options.iterate_upper_bound, user_key, prefix_extractor,
1946           rep_->internal_comparator.user_comparator(), const_ikey_ptr,
1947           &filter_checked, need_upper_bound_check, no_io, lookup_context);
1948     } else {
1949       // if prefix_extractor changed for block based filter, skip filter
1950       if (need_upper_bound_check) {
1951         return true;
1952       }
1953       auto prefix = prefix_extractor->Transform(user_key);
1954       InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
1955       auto internal_prefix = internal_key_prefix.Encode();
1956 
1957       // To prevent any io operation in this method, we set `read_tier` to make
1958       // sure we always read index or filter only when they have already been
1959       // loaded to memory.
1960       ReadOptions no_io_read_options;
1961       no_io_read_options.read_tier = kBlockCacheTier;
1962 
1963       // Then, try find it within each block
1964       // we already know prefix_extractor and prefix_extractor_name must match
1965       // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
1966       std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
1967           no_io_read_options,
1968           /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
1969           /*get_context=*/nullptr, lookup_context));
1970       iiter->Seek(internal_prefix);
1971 
1972       if (!iiter->Valid()) {
1973         // we're past end of file
1974         // if it's incomplete, it means that we avoided I/O
1975         // and we're not really sure that we're past the end
1976         // of the file
1977         may_match = iiter->status().IsIncomplete();
1978       } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
1979                                                : iiter->key())
1980                      .starts_with(ExtractUserKey(internal_prefix))) {
1981         // we need to check for this subtle case because our only
1982         // guarantee is that "the key is a string >= last key in that data
1983         // block" according to the doc/table_format.txt spec.
1984         //
1985         // Suppose iiter->key() starts with the desired prefix; it is not
1986         // necessarily the case that the corresponding data block will
1987         // contain the prefix, since iiter->key() need not be in the
1988         // block.  However, the next data block may contain the prefix, so
1989         // we return true to play it safe.
1990         may_match = true;
1991       } else if (filter->IsBlockBased()) {
1992         // iiter->key() does NOT start with the desired prefix.  Because
1993         // Seek() finds the first key that is >= the seek target, this
1994         // means that iiter->key() > prefix.  Thus, any data blocks coming
1995         // after the data block corresponding to iiter->key() cannot
1996         // possibly contain the key.  Thus, the corresponding data block
1997         // is the only on could potentially contain the prefix.
1998         BlockHandle handle = iiter->value().handle;
1999         may_match = filter->PrefixMayMatch(
2000             prefix, prefix_extractor, handle.offset(), no_io,
2001             /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
2002       }
2003     }
2004   }
2005 
2006   if (filter_checked) {
2007     Statistics* statistics = rep_->ioptions.statistics;
2008     RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
2009     if (!may_match) {
2010       RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
2011     }
2012   }
2013 
2014   return may_match;
2015 }
2016 
2017 
NewIterator(const ReadOptions & read_options,const SliceTransform * prefix_extractor,Arena * arena,bool skip_filters,TableReaderCaller caller,size_t compaction_readahead_size)2018 InternalIterator* BlockBasedTable::NewIterator(
2019     const ReadOptions& read_options, const SliceTransform* prefix_extractor,
2020     Arena* arena, bool skip_filters, TableReaderCaller caller,
2021     size_t compaction_readahead_size) {
2022   BlockCacheLookupContext lookup_context{caller};
2023   bool need_upper_bound_check =
2024       read_options.auto_prefix_mode ||
2025       PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
2026   std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
2027       read_options,
2028       need_upper_bound_check &&
2029           rep_->index_type == BlockBasedTableOptions::kHashSearch,
2030       /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
2031   if (arena == nullptr) {
2032     return new BlockBasedTableIterator(
2033         this, read_options, rep_->internal_comparator, std::move(index_iter),
2034         !skip_filters && !read_options.total_order_seek &&
2035             prefix_extractor != nullptr,
2036         need_upper_bound_check, prefix_extractor, caller,
2037         compaction_readahead_size);
2038   } else {
2039     auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
2040     return new (mem) BlockBasedTableIterator(
2041         this, read_options, rep_->internal_comparator, std::move(index_iter),
2042         !skip_filters && !read_options.total_order_seek &&
2043             prefix_extractor != nullptr,
2044         need_upper_bound_check, prefix_extractor, caller,
2045         compaction_readahead_size);
2046   }
2047 }
2048 
NewRangeTombstoneIterator(const ReadOptions & read_options)2049 FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
2050     const ReadOptions& read_options) {
2051   if (rep_->fragmented_range_dels == nullptr) {
2052     return nullptr;
2053   }
2054   SequenceNumber snapshot = kMaxSequenceNumber;
2055   if (read_options.snapshot != nullptr) {
2056     snapshot = read_options.snapshot->GetSequenceNumber();
2057   }
2058   return new FragmentedRangeTombstoneIterator(
2059       rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
2060 }
2061 
FullFilterKeyMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,const Slice & internal_key,const bool no_io,const SliceTransform * prefix_extractor,GetContext * get_context,BlockCacheLookupContext * lookup_context) const2062 bool BlockBasedTable::FullFilterKeyMayMatch(
2063     const ReadOptions& read_options, FilterBlockReader* filter,
2064     const Slice& internal_key, const bool no_io,
2065     const SliceTransform* prefix_extractor, GetContext* get_context,
2066     BlockCacheLookupContext* lookup_context) const {
2067   if (filter == nullptr || filter->IsBlockBased()) {
2068     return true;
2069   }
2070   Slice user_key = ExtractUserKey(internal_key);
2071   const Slice* const const_ikey_ptr = &internal_key;
2072   bool may_match = true;
2073   if (rep_->whole_key_filtering) {
2074     size_t ts_sz =
2075         rep_->internal_comparator.user_comparator()->timestamp_size();
2076     Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
2077     may_match =
2078         filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
2079                             no_io, const_ikey_ptr, get_context, lookup_context);
2080   } else if (!read_options.total_order_seek && prefix_extractor &&
2081              rep_->table_properties->prefix_extractor_name.compare(
2082                  prefix_extractor->Name()) == 0 &&
2083              prefix_extractor->InDomain(user_key) &&
2084              !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
2085                                      prefix_extractor, kNotValid, no_io,
2086                                      const_ikey_ptr, get_context,
2087                                      lookup_context)) {
2088     may_match = false;
2089   }
2090   if (may_match) {
2091     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
2092     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
2093   }
2094   return may_match;
2095 }
2096 
FullFilterKeysMayMatch(const ReadOptions & read_options,FilterBlockReader * filter,MultiGetRange * range,const bool no_io,const SliceTransform * prefix_extractor,BlockCacheLookupContext * lookup_context) const2097 void BlockBasedTable::FullFilterKeysMayMatch(
2098     const ReadOptions& read_options, FilterBlockReader* filter,
2099     MultiGetRange* range, const bool no_io,
2100     const SliceTransform* prefix_extractor,
2101     BlockCacheLookupContext* lookup_context) const {
2102   if (filter == nullptr || filter->IsBlockBased()) {
2103     return;
2104   }
2105   if (rep_->whole_key_filtering) {
2106     filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
2107                          lookup_context);
2108   } else if (!read_options.total_order_seek && prefix_extractor &&
2109              rep_->table_properties->prefix_extractor_name.compare(
2110                  prefix_extractor->Name()) == 0) {
2111     filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
2112                              lookup_context);
2113   }
2114 }
2115 
Get(const ReadOptions & read_options,const Slice & key,GetContext * get_context,const SliceTransform * prefix_extractor,bool skip_filters)2116 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
2117                             GetContext* get_context,
2118                             const SliceTransform* prefix_extractor,
2119                             bool skip_filters) {
2120   assert(key.size() >= 8);  // key must be internal key
2121   assert(get_context != nullptr);
2122   Status s;
2123   const bool no_io = read_options.read_tier == kBlockCacheTier;
2124 
2125   FilterBlockReader* const filter =
2126       !skip_filters ? rep_->filter.get() : nullptr;
2127 
2128   // First check the full filter
2129   // If full filter not useful, Then go into each block
2130   uint64_t tracing_get_id = get_context->get_tracing_get_id();
2131   BlockCacheLookupContext lookup_context{
2132       TableReaderCaller::kUserGet, tracing_get_id,
2133       /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2134   if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2135     // Trace the key since it contains both user key and sequence number.
2136     lookup_context.referenced_key = key.ToString();
2137     lookup_context.get_from_user_specified_snapshot =
2138         read_options.snapshot != nullptr;
2139   }
2140   const bool may_match =
2141       FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
2142                             get_context, &lookup_context);
2143   if (!may_match) {
2144     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2145     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2146   } else {
2147     IndexBlockIter iiter_on_stack;
2148     // if prefix_extractor found in block differs from options, disable
2149     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2150     bool need_upper_bound_check = false;
2151     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2152       need_upper_bound_check = PrefixExtractorChanged(
2153           rep_->table_properties.get(), prefix_extractor);
2154     }
2155     auto iiter =
2156         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
2157                          get_context, &lookup_context);
2158     std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2159     if (iiter != &iiter_on_stack) {
2160       iiter_unique_ptr.reset(iiter);
2161     }
2162 
2163     size_t ts_sz =
2164         rep_->internal_comparator.user_comparator()->timestamp_size();
2165     bool matched = false;  // if such user key mathced a key in SST
2166     bool done = false;
2167     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
2168       IndexValue v = iiter->value();
2169 
2170       bool not_exist_in_filter =
2171           filter != nullptr && filter->IsBlockBased() == true &&
2172           !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
2173                                prefix_extractor, v.handle.offset(), no_io,
2174                                /*const_ikey_ptr=*/nullptr, get_context,
2175                                &lookup_context);
2176 
2177       if (not_exist_in_filter) {
2178         // Not found
2179         // TODO: think about interaction with Merge. If a user key cannot
2180         // cross one data block, we should be fine.
2181         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
2182         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
2183         break;
2184       }
2185 
2186       if (!v.first_internal_key.empty() && !skip_filters &&
2187           UserComparatorWrapper(rep_->internal_comparator.user_comparator())
2188                   .Compare(ExtractUserKey(key),
2189                            ExtractUserKey(v.first_internal_key)) < 0) {
2190         // The requested key falls between highest key in previous block and
2191         // lowest key in current block.
2192         break;
2193       }
2194 
2195       BlockCacheLookupContext lookup_data_block_context{
2196           TableReaderCaller::kUserGet, tracing_get_id,
2197           /*get_from_user_specified_snapshot=*/read_options.snapshot !=
2198               nullptr};
2199       bool does_referenced_key_exist = false;
2200       DataBlockIter biter;
2201       uint64_t referenced_data_size = 0;
2202       NewDataBlockIterator<DataBlockIter>(
2203           read_options, v.handle, &biter, BlockType::kData, get_context,
2204           &lookup_data_block_context,
2205           /*s=*/Status(), /*prefetch_buffer*/ nullptr);
2206 
2207       if (no_io && biter.status().IsIncomplete()) {
2208         // couldn't get block from block_cache
2209         // Update Saver.state to Found because we are only looking for
2210         // whether we can guarantee the key is not there when "no_io" is set
2211         get_context->MarkKeyMayExist();
2212         break;
2213       }
2214       if (!biter.status().ok()) {
2215         s = biter.status();
2216         break;
2217       }
2218 
2219       bool may_exist = biter.SeekForGet(key);
2220       // If user-specified timestamp is supported, we cannot end the search
2221       // just because hash index lookup indicates the key+ts does not exist.
2222       if (!may_exist && ts_sz == 0) {
2223         // HashSeek cannot find the key this block and the the iter is not
2224         // the end of the block, i.e. cannot be in the following blocks
2225         // either. In this case, the seek_key cannot be found, so we break
2226         // from the top level for-loop.
2227         done = true;
2228       } else {
2229         // Call the *saver function on each entry/block until it returns false
2230         for (; biter.Valid(); biter.Next()) {
2231           ParsedInternalKey parsed_key;
2232           if (!ParseInternalKey(biter.key(), &parsed_key)) {
2233             s = Status::Corruption(Slice());
2234           }
2235 
2236           if (!get_context->SaveValue(
2237                   parsed_key, biter.value(), &matched,
2238                   biter.IsValuePinned() ? &biter : nullptr)) {
2239             if (get_context->State() == GetContext::GetState::kFound) {
2240               does_referenced_key_exist = true;
2241               referenced_data_size = biter.key().size() + biter.value().size();
2242             }
2243             done = true;
2244             break;
2245           }
2246         }
2247         s = biter.status();
2248       }
2249       // Write the block cache access record.
2250       if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2251         // Avoid making copy of block_key, cf_name, and referenced_key when
2252         // constructing the access record.
2253         Slice referenced_key;
2254         if (does_referenced_key_exist) {
2255           referenced_key = biter.key();
2256         } else {
2257           referenced_key = key;
2258         }
2259         BlockCacheTraceRecord access_record(
2260             rep_->ioptions.env->NowMicros(),
2261             /*block_key=*/"", lookup_data_block_context.block_type,
2262             lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
2263             /*cf_name=*/"", rep_->level_for_tracing(),
2264             rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
2265             lookup_data_block_context.is_cache_hit,
2266             lookup_data_block_context.no_insert,
2267             lookup_data_block_context.get_id,
2268             lookup_data_block_context.get_from_user_specified_snapshot,
2269             /*referenced_key=*/"", referenced_data_size,
2270             lookup_data_block_context.num_keys_in_block,
2271             does_referenced_key_exist);
2272         block_cache_tracer_->WriteBlockAccess(
2273             access_record, lookup_data_block_context.block_key,
2274             rep_->cf_name_for_tracing(), referenced_key);
2275       }
2276 
2277       if (done) {
2278         // Avoid the extra Next which is expensive in two-level indexes
2279         break;
2280       }
2281     }
2282     if (matched && filter != nullptr && !filter->IsBlockBased()) {
2283       RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
2284       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
2285                                 rep_->level);
2286     }
2287     if (s.ok() && !iiter->status().IsNotFound()) {
2288       s = iiter->status();
2289     }
2290   }
2291 
2292   return s;
2293 }
2294 
2295 using MultiGetRange = MultiGetContext::Range;
MultiGet(const ReadOptions & read_options,const MultiGetRange * mget_range,const SliceTransform * prefix_extractor,bool skip_filters)2296 void BlockBasedTable::MultiGet(const ReadOptions& read_options,
2297                                const MultiGetRange* mget_range,
2298                                const SliceTransform* prefix_extractor,
2299                                bool skip_filters) {
2300   FilterBlockReader* const filter =
2301       !skip_filters ? rep_->filter.get() : nullptr;
2302   MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
2303                                mget_range->end());
2304 
2305   // First check the full filter
2306   // If full filter not useful, Then go into each block
2307   const bool no_io = read_options.read_tier == kBlockCacheTier;
2308   uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
2309   if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
2310     tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
2311   }
2312   BlockCacheLookupContext lookup_context{
2313       TableReaderCaller::kUserMultiGet, tracing_mget_id,
2314       /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
2315   FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
2316                          prefix_extractor, &lookup_context);
2317 
2318   if (skip_filters || !sst_file_range.empty()) {
2319     IndexBlockIter iiter_on_stack;
2320     // if prefix_extractor found in block differs from options, disable
2321     // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
2322     bool need_upper_bound_check = false;
2323     if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
2324       need_upper_bound_check = PrefixExtractorChanged(
2325           rep_->table_properties.get(), prefix_extractor);
2326     }
2327     auto iiter =
2328         NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
2329                          sst_file_range.begin()->get_context, &lookup_context);
2330     std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2331     if (iiter != &iiter_on_stack) {
2332       iiter_unique_ptr.reset(iiter);
2333     }
2334 
2335     uint64_t offset = std::numeric_limits<uint64_t>::max();
2336     autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
2337     autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
2338     autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
2339     char stack_buf[kMultiGetReadStackBufSize];
2340     std::unique_ptr<char[]> block_buf;
2341     {
2342       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
2343                                      sst_file_range.end());
2344 
2345       CachableEntry<UncompressionDict> uncompression_dict;
2346       Status uncompression_dict_status;
2347       if (rep_->uncompression_dict_reader) {
2348         uncompression_dict_status =
2349             rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
2350                 nullptr /* prefetch_buffer */, no_io,
2351                 sst_file_range.begin()->get_context, &lookup_context,
2352                 &uncompression_dict);
2353       }
2354 
2355       const UncompressionDict& dict = uncompression_dict.GetValue()
2356                                           ? *uncompression_dict.GetValue()
2357                                           : UncompressionDict::GetEmptyDict();
2358 
2359       size_t total_len = 0;
2360       ReadOptions ro = read_options;
2361       ro.read_tier = kBlockCacheTier;
2362 
2363       for (auto miter = data_block_range.begin();
2364            miter != data_block_range.end(); ++miter) {
2365         const Slice& key = miter->ikey;
2366         iiter->Seek(miter->ikey);
2367 
2368         IndexValue v;
2369         if (iiter->Valid()) {
2370           v = iiter->value();
2371         }
2372         if (!iiter->Valid() ||
2373             (!v.first_internal_key.empty() && !skip_filters &&
2374              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
2375                      .Compare(ExtractUserKey(key),
2376                               ExtractUserKey(v.first_internal_key)) < 0)) {
2377           // The requested key falls between highest key in previous block and
2378           // lowest key in current block.
2379           *(miter->s) = iiter->status();
2380           data_block_range.SkipKey(miter);
2381           sst_file_range.SkipKey(miter);
2382           continue;
2383         }
2384 
2385         if (!uncompression_dict_status.ok()) {
2386           *(miter->s) = uncompression_dict_status;
2387           data_block_range.SkipKey(miter);
2388           sst_file_range.SkipKey(miter);
2389           continue;
2390         }
2391 
2392         statuses.emplace_back();
2393         results.emplace_back();
2394         if (v.handle.offset() == offset) {
2395           // We're going to reuse the block for this key later on. No need to
2396           // look it up now. Place a null handle
2397           block_handles.emplace_back(BlockHandle::NullBlockHandle());
2398           continue;
2399         }
2400         // Lookup the cache for the given data block referenced by an index
2401         // iterator value (i.e BlockHandle). If it exists in the cache,
2402         // initialize block to the contents of the data block.
2403         offset = v.handle.offset();
2404         BlockHandle handle = v.handle;
2405         BlockCacheLookupContext lookup_data_block_context(
2406             TableReaderCaller::kUserMultiGet);
2407         Status s = RetrieveBlock(
2408             nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
2409             miter->get_context, &lookup_data_block_context,
2410             /* for_compaction */ false, /* use_cache */ true);
2411         if (s.IsIncomplete()) {
2412           s = Status::OK();
2413         }
2414         if (s.ok() && !results.back().IsEmpty()) {
2415           // Found it in the cache. Add NULL handle to indicate there is
2416           // nothing to read from disk
2417           block_handles.emplace_back(BlockHandle::NullBlockHandle());
2418         } else {
2419           block_handles.emplace_back(handle);
2420           total_len += block_size(handle);
2421         }
2422       }
2423 
2424       if (total_len) {
2425         char* scratch = nullptr;
2426         // If the blocks need to be uncompressed and we don't need the
2427         // compressed blocks, then we can use a contiguous block of
2428         // memory to read in all the blocks as it will be temporary
2429         // storage
2430         // 1. If blocks are compressed and compressed block cache is there,
2431         //    alloc heap bufs
2432         // 2. If blocks are uncompressed, alloc heap bufs
2433         // 3. If blocks are compressed and no compressed block cache, use
2434         //    stack buf
2435         if (rep_->table_options.block_cache_compressed == nullptr &&
2436             rep_->blocks_maybe_compressed) {
2437           if (total_len <= kMultiGetReadStackBufSize) {
2438             scratch = stack_buf;
2439           } else {
2440             scratch = new char[total_len];
2441             block_buf.reset(scratch);
2442           }
2443         }
2444         RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
2445                                &statuses, &results, scratch, dict);
2446       }
2447     }
2448 
2449     DataBlockIter first_biter;
2450     DataBlockIter next_biter;
2451     size_t idx_in_batch = 0;
2452     for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
2453          ++miter) {
2454       Status s;
2455       GetContext* get_context = miter->get_context;
2456       const Slice& key = miter->ikey;
2457       bool matched = false;  // if such user key matched a key in SST
2458       bool done = false;
2459       bool first_block = true;
2460       do {
2461         DataBlockIter* biter = nullptr;
2462         bool reusing_block = true;
2463         uint64_t referenced_data_size = 0;
2464         bool does_referenced_key_exist = false;
2465         BlockCacheLookupContext lookup_data_block_context(
2466             TableReaderCaller::kUserMultiGet, tracing_mget_id,
2467             /*get_from_user_specified_snapshot=*/read_options.snapshot !=
2468                 nullptr);
2469         if (first_block) {
2470           if (!block_handles[idx_in_batch].IsNull() ||
2471               !results[idx_in_batch].IsEmpty()) {
2472             first_biter.Invalidate(Status::OK());
2473             NewDataBlockIterator<DataBlockIter>(
2474                 read_options, results[idx_in_batch], &first_biter,
2475                 statuses[idx_in_batch]);
2476             reusing_block = false;
2477           }
2478           biter = &first_biter;
2479           idx_in_batch++;
2480         } else {
2481           IndexValue v = iiter->value();
2482           if (!v.first_internal_key.empty() && !skip_filters &&
2483               UserComparatorWrapper(rep_->internal_comparator.user_comparator())
2484                       .Compare(ExtractUserKey(key),
2485                                ExtractUserKey(v.first_internal_key)) < 0) {
2486             // The requested key falls between highest key in previous block and
2487             // lowest key in current block.
2488             break;
2489           }
2490 
2491           next_biter.Invalidate(Status::OK());
2492           NewDataBlockIterator<DataBlockIter>(
2493               read_options, iiter->value().handle, &next_biter,
2494               BlockType::kData, get_context, &lookup_data_block_context,
2495               Status(), nullptr);
2496           biter = &next_biter;
2497           reusing_block = false;
2498         }
2499 
2500         if (read_options.read_tier == kBlockCacheTier &&
2501             biter->status().IsIncomplete()) {
2502           // couldn't get block from block_cache
2503           // Update Saver.state to Found because we are only looking for
2504           // whether we can guarantee the key is not there when "no_io" is set
2505           get_context->MarkKeyMayExist();
2506           break;
2507         }
2508         if (!biter->status().ok()) {
2509           s = biter->status();
2510           break;
2511         }
2512 
2513         bool may_exist = biter->SeekForGet(key);
2514         if (!may_exist) {
2515           // HashSeek cannot find the key this block and the the iter is not
2516           // the end of the block, i.e. cannot be in the following blocks
2517           // either. In this case, the seek_key cannot be found, so we break
2518           // from the top level for-loop.
2519           break;
2520         }
2521 
2522         // Call the *saver function on each entry/block until it returns false
2523         for (; biter->Valid(); biter->Next()) {
2524           ParsedInternalKey parsed_key;
2525           Cleanable dummy;
2526           Cleanable* value_pinner = nullptr;
2527           if (!ParseInternalKey(biter->key(), &parsed_key)) {
2528             s = Status::Corruption(Slice());
2529           }
2530           if (biter->IsValuePinned()) {
2531             if (reusing_block) {
2532               Cache* block_cache = rep_->table_options.block_cache.get();
2533               assert(biter->cache_handle() != nullptr);
2534               block_cache->Ref(biter->cache_handle());
2535               dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
2536                                     biter->cache_handle());
2537               value_pinner = &dummy;
2538             } else {
2539               value_pinner = biter;
2540             }
2541           }
2542           if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
2543                                       value_pinner)) {
2544             if (get_context->State() == GetContext::GetState::kFound) {
2545               does_referenced_key_exist = true;
2546               referenced_data_size =
2547                   biter->key().size() + biter->value().size();
2548             }
2549             done = true;
2550             break;
2551           }
2552           s = biter->status();
2553         }
2554         // Write the block cache access.
2555         if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
2556           // Avoid making copy of block_key, cf_name, and referenced_key when
2557           // constructing the access record.
2558           Slice referenced_key;
2559           if (does_referenced_key_exist) {
2560             referenced_key = biter->key();
2561           } else {
2562             referenced_key = key;
2563           }
2564           BlockCacheTraceRecord access_record(
2565               rep_->ioptions.env->NowMicros(),
2566               /*block_key=*/"", lookup_data_block_context.block_type,
2567               lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
2568               /*cf_name=*/"", rep_->level_for_tracing(),
2569               rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
2570               lookup_data_block_context.is_cache_hit,
2571               lookup_data_block_context.no_insert,
2572               lookup_data_block_context.get_id,
2573               lookup_data_block_context.get_from_user_specified_snapshot,
2574               /*referenced_key=*/"", referenced_data_size,
2575               lookup_data_block_context.num_keys_in_block,
2576               does_referenced_key_exist);
2577           block_cache_tracer_->WriteBlockAccess(
2578               access_record, lookup_data_block_context.block_key,
2579               rep_->cf_name_for_tracing(), referenced_key);
2580         }
2581         s = biter->status();
2582         if (done) {
2583           // Avoid the extra Next which is expensive in two-level indexes
2584           break;
2585         }
2586         if (first_block) {
2587           iiter->Seek(key);
2588         }
2589         first_block = false;
2590         iiter->Next();
2591       } while (iiter->Valid());
2592 
2593       if (matched && filter != nullptr && !filter->IsBlockBased()) {
2594         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
2595         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
2596                                   rep_->level);
2597       }
2598       if (s.ok()) {
2599         s = iiter->status();
2600       }
2601       *(miter->s) = s;
2602     }
2603   }
2604 }
2605 
Prefetch(const Slice * const begin,const Slice * const end)2606 Status BlockBasedTable::Prefetch(const Slice* const begin,
2607                                  const Slice* const end) {
2608   auto& comparator = rep_->internal_comparator;
2609   UserComparatorWrapper user_comparator(comparator.user_comparator());
2610   // pre-condition
2611   if (begin && end && comparator.Compare(*begin, *end) > 0) {
2612     return Status::InvalidArgument(*begin, *end);
2613   }
2614   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
2615   IndexBlockIter iiter_on_stack;
2616   auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2617                                 &iiter_on_stack, /*get_context=*/nullptr,
2618                                 &lookup_context);
2619   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2620   if (iiter != &iiter_on_stack) {
2621     iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
2622   }
2623 
2624   if (!iiter->status().ok()) {
2625     // error opening index iterator
2626     return iiter->status();
2627   }
2628 
2629   // indicates if we are on the last page that need to be pre-fetched
2630   bool prefetching_boundary_page = false;
2631 
2632   for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
2633        iiter->Next()) {
2634     BlockHandle block_handle = iiter->value().handle;
2635     const bool is_user_key = !rep_->index_key_includes_seq;
2636     if (end &&
2637         ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
2638          (is_user_key &&
2639           user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
2640       if (prefetching_boundary_page) {
2641         break;
2642       }
2643 
2644       // The index entry represents the last key in the data block.
2645       // We should load this page into memory as well, but no more
2646       prefetching_boundary_page = true;
2647     }
2648 
2649     // Load the block specified by the block_handle into the block cache
2650     DataBlockIter biter;
2651 
2652     NewDataBlockIterator<DataBlockIter>(
2653         ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
2654         /*get_context=*/nullptr, &lookup_context, Status(),
2655         /*prefetch_buffer=*/nullptr);
2656 
2657     if (!biter.status().ok()) {
2658       // there was an unexpected error while pre-fetching
2659       return biter.status();
2660     }
2661   }
2662 
2663   return Status::OK();
2664 }
2665 
VerifyChecksum(const ReadOptions & read_options,TableReaderCaller caller)2666 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
2667                                        TableReaderCaller caller) {
2668   Status s;
2669   // Check Meta blocks
2670   std::unique_ptr<Block> metaindex;
2671   std::unique_ptr<InternalIterator> metaindex_iter;
2672   s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
2673                          &metaindex_iter);
2674   if (s.ok()) {
2675     s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
2676     if (!s.ok()) {
2677       return s;
2678     }
2679   } else {
2680     return s;
2681   }
2682   // Check Data blocks
2683   IndexBlockIter iiter_on_stack;
2684   BlockCacheLookupContext context{caller};
2685   InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
2686       read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
2687       /*get_context=*/nullptr, &context);
2688   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2689   if (iiter != &iiter_on_stack) {
2690     iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
2691   }
2692   if (!iiter->status().ok()) {
2693     // error opening index iterator
2694     return iiter->status();
2695   }
2696   s = VerifyChecksumInBlocks(read_options, iiter);
2697   return s;
2698 }
2699 
VerifyChecksumInBlocks(const ReadOptions & read_options,InternalIteratorBase<IndexValue> * index_iter)2700 Status BlockBasedTable::VerifyChecksumInBlocks(
2701     const ReadOptions& read_options,
2702     InternalIteratorBase<IndexValue>* index_iter) {
2703   Status s;
2704   // We are scanning the whole file, so no need to do exponential
2705   // increasing of the buffer size.
2706   size_t readahead_size = (read_options.readahead_size != 0)
2707                               ? read_options.readahead_size
2708                               : kMaxAutoReadaheadSize;
2709   // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
2710   // needed there.
2711   FilePrefetchBuffer prefetch_buffer(
2712       rep_->file.get(), readahead_size /* readadhead_size */,
2713       readahead_size /* max_readahead_size */,
2714       !rep_->ioptions.allow_mmap_reads /* enable */);
2715 
2716   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2717     s = index_iter->status();
2718     if (!s.ok()) {
2719       break;
2720     }
2721     BlockHandle handle = index_iter->value().handle;
2722     BlockContents contents;
2723     BlockFetcher block_fetcher(
2724         rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
2725         &contents, rep_->ioptions, false /* decompress */,
2726         false /*maybe_compressed*/, BlockType::kData,
2727         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
2728     s = block_fetcher.ReadBlockContents();
2729     if (!s.ok()) {
2730       break;
2731     }
2732   }
2733   return s;
2734 }
2735 
GetBlockTypeForMetaBlockByName(const Slice & meta_block_name)2736 BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
2737     const Slice& meta_block_name) {
2738   if (meta_block_name.starts_with(kFilterBlockPrefix) ||
2739       meta_block_name.starts_with(kFullFilterBlockPrefix) ||
2740       meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
2741     return BlockType::kFilter;
2742   }
2743 
2744   if (meta_block_name == kPropertiesBlock) {
2745     return BlockType::kProperties;
2746   }
2747 
2748   if (meta_block_name == kCompressionDictBlock) {
2749     return BlockType::kCompressionDictionary;
2750   }
2751 
2752   if (meta_block_name == kRangeDelBlock) {
2753     return BlockType::kRangeDeletion;
2754   }
2755 
2756   if (meta_block_name == kHashIndexPrefixesBlock) {
2757     return BlockType::kHashIndexPrefixes;
2758   }
2759 
2760   if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
2761     return BlockType::kHashIndexMetadata;
2762   }
2763 
2764   assert(false);
2765   return BlockType::kInvalid;
2766 }
2767 
VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice> * index_iter)2768 Status BlockBasedTable::VerifyChecksumInMetaBlocks(
2769     InternalIteratorBase<Slice>* index_iter) {
2770   Status s;
2771   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
2772     s = index_iter->status();
2773     if (!s.ok()) {
2774       break;
2775     }
2776     BlockHandle handle;
2777     Slice input = index_iter->value();
2778     s = handle.DecodeFrom(&input);
2779     BlockContents contents;
2780     const Slice meta_block_name = index_iter->key();
2781     BlockFetcher block_fetcher(
2782         rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
2783         ReadOptions(), handle, &contents, rep_->ioptions,
2784         false /* decompress */, false /*maybe_compressed*/,
2785         GetBlockTypeForMetaBlockByName(meta_block_name),
2786         UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
2787     s = block_fetcher.ReadBlockContents();
2788     if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
2789       TableProperties* table_properties;
2790       s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
2791                                            index_iter->value(),
2792                                            &table_properties);
2793       delete table_properties;
2794     }
2795     if (!s.ok()) {
2796       break;
2797     }
2798   }
2799   return s;
2800 }
2801 
TEST_BlockInCache(const BlockHandle & handle) const2802 bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
2803   assert(rep_ != nullptr);
2804 
2805   Cache* const cache = rep_->table_options.block_cache.get();
2806   if (cache == nullptr) {
2807     return false;
2808   }
2809 
2810   char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
2811   Slice cache_key =
2812       GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
2813                   cache_key_storage);
2814 
2815   Cache::Handle* const cache_handle = cache->Lookup(cache_key);
2816   if (cache_handle == nullptr) {
2817     return false;
2818   }
2819 
2820   cache->Release(cache_handle);
2821 
2822   return true;
2823 }
2824 
TEST_KeyInCache(const ReadOptions & options,const Slice & key)2825 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
2826                                       const Slice& key) {
2827   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
2828       options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
2829       /*get_context=*/nullptr, /*lookup_context=*/nullptr));
2830   iiter->Seek(key);
2831   assert(iiter->Valid());
2832 
2833   return TEST_BlockInCache(iiter->value().handle);
2834 }
2835 
2836 // REQUIRES: The following fields of rep_ should have already been populated:
2837 //  1. file
2838 //  2. index_handle,
2839 //  3. options
2840 //  4. internal_comparator
2841 //  5. index_type
CreateIndexReader(FilePrefetchBuffer * prefetch_buffer,InternalIterator * preloaded_meta_index_iter,bool use_cache,bool prefetch,bool pin,BlockCacheLookupContext * lookup_context,std::unique_ptr<IndexReader> * index_reader)2842 Status BlockBasedTable::CreateIndexReader(
2843     FilePrefetchBuffer* prefetch_buffer,
2844     InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
2845     bool pin, BlockCacheLookupContext* lookup_context,
2846     std::unique_ptr<IndexReader>* index_reader) {
2847   // kHashSearch requires non-empty prefix_extractor but bypass checking
2848   // prefix_extractor here since we have no access to MutableCFOptions.
2849   // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
2850   // If prefix_extractor does not match prefix_extractor_name from table
2851   // properties, turn off Hash Index by setting total_order_seek to true
2852 
2853   switch (rep_->index_type) {
2854     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
2855       return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
2856                                           prefetch, pin, lookup_context,
2857                                           index_reader);
2858     }
2859     case BlockBasedTableOptions::kBinarySearch:
2860       FALLTHROUGH_INTENDED;
2861     case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
2862       return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
2863                                              prefetch, pin, lookup_context,
2864                                              index_reader);
2865     }
2866     case BlockBasedTableOptions::kHashSearch: {
2867       std::unique_ptr<Block> metaindex_guard;
2868       std::unique_ptr<InternalIterator> metaindex_iter_guard;
2869       auto meta_index_iter = preloaded_meta_index_iter;
2870       bool should_fallback = false;
2871       if (rep_->internal_prefix_transform.get() == nullptr) {
2872         ROCKS_LOG_WARN(rep_->ioptions.info_log,
2873                        "No prefix extractor passed in. Fall back to binary"
2874                        " search index.");
2875         should_fallback = true;
2876       } else if (meta_index_iter == nullptr) {
2877         auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
2878                                     &metaindex_iter_guard);
2879         if (!s.ok()) {
2880           // we simply fall back to binary search in case there is any
2881           // problem with prefix hash index loading.
2882           ROCKS_LOG_WARN(rep_->ioptions.info_log,
2883                          "Unable to read the metaindex block."
2884                          " Fall back to binary search index.");
2885           should_fallback = true;
2886         }
2887         meta_index_iter = metaindex_iter_guard.get();
2888       }
2889 
2890       if (should_fallback) {
2891         return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
2892                                                prefetch, pin, lookup_context,
2893                                                index_reader);
2894       } else {
2895         return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
2896                                        use_cache, prefetch, pin, lookup_context,
2897                                        index_reader);
2898       }
2899     }
2900     default: {
2901       std::string error_message =
2902           "Unrecognized index type: " + ToString(rep_->index_type);
2903       return Status::InvalidArgument(error_message.c_str());
2904     }
2905   }
2906 }
2907 
ApproximateOffsetOf(const InternalIteratorBase<IndexValue> & index_iter) const2908 uint64_t BlockBasedTable::ApproximateOffsetOf(
2909     const InternalIteratorBase<IndexValue>& index_iter) const {
2910   uint64_t result = 0;
2911   if (index_iter.Valid()) {
2912     BlockHandle handle = index_iter.value().handle;
2913     result = handle.offset();
2914   } else {
2915     // The iterator is past the last key in the file. If table_properties is not
2916     // available, approximate the offset by returning the offset of the
2917     // metaindex block (which is right near the end of the file).
2918     if (rep_->table_properties) {
2919       result = rep_->table_properties->data_size;
2920     }
2921     // table_properties is not present in the table.
2922     if (result == 0) {
2923       result = rep_->footer.metaindex_handle().offset();
2924     }
2925   }
2926 
2927   return result;
2928 }
2929 
ApproximateOffsetOf(const Slice & key,TableReaderCaller caller)2930 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
2931                                               TableReaderCaller caller) {
2932   BlockCacheLookupContext context(caller);
2933   IndexBlockIter iiter_on_stack;
2934   ReadOptions ro;
2935   ro.total_order_seek = true;
2936   auto index_iter =
2937       NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2938                        /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2939                        /*lookup_context=*/&context);
2940   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2941   if (index_iter != &iiter_on_stack) {
2942     iiter_unique_ptr.reset(index_iter);
2943   }
2944 
2945   index_iter->Seek(key);
2946   return ApproximateOffsetOf(*index_iter);
2947 }
2948 
ApproximateSize(const Slice & start,const Slice & end,TableReaderCaller caller)2949 uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
2950                                           TableReaderCaller caller) {
2951   assert(rep_->internal_comparator.Compare(start, end) <= 0);
2952 
2953   BlockCacheLookupContext context(caller);
2954   IndexBlockIter iiter_on_stack;
2955   ReadOptions ro;
2956   ro.total_order_seek = true;
2957   auto index_iter =
2958       NewIndexIterator(ro, /*disable_prefix_seek=*/true,
2959                        /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
2960                        /*lookup_context=*/&context);
2961   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
2962   if (index_iter != &iiter_on_stack) {
2963     iiter_unique_ptr.reset(index_iter);
2964   }
2965 
2966   index_iter->Seek(start);
2967   uint64_t start_offset = ApproximateOffsetOf(*index_iter);
2968   index_iter->Seek(end);
2969   uint64_t end_offset = ApproximateOffsetOf(*index_iter);
2970 
2971   assert(end_offset >= start_offset);
2972   return end_offset - start_offset;
2973 }
2974 
TEST_FilterBlockInCache() const2975 bool BlockBasedTable::TEST_FilterBlockInCache() const {
2976   assert(rep_ != nullptr);
2977   return TEST_BlockInCache(rep_->filter_handle);
2978 }
2979 
TEST_IndexBlockInCache() const2980 bool BlockBasedTable::TEST_IndexBlockInCache() const {
2981   assert(rep_ != nullptr);
2982 
2983   return TEST_BlockInCache(rep_->footer.index_handle());
2984 }
2985 
GetKVPairsFromDataBlocks(std::vector<KVPairBlock> * kv_pair_blocks)2986 Status BlockBasedTable::GetKVPairsFromDataBlocks(
2987     std::vector<KVPairBlock>* kv_pair_blocks) {
2988   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
2989       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
2990                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
2991                        /*lookup_contex=*/nullptr));
2992 
2993   Status s = blockhandles_iter->status();
2994   if (!s.ok()) {
2995     // Cannot read Index Block
2996     return s;
2997   }
2998 
2999   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3000        blockhandles_iter->Next()) {
3001     s = blockhandles_iter->status();
3002 
3003     if (!s.ok()) {
3004       break;
3005     }
3006 
3007     std::unique_ptr<InternalIterator> datablock_iter;
3008     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3009         ReadOptions(), blockhandles_iter->value().handle,
3010         /*input_iter=*/nullptr, /*type=*/BlockType::kData,
3011         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
3012         /*prefetch_buffer=*/nullptr));
3013     s = datablock_iter->status();
3014 
3015     if (!s.ok()) {
3016       // Error reading the block - Skipped
3017       continue;
3018     }
3019 
3020     KVPairBlock kv_pair_block;
3021     for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
3022          datablock_iter->Next()) {
3023       s = datablock_iter->status();
3024       if (!s.ok()) {
3025         // Error reading the block - Skipped
3026         break;
3027       }
3028       const Slice& key = datablock_iter->key();
3029       const Slice& value = datablock_iter->value();
3030       std::string key_copy = std::string(key.data(), key.size());
3031       std::string value_copy = std::string(value.data(), value.size());
3032 
3033       kv_pair_block.push_back(
3034           std::make_pair(std::move(key_copy), std::move(value_copy)));
3035     }
3036     kv_pair_blocks->push_back(std::move(kv_pair_block));
3037   }
3038   return Status::OK();
3039 }
3040 
DumpTable(WritableFile * out_file)3041 Status BlockBasedTable::DumpTable(WritableFile* out_file) {
3042   // Output Footer
3043   out_file->Append(
3044       "Footer Details:\n"
3045       "--------------------------------------\n"
3046       "  ");
3047   out_file->Append(rep_->footer.ToString().c_str());
3048   out_file->Append("\n");
3049 
3050   // Output MetaIndex
3051   out_file->Append(
3052       "Metaindex Details:\n"
3053       "--------------------------------------\n");
3054   std::unique_ptr<Block> metaindex;
3055   std::unique_ptr<InternalIterator> metaindex_iter;
3056   Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
3057                                 &metaindex_iter);
3058   if (s.ok()) {
3059     for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
3060          metaindex_iter->Next()) {
3061       s = metaindex_iter->status();
3062       if (!s.ok()) {
3063         return s;
3064       }
3065       if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
3066         out_file->Append("  Properties block handle: ");
3067         out_file->Append(metaindex_iter->value().ToString(true).c_str());
3068         out_file->Append("\n");
3069       } else if (metaindex_iter->key() ==
3070                  ROCKSDB_NAMESPACE::kCompressionDictBlock) {
3071         out_file->Append("  Compression dictionary block handle: ");
3072         out_file->Append(metaindex_iter->value().ToString(true).c_str());
3073         out_file->Append("\n");
3074       } else if (strstr(metaindex_iter->key().ToString().c_str(),
3075                         "filter.rocksdb.") != nullptr) {
3076         out_file->Append("  Filter block handle: ");
3077         out_file->Append(metaindex_iter->value().ToString(true).c_str());
3078         out_file->Append("\n");
3079       } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
3080         out_file->Append("  Range deletion block handle: ");
3081         out_file->Append(metaindex_iter->value().ToString(true).c_str());
3082         out_file->Append("\n");
3083       }
3084     }
3085     out_file->Append("\n");
3086   } else {
3087     return s;
3088   }
3089 
3090   // Output TableProperties
3091   const ROCKSDB_NAMESPACE::TableProperties* table_properties;
3092   table_properties = rep_->table_properties.get();
3093 
3094   if (table_properties != nullptr) {
3095     out_file->Append(
3096         "Table Properties:\n"
3097         "--------------------------------------\n"
3098         "  ");
3099     out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
3100     out_file->Append("\n");
3101   }
3102 
3103   if (rep_->filter) {
3104     out_file->Append(
3105         "Filter Details:\n"
3106         "--------------------------------------\n"
3107         "  ");
3108     out_file->Append(rep_->filter->ToString().c_str());
3109     out_file->Append("\n");
3110   }
3111 
3112   // Output Index block
3113   s = DumpIndexBlock(out_file);
3114   if (!s.ok()) {
3115     return s;
3116   }
3117 
3118   // Output compression dictionary
3119   if (rep_->uncompression_dict_reader) {
3120     CachableEntry<UncompressionDict> uncompression_dict;
3121     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
3122         nullptr /* prefetch_buffer */, false /* no_io */,
3123         nullptr /* get_context */, nullptr /* lookup_context */,
3124         &uncompression_dict);
3125     if (!s.ok()) {
3126       return s;
3127     }
3128 
3129     assert(uncompression_dict.GetValue());
3130 
3131     const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
3132     out_file->Append(
3133         "Compression Dictionary:\n"
3134         "--------------------------------------\n");
3135     out_file->Append("  size (bytes): ");
3136     out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
3137     out_file->Append("\n\n");
3138     out_file->Append("  HEX    ");
3139     out_file->Append(raw_dict.ToString(true).c_str());
3140     out_file->Append("\n\n");
3141   }
3142 
3143   // Output range deletions block
3144   auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
3145   if (range_del_iter != nullptr) {
3146     range_del_iter->SeekToFirst();
3147     if (range_del_iter->Valid()) {
3148       out_file->Append(
3149           "Range deletions:\n"
3150           "--------------------------------------\n"
3151           "  ");
3152       for (; range_del_iter->Valid(); range_del_iter->Next()) {
3153         DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
3154       }
3155       out_file->Append("\n");
3156     }
3157     delete range_del_iter;
3158   }
3159   // Output Data blocks
3160   s = DumpDataBlocks(out_file);
3161 
3162   return s;
3163 }
3164 
DumpIndexBlock(WritableFile * out_file)3165 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
3166   out_file->Append(
3167       "Index Details:\n"
3168       "--------------------------------------\n");
3169   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
3170       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3171                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
3172                        /*lookup_contex=*/nullptr));
3173   Status s = blockhandles_iter->status();
3174   if (!s.ok()) {
3175     out_file->Append("Can not read Index Block \n\n");
3176     return s;
3177   }
3178 
3179   out_file->Append("  Block key hex dump: Data block handle\n");
3180   out_file->Append("  Block key ascii\n\n");
3181   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3182        blockhandles_iter->Next()) {
3183     s = blockhandles_iter->status();
3184     if (!s.ok()) {
3185       break;
3186     }
3187     Slice key = blockhandles_iter->key();
3188     Slice user_key;
3189     InternalKey ikey;
3190     if (!rep_->index_key_includes_seq) {
3191       user_key = key;
3192     } else {
3193       ikey.DecodeFrom(key);
3194       user_key = ikey.user_key();
3195     }
3196 
3197     out_file->Append("  HEX    ");
3198     out_file->Append(user_key.ToString(true).c_str());
3199     out_file->Append(": ");
3200     out_file->Append(blockhandles_iter->value()
3201                          .ToString(true, rep_->index_has_first_key)
3202                          .c_str());
3203     out_file->Append("\n");
3204 
3205     std::string str_key = user_key.ToString();
3206     std::string res_key("");
3207     char cspace = ' ';
3208     for (size_t i = 0; i < str_key.size(); i++) {
3209       res_key.append(&str_key[i], 1);
3210       res_key.append(1, cspace);
3211     }
3212     out_file->Append("  ASCII  ");
3213     out_file->Append(res_key.c_str());
3214     out_file->Append("\n  ------\n");
3215   }
3216   out_file->Append("\n");
3217   return Status::OK();
3218 }
3219 
DumpDataBlocks(WritableFile * out_file)3220 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
3221   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
3222       NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
3223                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
3224                        /*lookup_contex=*/nullptr));
3225   Status s = blockhandles_iter->status();
3226   if (!s.ok()) {
3227     out_file->Append("Can not read Index Block \n\n");
3228     return s;
3229   }
3230 
3231   uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
3232   uint64_t datablock_size_max = 0;
3233   uint64_t datablock_size_sum = 0;
3234 
3235   size_t block_id = 1;
3236   for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
3237        block_id++, blockhandles_iter->Next()) {
3238     s = blockhandles_iter->status();
3239     if (!s.ok()) {
3240       break;
3241     }
3242 
3243     BlockHandle bh = blockhandles_iter->value().handle;
3244     uint64_t datablock_size = bh.size();
3245     datablock_size_min = std::min(datablock_size_min, datablock_size);
3246     datablock_size_max = std::max(datablock_size_max, datablock_size);
3247     datablock_size_sum += datablock_size;
3248 
3249     out_file->Append("Data Block # ");
3250     out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
3251     out_file->Append(" @ ");
3252     out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
3253     out_file->Append("\n");
3254     out_file->Append("--------------------------------------\n");
3255 
3256     std::unique_ptr<InternalIterator> datablock_iter;
3257     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
3258         ReadOptions(), blockhandles_iter->value().handle,
3259         /*input_iter=*/nullptr, /*type=*/BlockType::kData,
3260         /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
3261         /*prefetch_buffer=*/nullptr));
3262     s = datablock_iter->status();
3263 
3264     if (!s.ok()) {
3265       out_file->Append("Error reading the block - Skipped \n\n");
3266       continue;
3267     }
3268 
3269     for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
3270          datablock_iter->Next()) {
3271       s = datablock_iter->status();
3272       if (!s.ok()) {
3273         out_file->Append("Error reading the block - Skipped \n");
3274         break;
3275       }
3276       DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
3277     }
3278     out_file->Append("\n");
3279   }
3280 
3281   uint64_t num_datablocks = block_id - 1;
3282   if (num_datablocks) {
3283     double datablock_size_avg =
3284         static_cast<double>(datablock_size_sum) / num_datablocks;
3285     out_file->Append("Data Block Summary:\n");
3286     out_file->Append("--------------------------------------");
3287     out_file->Append("\n  # data blocks: ");
3288     out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
3289     out_file->Append("\n  min data block size: ");
3290     out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
3291     out_file->Append("\n  max data block size: ");
3292     out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
3293     out_file->Append("\n  avg data block size: ");
3294     out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
3295     out_file->Append("\n");
3296   }
3297 
3298   return Status::OK();
3299 }
3300 
DumpKeyValue(const Slice & key,const Slice & value,WritableFile * out_file)3301 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
3302                                    WritableFile* out_file) {
3303   InternalKey ikey;
3304   ikey.DecodeFrom(key);
3305 
3306   out_file->Append("  HEX    ");
3307   out_file->Append(ikey.user_key().ToString(true).c_str());
3308   out_file->Append(": ");
3309   out_file->Append(value.ToString(true).c_str());
3310   out_file->Append("\n");
3311 
3312   std::string str_key = ikey.user_key().ToString();
3313   std::string str_value = value.ToString();
3314   std::string res_key(""), res_value("");
3315   char cspace = ' ';
3316   for (size_t i = 0; i < str_key.size(); i++) {
3317     if (str_key[i] == '\0') {
3318       res_key.append("\\0", 2);
3319     } else {
3320       res_key.append(&str_key[i], 1);
3321     }
3322     res_key.append(1, cspace);
3323   }
3324   for (size_t i = 0; i < str_value.size(); i++) {
3325     if (str_value[i] == '\0') {
3326       res_value.append("\\0", 2);
3327     } else {
3328       res_value.append(&str_value[i], 1);
3329     }
3330     res_value.append(1, cspace);
3331   }
3332 
3333   out_file->Append("  ASCII  ");
3334   out_file->Append(res_key.c_str());
3335   out_file->Append(": ");
3336   out_file->Append(res_value.c_str());
3337   out_file->Append("\n  ------\n");
3338 }
3339 
3340 }  // namespace ROCKSDB_NAMESPACE
3341