1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_iterator.h"
10 
11 namespace ROCKSDB_NAMESPACE {
Seek(const Slice & target)12 void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); }
13 
SeekToFirst()14 void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); }
15 
SeekImpl(const Slice * target)16 void BlockBasedTableIterator::SeekImpl(const Slice* target) {
17   is_out_of_bound_ = false;
18   is_at_first_key_from_index_ = false;
19   if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
20     ResetDataIter();
21     return;
22   }
23 
24   bool need_seek_index = true;
25   if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
26     // Reseek.
27     prev_block_offset_ = index_iter_->value().handle.offset();
28 
29     if (target) {
30       // We can avoid an index seek if:
31       // 1. The new seek key is larger than the current key
32       // 2. The new seek key is within the upper bound of the block
33       // Since we don't necessarily know the internal key for either
34       // the current key or the upper bound, we check user keys and
35       // exclude the equality case. Considering internal keys can
36       // improve for the boundary cases, but it would complicate the
37       // code.
38       if (user_comparator_.Compare(ExtractUserKey(*target),
39                                    block_iter_.user_key()) > 0 &&
40           user_comparator_.Compare(ExtractUserKey(*target),
41                                    index_iter_->user_key()) < 0) {
42         need_seek_index = false;
43       }
44     }
45   }
46 
47   if (need_seek_index) {
48     if (target) {
49       index_iter_->Seek(*target);
50     } else {
51       index_iter_->SeekToFirst();
52     }
53 
54     if (!index_iter_->Valid()) {
55       ResetDataIter();
56       return;
57     }
58   }
59 
60   IndexValue v = index_iter_->value();
61   const bool same_block = block_iter_points_to_real_block_ &&
62                           v.handle.offset() == prev_block_offset_;
63 
64   // TODO(kolmike): Remove the != kBlockCacheTier condition.
65   if (!v.first_internal_key.empty() && !same_block &&
66       (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
67       read_options_.read_tier != kBlockCacheTier) {
68     // Index contains the first key of the block, and it's >= target.
69     // We can defer reading the block.
70     is_at_first_key_from_index_ = true;
71     // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
72     // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
73     // as that will be done later when the data block is actually read.
74     ResetDataIter();
75   } else {
76     // Need to use the data block.
77     if (!same_block) {
78       InitDataBlock();
79     } else {
80       // When the user does a reseek, the iterate_upper_bound might have
81       // changed. CheckDataBlockWithinUpperBound() needs to be called
82       // explicitly if the reseek ends up in the same data block.
83       // If the reseek ends up in a different block, InitDataBlock() will do
84       // the iterator upper bound check.
85       CheckDataBlockWithinUpperBound();
86     }
87 
88     if (target) {
89       block_iter_.Seek(*target);
90     } else {
91       block_iter_.SeekToFirst();
92     }
93     FindKeyForward();
94   }
95 
96   CheckOutOfBound();
97 
98   if (target) {
99     assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
100   }
101 }
102 
SeekForPrev(const Slice & target)103 void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
104   is_out_of_bound_ = false;
105   is_at_first_key_from_index_ = false;
106   // For now totally disable prefix seek in auto prefix mode because we don't
107   // have logic
108   if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
109     ResetDataIter();
110     return;
111   }
112 
113   SavePrevIndexValue();
114 
115   // Call Seek() rather than SeekForPrev() in the index block, because the
116   // target data block will likely to contain the position for `target`, the
117   // same as Seek(), rather than than before.
118   // For example, if we have three data blocks, each containing two keys:
119   //   [2, 4]  [6, 8] [10, 12]
120   //  (the keys in the index block would be [4, 8, 12])
121   // and the user calls SeekForPrev(7), we need to go to the second block,
122   // just like if they call Seek(7).
123   // The only case where the block is difference is when they seek to a position
124   // in the boundary. For example, if they SeekForPrev(5), we should go to the
125   // first block, rather than the second. However, we don't have the information
126   // to distinguish the two unless we read the second block. In this case, we'll
127   // end up with reading two blocks.
128   index_iter_->Seek(target);
129 
130   if (!index_iter_->Valid()) {
131     auto seek_status = index_iter_->status();
132     // Check for IO error
133     if (!seek_status.IsNotFound() && !seek_status.ok()) {
134       ResetDataIter();
135       return;
136     }
137 
138     // With prefix index, Seek() returns NotFound if the prefix doesn't exist
139     if (seek_status.IsNotFound()) {
140       // Any key less than the target is fine for prefix seek
141       ResetDataIter();
142       return;
143     } else {
144       index_iter_->SeekToLast();
145     }
146     // Check for IO error
147     if (!index_iter_->Valid()) {
148       ResetDataIter();
149       return;
150     }
151   }
152 
153   InitDataBlock();
154 
155   block_iter_.SeekForPrev(target);
156 
157   FindKeyBackward();
158   CheckDataBlockWithinUpperBound();
159   assert(!block_iter_.Valid() ||
160          icomp_.Compare(target, block_iter_.key()) >= 0);
161 }
162 
SeekToLast()163 void BlockBasedTableIterator::SeekToLast() {
164   is_out_of_bound_ = false;
165   is_at_first_key_from_index_ = false;
166   SavePrevIndexValue();
167   index_iter_->SeekToLast();
168   if (!index_iter_->Valid()) {
169     ResetDataIter();
170     return;
171   }
172   InitDataBlock();
173   block_iter_.SeekToLast();
174   FindKeyBackward();
175   CheckDataBlockWithinUpperBound();
176 }
177 
Next()178 void BlockBasedTableIterator::Next() {
179   if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
180     return;
181   }
182   assert(block_iter_points_to_real_block_);
183   block_iter_.Next();
184   FindKeyForward();
185   CheckOutOfBound();
186 }
187 
NextAndGetResult(IterateResult * result)188 bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
189   Next();
190   bool is_valid = Valid();
191   if (is_valid) {
192     result->key = key();
193     result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
194   }
195   return is_valid;
196 }
197 
Prev()198 void BlockBasedTableIterator::Prev() {
199   if (is_at_first_key_from_index_) {
200     is_at_first_key_from_index_ = false;
201 
202     index_iter_->Prev();
203     if (!index_iter_->Valid()) {
204       return;
205     }
206 
207     InitDataBlock();
208     block_iter_.SeekToLast();
209   } else {
210     assert(block_iter_points_to_real_block_);
211     block_iter_.Prev();
212   }
213 
214   FindKeyBackward();
215 }
216 
InitDataBlock()217 void BlockBasedTableIterator::InitDataBlock() {
218   BlockHandle data_block_handle = index_iter_->value().handle;
219   if (!block_iter_points_to_real_block_ ||
220       data_block_handle.offset() != prev_block_offset_ ||
221       // if previous attempt of reading the block missed cache, try again
222       block_iter_.status().IsIncomplete()) {
223     if (block_iter_points_to_real_block_) {
224       ResetDataIter();
225     }
226     auto* rep = table_->get_rep();
227 
228     bool is_for_compaction =
229         lookup_context_.caller == TableReaderCaller::kCompaction;
230     // Prefetch additional data for range scans (iterators).
231     // Implicit auto readahead:
232     //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
233     // Explicit user requested readahead:
234     //   Enabled from the very first IO when ReadOptions.readahead_size is set.
235     block_prefetcher_.PrefetchIfNeeded(rep, data_block_handle,
236                                        read_options_.readahead_size,
237                                        is_for_compaction);
238 
239     Status s;
240     table_->NewDataBlockIterator<DataBlockIter>(
241         read_options_, data_block_handle, &block_iter_, BlockType::kData,
242         /*get_context=*/nullptr, &lookup_context_, s,
243         block_prefetcher_.prefetch_buffer(),
244         /*for_compaction=*/is_for_compaction);
245     block_iter_points_to_real_block_ = true;
246     CheckDataBlockWithinUpperBound();
247   }
248 }
249 
MaterializeCurrentBlock()250 bool BlockBasedTableIterator::MaterializeCurrentBlock() {
251   assert(is_at_first_key_from_index_);
252   assert(!block_iter_points_to_real_block_);
253   assert(index_iter_->Valid());
254 
255   is_at_first_key_from_index_ = false;
256   InitDataBlock();
257   assert(block_iter_points_to_real_block_);
258   block_iter_.SeekToFirst();
259 
260   if (!block_iter_.Valid() ||
261       icomp_.Compare(block_iter_.key(),
262                      index_iter_->value().first_internal_key) != 0) {
263     // Uh oh.
264     block_iter_.Invalidate(Status::Corruption(
265         "first key in index doesn't match first key in block"));
266     return false;
267   }
268 
269   return true;
270 }
271 
FindKeyForward()272 void BlockBasedTableIterator::FindKeyForward() {
273   // This method's code is kept short to make it likely to be inlined.
274 
275   assert(!is_out_of_bound_);
276   assert(block_iter_points_to_real_block_);
277 
278   if (!block_iter_.Valid()) {
279     // This is the only call site of FindBlockForward(), but it's extracted into
280     // a separate method to keep FindKeyForward() short and likely to be
281     // inlined. When transitioning to a different block, we call
282     // FindBlockForward(), which is much longer and is probably not inlined.
283     FindBlockForward();
284   } else {
285     // This is the fast path that avoids a function call.
286   }
287 }
288 
FindBlockForward()289 void BlockBasedTableIterator::FindBlockForward() {
290   // TODO the while loop inherits from two-level-iterator. We don't know
291   // whether a block can be empty so it can be replaced by an "if".
292   do {
293     if (!block_iter_.status().ok()) {
294       return;
295     }
296     // Whether next data block is out of upper bound, if there is one.
297     const bool next_block_is_out_of_bound =
298         read_options_.iterate_upper_bound != nullptr &&
299         block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
300     assert(!next_block_is_out_of_bound ||
301            user_comparator_.CompareWithoutTimestamp(
302                *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
303                index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
304     ResetDataIter();
305     index_iter_->Next();
306     if (next_block_is_out_of_bound) {
307       // The next block is out of bound. No need to read it.
308       TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
309       // We need to make sure this is not the last data block before setting
310       // is_out_of_bound_, since the index key for the last data block can be
311       // larger than smallest key of the next file on the same level.
312       if (index_iter_->Valid()) {
313         is_out_of_bound_ = true;
314       }
315       return;
316     }
317 
318     if (!index_iter_->Valid()) {
319       return;
320     }
321 
322     IndexValue v = index_iter_->value();
323 
324     // TODO(kolmike): Remove the != kBlockCacheTier condition.
325     if (!v.first_internal_key.empty() &&
326         read_options_.read_tier != kBlockCacheTier) {
327       // Index contains the first key of the block. Defer reading the block.
328       is_at_first_key_from_index_ = true;
329       return;
330     }
331 
332     InitDataBlock();
333     block_iter_.SeekToFirst();
334   } while (!block_iter_.Valid());
335 }
336 
FindKeyBackward()337 void BlockBasedTableIterator::FindKeyBackward() {
338   while (!block_iter_.Valid()) {
339     if (!block_iter_.status().ok()) {
340       return;
341     }
342 
343     ResetDataIter();
344     index_iter_->Prev();
345 
346     if (index_iter_->Valid()) {
347       InitDataBlock();
348       block_iter_.SeekToLast();
349     } else {
350       return;
351     }
352   }
353 
354   // We could have check lower bound here too, but we opt not to do it for
355   // code simplicity.
356 }
357 
CheckOutOfBound()358 void BlockBasedTableIterator::CheckOutOfBound() {
359   if (read_options_.iterate_upper_bound != nullptr && Valid()) {
360     is_out_of_bound_ =
361         user_comparator_.CompareWithoutTimestamp(
362             *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
363             /*b_has_ts=*/true) <= 0;
364   }
365 }
366 
CheckDataBlockWithinUpperBound()367 void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
368   if (read_options_.iterate_upper_bound != nullptr &&
369       block_iter_points_to_real_block_) {
370     data_block_within_upper_bound_ =
371         (user_comparator_.CompareWithoutTimestamp(
372              *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
373              index_iter_->user_key(),
374              /*b_has_ts=*/true) > 0);
375   }
376 }
377 }  // namespace ROCKSDB_NAMESPACE
378