1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_iterator.h"
10
11 namespace ROCKSDB_NAMESPACE {
Seek(const Slice & target)12 void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); }
13
SeekToFirst()14 void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); }
15
SeekImpl(const Slice * target)16 void BlockBasedTableIterator::SeekImpl(const Slice* target) {
17 is_out_of_bound_ = false;
18 is_at_first_key_from_index_ = false;
19 if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
20 ResetDataIter();
21 return;
22 }
23
24 bool need_seek_index = true;
25 if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
26 // Reseek.
27 prev_block_offset_ = index_iter_->value().handle.offset();
28
29 if (target) {
30 // We can avoid an index seek if:
31 // 1. The new seek key is larger than the current key
32 // 2. The new seek key is within the upper bound of the block
33 // Since we don't necessarily know the internal key for either
34 // the current key or the upper bound, we check user keys and
35 // exclude the equality case. Considering internal keys can
36 // improve for the boundary cases, but it would complicate the
37 // code.
38 if (user_comparator_.Compare(ExtractUserKey(*target),
39 block_iter_.user_key()) > 0 &&
40 user_comparator_.Compare(ExtractUserKey(*target),
41 index_iter_->user_key()) < 0) {
42 need_seek_index = false;
43 }
44 }
45 }
46
47 if (need_seek_index) {
48 if (target) {
49 index_iter_->Seek(*target);
50 } else {
51 index_iter_->SeekToFirst();
52 }
53
54 if (!index_iter_->Valid()) {
55 ResetDataIter();
56 return;
57 }
58 }
59
60 IndexValue v = index_iter_->value();
61 const bool same_block = block_iter_points_to_real_block_ &&
62 v.handle.offset() == prev_block_offset_;
63
64 // TODO(kolmike): Remove the != kBlockCacheTier condition.
65 if (!v.first_internal_key.empty() && !same_block &&
66 (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
67 read_options_.read_tier != kBlockCacheTier) {
68 // Index contains the first key of the block, and it's >= target.
69 // We can defer reading the block.
70 is_at_first_key_from_index_ = true;
71 // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
72 // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
73 // as that will be done later when the data block is actually read.
74 ResetDataIter();
75 } else {
76 // Need to use the data block.
77 if (!same_block) {
78 InitDataBlock();
79 } else {
80 // When the user does a reseek, the iterate_upper_bound might have
81 // changed. CheckDataBlockWithinUpperBound() needs to be called
82 // explicitly if the reseek ends up in the same data block.
83 // If the reseek ends up in a different block, InitDataBlock() will do
84 // the iterator upper bound check.
85 CheckDataBlockWithinUpperBound();
86 }
87
88 if (target) {
89 block_iter_.Seek(*target);
90 } else {
91 block_iter_.SeekToFirst();
92 }
93 FindKeyForward();
94 }
95
96 CheckOutOfBound();
97
98 if (target) {
99 assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
100 }
101 }
102
SeekForPrev(const Slice & target)103 void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
104 is_out_of_bound_ = false;
105 is_at_first_key_from_index_ = false;
106 // For now totally disable prefix seek in auto prefix mode because we don't
107 // have logic
108 if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
109 ResetDataIter();
110 return;
111 }
112
113 SavePrevIndexValue();
114
115 // Call Seek() rather than SeekForPrev() in the index block, because the
116 // target data block will likely to contain the position for `target`, the
117 // same as Seek(), rather than than before.
118 // For example, if we have three data blocks, each containing two keys:
119 // [2, 4] [6, 8] [10, 12]
120 // (the keys in the index block would be [4, 8, 12])
121 // and the user calls SeekForPrev(7), we need to go to the second block,
122 // just like if they call Seek(7).
123 // The only case where the block is difference is when they seek to a position
124 // in the boundary. For example, if they SeekForPrev(5), we should go to the
125 // first block, rather than the second. However, we don't have the information
126 // to distinguish the two unless we read the second block. In this case, we'll
127 // end up with reading two blocks.
128 index_iter_->Seek(target);
129
130 if (!index_iter_->Valid()) {
131 auto seek_status = index_iter_->status();
132 // Check for IO error
133 if (!seek_status.IsNotFound() && !seek_status.ok()) {
134 ResetDataIter();
135 return;
136 }
137
138 // With prefix index, Seek() returns NotFound if the prefix doesn't exist
139 if (seek_status.IsNotFound()) {
140 // Any key less than the target is fine for prefix seek
141 ResetDataIter();
142 return;
143 } else {
144 index_iter_->SeekToLast();
145 }
146 // Check for IO error
147 if (!index_iter_->Valid()) {
148 ResetDataIter();
149 return;
150 }
151 }
152
153 InitDataBlock();
154
155 block_iter_.SeekForPrev(target);
156
157 FindKeyBackward();
158 CheckDataBlockWithinUpperBound();
159 assert(!block_iter_.Valid() ||
160 icomp_.Compare(target, block_iter_.key()) >= 0);
161 }
162
SeekToLast()163 void BlockBasedTableIterator::SeekToLast() {
164 is_out_of_bound_ = false;
165 is_at_first_key_from_index_ = false;
166 SavePrevIndexValue();
167 index_iter_->SeekToLast();
168 if (!index_iter_->Valid()) {
169 ResetDataIter();
170 return;
171 }
172 InitDataBlock();
173 block_iter_.SeekToLast();
174 FindKeyBackward();
175 CheckDataBlockWithinUpperBound();
176 }
177
Next()178 void BlockBasedTableIterator::Next() {
179 if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
180 return;
181 }
182 assert(block_iter_points_to_real_block_);
183 block_iter_.Next();
184 FindKeyForward();
185 CheckOutOfBound();
186 }
187
NextAndGetResult(IterateResult * result)188 bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
189 Next();
190 bool is_valid = Valid();
191 if (is_valid) {
192 result->key = key();
193 result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
194 }
195 return is_valid;
196 }
197
Prev()198 void BlockBasedTableIterator::Prev() {
199 if (is_at_first_key_from_index_) {
200 is_at_first_key_from_index_ = false;
201
202 index_iter_->Prev();
203 if (!index_iter_->Valid()) {
204 return;
205 }
206
207 InitDataBlock();
208 block_iter_.SeekToLast();
209 } else {
210 assert(block_iter_points_to_real_block_);
211 block_iter_.Prev();
212 }
213
214 FindKeyBackward();
215 }
216
InitDataBlock()217 void BlockBasedTableIterator::InitDataBlock() {
218 BlockHandle data_block_handle = index_iter_->value().handle;
219 if (!block_iter_points_to_real_block_ ||
220 data_block_handle.offset() != prev_block_offset_ ||
221 // if previous attempt of reading the block missed cache, try again
222 block_iter_.status().IsIncomplete()) {
223 if (block_iter_points_to_real_block_) {
224 ResetDataIter();
225 }
226 auto* rep = table_->get_rep();
227
228 bool is_for_compaction =
229 lookup_context_.caller == TableReaderCaller::kCompaction;
230 // Prefetch additional data for range scans (iterators).
231 // Implicit auto readahead:
232 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
233 // Explicit user requested readahead:
234 // Enabled from the very first IO when ReadOptions.readahead_size is set.
235 block_prefetcher_.PrefetchIfNeeded(rep, data_block_handle,
236 read_options_.readahead_size,
237 is_for_compaction);
238
239 Status s;
240 table_->NewDataBlockIterator<DataBlockIter>(
241 read_options_, data_block_handle, &block_iter_, BlockType::kData,
242 /*get_context=*/nullptr, &lookup_context_, s,
243 block_prefetcher_.prefetch_buffer(),
244 /*for_compaction=*/is_for_compaction);
245 block_iter_points_to_real_block_ = true;
246 CheckDataBlockWithinUpperBound();
247 }
248 }
249
MaterializeCurrentBlock()250 bool BlockBasedTableIterator::MaterializeCurrentBlock() {
251 assert(is_at_first_key_from_index_);
252 assert(!block_iter_points_to_real_block_);
253 assert(index_iter_->Valid());
254
255 is_at_first_key_from_index_ = false;
256 InitDataBlock();
257 assert(block_iter_points_to_real_block_);
258 block_iter_.SeekToFirst();
259
260 if (!block_iter_.Valid() ||
261 icomp_.Compare(block_iter_.key(),
262 index_iter_->value().first_internal_key) != 0) {
263 // Uh oh.
264 block_iter_.Invalidate(Status::Corruption(
265 "first key in index doesn't match first key in block"));
266 return false;
267 }
268
269 return true;
270 }
271
FindKeyForward()272 void BlockBasedTableIterator::FindKeyForward() {
273 // This method's code is kept short to make it likely to be inlined.
274
275 assert(!is_out_of_bound_);
276 assert(block_iter_points_to_real_block_);
277
278 if (!block_iter_.Valid()) {
279 // This is the only call site of FindBlockForward(), but it's extracted into
280 // a separate method to keep FindKeyForward() short and likely to be
281 // inlined. When transitioning to a different block, we call
282 // FindBlockForward(), which is much longer and is probably not inlined.
283 FindBlockForward();
284 } else {
285 // This is the fast path that avoids a function call.
286 }
287 }
288
FindBlockForward()289 void BlockBasedTableIterator::FindBlockForward() {
290 // TODO the while loop inherits from two-level-iterator. We don't know
291 // whether a block can be empty so it can be replaced by an "if".
292 do {
293 if (!block_iter_.status().ok()) {
294 return;
295 }
296 // Whether next data block is out of upper bound, if there is one.
297 const bool next_block_is_out_of_bound =
298 read_options_.iterate_upper_bound != nullptr &&
299 block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
300 assert(!next_block_is_out_of_bound ||
301 user_comparator_.CompareWithoutTimestamp(
302 *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
303 index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
304 ResetDataIter();
305 index_iter_->Next();
306 if (next_block_is_out_of_bound) {
307 // The next block is out of bound. No need to read it.
308 TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
309 // We need to make sure this is not the last data block before setting
310 // is_out_of_bound_, since the index key for the last data block can be
311 // larger than smallest key of the next file on the same level.
312 if (index_iter_->Valid()) {
313 is_out_of_bound_ = true;
314 }
315 return;
316 }
317
318 if (!index_iter_->Valid()) {
319 return;
320 }
321
322 IndexValue v = index_iter_->value();
323
324 // TODO(kolmike): Remove the != kBlockCacheTier condition.
325 if (!v.first_internal_key.empty() &&
326 read_options_.read_tier != kBlockCacheTier) {
327 // Index contains the first key of the block. Defer reading the block.
328 is_at_first_key_from_index_ = true;
329 return;
330 }
331
332 InitDataBlock();
333 block_iter_.SeekToFirst();
334 } while (!block_iter_.Valid());
335 }
336
FindKeyBackward()337 void BlockBasedTableIterator::FindKeyBackward() {
338 while (!block_iter_.Valid()) {
339 if (!block_iter_.status().ok()) {
340 return;
341 }
342
343 ResetDataIter();
344 index_iter_->Prev();
345
346 if (index_iter_->Valid()) {
347 InitDataBlock();
348 block_iter_.SeekToLast();
349 } else {
350 return;
351 }
352 }
353
354 // We could have check lower bound here too, but we opt not to do it for
355 // code simplicity.
356 }
357
CheckOutOfBound()358 void BlockBasedTableIterator::CheckOutOfBound() {
359 if (read_options_.iterate_upper_bound != nullptr && Valid()) {
360 is_out_of_bound_ =
361 user_comparator_.CompareWithoutTimestamp(
362 *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
363 /*b_has_ts=*/true) <= 0;
364 }
365 }
366
CheckDataBlockWithinUpperBound()367 void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
368 if (read_options_.iterate_upper_bound != nullptr &&
369 block_iter_points_to_real_block_) {
370 data_block_within_upper_bound_ =
371 (user_comparator_.CompareWithoutTimestamp(
372 *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
373 index_iter_->user_key(),
374 /*b_has_ts=*/true) > 0);
375 }
376 }
377 } // namespace ROCKSDB_NAMESPACE
378