1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #include "table/get_context.h"
7 #include "db/merge_helper.h"
8 #include "db/pinned_iterators_manager.h"
9 #include "db/read_callback.h"
10 #include "monitoring/file_read_sample.h"
11 #include "monitoring/perf_context_imp.h"
12 #include "monitoring/statistics.h"
13 #include "rocksdb/env.h"
14 #include "rocksdb/merge_operator.h"
15 #include "rocksdb/statistics.h"
16 
17 namespace ROCKSDB_NAMESPACE {
18 
19 namespace {
20 
appendToReplayLog(std::string * replay_log,ValueType type,Slice value)21 void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
22 #ifndef ROCKSDB_LITE
23   if (replay_log) {
24     if (replay_log->empty()) {
25       // Optimization: in the common case of only one operation in the
26       // log, we allocate the exact amount of space needed.
27       replay_log->reserve(1 + VarintLength(value.size()) + value.size());
28     }
29     replay_log->push_back(type);
30     PutLengthPrefixedSlice(replay_log, value);
31   }
32 #else
33   (void)replay_log;
34   (void)type;
35   (void)value;
36 #endif  // ROCKSDB_LITE
37 }
38 
39 }  // namespace
40 
GetContext(const Comparator * ucmp,const MergeOperator * merge_operator,Logger * logger,Statistics * statistics,GetState init_state,const Slice & user_key,PinnableSlice * pinnable_val,std::string * timestamp,bool * value_found,MergeContext * merge_context,bool do_merge,SequenceNumber * _max_covering_tombstone_seq,Env * env,SequenceNumber * seq,PinnedIteratorsManager * _pinned_iters_mgr,ReadCallback * callback,bool * is_blob_index,uint64_t tracing_get_id)41 GetContext::GetContext(
42     const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
43     Statistics* statistics, GetState init_state, const Slice& user_key,
44     PinnableSlice* pinnable_val, std::string* timestamp, bool* value_found,
45     MergeContext* merge_context, bool do_merge,
46     SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq,
47     PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
48     bool* is_blob_index, uint64_t tracing_get_id)
49     : ucmp_(ucmp),
50       merge_operator_(merge_operator),
51       logger_(logger),
52       statistics_(statistics),
53       state_(init_state),
54       user_key_(user_key),
55       pinnable_val_(pinnable_val),
56       timestamp_(timestamp),
57       value_found_(value_found),
58       merge_context_(merge_context),
59       max_covering_tombstone_seq_(_max_covering_tombstone_seq),
60       env_(env),
61       seq_(seq),
62       replay_log_(nullptr),
63       pinned_iters_mgr_(_pinned_iters_mgr),
64       callback_(callback),
65       do_merge_(do_merge),
66       is_blob_index_(is_blob_index),
67       tracing_get_id_(tracing_get_id) {
68   if (seq_) {
69     *seq_ = kMaxSequenceNumber;
70   }
71   sample_ = should_sample_file_read();
72 }
73 
GetContext(const Comparator * ucmp,const MergeOperator * merge_operator,Logger * logger,Statistics * statistics,GetState init_state,const Slice & user_key,PinnableSlice * pinnable_val,bool * value_found,MergeContext * merge_context,bool do_merge,SequenceNumber * _max_covering_tombstone_seq,Env * env,SequenceNumber * seq,PinnedIteratorsManager * _pinned_iters_mgr,ReadCallback * callback,bool * is_blob_index,uint64_t tracing_get_id)74 GetContext::GetContext(
75     const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
76     Statistics* statistics, GetState init_state, const Slice& user_key,
77     PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
78     bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
79     SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
80     ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
81     : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
82                  pinnable_val, nullptr, value_found, merge_context, do_merge,
83                  _max_covering_tombstone_seq, env, seq, _pinned_iters_mgr,
84                  callback, is_blob_index, tracing_get_id) {}
85 
86 // Called from TableCache::Get and Table::Get when file/block in which
87 // key may exist are not there in TableCache/BlockCache respectively. In this
88 // case we can't guarantee that key does not exist and are not permitted to do
89 // IO to be certain.Set the status=kFound and value_found=false to let the
90 // caller know that key may exist but is not there in memory
MarkKeyMayExist()91 void GetContext::MarkKeyMayExist() {
92   state_ = kFound;
93   if (value_found_ != nullptr) {
94     *value_found_ = false;
95   }
96 }
97 
SaveValue(const Slice & value,SequenceNumber)98 void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
99   assert(state_ == kNotFound);
100   appendToReplayLog(replay_log_, kTypeValue, value);
101 
102   state_ = kFound;
103   if (LIKELY(pinnable_val_ != nullptr)) {
104     pinnable_val_->PinSelf(value);
105   }
106 }
107 
ReportCounters()108 void GetContext::ReportCounters() {
109   if (get_context_stats_.num_cache_hit > 0) {
110     RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
111   }
112   if (get_context_stats_.num_cache_index_hit > 0) {
113     RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
114                get_context_stats_.num_cache_index_hit);
115   }
116   if (get_context_stats_.num_cache_data_hit > 0) {
117     RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
118                get_context_stats_.num_cache_data_hit);
119   }
120   if (get_context_stats_.num_cache_filter_hit > 0) {
121     RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
122                get_context_stats_.num_cache_filter_hit);
123   }
124   if (get_context_stats_.num_cache_compression_dict_hit > 0) {
125     RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
126                get_context_stats_.num_cache_compression_dict_hit);
127   }
128   if (get_context_stats_.num_cache_index_miss > 0) {
129     RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
130                get_context_stats_.num_cache_index_miss);
131   }
132   if (get_context_stats_.num_cache_filter_miss > 0) {
133     RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
134                get_context_stats_.num_cache_filter_miss);
135   }
136   if (get_context_stats_.num_cache_data_miss > 0) {
137     RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
138                get_context_stats_.num_cache_data_miss);
139   }
140   if (get_context_stats_.num_cache_compression_dict_miss > 0) {
141     RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
142                get_context_stats_.num_cache_compression_dict_miss);
143   }
144   if (get_context_stats_.num_cache_bytes_read > 0) {
145     RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
146                get_context_stats_.num_cache_bytes_read);
147   }
148   if (get_context_stats_.num_cache_miss > 0) {
149     RecordTick(statistics_, BLOCK_CACHE_MISS,
150                get_context_stats_.num_cache_miss);
151   }
152   if (get_context_stats_.num_cache_add > 0) {
153     RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
154   }
155   if (get_context_stats_.num_cache_bytes_write > 0) {
156     RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
157                get_context_stats_.num_cache_bytes_write);
158   }
159   if (get_context_stats_.num_cache_index_add > 0) {
160     RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
161                get_context_stats_.num_cache_index_add);
162   }
163   if (get_context_stats_.num_cache_index_bytes_insert > 0) {
164     RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
165                get_context_stats_.num_cache_index_bytes_insert);
166   }
167   if (get_context_stats_.num_cache_data_add > 0) {
168     RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
169                get_context_stats_.num_cache_data_add);
170   }
171   if (get_context_stats_.num_cache_data_bytes_insert > 0) {
172     RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
173                get_context_stats_.num_cache_data_bytes_insert);
174   }
175   if (get_context_stats_.num_cache_filter_add > 0) {
176     RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
177                get_context_stats_.num_cache_filter_add);
178   }
179   if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
180     RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
181                get_context_stats_.num_cache_filter_bytes_insert);
182   }
183   if (get_context_stats_.num_cache_compression_dict_add > 0) {
184     RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
185                get_context_stats_.num_cache_compression_dict_add);
186   }
187   if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
188     RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
189                get_context_stats_.num_cache_compression_dict_bytes_insert);
190   }
191 }
192 
SaveValue(const ParsedInternalKey & parsed_key,const Slice & value,bool * matched,Cleanable * value_pinner)193 bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
194                            const Slice& value, bool* matched,
195                            Cleanable* value_pinner) {
196   assert(matched);
197   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
198          merge_context_ != nullptr);
199   if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
200     *matched = true;
201     // If the value is not in the snapshot, skip it
202     if (!CheckCallback(parsed_key.sequence)) {
203       return true;  // to continue to the next seq
204     }
205 
206     appendToReplayLog(replay_log_, parsed_key.type, value);
207 
208     if (seq_ != nullptr) {
209       // Set the sequence number if it is uninitialized
210       if (*seq_ == kMaxSequenceNumber) {
211         *seq_ = parsed_key.sequence;
212       }
213     }
214 
215     auto type = parsed_key.type;
216     // Key matches. Process it
217     if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
218         max_covering_tombstone_seq_ != nullptr &&
219         *max_covering_tombstone_seq_ > parsed_key.sequence) {
220       type = kTypeRangeDeletion;
221     }
222     switch (type) {
223       case kTypeValue:
224       case kTypeBlobIndex:
225         assert(state_ == kNotFound || state_ == kMerge);
226         if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
227           // Blob value not supported. Stop.
228           state_ = kBlobIndex;
229           return false;
230         }
231         if (kNotFound == state_) {
232           state_ = kFound;
233           if (do_merge_) {
234             if (LIKELY(pinnable_val_ != nullptr)) {
235               if (LIKELY(value_pinner != nullptr)) {
236                 // If the backing resources for the value are provided, pin them
237                 pinnable_val_->PinSlice(value, value_pinner);
238               } else {
239                 TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
240                                          this);
241 
242                 // Otherwise copy the value
243                 pinnable_val_->PinSelf(value);
244               }
245             }
246           } else {
247             // It means this function is called as part of DB GetMergeOperands
248             // API and the current value should be part of
249             // merge_context_->operand_list
250             push_operand(value, value_pinner);
251           }
252         } else if (kMerge == state_) {
253           assert(merge_operator_ != nullptr);
254           state_ = kFound;
255           if (do_merge_) {
256             if (LIKELY(pinnable_val_ != nullptr)) {
257               Status merge_status = MergeHelper::TimedFullMerge(
258                   merge_operator_, user_key_, &value,
259                   merge_context_->GetOperands(), pinnable_val_->GetSelf(),
260                   logger_, statistics_, env_);
261               pinnable_val_->PinSelf();
262               if (!merge_status.ok()) {
263                 state_ = kCorrupt;
264               }
265             }
266           } else {
267             // It means this function is called as part of DB GetMergeOperands
268             // API and the current value should be part of
269             // merge_context_->operand_list
270             push_operand(value, value_pinner);
271           }
272         }
273         if (state_ == kFound) {
274           size_t ts_sz = ucmp_->timestamp_size();
275           if (ts_sz > 0 && timestamp_ != nullptr) {
276             Slice ts = ExtractTimestampFromUserKey(parsed_key.user_key, ts_sz);
277             timestamp_->assign(ts.data(), ts.size());
278           }
279         }
280         if (is_blob_index_ != nullptr) {
281           *is_blob_index_ = (type == kTypeBlobIndex);
282         }
283         return false;
284 
285       case kTypeDeletion:
286       case kTypeSingleDeletion:
287       case kTypeRangeDeletion:
288         // TODO(noetzli): Verify correctness once merge of single-deletes
289         // is supported
290         assert(state_ == kNotFound || state_ == kMerge);
291         if (kNotFound == state_) {
292           state_ = kDeleted;
293         } else if (kMerge == state_) {
294           state_ = kFound;
295           if (LIKELY(pinnable_val_ != nullptr)) {
296             if (do_merge_) {
297               Status merge_status = MergeHelper::TimedFullMerge(
298                   merge_operator_, user_key_, nullptr,
299                   merge_context_->GetOperands(), pinnable_val_->GetSelf(),
300                   logger_, statistics_, env_);
301               pinnable_val_->PinSelf();
302               if (!merge_status.ok()) {
303                 state_ = kCorrupt;
304               }
305             }
306             // If do_merge_ = false then the current value shouldn't be part of
307             // merge_context_->operand_list
308           }
309         }
310         return false;
311 
312       case kTypeMerge:
313         assert(state_ == kNotFound || state_ == kMerge);
314         state_ = kMerge;
315         // value_pinner is not set from plain_table_reader.cc for example.
316         push_operand(value, value_pinner);
317         if (do_merge_ && merge_operator_ != nullptr &&
318             merge_operator_->ShouldMerge(
319                 merge_context_->GetOperandsDirectionBackward())) {
320           state_ = kFound;
321           if (LIKELY(pinnable_val_ != nullptr)) {
322             // do_merge_ = true this is the case where this function is called
323             // as part of DB Get API hence merge operators should be merged.
324             if (do_merge_) {
325               Status merge_status = MergeHelper::TimedFullMerge(
326                   merge_operator_, user_key_, nullptr,
327                   merge_context_->GetOperands(), pinnable_val_->GetSelf(),
328                   logger_, statistics_, env_);
329               pinnable_val_->PinSelf();
330               if (!merge_status.ok()) {
331                 state_ = kCorrupt;
332               }
333             }
334           }
335           return false;
336         }
337         return true;
338 
339       default:
340         assert(false);
341         break;
342     }
343   }
344 
345   // state_ could be Corrupt, merge or notfound
346   return false;
347 }
348 
push_operand(const Slice & value,Cleanable * value_pinner)349 void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
350   if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
351       value_pinner != nullptr) {
352     value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
353     merge_context_->PushOperand(value, true /*value_pinned*/);
354   } else {
355     merge_context_->PushOperand(value, false);
356   }
357 }
358 
replayGetContextLog(const Slice & replay_log,const Slice & user_key,GetContext * get_context,Cleanable * value_pinner)359 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
360                          GetContext* get_context, Cleanable* value_pinner) {
361 #ifndef ROCKSDB_LITE
362   Slice s = replay_log;
363   while (s.size()) {
364     auto type = static_cast<ValueType>(*s.data());
365     s.remove_prefix(1);
366     Slice value;
367     bool ret = GetLengthPrefixedSlice(&s, &value);
368     assert(ret);
369     (void)ret;
370 
371     bool dont_care __attribute__((__unused__));
372     // Since SequenceNumber is not stored and unknown, we will use
373     // kMaxSequenceNumber.
374     get_context->SaveValue(
375         ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
376         &dont_care, value_pinner);
377   }
378 #else   // ROCKSDB_LITE
379   (void)replay_log;
380   (void)user_key;
381   (void)get_context;
382   (void)value_pinner;
383   assert(false);
384 #endif  // ROCKSDB_LITE
385 }
386 
387 }  // namespace ROCKSDB_NAMESPACE
388