1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 
8 #include <stdint.h>
9 #include <map>
10 #include <string>
11 
12 #include "rocksdb/perf_level.h"
13 
14 namespace ROCKSDB_NAMESPACE {
15 
16 // A thread local context for gathering performance counter efficiently
17 // and transparently.
18 // Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
19 
20 // Break down performance counters by level and store per-level perf context in
21 // PerfContextByLevel
22 struct PerfContextByLevel {
23   // # of times bloom filter has avoided file reads, i.e., negatives.
24   uint64_t bloom_filter_useful = 0;
25   // # of times bloom FullFilter has not avoided the reads.
26   uint64_t bloom_filter_full_positive = 0;
27   // # of times bloom FullFilter has not avoided the reads and data actually
28   // exist.
29   uint64_t bloom_filter_full_true_positive = 0;
30 
31   // total number of user key returned (only include keys that are found, does
32   // not include keys that are deleted or merged without a final put
33   uint64_t user_key_return_count;
34 
35   // total nanos spent on reading data from SST files
36   uint64_t get_from_table_nanos;
37 
38   uint64_t block_cache_hit_count = 0;   // total number of block cache hits
39   uint64_t block_cache_miss_count = 0;  // total number of block cache misses
40 
41   void Reset();  // reset all performance counters to zero
42 };
43 
44 struct PerfContext {
45   ~PerfContext();
46 
PerfContextPerfContext47   PerfContext() {}
48 
49   PerfContext(const PerfContext&);
50   PerfContext& operator=(const PerfContext&);
51   PerfContext(PerfContext&&) noexcept;
52 
53   void Reset();  // reset all performance counters to zero
54 
55   std::string ToString(bool exclude_zero_counters = false) const;
56 
57   // enable per level perf context and allocate storage for PerfContextByLevel
58   void EnablePerLevelPerfContext();
59 
60   // temporarily disable per level perf contxt by setting the flag to false
61   void DisablePerLevelPerfContext();
62 
63   // free the space for PerfContextByLevel, also disable per level perf context
64   void ClearPerLevelPerfContext();
65 
66   uint64_t user_key_comparison_count;  // total number of user key comparisons
67   uint64_t block_cache_hit_count;      // total number of block cache hits
68   uint64_t block_read_count;           // total number of block reads (with IO)
69   uint64_t block_read_byte;            // total number of bytes from block reads
70   uint64_t block_read_time;            // total nanos spent on block reads
71   uint64_t block_cache_index_hit_count;   // total number of index block hits
72   uint64_t index_block_read_count;        // total number of index block reads
73   uint64_t block_cache_filter_hit_count;  // total number of filter block hits
74   uint64_t filter_block_read_count;       // total number of filter block reads
75   uint64_t compression_dict_block_read_count;  // total number of compression
76                                                // dictionary block reads
77   uint64_t block_checksum_time;    // total nanos spent on block checksum
78   uint64_t block_decompress_time;  // total nanos spent on block decompression
79 
80   uint64_t get_read_bytes;       // bytes for vals returned by Get
81   uint64_t multiget_read_bytes;  // bytes for vals returned by MultiGet
82   uint64_t iter_read_bytes;      // bytes for keys/vals decoded by iterator
83 
84   // total number of internal keys skipped over during iteration.
85   // There are several reasons for it:
86   // 1. when calling Next(), the iterator is in the position of the previous
87   //    key, so that we'll need to skip it. It means this counter will always
88   //    be incremented in Next().
89   // 2. when calling Next(), we need to skip internal entries for the previous
90   //    keys that are overwritten.
91   // 3. when calling Next(), Seek() or SeekToFirst(), after previous key
92   //    before calling Next(), the seek key in Seek() or the beginning for
93   //    SeekToFirst(), there may be one or more deleted keys before the next
94   //    valid key that the operation should place the iterator to. We need
95   //    to skip both of the tombstone and updates hidden by the tombstones. The
96   //    tombstones are not included in this counter, while previous updates
97   //    hidden by the tombstones will be included here.
98   // 4. symmetric cases for Prev() and SeekToLast()
99   // internal_recent_skipped_count is not included in this counter.
100   //
101   uint64_t internal_key_skipped_count;
102   // Total number of deletes and single deletes skipped over during iteration
103   // When calling Next(), Seek() or SeekToFirst(), after previous position
104   // before calling Next(), the seek key in Seek() or the beginning for
105   // SeekToFirst(), there may be one or more deleted keys before the next valid
106   // key. Every deleted key is counted once. We don't recount here if there are
107   // still older updates invalidated by the tombstones.
108   //
109   uint64_t internal_delete_skipped_count;
110   // How many times iterators skipped over internal keys that are more recent
111   // than the snapshot that iterator is using.
112   //
113   uint64_t internal_recent_skipped_count;
114   // How many values were fed into merge operator by iterators.
115   //
116   uint64_t internal_merge_count;
117 
118   uint64_t get_snapshot_time;        // total nanos spent on getting snapshot
119   uint64_t get_from_memtable_time;   // total nanos spent on querying memtables
120   uint64_t get_from_memtable_count;  // number of mem tables queried
121   // total nanos spent after Get() finds a key
122   uint64_t get_post_process_time;
123   uint64_t get_from_output_files_time;  // total nanos reading from output files
124   // total nanos spent on seeking memtable
125   uint64_t seek_on_memtable_time;
126   // number of seeks issued on memtable
127   // (including SeekForPrev but not SeekToFirst and SeekToLast)
128   uint64_t seek_on_memtable_count;
129   // number of Next()s issued on memtable
130   uint64_t next_on_memtable_count;
131   // number of Prev()s issued on memtable
132   uint64_t prev_on_memtable_count;
133   // total nanos spent on seeking child iters
134   uint64_t seek_child_seek_time;
135   // number of seek issued in child iterators
136   uint64_t seek_child_seek_count;
137   uint64_t seek_min_heap_time;  // total nanos spent on the merge min heap
138   uint64_t seek_max_heap_time;  // total nanos spent on the merge max heap
139   // total nanos spent on seeking the internal entries
140   uint64_t seek_internal_seek_time;
141   // total nanos spent on iterating internal entries to find the next user entry
142   uint64_t find_next_user_entry_time;
143 
144   // This group of stats provide a breakdown of time spent by Write().
145   // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
146   // are enabled.
147   //
148   // total nanos spent on writing to WAL
149   uint64_t write_wal_time;
150   // total nanos spent on writing to mem tables
151   uint64_t write_memtable_time;
152   // total nanos spent on delaying or throttling write
153   uint64_t write_delay_time;
154   // total nanos spent on switching memtable/wal and scheduling
155   // flushes/compactions.
156   uint64_t write_scheduling_flushes_compactions_time;
157   // total nanos spent on writing a record, excluding the above four things
158   uint64_t write_pre_and_post_process_time;
159 
160   // time spent waiting for other threads of the batch group
161   uint64_t write_thread_wait_nanos;
162 
163   // time spent on acquiring DB mutex.
164   uint64_t db_mutex_lock_nanos;
165   // Time spent on waiting with a condition variable created with DB mutex.
166   uint64_t db_condition_wait_nanos;
167   // Time spent on merge operator.
168   uint64_t merge_operator_time_nanos;
169 
170   // Time spent on reading index block from block cache or SST file
171   uint64_t read_index_block_nanos;
172   // Time spent on reading filter block from block cache or SST file
173   uint64_t read_filter_block_nanos;
174   // Time spent on creating data block iterator
175   uint64_t new_table_block_iter_nanos;
176   // Time spent on creating a iterator of an SST file.
177   uint64_t new_table_iterator_nanos;
178   // Time spent on seeking a key in data/index blocks
179   uint64_t block_seek_nanos;
180   // Time spent on finding or creating a table reader
181   uint64_t find_table_nanos;
182   // total number of mem table bloom hits
183   uint64_t bloom_memtable_hit_count;
184   // total number of mem table bloom misses
185   uint64_t bloom_memtable_miss_count;
186   // total number of SST table bloom hits
187   uint64_t bloom_sst_hit_count;
188   // total number of SST table bloom misses
189   uint64_t bloom_sst_miss_count;
190 
191   // Time spent waiting on key locks in transaction lock manager.
192   uint64_t key_lock_wait_time;
193   // number of times acquiring a lock was blocked by another transaction.
194   uint64_t key_lock_wait_count;
195 
196   // Total time spent in Env filesystem operations. These are only populated
197   // when TimedEnv is used.
198   uint64_t env_new_sequential_file_nanos;
199   uint64_t env_new_random_access_file_nanos;
200   uint64_t env_new_writable_file_nanos;
201   uint64_t env_reuse_writable_file_nanos;
202   uint64_t env_new_random_rw_file_nanos;
203   uint64_t env_new_directory_nanos;
204   uint64_t env_file_exists_nanos;
205   uint64_t env_get_children_nanos;
206   uint64_t env_get_children_file_attributes_nanos;
207   uint64_t env_delete_file_nanos;
208   uint64_t env_create_dir_nanos;
209   uint64_t env_create_dir_if_missing_nanos;
210   uint64_t env_delete_dir_nanos;
211   uint64_t env_get_file_size_nanos;
212   uint64_t env_get_file_modification_time_nanos;
213   uint64_t env_rename_file_nanos;
214   uint64_t env_link_file_nanos;
215   uint64_t env_lock_file_nanos;
216   uint64_t env_unlock_file_nanos;
217   uint64_t env_new_logger_nanos;
218 
219   uint64_t get_cpu_nanos;
220   uint64_t iter_next_cpu_nanos;
221   uint64_t iter_prev_cpu_nanos;
222   uint64_t iter_seek_cpu_nanos;
223 
224   std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
225   bool per_level_perf_context_enabled = false;
226 };
227 
228 // Get Thread-local PerfContext object pointer
229 // if defined(NPERF_CONTEXT), then the pointer is not thread-local
230 PerfContext* get_perf_context();
231 
232 }  // namespace ROCKSDB_NAMESPACE
233