1 //  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 #pragma once
7 
8 #ifndef ROCKSDB_LITE
9 
10 #include <limits>
11 #include <list>
12 #include <map>
13 #include <string>
14 #include <vector>
15 
16 #include "monitoring/histogram.h"
17 #include "rocksdb/env.h"
18 #include "rocksdb/persistent_cache.h"
19 #include "rocksdb/status.h"
20 
21 // Persistent Cache
22 //
23 // Persistent cache is tiered key-value cache that can use persistent medium. It
24 // is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
25 // The code has been kept generic but significant benchmark/design/development
26 // time has been spent to make sure the cache performs appropriately for
27 // respective storage medium.
28 // The file defines
29 // PersistentCacheTier    : Implementation that handles individual cache tier
30 // PersistentTieresCache  : Implementation that handles all tiers as a logical
31 //                          unit
32 //
33 // PersistentTieredCache architecture:
34 // +--------------------------+ PersistentCacheTier that handles multiple tiers
35 // | +----------------+       |
36 // | | RAM            | PersistentCacheTier that handles RAM (VolatileCacheImpl)
37 // | +----------------+       |
38 // |   | next                 |
39 // |   v                      |
40 // | +----------------+       |
41 // | | NVM            | PersistentCacheTier implementation that handles NVM
42 // | +----------------+ (BlockCacheImpl)
43 // |   | next                 |
44 // |   V                      |
45 // | +----------------+       |
46 // | | LE-SSD         | PersistentCacheTier implementation that handles LE-SSD
47 // | +----------------+ (BlockCacheImpl)
48 // |   |                      |
49 // |   V                      |
50 // |  null                    |
51 // +--------------------------+
52 //               |
53 //               V
54 //              null
55 namespace ROCKSDB_NAMESPACE {
56 
57 // Persistent Cache Config
58 //
59 // This struct captures all the options that are used to configure persistent
60 // cache. Some of the terminologies used in naming the options are
61 //
62 // dispatch size :
63 // This is the size in which IO is dispatched to the device
64 //
65 // write buffer size :
66 // This is the size of an individual write buffer size. Write buffers are
67 // grouped to form buffered file.
68 //
69 // cache size :
70 // This is the logical maximum for the cache size
71 //
72 // qdepth :
73 // This is the max number of IOs that can issues to the device in parallel
74 //
75 // pepeling :
76 // The writer code path follows pipelined architecture, which means the
77 // operations are handed off from one stage to another
78 //
79 // pipelining backlog size :
80 // With the pipelined architecture, there can always be backlogging of ops in
81 // pipeline queues. This is the maximum backlog size after which ops are dropped
82 // from queue
83 struct PersistentCacheConfig {
84   explicit PersistentCacheConfig(
85       Env* const _env, const std::string& _path, const uint64_t _cache_size,
86       const std::shared_ptr<Logger>& _log,
87       const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
88     env = _env;
89     path = _path;
90     log = _log;
91     cache_size = _cache_size;
92     writer_dispatch_size = write_buffer_size = _write_buffer_size;
93   }
94 
95   //
96   // Validate the settings. Our intentions are to catch erroneous settings ahead
97   // of time instead going violating invariants or causing dead locks.
98   //
ValidateSettingsPersistentCacheConfig99   Status ValidateSettings() const {
100     // (1) check pre-conditions for variables
101     if (!env || path.empty()) {
102       return Status::InvalidArgument("empty or null args");
103     }
104 
105     // (2) assert size related invariants
106     // - cache size cannot be less than cache file size
107     // - individual write buffer size cannot be greater than cache file size
108     // - total write buffer size cannot be less than 2X cache file size
109     if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
110         write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
111       return Status::InvalidArgument("invalid cache size");
112     }
113 
114     // (2) check writer settings
115     // - Queue depth cannot be 0
116     // - writer_dispatch_size cannot be greater than writer_buffer_size
117     // - dispatch size and buffer size need to be aligned
118     if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
119         write_buffer_size % writer_dispatch_size) {
120       return Status::InvalidArgument("invalid writer settings");
121     }
122 
123     return Status::OK();
124   }
125 
126   //
127   // Env abstraction to use for systmer level operations
128   //
129   Env* env;
130 
131   //
132   // Path for the block cache where blocks are persisted
133   //
134   std::string path;
135 
136   //
137   // Log handle for logging messages
138   //
139   std::shared_ptr<Logger> log;
140 
141   //
142   // Enable direct IO for reading
143   //
144   bool enable_direct_reads = true;
145 
146   //
147   // Enable direct IO for writing
148   //
149   bool enable_direct_writes = false;
150 
151   //
152   // Logical cache size
153   //
154   uint64_t cache_size = std::numeric_limits<uint64_t>::max();
155 
156   // cache-file-size
157   //
158   // Cache consists of multiples of small files. This parameter defines the
159   // size of an individual cache file
160   //
161   // default: 1M
162   uint32_t cache_file_size = 100ULL * 1024 * 1024;
163 
164   // writer-qdepth
165   //
166   // The writers can issues IO to the devices in parallel. This parameter
167   // controls the max number if IOs that can issues in parallel to the block
168   // device
169   //
170   // default :1
171   uint32_t writer_qdepth = 1;
172 
173   // pipeline-writes
174   //
175   // The write optionally follow pipelined architecture. This helps
176   // avoid regression in the eviction code path of the primary tier. This
177   // parameter defines if pipelining is enabled or disabled
178   //
179   // default: true
180   bool pipeline_writes = true;
181 
182   // max-write-pipeline-backlog-size
183   //
184   // Max pipeline buffer size. This is the maximum backlog we can accumulate
185   // while waiting for writes. After the limit, new ops will be dropped.
186   //
187   // Default: 1GiB
188   uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;
189 
190   // write-buffer-size
191   //
192   // This is the size in which buffer slabs are allocated.
193   //
194   // Default: 1M
195   uint32_t write_buffer_size = 1ULL * 1024 * 1024;
196 
197   // write-buffer-count
198   //
199   // This is the total number of buffer slabs. This is calculated as a factor of
200   // file size in order to avoid dead lock.
write_buffer_countPersistentCacheConfig201   size_t write_buffer_count() const {
202     assert(write_buffer_size);
203     return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
204                                write_buffer_size);
205   }
206 
207   // writer-dispatch-size
208   //
209   // The writer thread will dispatch the IO at the specified IO size
210   //
211   // default: 1M
212   uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;
213 
214   // is_compressed
215   //
216   // This option determines if the cache will run in compressed mode or
217   // uncompressed mode
218   bool is_compressed = true;
219 
220   PersistentCacheConfig MakePersistentCacheConfig(
221       const std::string& path, const uint64_t size,
222       const std::shared_ptr<Logger>& log);
223 
224   std::string ToString() const;
225 };
226 
227 // Persistent Cache Tier
228 //
229 // This a logical abstraction that defines a tier of the persistent cache. Tiers
230 // can be stacked over one another. PersistentCahe provides the basic definition
231 // for accessing/storing in the cache. PersistentCacheTier extends the interface
232 // to enable management and stacking of tiers.
233 class PersistentCacheTier : public PersistentCache {
234  public:
235   typedef std::shared_ptr<PersistentCacheTier> Tier;
236 
~PersistentCacheTier()237   virtual ~PersistentCacheTier() {}
238 
239   // Open the persistent cache tier
240   virtual Status Open();
241 
242   // Close the persistent cache tier
243   virtual Status Close();
244 
245   // Reserve space up to 'size' bytes
246   virtual bool Reserve(const size_t size);
247 
248   // Erase a key from the cache
249   virtual bool Erase(const Slice& key);
250 
251   // Print stats to string recursively
252   virtual std::string PrintStats();
253 
254   virtual PersistentCache::StatsType Stats() override;
255 
256   // Insert to page cache
257   virtual Status Insert(const Slice& page_key, const char* data,
258                         const size_t size) override = 0;
259 
260   // Lookup page cache by page identifier
261   virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
262                         size_t* size) override = 0;
263 
264   // Does it store compressed data ?
265   virtual bool IsCompressed() override = 0;
266 
267   virtual std::string GetPrintableOptions() const override = 0;
268 
269   // Return a reference to next tier
next_tier()270   virtual Tier& next_tier() { return next_tier_; }
271 
272   // Set the value for next tier
set_next_tier(const Tier & tier)273   virtual void set_next_tier(const Tier& tier) {
274     assert(!next_tier_);
275     next_tier_ = tier;
276   }
277 
TEST_Flush()278   virtual void TEST_Flush() {
279     if (next_tier_) {
280       next_tier_->TEST_Flush();
281     }
282   }
283 
284  private:
285   Tier next_tier_;  // next tier
286 };
287 
288 // PersistentTieredCache
289 //
290 // Abstraction that helps you construct a tiers of persistent caches as a
291 // unified cache. The tier(s) of cache will act a single tier for management
292 // ease and support PersistentCache methods for accessing data.
293 class PersistentTieredCache : public PersistentCacheTier {
294  public:
295   virtual ~PersistentTieredCache();
296 
297   Status Open() override;
298   Status Close() override;
299   bool Erase(const Slice& key) override;
300   std::string PrintStats() override;
301   PersistentCache::StatsType Stats() override;
302   Status Insert(const Slice& page_key, const char* data,
303                 const size_t size) override;
304   Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
305                 size_t* size) override;
306   bool IsCompressed() override;
307 
GetPrintableOptions()308   std::string GetPrintableOptions() const override {
309     return "PersistentTieredCache";
310   }
311 
312   void AddTier(const Tier& tier);
313 
next_tier()314   Tier& next_tier() override {
315     auto it = tiers_.end();
316     return (*it)->next_tier();
317   }
318 
set_next_tier(const Tier & tier)319   void set_next_tier(const Tier& tier) override {
320     auto it = tiers_.end();
321     (*it)->set_next_tier(tier);
322   }
323 
TEST_Flush()324   void TEST_Flush() override {
325     assert(!tiers_.empty());
326     tiers_.front()->TEST_Flush();
327     PersistentCacheTier::TEST_Flush();
328   }
329 
330  protected:
331   std::list<Tier> tiers_;  // list of tiers top-down
332 };
333 
334 }  // namespace ROCKSDB_NAMESPACE
335 
336 #endif
337