1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "cache/clock_cache.h"
11 
12 #ifndef SUPPORT_CLOCK_CACHE
13 
14 namespace ROCKSDB_NAMESPACE {
15 
NewClockCache(size_t,int,bool,CacheMetadataChargePolicy)16 std::shared_ptr<Cache> NewClockCache(
17     size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/,
18     CacheMetadataChargePolicy /*metadata_charge_policy*/) {
19   // Clock cache not supported.
20   return nullptr;
21 }
22 
23 }  // namespace ROCKSDB_NAMESPACE
24 
25 #else
26 
27 #include <assert.h>
28 #include <atomic>
29 #include <deque>
30 
31 // "tbb/concurrent_hash_map.h" requires RTTI if exception is enabled.
32 // Disable it so users can chooose to disable RTTI.
33 #ifndef ROCKSDB_USE_RTTI
34 #define TBB_USE_EXCEPTIONS 0
35 #endif
36 #include "tbb/concurrent_hash_map.h"
37 
38 #include "cache/sharded_cache.h"
39 #include "port/malloc.h"
40 #include "port/port.h"
41 #include "util/autovector.h"
42 #include "util/mutexlock.h"
43 
44 namespace ROCKSDB_NAMESPACE {
45 
46 namespace {
47 
48 // An implementation of the Cache interface based on CLOCK algorithm, with
49 // better concurrent performance than LRUCache. The idea of CLOCK algorithm
50 // is to maintain all cache entries in a circular list, and an iterator
51 // (the "head") pointing to the last examined entry. Eviction starts from the
52 // current head. Each entry is given a second chance before eviction, if it
53 // has been access since last examine. In contrast to LRU, no modification
54 // to the internal data-structure (except for flipping the usage bit) needs
55 // to be done upon lookup. This gives us oppertunity to implement a cache
56 // with better concurrency.
57 //
58 // Each cache entry is represented by a cache handle, and all the handles
59 // are arranged in a circular list, as describe above. Upon erase of an entry,
60 // we never remove the handle. Instead, the handle is put into a recycle bin
61 // to be re-use. This is to avoid memory dealocation, which is hard to deal
62 // with in concurrent environment.
63 //
64 // The cache also maintains a concurrent hash map for lookup. Any concurrent
65 // hash map implementation should do the work. We currently use
66 // tbb::concurrent_hash_map because it supports concurrent erase.
67 //
68 // Each cache handle has the following flags and counters, which are squeeze
69 // in an atomic interger, to make sure the handle always be in a consistent
70 // state:
71 //
72 //   * In-cache bit: whether the entry is reference by the cache itself. If
73 //     an entry is in cache, its key would also be available in the hash map.
74 //   * Usage bit: whether the entry has been access by user since last
75 //     examine for eviction. Can be reset by eviction.
76 //   * Reference count: reference count by user.
77 //
78 // An entry can be reference only when it's in cache. An entry can be evicted
79 // only when it is in cache, has no usage since last examine, and reference
80 // count is zero.
81 //
82 // The follow figure shows a possible layout of the cache. Boxes represents
83 // cache handles and numbers in each box being in-cache bit, usage bit and
84 // reference count respectively.
85 //
86 //    hash map:
87 //      +-------+--------+
88 //      |  key  | handle |
89 //      +-------+--------+
90 //      | "foo" |    5   |-------------------------------------+
91 //      +-------+--------+                                     |
92 //      | "bar" |    2   |--+                                  |
93 //      +-------+--------+  |                                  |
94 //                          |                                  |
95 //                     head |                                  |
96 //                       |  |                                  |
97 //    circular list:     |  |                                  |
98 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
99 //         |(0,0,0)|---|(1,1,0)|---|(0,0,0)|---|(0,1,3)|---|(1,0,0)|---|  ...
100 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
101 //             |                       |
102 //             +-------+   +-----------+
103 //                     |   |
104 //                   +---+---+
105 //    recycle bin:   | 1 | 3 |
106 //                   +---+---+
107 //
108 // Suppose we try to insert "baz" into the cache at this point and the cache is
109 // full. The cache will first look for entries to evict, starting from where
110 // head points to (the second entry). It resets usage bit of the second entry,
111 // skips the third and fourth entry since they are not in cache, and finally
112 // evict the fifth entry ("foo"). It looks at recycle bin for available handle,
113 // grabs handle 3, and insert the key into the handle. The following figure
114 // shows the resulting layout.
115 //
116 //    hash map:
117 //      +-------+--------+
118 //      |  key  | handle |
119 //      +-------+--------+
120 //      | "baz" |    3   |-------------+
121 //      +-------+--------+             |
122 //      | "bar" |    2   |--+          |
123 //      +-------+--------+  |          |
124 //                          |          |
125 //                          |          |                                 head
126 //                          |          |                                   |
127 //    circular list:        |          |                                   |
128 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
129 //         |(0,0,0)|---|(1,0,0)|---|(1,0,0)|---|(0,1,3)|---|(0,0,0)|---|  ...
130 //         +-------+   +-------+   +-------+   +-------+   +-------+   +-------
131 //             |                                               |
132 //             +-------+   +-----------------------------------+
133 //                     |   |
134 //                   +---+---+
135 //    recycle bin:   | 1 | 5 |
136 //                   +---+---+
137 //
138 // A global mutex guards the circular list, the head, and the recycle bin.
139 // We additionally require that modifying the hash map needs to hold the mutex.
140 // As such, Modifying the cache (such as Insert() and Erase()) require to
141 // hold the mutex. Lookup() only access the hash map and the flags associated
142 // with each handle, and don't require explicit locking. Release() has to
143 // acquire the mutex only when it releases the last reference to the entry and
144 // the entry has been erased from cache explicitly. A future improvement could
145 // be to remove the mutex completely.
146 //
147 // Benchmark:
148 // We run readrandom db_bench on a test DB of size 13GB, with size of each
149 // level:
150 //
151 //    Level    Files   Size(MB)
152 //    -------------------------
153 //      L0        1       0.01
154 //      L1       18      17.32
155 //      L2      230     182.94
156 //      L3     1186    1833.63
157 //      L4     4602    8140.30
158 //
159 // We test with both 32 and 16 read threads, with 2GB cache size (the whole DB
160 // doesn't fits in) and 64GB cache size (the whole DB can fit in cache), and
161 // whether to put index and filter blocks in block cache. The benchmark runs
162 // with
163 // with RocksDB 4.10. We got the following result:
164 //
165 // Threads Cache     Cache               ClockCache               LRUCache
166 //         Size  Index/Filter Throughput(MB/s)   Hit Throughput(MB/s)    Hit
167 //     32   2GB       yes               466.7  85.9%           433.7   86.5%
168 //     32   2GB       no                529.9  72.7%           532.7   73.9%
169 //     32  64GB       yes               649.9  99.9%           507.9   99.9%
170 //     32  64GB       no                740.4  99.9%           662.8   99.9%
171 //     16   2GB       yes               278.4  85.9%           283.4   86.5%
172 //     16   2GB       no                318.6  72.7%           335.8   73.9%
173 //     16  64GB       yes               391.9  99.9%           353.3   99.9%
174 //     16  64GB       no                433.8  99.8%           419.4   99.8%
175 
176 // Cache entry meta data.
177 struct CacheHandle {
178   using Deleter = Cache::Deleter;
179 
180   Slice key;
181   uint32_t hash;
182   void* value;
183   size_t charge;
184   Deleter* deleter;
185 
186   // Flags and counters associated with the cache handle:
187   //   lowest bit: n-cache bit
188   //   second lowest bit: usage bit
189   //   the rest bits: reference count
190   // The handle is unused when flags equals to 0. The thread decreases the count
191   // to 0 is responsible to put the handle back to recycle_ and cleanup memory.
192   std::atomic<uint32_t> flags;
193 
194   CacheHandle() = default;
195 
CacheHandleROCKSDB_NAMESPACE::__anon9da5ff120111::CacheHandle196   CacheHandle(const CacheHandle& a) { *this = a; }
197 
CacheHandleROCKSDB_NAMESPACE::__anon9da5ff120111::CacheHandle198   CacheHandle(const Slice& k, void* v, Deleter* del)
199       : key(k), value(v), deleter(del) {}
200 
operator =ROCKSDB_NAMESPACE::__anon9da5ff120111::CacheHandle201   CacheHandle& operator=(const CacheHandle& a) {
202     // Only copy members needed for deletion.
203     key = a.key;
204     value = a.value;
205     deleter = a.deleter;
206     return *this;
207   }
208 
CalcTotalChargeROCKSDB_NAMESPACE::__anon9da5ff120111::CacheHandle209   inline static size_t CalcTotalCharge(
210       Slice key, size_t charge,
211       CacheMetadataChargePolicy metadata_charge_policy) {
212     size_t meta_charge = 0;
213     if (metadata_charge_policy == kFullChargeCacheMetadata) {
214       meta_charge += sizeof(CacheHandle);
215 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
216       meta_charge +=
217           malloc_usable_size(static_cast<void*>(const_cast<char*>(key.data())));
218 #else
219       meta_charge += key.size();
220 #endif
221     }
222     return charge + meta_charge;
223   }
224 
CalcTotalChargeROCKSDB_NAMESPACE::__anon9da5ff120111::CacheHandle225   inline size_t CalcTotalCharge(
226       CacheMetadataChargePolicy metadata_charge_policy) {
227     return CalcTotalCharge(key, charge, metadata_charge_policy);
228   }
229 };
230 
231 // Key of hash map. We store hash value with the key for convenience.
232 struct CacheKey {
233   Slice key;
234   uint32_t hash_value;
235 
236   CacheKey() = default;
237 
CacheKeyROCKSDB_NAMESPACE::__anon9da5ff120111::CacheKey238   CacheKey(const Slice& k, uint32_t h) {
239     key = k;
240     hash_value = h;
241   }
242 
equalROCKSDB_NAMESPACE::__anon9da5ff120111::CacheKey243   static bool equal(const CacheKey& a, const CacheKey& b) {
244     return a.hash_value == b.hash_value && a.key == b.key;
245   }
246 
hashROCKSDB_NAMESPACE::__anon9da5ff120111::CacheKey247   static size_t hash(const CacheKey& a) {
248     return static_cast<size_t>(a.hash_value);
249   }
250 };
251 
252 struct CleanupContext {
253   // List of values to be deleted, along with the key and deleter.
254   autovector<CacheHandle> to_delete_value;
255 
256   // List of keys to be deleted.
257   autovector<const char*> to_delete_key;
258 };
259 
260 // A cache shard which maintains its own CLOCK cache.
261 class ClockCacheShard final : public CacheShard {
262  public:
263   // Hash map type.
264   typedef tbb::concurrent_hash_map<CacheKey, CacheHandle*, CacheKey> HashTable;
265 
266   ClockCacheShard();
267   ~ClockCacheShard() override;
268 
269   // Interfaces
270   void SetCapacity(size_t capacity) override;
271   void SetStrictCapacityLimit(bool strict_capacity_limit) override;
272   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
273                 Deleter* deleter, Cache::Handle** handle,
274                 Cache::Priority priority) override;
275   Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
276   // If the entry in in cache, increase reference count and return true.
277   // Return false otherwise.
278   //
279   // Not necessary to hold mutex_ before being called.
280   bool Ref(Cache::Handle* handle) override;
281   bool Release(Cache::Handle* handle, bool force_erase = false) override;
282   void Erase(const Slice& key, uint32_t hash) override;
283   bool EraseAndConfirm(const Slice& key, uint32_t hash,
284                        CleanupContext* context);
285   size_t GetUsage() const override;
286   size_t GetPinnedUsage() const override;
287   void EraseUnRefEntries() override;
288   void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
289                               bool thread_safe) override;
290 
291  private:
292   static const uint32_t kInCacheBit = 1;
293   static const uint32_t kUsageBit = 2;
294   static const uint32_t kRefsOffset = 2;
295   static const uint32_t kOneRef = 1 << kRefsOffset;
296 
297   // Helper functions to extract cache handle flags and counters.
InCache(uint32_t flags)298   static bool InCache(uint32_t flags) { return flags & kInCacheBit; }
HasUsage(uint32_t flags)299   static bool HasUsage(uint32_t flags) { return flags & kUsageBit; }
CountRefs(uint32_t flags)300   static uint32_t CountRefs(uint32_t flags) { return flags >> kRefsOffset; }
301 
302   // Decrease reference count of the entry. If this decreases the count to 0,
303   // recycle the entry. If set_usage is true, also set the usage bit.
304   //
305   // returns true if a value is erased.
306   //
307   // Not necessary to hold mutex_ before being called.
308   bool Unref(CacheHandle* handle, bool set_usage, CleanupContext* context);
309 
310   // Unset in-cache bit of the entry. Recycle the handle if necessary.
311   //
312   // returns true if a value is erased.
313   //
314   // Has to hold mutex_ before being called.
315   bool UnsetInCache(CacheHandle* handle, CleanupContext* context);
316 
317   // Put the handle back to recycle_ list, and put the value associated with
318   // it into to-be-deleted list. It doesn't cleanup the key as it might be
319   // reused by another handle.
320   //
321   // Has to hold mutex_ before being called.
322   void RecycleHandle(CacheHandle* handle, CleanupContext* context);
323 
324   // Delete keys and values in to-be-deleted list. Call the method without
325   // holding mutex, as destructors can be expensive.
326   void Cleanup(const CleanupContext& context);
327 
328   // Examine the handle for eviction. If the handle is in cache, usage bit is
329   // not set, and referece count is 0, evict it from cache. Otherwise unset
330   // the usage bit.
331   //
332   // Has to hold mutex_ before being called.
333   bool TryEvict(CacheHandle* value, CleanupContext* context);
334 
335   // Scan through the circular list, evict entries until we get enough capacity
336   // for new cache entry of specific size. Return true if success, false
337   // otherwise.
338   //
339   // Has to hold mutex_ before being called.
340   bool EvictFromCache(size_t charge, CleanupContext* context);
341 
342   CacheHandle* Insert(const Slice& key, uint32_t hash, void* value,
343                       size_t charge, Deleter* deleter, bool hold_reference,
344                       CleanupContext* context);
345 
346   // Guards list_, head_, and recycle_. In addition, updating table_ also has
347   // to hold the mutex, to avoid the cache being in inconsistent state.
348   mutable port::Mutex mutex_;
349 
350   // The circular list of cache handles. Initially the list is empty. Once a
351   // handle is needed by insertion, and no more handles are available in
352   // recycle bin, one more handle is appended to the end.
353   //
354   // We use std::deque for the circular list because we want to make sure
355   // pointers to handles are valid through out the life-cycle of the cache
356   // (in contrast to std::vector), and be able to grow the list (in contrast
357   // to statically allocated arrays).
358   std::deque<CacheHandle> list_;
359 
360   // Pointer to the next handle in the circular list to be examine for
361   // eviction.
362   size_t head_;
363 
364   // Recycle bin of cache handles.
365   autovector<CacheHandle*> recycle_;
366 
367   // Maximum cache size.
368   std::atomic<size_t> capacity_;
369 
370   // Current total size of the cache.
371   std::atomic<size_t> usage_;
372 
373   // Total un-released cache size.
374   std::atomic<size_t> pinned_usage_;
375 
376   // Whether allow insert into cache if cache is full.
377   std::atomic<bool> strict_capacity_limit_;
378 
379   // Hash table (tbb::concurrent_hash_map) for lookup.
380   HashTable table_;
381 };
382 
ClockCacheShard()383 ClockCacheShard::ClockCacheShard()
384     : head_(0), usage_(0), pinned_usage_(0), strict_capacity_limit_(false) {}
385 
~ClockCacheShard()386 ClockCacheShard::~ClockCacheShard() {
387   for (auto& handle : list_) {
388     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
389     if (InCache(flags) || CountRefs(flags) > 0) {
390       if (handle.deleter != nullptr) {
391         (*handle.deleter)(handle.key, handle.value);
392       }
393       delete[] handle.key.data();
394     }
395   }
396 }
397 
GetUsage() const398 size_t ClockCacheShard::GetUsage() const {
399   return usage_.load(std::memory_order_relaxed);
400 }
401 
GetPinnedUsage() const402 size_t ClockCacheShard::GetPinnedUsage() const {
403   return pinned_usage_.load(std::memory_order_relaxed);
404 }
405 
ApplyToAllCacheEntries(void (* callback)(void *,size_t),bool thread_safe)406 void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
407                                              bool thread_safe) {
408   if (thread_safe) {
409     mutex_.Lock();
410   }
411   for (auto& handle : list_) {
412     // Use relaxed semantics instead of acquire semantics since we are either
413     // holding mutex, or don't have thread safe requirement.
414     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
415     if (InCache(flags)) {
416       callback(handle.value, handle.charge);
417     }
418   }
419   if (thread_safe) {
420     mutex_.Unlock();
421   }
422 }
423 
RecycleHandle(CacheHandle * handle,CleanupContext * context)424 void ClockCacheShard::RecycleHandle(CacheHandle* handle,
425                                     CleanupContext* context) {
426   mutex_.AssertHeld();
427   assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
428   context->to_delete_key.push_back(handle->key.data());
429   context->to_delete_value.emplace_back(*handle);
430   size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
431   handle->key.clear();
432   handle->value = nullptr;
433   handle->deleter = nullptr;
434   recycle_.push_back(handle);
435   usage_.fetch_sub(total_charge, std::memory_order_relaxed);
436 }
437 
Cleanup(const CleanupContext & context)438 void ClockCacheShard::Cleanup(const CleanupContext& context) {
439   for (const CacheHandle& handle : context.to_delete_value) {
440     if (handle.deleter) {
441       (*handle.deleter)(handle.key, handle.value);
442     }
443   }
444   for (const char* key : context.to_delete_key) {
445     delete[] key;
446   }
447 }
448 
Ref(Cache::Handle * h)449 bool ClockCacheShard::Ref(Cache::Handle* h) {
450   auto handle = reinterpret_cast<CacheHandle*>(h);
451   // CAS loop to increase reference count.
452   uint32_t flags = handle->flags.load(std::memory_order_relaxed);
453   while (InCache(flags)) {
454     // Use acquire semantics on success, as further operations on the cache
455     // entry has to be order after reference count is increased.
456     if (handle->flags.compare_exchange_weak(flags, flags + kOneRef,
457                                             std::memory_order_acquire,
458                                             std::memory_order_relaxed)) {
459       if (CountRefs(flags) == 0) {
460         // No reference count before the operation.
461         size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
462         pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
463       }
464       return true;
465     }
466   }
467   return false;
468 }
469 
Unref(CacheHandle * handle,bool set_usage,CleanupContext * context)470 bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
471                             CleanupContext* context) {
472   if (set_usage) {
473     handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed);
474   }
475   // Use acquire-release semantics as previous operations on the cache entry
476   // has to be order before reference count is decreased, and potential cleanup
477   // of the entry has to be order after.
478   uint32_t flags = handle->flags.fetch_sub(kOneRef, std::memory_order_acq_rel);
479   assert(CountRefs(flags) > 0);
480   if (CountRefs(flags) == 1) {
481     // this is the last reference.
482     size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
483     pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
484     // Cleanup if it is the last reference.
485     if (!InCache(flags)) {
486       MutexLock l(&mutex_);
487       RecycleHandle(handle, context);
488     }
489   }
490   return context->to_delete_value.size();
491 }
492 
UnsetInCache(CacheHandle * handle,CleanupContext * context)493 bool ClockCacheShard::UnsetInCache(CacheHandle* handle,
494                                    CleanupContext* context) {
495   mutex_.AssertHeld();
496   // Use acquire-release semantics as previous operations on the cache entry
497   // has to be order before reference count is decreased, and potential cleanup
498   // of the entry has to be order after.
499   uint32_t flags =
500       handle->flags.fetch_and(~kInCacheBit, std::memory_order_acq_rel);
501   // Cleanup if it is the last reference.
502   if (InCache(flags) && CountRefs(flags) == 0) {
503     RecycleHandle(handle, context);
504   }
505   return context->to_delete_value.size();
506 }
507 
TryEvict(CacheHandle * handle,CleanupContext * context)508 bool ClockCacheShard::TryEvict(CacheHandle* handle, CleanupContext* context) {
509   mutex_.AssertHeld();
510   uint32_t flags = kInCacheBit;
511   if (handle->flags.compare_exchange_strong(flags, 0, std::memory_order_acquire,
512                                             std::memory_order_relaxed)) {
513     bool erased __attribute__((__unused__)) =
514         table_.erase(CacheKey(handle->key, handle->hash));
515     assert(erased);
516     RecycleHandle(handle, context);
517     return true;
518   }
519   handle->flags.fetch_and(~kUsageBit, std::memory_order_relaxed);
520   return false;
521 }
522 
EvictFromCache(size_t charge,CleanupContext * context)523 bool ClockCacheShard::EvictFromCache(size_t charge, CleanupContext* context) {
524   size_t usage = usage_.load(std::memory_order_relaxed);
525   size_t capacity = capacity_.load(std::memory_order_relaxed);
526   if (usage == 0) {
527     return charge <= capacity;
528   }
529   size_t new_head = head_;
530   bool second_iteration = false;
531   while (usage + charge > capacity) {
532     assert(new_head < list_.size());
533     if (TryEvict(&list_[new_head], context)) {
534       usage = usage_.load(std::memory_order_relaxed);
535     }
536     new_head = (new_head + 1 >= list_.size()) ? 0 : new_head + 1;
537     if (new_head == head_) {
538       if (second_iteration) {
539         return false;
540       } else {
541         second_iteration = true;
542       }
543     }
544   }
545   head_ = new_head;
546   return true;
547 }
548 
SetCapacity(size_t capacity)549 void ClockCacheShard::SetCapacity(size_t capacity) {
550   CleanupContext context;
551   {
552     MutexLock l(&mutex_);
553     capacity_.store(capacity, std::memory_order_relaxed);
554     EvictFromCache(0, &context);
555   }
556   Cleanup(context);
557 }
558 
SetStrictCapacityLimit(bool strict_capacity_limit)559 void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
560   strict_capacity_limit_.store(strict_capacity_limit,
561                                std::memory_order_relaxed);
562 }
563 
Insert(const Slice & key,uint32_t hash,void * value,size_t charge,Deleter * deleter,bool hold_reference,CleanupContext * context)564 CacheHandle* ClockCacheShard::Insert(const Slice& key, uint32_t hash,
565                                      void* value, size_t charge,
566                                      Deleter* deleter, bool hold_reference,
567                                      CleanupContext* context) {
568   size_t total_charge =
569       CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
570   MutexLock l(&mutex_);
571   bool success = EvictFromCache(total_charge, context);
572   bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
573   if (!success && (strict || !hold_reference)) {
574     context->to_delete_key.push_back(key.data());
575     if (!hold_reference) {
576       context->to_delete_value.emplace_back(key, value, deleter);
577     }
578     return nullptr;
579   }
580   // Grab available handle from recycle bin. If recycle bin is empty, create
581   // and append new handle to end of circular list.
582   CacheHandle* handle = nullptr;
583   if (!recycle_.empty()) {
584     handle = recycle_.back();
585     recycle_.pop_back();
586   } else {
587     list_.emplace_back();
588     handle = &list_.back();
589   }
590   // Fill handle.
591   handle->key = key;
592   handle->hash = hash;
593   handle->value = value;
594   handle->charge = charge;
595   handle->deleter = deleter;
596   uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit;
597   handle->flags.store(flags, std::memory_order_relaxed);
598   HashTable::accessor accessor;
599   if (table_.find(accessor, CacheKey(key, hash))) {
600     CacheHandle* existing_handle = accessor->second;
601     table_.erase(accessor);
602     UnsetInCache(existing_handle, context);
603   }
604   table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
605   if (hold_reference) {
606     pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
607   }
608   usage_.fetch_add(total_charge, std::memory_order_relaxed);
609   return handle;
610 }
611 
Insert(const Slice & key,uint32_t hash,void * value,size_t charge,Deleter * deleter,Cache::Handle ** out_handle,Cache::Priority)612 Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
613                                size_t charge, Deleter* deleter,
614                                Cache::Handle** out_handle,
615                                Cache::Priority /*priority*/) {
616   CleanupContext context;
617   HashTable::accessor accessor;
618   char* key_data = new char[key.size()];
619   memcpy(key_data, key.data(), key.size());
620   Slice key_copy(key_data, key.size());
621   CacheHandle* handle = Insert(key_copy, hash, value, charge, deleter,
622                                out_handle != nullptr, &context);
623   Status s;
624   if (out_handle != nullptr) {
625     if (handle == nullptr) {
626       s = Status::Incomplete("Insert failed due to LRU cache being full.");
627     } else {
628       *out_handle = reinterpret_cast<Cache::Handle*>(handle);
629     }
630   }
631   Cleanup(context);
632   return s;
633 }
634 
Lookup(const Slice & key,uint32_t hash)635 Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
636   HashTable::const_accessor accessor;
637   if (!table_.find(accessor, CacheKey(key, hash))) {
638     return nullptr;
639   }
640   CacheHandle* handle = accessor->second;
641   accessor.release();
642   // Ref() could fail if another thread sneak in and evict/erase the cache
643   // entry before we are able to hold reference.
644   if (!Ref(reinterpret_cast<Cache::Handle*>(handle))) {
645     return nullptr;
646   }
647   // Double check the key since the handle may now representing another key
648   // if other threads sneak in, evict/erase the entry and re-used the handle
649   // for another cache entry.
650   if (hash != handle->hash || key != handle->key) {
651     CleanupContext context;
652     Unref(handle, false, &context);
653     // It is possible Unref() delete the entry, so we need to cleanup.
654     Cleanup(context);
655     return nullptr;
656   }
657   return reinterpret_cast<Cache::Handle*>(handle);
658 }
659 
Release(Cache::Handle * h,bool force_erase)660 bool ClockCacheShard::Release(Cache::Handle* h, bool force_erase) {
661   CleanupContext context;
662   CacheHandle* handle = reinterpret_cast<CacheHandle*>(h);
663   bool erased = Unref(handle, true, &context);
664   if (force_erase && !erased) {
665     erased = EraseAndConfirm(handle->key, handle->hash, &context);
666   }
667   Cleanup(context);
668   return erased;
669 }
670 
Erase(const Slice & key,uint32_t hash)671 void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
672   CleanupContext context;
673   EraseAndConfirm(key, hash, &context);
674   Cleanup(context);
675 }
676 
EraseAndConfirm(const Slice & key,uint32_t hash,CleanupContext * context)677 bool ClockCacheShard::EraseAndConfirm(const Slice& key, uint32_t hash,
678                                       CleanupContext* context) {
679   MutexLock l(&mutex_);
680   HashTable::accessor accessor;
681   bool erased = false;
682   if (table_.find(accessor, CacheKey(key, hash))) {
683     CacheHandle* handle = accessor->second;
684     table_.erase(accessor);
685     erased = UnsetInCache(handle, context);
686   }
687   return erased;
688 }
689 
EraseUnRefEntries()690 void ClockCacheShard::EraseUnRefEntries() {
691   CleanupContext context;
692   {
693     MutexLock l(&mutex_);
694     table_.clear();
695     for (auto& handle : list_) {
696       UnsetInCache(&handle, &context);
697     }
698   }
699   Cleanup(context);
700 }
701 
702 class ClockCache final : public ShardedCache {
703  public:
ClockCache(size_t capacity,int num_shard_bits,bool strict_capacity_limit,CacheMetadataChargePolicy metadata_charge_policy)704   ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
705              CacheMetadataChargePolicy metadata_charge_policy)
706       : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
707     int num_shards = 1 << num_shard_bits;
708     shards_ = new ClockCacheShard[num_shards];
709     for (int i = 0; i < num_shards; i++) {
710       shards_[i].set_metadata_charge_policy(metadata_charge_policy);
711     }
712     SetCapacity(capacity);
713     SetStrictCapacityLimit(strict_capacity_limit);
714   }
715 
~ClockCache()716   ~ClockCache() override { delete[] shards_; }
717 
Name() const718   const char* Name() const override { return "ClockCache"; }
719 
GetShard(int shard)720   CacheShard* GetShard(int shard) override {
721     return reinterpret_cast<CacheShard*>(&shards_[shard]);
722   }
723 
GetShard(int shard) const724   const CacheShard* GetShard(int shard) const override {
725     return reinterpret_cast<CacheShard*>(&shards_[shard]);
726   }
727 
Value(Handle * handle)728   void* Value(Handle* handle) override {
729     return reinterpret_cast<const CacheHandle*>(handle)->value;
730   }
731 
GetCharge(Handle * handle) const732   size_t GetCharge(Handle* handle) const override {
733     return reinterpret_cast<const CacheHandle*>(handle)->charge;
734   }
735 
GetHash(Handle * handle) const736   uint32_t GetHash(Handle* handle) const override {
737     return reinterpret_cast<const CacheHandle*>(handle)->hash;
738   }
739 
DisownData()740   void DisownData() override { shards_ = nullptr; }
741 
742  private:
743   ClockCacheShard* shards_;
744 };
745 
746 }  // end anonymous namespace
747 
NewClockCache(size_t capacity,int num_shard_bits,bool strict_capacity_limit,CacheMetadataChargePolicy metadata_charge_policy)748 std::shared_ptr<Cache> NewClockCache(
749     size_t capacity, int num_shard_bits, bool strict_capacity_limit,
750     CacheMetadataChargePolicy metadata_charge_policy) {
751   if (num_shard_bits < 0) {
752     num_shard_bits = GetDefaultCacheShardBits(capacity);
753   }
754   return std::make_shared<ClockCache>(
755       capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy);
756 }
757 
758 }  // namespace ROCKSDB_NAMESPACE
759 
760 #endif  // SUPPORT_CLOCK_CACHE
761