1 //===-- memprof_allocator.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file is a part of MemProfiler, a memory profiler.
10 //
11 // Implementation of MemProf's memory allocator, which uses the allocator
12 // from sanitizer_common.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "memprof_allocator.h"
17 #include "memprof_mapping.h"
18 #include "memprof_stack.h"
19 #include "memprof_thread.h"
20 #include "sanitizer_common/sanitizer_allocator_checks.h"
21 #include "sanitizer_common/sanitizer_allocator_interface.h"
22 #include "sanitizer_common/sanitizer_allocator_report.h"
23 #include "sanitizer_common/sanitizer_errno.h"
24 #include "sanitizer_common/sanitizer_file.h"
25 #include "sanitizer_common/sanitizer_flags.h"
26 #include "sanitizer_common/sanitizer_internal_defs.h"
27 #include "sanitizer_common/sanitizer_list.h"
28 #include "sanitizer_common/sanitizer_stackdepot.h"
29 
30 #include <sched.h>
31 #include <stdlib.h>
32 #include <time.h>
33 
34 namespace __memprof {
35 
36 static int GetCpuId(void) {
37   // _memprof_preinit is called via the preinit_array, which subsequently calls
38   // malloc. Since this is before _dl_init calls VDSO_SETUP, sched_getcpu
39   // will seg fault as the address of __vdso_getcpu will be null.
40   if (!memprof_init_done)
41     return -1;
42   return sched_getcpu();
43 }
44 
45 // Compute the timestamp in ms.
46 static int GetTimestamp(void) {
47   // timespec_get will segfault if called from dl_init
48   if (!memprof_timestamp_inited) {
49     // By returning 0, this will be effectively treated as being
50     // timestamped at memprof init time (when memprof_init_timestamp_s
51     // is initialized).
52     return 0;
53   }
54   timespec ts;
55   clock_gettime(CLOCK_REALTIME, &ts);
56   return (ts.tv_sec - memprof_init_timestamp_s) * 1000 + ts.tv_nsec / 1000000;
57 }
58 
59 static MemprofAllocator &get_allocator();
60 
61 // The memory chunk allocated from the underlying allocator looks like this:
62 // H H U U U U U U
63 //   H -- ChunkHeader (32 bytes)
64 //   U -- user memory.
65 
66 // If there is left padding before the ChunkHeader (due to use of memalign),
67 // we store a magic value in the first uptr word of the memory block and
68 // store the address of ChunkHeader in the next uptr.
69 // M B L L L L L L L L L  H H U U U U U U
70 //   |                    ^
71 //   ---------------------|
72 //   M -- magic value kAllocBegMagic
73 //   B -- address of ChunkHeader pointing to the first 'H'
74 
75 constexpr uptr kMaxAllowedMallocBits = 40;
76 
77 // Should be no more than 32-bytes
78 struct ChunkHeader {
79   // 1-st 4 bytes.
80   u32 alloc_context_id;
81   // 2-nd 4 bytes
82   u32 cpu_id;
83   // 3-rd 4 bytes
84   u32 timestamp_ms;
85   // 4-th 4 bytes
86   // Note only 1 bit is needed for this flag if we need space in the future for
87   // more fields.
88   u32 from_memalign;
89   // 5-th and 6-th 4 bytes
90   // The max size of an allocation is 2^40 (kMaxAllowedMallocSize), so this
91   // could be shrunk to kMaxAllowedMallocBits if we need space in the future for
92   // more fields.
93   atomic_uint64_t user_requested_size;
94   // 23 bits available
95   // 7-th and 8-th 4 bytes
96   u64 data_type_id; // TODO: hash of type name
97 };
98 
99 static const uptr kChunkHeaderSize = sizeof(ChunkHeader);
100 COMPILER_CHECK(kChunkHeaderSize == 32);
101 
102 struct MemprofChunk : ChunkHeader {
103   uptr Beg() { return reinterpret_cast<uptr>(this) + kChunkHeaderSize; }
104   uptr UsedSize() {
105     return atomic_load(&user_requested_size, memory_order_relaxed);
106   }
107   void *AllocBeg() {
108     if (from_memalign)
109       return get_allocator().GetBlockBegin(reinterpret_cast<void *>(this));
110     return reinterpret_cast<void *>(this);
111   }
112 };
113 
114 class LargeChunkHeader {
115   static constexpr uptr kAllocBegMagic =
116       FIRST_32_SECOND_64(0xCC6E96B9, 0xCC6E96B9CC6E96B9ULL);
117   atomic_uintptr_t magic;
118   MemprofChunk *chunk_header;
119 
120 public:
121   MemprofChunk *Get() const {
122     return atomic_load(&magic, memory_order_acquire) == kAllocBegMagic
123                ? chunk_header
124                : nullptr;
125   }
126 
127   void Set(MemprofChunk *p) {
128     if (p) {
129       chunk_header = p;
130       atomic_store(&magic, kAllocBegMagic, memory_order_release);
131       return;
132     }
133 
134     uptr old = kAllocBegMagic;
135     if (!atomic_compare_exchange_strong(&magic, &old, 0,
136                                         memory_order_release)) {
137       CHECK_EQ(old, kAllocBegMagic);
138     }
139   }
140 };
141 
142 void FlushUnneededMemProfShadowMemory(uptr p, uptr size) {
143   // Since memprof's mapping is compacting, the shadow chunk may be
144   // not page-aligned, so we only flush the page-aligned portion.
145   ReleaseMemoryPagesToOS(MemToShadow(p), MemToShadow(p + size));
146 }
147 
148 void MemprofMapUnmapCallback::OnMap(uptr p, uptr size) const {
149   // Statistics.
150   MemprofStats &thread_stats = GetCurrentThreadStats();
151   thread_stats.mmaps++;
152   thread_stats.mmaped += size;
153 }
154 void MemprofMapUnmapCallback::OnUnmap(uptr p, uptr size) const {
155   // We are about to unmap a chunk of user memory.
156   // Mark the corresponding shadow memory as not needed.
157   FlushUnneededMemProfShadowMemory(p, size);
158   // Statistics.
159   MemprofStats &thread_stats = GetCurrentThreadStats();
160   thread_stats.munmaps++;
161   thread_stats.munmaped += size;
162 }
163 
164 AllocatorCache *GetAllocatorCache(MemprofThreadLocalMallocStorage *ms) {
165   CHECK(ms);
166   return &ms->allocator_cache;
167 }
168 
169 struct MemInfoBlock {
170   u32 alloc_count;
171   u64 total_access_count, min_access_count, max_access_count;
172   u64 total_size;
173   u32 min_size, max_size;
174   u32 alloc_timestamp, dealloc_timestamp;
175   u64 total_lifetime;
176   u32 min_lifetime, max_lifetime;
177   u32 alloc_cpu_id, dealloc_cpu_id;
178   u32 num_migrated_cpu;
179 
180   // Only compared to prior deallocated object currently.
181   u32 num_lifetime_overlaps;
182   u32 num_same_alloc_cpu;
183   u32 num_same_dealloc_cpu;
184 
185   u64 data_type_id; // TODO: hash of type name
186 
187   MemInfoBlock() : alloc_count(0) {}
188 
189   MemInfoBlock(u32 size, u64 access_count, u32 alloc_timestamp,
190                u32 dealloc_timestamp, u32 alloc_cpu, u32 dealloc_cpu)
191       : alloc_count(1), total_access_count(access_count),
192         min_access_count(access_count), max_access_count(access_count),
193         total_size(size), min_size(size), max_size(size),
194         alloc_timestamp(alloc_timestamp), dealloc_timestamp(dealloc_timestamp),
195         total_lifetime(dealloc_timestamp - alloc_timestamp),
196         min_lifetime(total_lifetime), max_lifetime(total_lifetime),
197         alloc_cpu_id(alloc_cpu), dealloc_cpu_id(dealloc_cpu),
198         num_lifetime_overlaps(0), num_same_alloc_cpu(0),
199         num_same_dealloc_cpu(0) {
200     num_migrated_cpu = alloc_cpu_id != dealloc_cpu_id;
201   }
202 
203   void Print(u64 id) {
204     u64 p;
205     if (flags()->print_terse) {
206       p = total_size * 100 / alloc_count;
207       Printf("MIB:%llu/%u/%llu.%02llu/%u/%u/", id, alloc_count, p / 100, p % 100,
208              min_size, max_size);
209       p = total_access_count * 100 / alloc_count;
210       Printf("%llu.%02llu/%llu/%llu/", p / 100, p % 100, min_access_count,
211              max_access_count);
212       p = total_lifetime * 100 / alloc_count;
213       Printf("%llu.%02llu/%u/%u/", p / 100, p % 100, min_lifetime, max_lifetime);
214       Printf("%u/%u/%u/%u\n", num_migrated_cpu, num_lifetime_overlaps,
215              num_same_alloc_cpu, num_same_dealloc_cpu);
216     } else {
217       p = total_size * 100 / alloc_count;
218       Printf("Memory allocation stack id = %llu\n", id);
219       Printf("\talloc_count %u, size (ave/min/max) %llu.%02llu / %u / %u\n",
220              alloc_count, p / 100, p % 100, min_size, max_size);
221       p = total_access_count * 100 / alloc_count;
222       Printf("\taccess_count (ave/min/max): %llu.%02llu / %llu / %llu\n", p / 100,
223              p % 100, min_access_count, max_access_count);
224       p = total_lifetime * 100 / alloc_count;
225       Printf("\tlifetime (ave/min/max): %llu.%02llu / %u / %u\n", p / 100, p % 100,
226              min_lifetime, max_lifetime);
227       Printf("\tnum migrated: %u, num lifetime overlaps: %u, num same alloc "
228              "cpu: %u, num same dealloc_cpu: %u\n",
229              num_migrated_cpu, num_lifetime_overlaps, num_same_alloc_cpu,
230              num_same_dealloc_cpu);
231     }
232   }
233 
234   static void printHeader() {
235     CHECK(flags()->print_terse);
236     Printf("MIB:StackID/AllocCount/AveSize/MinSize/MaxSize/AveAccessCount/"
237            "MinAccessCount/MaxAccessCount/AveLifetime/MinLifetime/MaxLifetime/"
238            "NumMigratedCpu/NumLifetimeOverlaps/NumSameAllocCpu/"
239            "NumSameDeallocCpu\n");
240   }
241 
242   void Merge(MemInfoBlock &newMIB) {
243     alloc_count += newMIB.alloc_count;
244 
245     total_access_count += newMIB.total_access_count;
246     min_access_count = Min(min_access_count, newMIB.min_access_count);
247     max_access_count = Max(max_access_count, newMIB.max_access_count);
248 
249     total_size += newMIB.total_size;
250     min_size = Min(min_size, newMIB.min_size);
251     max_size = Max(max_size, newMIB.max_size);
252 
253     total_lifetime += newMIB.total_lifetime;
254     min_lifetime = Min(min_lifetime, newMIB.min_lifetime);
255     max_lifetime = Max(max_lifetime, newMIB.max_lifetime);
256 
257     // We know newMIB was deallocated later, so just need to check if it was
258     // allocated before last one deallocated.
259     num_lifetime_overlaps += newMIB.alloc_timestamp < dealloc_timestamp;
260     alloc_timestamp = newMIB.alloc_timestamp;
261     dealloc_timestamp = newMIB.dealloc_timestamp;
262 
263     num_same_alloc_cpu += alloc_cpu_id == newMIB.alloc_cpu_id;
264     num_same_dealloc_cpu += dealloc_cpu_id == newMIB.dealloc_cpu_id;
265     alloc_cpu_id = newMIB.alloc_cpu_id;
266     dealloc_cpu_id = newMIB.dealloc_cpu_id;
267   }
268 };
269 
270 struct SetEntry {
271   SetEntry() : id(0), MIB() {}
272   bool Empty() { return id == 0; }
273   void Print() {
274     CHECK(!Empty());
275     MIB.Print(id);
276   }
277   // The stack id
278   u64 id;
279   MemInfoBlock MIB;
280 };
281 
282 struct CacheSet {
283   enum { kSetSize = 4 };
284 
285   void PrintAll() {
286     for (int i = 0; i < kSetSize; i++) {
287       if (Entries[i].Empty())
288         continue;
289       Entries[i].Print();
290     }
291   }
292   void insertOrMerge(u64 new_id, MemInfoBlock &newMIB) {
293     SpinMutexLock l(&SetMutex);
294     AccessCount++;
295 
296     for (int i = 0; i < kSetSize; i++) {
297       auto id = Entries[i].id;
298       // Check if this is a hit or an empty entry. Since we always move any
299       // filled locations to the front of the array (see below), we don't need
300       // to look after finding the first empty entry.
301       if (id == new_id || !id) {
302         if (id == 0) {
303           Entries[i].id = new_id;
304           Entries[i].MIB = newMIB;
305         } else {
306           Entries[i].MIB.Merge(newMIB);
307         }
308         // Assuming some id locality, we try to swap the matching entry
309         // into the first set position.
310         if (i != 0) {
311           auto tmp = Entries[0];
312           Entries[0] = Entries[i];
313           Entries[i] = tmp;
314         }
315         return;
316       }
317     }
318 
319     // Miss
320     MissCount++;
321 
322     // We try to find the entries with the lowest alloc count to be evicted:
323     int min_idx = 0;
324     u64 min_count = Entries[0].MIB.alloc_count;
325     for (int i = 1; i < kSetSize; i++) {
326       CHECK(!Entries[i].Empty());
327       if (Entries[i].MIB.alloc_count < min_count) {
328         min_idx = i;
329         min_count = Entries[i].MIB.alloc_count;
330       }
331     }
332 
333     // Print the evicted entry profile information
334     if (!flags()->print_terse)
335       Printf("Evicted:\n");
336     Entries[min_idx].Print();
337 
338     // Similar to the hit case, put new MIB in first set position.
339     if (min_idx != 0)
340       Entries[min_idx] = Entries[0];
341     Entries[0].id = new_id;
342     Entries[0].MIB = newMIB;
343   }
344 
345   void PrintMissRate(int i) {
346     u64 p = AccessCount ? MissCount * 10000ULL / AccessCount : 0;
347     Printf("Set %d miss rate: %d / %d = %5llu.%02llu%%\n", i, MissCount,
348            AccessCount, p / 100, p % 100);
349   }
350 
351   SetEntry Entries[kSetSize];
352   u32 AccessCount = 0;
353   u32 MissCount = 0;
354   SpinMutex SetMutex;
355 };
356 
357 struct MemInfoBlockCache {
358   MemInfoBlockCache() {
359     if (common_flags()->print_module_map)
360       DumpProcessMap();
361     if (flags()->print_terse)
362       MemInfoBlock::printHeader();
363     Sets =
364         (CacheSet *)malloc(sizeof(CacheSet) * flags()->mem_info_cache_entries);
365     Constructed = true;
366   }
367 
368   ~MemInfoBlockCache() { free(Sets); }
369 
370   void insertOrMerge(u64 new_id, MemInfoBlock &newMIB) {
371     u64 hv = new_id;
372 
373     // Use mod method where number of entries should be a prime close to power
374     // of 2.
375     hv %= flags()->mem_info_cache_entries;
376 
377     return Sets[hv].insertOrMerge(new_id, newMIB);
378   }
379 
380   void PrintAll() {
381     for (int i = 0; i < flags()->mem_info_cache_entries; i++) {
382       Sets[i].PrintAll();
383     }
384   }
385 
386   void PrintMissRate() {
387     if (!flags()->print_mem_info_cache_miss_rate)
388       return;
389     u64 MissCountSum = 0;
390     u64 AccessCountSum = 0;
391     for (int i = 0; i < flags()->mem_info_cache_entries; i++) {
392       MissCountSum += Sets[i].MissCount;
393       AccessCountSum += Sets[i].AccessCount;
394     }
395     u64 p = AccessCountSum ? MissCountSum * 10000ULL / AccessCountSum : 0;
396     Printf("Overall miss rate: %llu / %llu = %5llu.%02llu%%\n", MissCountSum,
397            AccessCountSum, p / 100, p % 100);
398     if (flags()->print_mem_info_cache_miss_rate_details)
399       for (int i = 0; i < flags()->mem_info_cache_entries; i++)
400         Sets[i].PrintMissRate(i);
401   }
402 
403   CacheSet *Sets;
404   // Flag when the Sets have been allocated, in case a deallocation is called
405   // very early before the static init of the Allocator and therefore this table
406   // have completed.
407   bool Constructed = false;
408 };
409 
410 // Accumulates the access count from the shadow for the given pointer and size.
411 u64 GetShadowCount(uptr p, u32 size) {
412   u64 *shadow = (u64 *)MEM_TO_SHADOW(p);
413   u64 *shadow_end = (u64 *)MEM_TO_SHADOW(p + size);
414   u64 count = 0;
415   for (; shadow <= shadow_end; shadow++)
416     count += *shadow;
417   return count;
418 }
419 
420 // Clears the shadow counters (when memory is allocated).
421 void ClearShadow(uptr addr, uptr size) {
422   CHECK(AddrIsAlignedByGranularity(addr));
423   CHECK(AddrIsInMem(addr));
424   CHECK(AddrIsAlignedByGranularity(addr + size));
425   CHECK(AddrIsInMem(addr + size - SHADOW_GRANULARITY));
426   CHECK(REAL(memset));
427   uptr shadow_beg = MEM_TO_SHADOW(addr);
428   uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
429   if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
430     REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
431   } else {
432     uptr page_size = GetPageSizeCached();
433     uptr page_beg = RoundUpTo(shadow_beg, page_size);
434     uptr page_end = RoundDownTo(shadow_end, page_size);
435 
436     if (page_beg >= page_end) {
437       REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
438     } else {
439       if (page_beg != shadow_beg) {
440         REAL(memset)((void *)shadow_beg, 0, page_beg - shadow_beg);
441       }
442       if (page_end != shadow_end) {
443         REAL(memset)((void *)page_end, 0, shadow_end - page_end);
444       }
445       ReserveShadowMemoryRange(page_beg, page_end - 1, nullptr);
446     }
447   }
448 }
449 
450 struct Allocator {
451   static const uptr kMaxAllowedMallocSize = 1ULL << kMaxAllowedMallocBits;
452 
453   MemprofAllocator allocator;
454   StaticSpinMutex fallback_mutex;
455   AllocatorCache fallback_allocator_cache;
456 
457   uptr max_user_defined_malloc_size;
458   atomic_uint8_t rss_limit_exceeded;
459 
460   MemInfoBlockCache MemInfoBlockTable;
461   bool destructing;
462 
463   // ------------------- Initialization ------------------------
464   explicit Allocator(LinkerInitialized) : destructing(false) {}
465 
466   ~Allocator() { FinishAndPrint(); }
467 
468   void FinishAndPrint() {
469     if (!flags()->print_terse)
470       Printf("Live on exit:\n");
471     allocator.ForceLock();
472     allocator.ForEachChunk(
473         [](uptr chunk, void *alloc) {
474           u64 user_requested_size;
475           MemprofChunk *m =
476               ((Allocator *)alloc)
477                   ->GetMemprofChunk((void *)chunk, user_requested_size);
478           if (!m)
479             return;
480           uptr user_beg = ((uptr)m) + kChunkHeaderSize;
481           u64 c = GetShadowCount(user_beg, user_requested_size);
482           long curtime = GetTimestamp();
483           MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
484                               m->cpu_id, GetCpuId());
485           ((Allocator *)alloc)
486               ->MemInfoBlockTable.insertOrMerge(m->alloc_context_id, newMIB);
487         },
488         this);
489     allocator.ForceUnlock();
490 
491     destructing = true;
492     MemInfoBlockTable.PrintMissRate();
493     MemInfoBlockTable.PrintAll();
494     StackDepotPrintAll();
495   }
496 
497   void InitLinkerInitialized() {
498     SetAllocatorMayReturnNull(common_flags()->allocator_may_return_null);
499     allocator.InitLinkerInitialized(
500         common_flags()->allocator_release_to_os_interval_ms);
501     max_user_defined_malloc_size = common_flags()->max_allocation_size_mb
502                                        ? common_flags()->max_allocation_size_mb
503                                              << 20
504                                        : kMaxAllowedMallocSize;
505   }
506 
507   bool RssLimitExceeded() {
508     return atomic_load(&rss_limit_exceeded, memory_order_relaxed);
509   }
510 
511   void SetRssLimitExceeded(bool limit_exceeded) {
512     atomic_store(&rss_limit_exceeded, limit_exceeded, memory_order_relaxed);
513   }
514 
515   // -------------------- Allocation/Deallocation routines ---------------
516   void *Allocate(uptr size, uptr alignment, BufferedStackTrace *stack,
517                  AllocType alloc_type) {
518     if (UNLIKELY(!memprof_inited))
519       MemprofInitFromRtl();
520     if (RssLimitExceeded()) {
521       if (AllocatorMayReturnNull())
522         return nullptr;
523       ReportRssLimitExceeded(stack);
524     }
525     CHECK(stack);
526     const uptr min_alignment = MEMPROF_ALIGNMENT;
527     if (alignment < min_alignment)
528       alignment = min_alignment;
529     if (size == 0) {
530       // We'd be happy to avoid allocating memory for zero-size requests, but
531       // some programs/tests depend on this behavior and assume that malloc
532       // would not return NULL even for zero-size allocations. Moreover, it
533       // looks like operator new should never return NULL, and results of
534       // consecutive "new" calls must be different even if the allocated size
535       // is zero.
536       size = 1;
537     }
538     CHECK(IsPowerOfTwo(alignment));
539     uptr rounded_size = RoundUpTo(size, alignment);
540     uptr needed_size = rounded_size + kChunkHeaderSize;
541     if (alignment > min_alignment)
542       needed_size += alignment;
543     CHECK(IsAligned(needed_size, min_alignment));
544     if (size > kMaxAllowedMallocSize || needed_size > kMaxAllowedMallocSize ||
545         size > max_user_defined_malloc_size) {
546       if (AllocatorMayReturnNull()) {
547         Report("WARNING: MemProfiler failed to allocate 0x%zx bytes\n", size);
548         return nullptr;
549       }
550       uptr malloc_limit =
551           Min(kMaxAllowedMallocSize, max_user_defined_malloc_size);
552       ReportAllocationSizeTooBig(size, malloc_limit, stack);
553     }
554 
555     MemprofThread *t = GetCurrentThread();
556     void *allocated;
557     if (t) {
558       AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage());
559       allocated = allocator.Allocate(cache, needed_size, 8);
560     } else {
561       SpinMutexLock l(&fallback_mutex);
562       AllocatorCache *cache = &fallback_allocator_cache;
563       allocated = allocator.Allocate(cache, needed_size, 8);
564     }
565     if (UNLIKELY(!allocated)) {
566       SetAllocatorOutOfMemory();
567       if (AllocatorMayReturnNull())
568         return nullptr;
569       ReportOutOfMemory(size, stack);
570     }
571 
572     uptr alloc_beg = reinterpret_cast<uptr>(allocated);
573     uptr alloc_end = alloc_beg + needed_size;
574     uptr beg_plus_header = alloc_beg + kChunkHeaderSize;
575     uptr user_beg = beg_plus_header;
576     if (!IsAligned(user_beg, alignment))
577       user_beg = RoundUpTo(user_beg, alignment);
578     uptr user_end = user_beg + size;
579     CHECK_LE(user_end, alloc_end);
580     uptr chunk_beg = user_beg - kChunkHeaderSize;
581     MemprofChunk *m = reinterpret_cast<MemprofChunk *>(chunk_beg);
582     m->from_memalign = alloc_beg != chunk_beg;
583     CHECK(size);
584 
585     m->cpu_id = GetCpuId();
586     m->timestamp_ms = GetTimestamp();
587     m->alloc_context_id = StackDepotPut(*stack);
588 
589     uptr size_rounded_down_to_granularity =
590         RoundDownTo(size, SHADOW_GRANULARITY);
591     if (size_rounded_down_to_granularity)
592       ClearShadow(user_beg, size_rounded_down_to_granularity);
593 
594     MemprofStats &thread_stats = GetCurrentThreadStats();
595     thread_stats.mallocs++;
596     thread_stats.malloced += size;
597     thread_stats.malloced_overhead += needed_size - size;
598     if (needed_size > SizeClassMap::kMaxSize)
599       thread_stats.malloc_large++;
600     else
601       thread_stats.malloced_by_size[SizeClassMap::ClassID(needed_size)]++;
602 
603     void *res = reinterpret_cast<void *>(user_beg);
604     atomic_store(&m->user_requested_size, size, memory_order_release);
605     if (alloc_beg != chunk_beg) {
606       CHECK_LE(alloc_beg + sizeof(LargeChunkHeader), chunk_beg);
607       reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Set(m);
608     }
609     MEMPROF_MALLOC_HOOK(res, size);
610     return res;
611   }
612 
613   void Deallocate(void *ptr, uptr delete_size, uptr delete_alignment,
614                   BufferedStackTrace *stack, AllocType alloc_type) {
615     uptr p = reinterpret_cast<uptr>(ptr);
616     if (p == 0)
617       return;
618 
619     MEMPROF_FREE_HOOK(ptr);
620 
621     uptr chunk_beg = p - kChunkHeaderSize;
622     MemprofChunk *m = reinterpret_cast<MemprofChunk *>(chunk_beg);
623 
624     u64 user_requested_size =
625         atomic_exchange(&m->user_requested_size, 0, memory_order_acquire);
626     if (memprof_inited && memprof_init_done && !destructing &&
627         MemInfoBlockTable.Constructed) {
628       u64 c = GetShadowCount(p, user_requested_size);
629       long curtime = GetTimestamp();
630 
631       MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
632                           m->cpu_id, GetCpuId());
633         MemInfoBlockTable.insertOrMerge(m->alloc_context_id, newMIB);
634     }
635 
636     MemprofStats &thread_stats = GetCurrentThreadStats();
637     thread_stats.frees++;
638     thread_stats.freed += user_requested_size;
639 
640     void *alloc_beg = m->AllocBeg();
641     if (alloc_beg != m) {
642       // Clear the magic value, as allocator internals may overwrite the
643       // contents of deallocated chunk, confusing GetMemprofChunk lookup.
644       reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Set(nullptr);
645     }
646 
647     MemprofThread *t = GetCurrentThread();
648     if (t) {
649       AllocatorCache *cache = GetAllocatorCache(&t->malloc_storage());
650       allocator.Deallocate(cache, alloc_beg);
651     } else {
652       SpinMutexLock l(&fallback_mutex);
653       AllocatorCache *cache = &fallback_allocator_cache;
654       allocator.Deallocate(cache, alloc_beg);
655     }
656   }
657 
658   void *Reallocate(void *old_ptr, uptr new_size, BufferedStackTrace *stack) {
659     CHECK(old_ptr && new_size);
660     uptr p = reinterpret_cast<uptr>(old_ptr);
661     uptr chunk_beg = p - kChunkHeaderSize;
662     MemprofChunk *m = reinterpret_cast<MemprofChunk *>(chunk_beg);
663 
664     MemprofStats &thread_stats = GetCurrentThreadStats();
665     thread_stats.reallocs++;
666     thread_stats.realloced += new_size;
667 
668     void *new_ptr = Allocate(new_size, 8, stack, FROM_MALLOC);
669     if (new_ptr) {
670       CHECK_NE(REAL(memcpy), nullptr);
671       uptr memcpy_size = Min(new_size, m->UsedSize());
672       REAL(memcpy)(new_ptr, old_ptr, memcpy_size);
673       Deallocate(old_ptr, 0, 0, stack, FROM_MALLOC);
674     }
675     return new_ptr;
676   }
677 
678   void *Calloc(uptr nmemb, uptr size, BufferedStackTrace *stack) {
679     if (UNLIKELY(CheckForCallocOverflow(size, nmemb))) {
680       if (AllocatorMayReturnNull())
681         return nullptr;
682       ReportCallocOverflow(nmemb, size, stack);
683     }
684     void *ptr = Allocate(nmemb * size, 8, stack, FROM_MALLOC);
685     // If the memory comes from the secondary allocator no need to clear it
686     // as it comes directly from mmap.
687     if (ptr && allocator.FromPrimary(ptr))
688       REAL(memset)(ptr, 0, nmemb * size);
689     return ptr;
690   }
691 
692   void CommitBack(MemprofThreadLocalMallocStorage *ms,
693                   BufferedStackTrace *stack) {
694     AllocatorCache *ac = GetAllocatorCache(ms);
695     allocator.SwallowCache(ac);
696   }
697 
698   // -------------------------- Chunk lookup ----------------------
699 
700   // Assumes alloc_beg == allocator.GetBlockBegin(alloc_beg).
701   MemprofChunk *GetMemprofChunk(void *alloc_beg, u64 &user_requested_size) {
702     if (!alloc_beg)
703       return nullptr;
704     MemprofChunk *p = reinterpret_cast<LargeChunkHeader *>(alloc_beg)->Get();
705     if (!p) {
706       if (!allocator.FromPrimary(alloc_beg))
707         return nullptr;
708       p = reinterpret_cast<MemprofChunk *>(alloc_beg);
709     }
710     // The size is reset to 0 on deallocation (and a min of 1 on
711     // allocation).
712     user_requested_size =
713         atomic_load(&p->user_requested_size, memory_order_acquire);
714     if (user_requested_size)
715       return p;
716     return nullptr;
717   }
718 
719   MemprofChunk *GetMemprofChunkByAddr(uptr p, u64 &user_requested_size) {
720     void *alloc_beg = allocator.GetBlockBegin(reinterpret_cast<void *>(p));
721     return GetMemprofChunk(alloc_beg, user_requested_size);
722   }
723 
724   uptr AllocationSize(uptr p) {
725     u64 user_requested_size;
726     MemprofChunk *m = GetMemprofChunkByAddr(p, user_requested_size);
727     if (!m)
728       return 0;
729     if (m->Beg() != p)
730       return 0;
731     return user_requested_size;
732   }
733 
734   void Purge(BufferedStackTrace *stack) { allocator.ForceReleaseToOS(); }
735 
736   void PrintStats() { allocator.PrintStats(); }
737 
738   void ForceLock() NO_THREAD_SAFETY_ANALYSIS {
739     allocator.ForceLock();
740     fallback_mutex.Lock();
741   }
742 
743   void ForceUnlock() NO_THREAD_SAFETY_ANALYSIS {
744     fallback_mutex.Unlock();
745     allocator.ForceUnlock();
746   }
747 };
748 
749 static Allocator instance(LINKER_INITIALIZED);
750 
751 static MemprofAllocator &get_allocator() { return instance.allocator; }
752 
753 void InitializeAllocator() { instance.InitLinkerInitialized(); }
754 
755 void MemprofThreadLocalMallocStorage::CommitBack() {
756   GET_STACK_TRACE_MALLOC;
757   instance.CommitBack(this, &stack);
758 }
759 
760 void PrintInternalAllocatorStats() { instance.PrintStats(); }
761 
762 void memprof_free(void *ptr, BufferedStackTrace *stack, AllocType alloc_type) {
763   instance.Deallocate(ptr, 0, 0, stack, alloc_type);
764 }
765 
766 void memprof_delete(void *ptr, uptr size, uptr alignment,
767                     BufferedStackTrace *stack, AllocType alloc_type) {
768   instance.Deallocate(ptr, size, alignment, stack, alloc_type);
769 }
770 
771 void *memprof_malloc(uptr size, BufferedStackTrace *stack) {
772   return SetErrnoOnNull(instance.Allocate(size, 8, stack, FROM_MALLOC));
773 }
774 
775 void *memprof_calloc(uptr nmemb, uptr size, BufferedStackTrace *stack) {
776   return SetErrnoOnNull(instance.Calloc(nmemb, size, stack));
777 }
778 
779 void *memprof_reallocarray(void *p, uptr nmemb, uptr size,
780                            BufferedStackTrace *stack) {
781   if (UNLIKELY(CheckForCallocOverflow(size, nmemb))) {
782     errno = errno_ENOMEM;
783     if (AllocatorMayReturnNull())
784       return nullptr;
785     ReportReallocArrayOverflow(nmemb, size, stack);
786   }
787   return memprof_realloc(p, nmemb * size, stack);
788 }
789 
790 void *memprof_realloc(void *p, uptr size, BufferedStackTrace *stack) {
791   if (!p)
792     return SetErrnoOnNull(instance.Allocate(size, 8, stack, FROM_MALLOC));
793   if (size == 0) {
794     if (flags()->allocator_frees_and_returns_null_on_realloc_zero) {
795       instance.Deallocate(p, 0, 0, stack, FROM_MALLOC);
796       return nullptr;
797     }
798     // Allocate a size of 1 if we shouldn't free() on Realloc to 0
799     size = 1;
800   }
801   return SetErrnoOnNull(instance.Reallocate(p, size, stack));
802 }
803 
804 void *memprof_valloc(uptr size, BufferedStackTrace *stack) {
805   return SetErrnoOnNull(
806       instance.Allocate(size, GetPageSizeCached(), stack, FROM_MALLOC));
807 }
808 
809 void *memprof_pvalloc(uptr size, BufferedStackTrace *stack) {
810   uptr PageSize = GetPageSizeCached();
811   if (UNLIKELY(CheckForPvallocOverflow(size, PageSize))) {
812     errno = errno_ENOMEM;
813     if (AllocatorMayReturnNull())
814       return nullptr;
815     ReportPvallocOverflow(size, stack);
816   }
817   // pvalloc(0) should allocate one page.
818   size = size ? RoundUpTo(size, PageSize) : PageSize;
819   return SetErrnoOnNull(instance.Allocate(size, PageSize, stack, FROM_MALLOC));
820 }
821 
822 void *memprof_memalign(uptr alignment, uptr size, BufferedStackTrace *stack,
823                        AllocType alloc_type) {
824   if (UNLIKELY(!IsPowerOfTwo(alignment))) {
825     errno = errno_EINVAL;
826     if (AllocatorMayReturnNull())
827       return nullptr;
828     ReportInvalidAllocationAlignment(alignment, stack);
829   }
830   return SetErrnoOnNull(instance.Allocate(size, alignment, stack, alloc_type));
831 }
832 
833 void *memprof_aligned_alloc(uptr alignment, uptr size,
834                             BufferedStackTrace *stack) {
835   if (UNLIKELY(!CheckAlignedAllocAlignmentAndSize(alignment, size))) {
836     errno = errno_EINVAL;
837     if (AllocatorMayReturnNull())
838       return nullptr;
839     ReportInvalidAlignedAllocAlignment(size, alignment, stack);
840   }
841   return SetErrnoOnNull(instance.Allocate(size, alignment, stack, FROM_MALLOC));
842 }
843 
844 int memprof_posix_memalign(void **memptr, uptr alignment, uptr size,
845                            BufferedStackTrace *stack) {
846   if (UNLIKELY(!CheckPosixMemalignAlignment(alignment))) {
847     if (AllocatorMayReturnNull())
848       return errno_EINVAL;
849     ReportInvalidPosixMemalignAlignment(alignment, stack);
850   }
851   void *ptr = instance.Allocate(size, alignment, stack, FROM_MALLOC);
852   if (UNLIKELY(!ptr))
853     // OOM error is already taken care of by Allocate.
854     return errno_ENOMEM;
855   CHECK(IsAligned((uptr)ptr, alignment));
856   *memptr = ptr;
857   return 0;
858 }
859 
860 uptr memprof_malloc_usable_size(const void *ptr, uptr pc, uptr bp) {
861   if (!ptr)
862     return 0;
863   uptr usable_size = instance.AllocationSize(reinterpret_cast<uptr>(ptr));
864   return usable_size;
865 }
866 
867 void MemprofSoftRssLimitExceededCallback(bool limit_exceeded) {
868   instance.SetRssLimitExceeded(limit_exceeded);
869 }
870 
871 } // namespace __memprof
872 
873 // ---------------------- Interface ---------------- {{{1
874 using namespace __memprof;
875 
876 #if !SANITIZER_SUPPORTS_WEAK_HOOKS
877 // Provide default (no-op) implementation of malloc hooks.
878 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_malloc_hook, void *ptr,
879                              uptr size) {
880   (void)ptr;
881   (void)size;
882 }
883 
884 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_free_hook, void *ptr) {
885   (void)ptr;
886 }
887 #endif
888 
889 uptr __sanitizer_get_estimated_allocated_size(uptr size) { return size; }
890 
891 int __sanitizer_get_ownership(const void *p) {
892   return memprof_malloc_usable_size(p, 0, 0) != 0;
893 }
894 
895 uptr __sanitizer_get_allocated_size(const void *p) {
896   return memprof_malloc_usable_size(p, 0, 0);
897 }
898 
899 int __memprof_profile_dump() {
900   instance.FinishAndPrint();
901   // In the future we may want to return non-zero if there are any errors
902   // detected during the dumping process.
903   return 0;
904 }
905