xref: /oneTBB/src/tbbmalloc/frontend.cpp (revision b2474bfc)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
31     extern "C" { static void mallocThreadShutdownNotification(void*); }
32     #if __sun || __SUNPRO_CC
33     #define __asm__ asm
34     #endif
35     #include <unistd.h> // sysconf(_SC_PAGESIZE)
36 #elif USE_WINTHREAD
37     #define GetMyTID() GetCurrentThreadId()
38 #if __TBB_WIN8UI_SUPPORT
39     #include<thread>
40     #define TlsSetValue_func FlsSetValue
41     #define TlsGetValue_func FlsGetValue
42     #define TlsAlloc() FlsAlloc(nullptr)
43     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
44     #define TlsFree FlsFree
45 #else
46     #define TlsSetValue_func TlsSetValue
47     #define TlsGetValue_func TlsGetValue
48     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
49 #endif
50 #else
51     #error Must define USE_PTHREAD or USE_WINTHREAD
52 #endif
53 
54 #define FREELIST_NONBLOCKING 1
55 
56 namespace rml {
57 class MemoryPool;
58 namespace internal {
59 
60 class Block;
61 class MemoryPool;
62 
63 #if MALLOC_CHECK_RECURSION
64 
65 inline bool isMallocInitialized();
66 
67 #endif // MALLOC_CHECK_RECURSION
68 
69 /** Support for handling the special UNUSABLE pointer state **/
70 const intptr_t UNUSABLE = 0x1;
71 inline bool isSolidPtr( void* ptr ) {
72     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
73 }
74 inline bool isNotForUse( void* ptr ) {
75     return (intptr_t)ptr==UNUSABLE;
76 }
77 
78 /*
79  * Block::objectSize value used to mark blocks allocated by startupAlloc
80  */
81 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
82 
83 /*
84  * The following constant is used to define the size of struct Block, the block header.
85  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
86  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
87  */
88 const int blockHeaderAlignment = estimatedCacheLineSize;
89 
90 /********* The data structures and global objects        **************/
91 
92 /*
93  * The malloc routines themselves need to be able to occasionally malloc some space,
94  * in order to set up the structures used by the thread local structures. This
95  * routine performs that functions.
96  */
97 class BootStrapBlocks {
98     MallocMutex bootStrapLock;
99     Block      *bootStrapBlock;
100     Block      *bootStrapBlockUsed;
101     FreeObject *bootStrapObjectList;
102 public:
103     void *allocate(MemoryPool *memPool, size_t size);
104     void free(void* ptr);
105     void reset();
106 };
107 
108 #if USE_INTERNAL_TID
109 class ThreadId {
110     static tls_key_t Tid_key;
111     std::atomic<intptr_t> ThreadCount;
112 
113     unsigned int id;
114 
115     static unsigned int tlsNumber() {
116         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
117         if( !result ) {
118             RecursiveMallocCallProtector scoped;
119             // Thread-local value is zero -> first call from this thread,
120             // need to initialize with next ID value (IDs start from 1)
121             result = ++ThreadCount; // returned new value!
122             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
123         }
124         return result;
125     }
126 public:
127     static bool init() {
128 #if USE_WINTHREAD
129         Tid_key = TlsAlloc();
130         if (Tid_key == TLS_ALLOC_FAILURE)
131             return false;
132 #else
133         int status = pthread_key_create( &Tid_key, nullptr );
134         if ( status ) {
135             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
136             return false;
137         }
138 #endif /* USE_WINTHREAD */
139         return true;
140     }
141 #if __TBB_SOURCE_DIRECTLY_INCLUDED
142     static void destroy() {
143         if( Tid_key ) {
144 #if USE_WINTHREAD
145             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
146 #else
147             int status = pthread_key_delete( Tid_key );
148 #endif /* USE_WINTHREAD */
149             if ( status )
150                 fprintf (stderr, "The memory manager cannot delete tls key\n");
151             Tid_key = 0;
152         }
153     }
154 #endif
155 
156     ThreadId() : id(ThreadId::tlsNumber()) {}
157     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
158 
159 #if COLLECT_STATISTICS || MALLOC_TRACE
160     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
161 #endif
162 #if COLLECT_STATISTICS
163     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
164 
165     friend int STAT_increment(ThreadId tid, int bin, int ctr);
166 #endif
167 };
168 
169 tls_key_t ThreadId::Tid_key;
170 intptr_t ThreadId::ThreadCount;
171 
172 #if COLLECT_STATISTICS
173 int STAT_increment(ThreadId tid, int bin, int ctr)
174 {
175     return ::STAT_increment(tid.id, bin, ctr);
176 }
177 #endif
178 
179 #else // USE_INTERNAL_TID
180 
181 class ThreadId {
182 #if USE_PTHREAD
183     std::atomic<pthread_t> tid;
184 #else
185     std::atomic<DWORD>     tid;
186 #endif
187 public:
188     ThreadId() : tid(GetMyTID()) {}
189     ThreadId(ThreadId &other) = delete;
190     ~ThreadId() = default;
191 
192 #if USE_PTHREAD
193     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
194 #else
195     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
196 #endif
197     ThreadId& operator=(const ThreadId& other) {
198         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
199         return *this;
200     }
201     static bool init() { return true; }
202 #if __TBB_SOURCE_DIRECTLY_INCLUDED
203     static void destroy() {}
204 #endif
205 };
206 
207 #endif // USE_INTERNAL_TID
208 
209 /*********** Code to provide thread ID and a thread-local void pointer **********/
210 
211 bool TLSKey::init()
212 {
213 #if USE_WINTHREAD
214     TLS_pointer_key = TlsAlloc();
215     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
216         return false;
217 #else
218     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
219     if ( status )
220         return false;
221 #endif /* USE_WINTHREAD */
222     return true;
223 }
224 
225 bool TLSKey::destroy()
226 {
227 #if USE_WINTHREAD
228     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
229 #else
230     int status1 = pthread_key_delete(TLS_pointer_key);
231 #endif /* USE_WINTHREAD */
232     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
233     return status1==0;
234 }
235 
236 inline TLSData* TLSKey::getThreadMallocTLS() const
237 {
238     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
239 }
240 
241 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
242     RecursiveMallocCallProtector scoped;
243     TlsSetValue_func( TLS_pointer_key, newvalue );
244 }
245 
246 /* The 'next' field in the block header has to maintain some invariants:
247  *   it needs to be on a 16K boundary and the first field in the block.
248  *   Any value stored there needs to have the lower 14 bits set to 0
249  *   so that various assert work. This means that if you want to smash this memory
250  *   for debugging purposes you will need to obey this invariant.
251  * The total size of the header needs to be a power of 2 to simplify
252  * the alignment requirements. For now it is a 128 byte structure.
253  * To avoid false sharing, the fields changed only locally are separated
254  * from the fields changed by foreign threads.
255  * Changing the size of the block header would require to change
256  * some bin allocation sizes, in particular "fitting" sizes (see above).
257  */
258 class Bin;
259 class StartupBlock;
260 
261 class MemoryPool {
262     // if no explicit grainsize, expect to see malloc in user's pAlloc
263     // and set reasonable low granularity
264     static const size_t defaultGranularity = estimatedCacheLineSize;
265 
266     MemoryPool() = delete;                  // deny
267 public:
268     static MallocMutex  memPoolListLock;
269 
270     // list of all active pools is used to release
271     // all TLS data on thread termination or library unload
272     MemoryPool    *next,
273                   *prev;
274     ExtMemoryPool  extMemPool;
275     BootStrapBlocks bootStrapBlocks;
276 
277     static void initDefaultPool();
278 
279     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
280     bool reset();
281     bool destroy();
282     void onThreadShutdown(TLSData *tlsData);
283 
284     inline TLSData *getTLS(bool create);
285     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(nullptr); }
286 
287     Block *getEmptyBlock(size_t size);
288     void returnEmptyBlock(Block *block, bool poolTheBlock);
289 
290     // get/put large object to/from local large object cache
291     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
292     void putToLLOCache(TLSData *tls, void *object);
293 };
294 
295 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
296                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
297 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
298 const size_t MemoryPool::defaultGranularity;
299 // zero-initialized
300 MallocMutex  MemoryPool::memPoolListLock;
301 // TODO: move huge page status to default pool, because that's its states
302 HugePagesStatus hugePages;
303 static bool usedBySrcIncluded = false;
304 
305 // Padding helpers
306 template<size_t padd>
307 struct PaddingImpl {
308     size_t       __padding[padd];
309 };
310 
311 template<>
312 struct PaddingImpl<0> {};
313 
314 template<int N>
315 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
316 
317 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
318 // fields and fields commonly accessed by not owner threads.
319 class GlobalBlockFields : public BlockI {
320 protected:
321     std::atomic<FreeObject*> publicFreeList;
322     std::atomic<Block*> nextPrivatizable;
323     MemoryPool  *poolPtr;
324 };
325 
326 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
327 protected:
328     Block       *next;
329     Block       *previous;        /* Use double linked list to speed up removal */
330     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
331     FreeObject  *freeList;
332     /* Pointer to local data for the owner thread. Used for fast finding tls
333        when releasing object from a block that current thread owned.
334        nullptr for orphaned blocks. */
335     std::atomic<TLSData*> tlsPtr;
336     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
337     BackRefIdx   backRefIdx;
338     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
339     uint16_t     objectSize;
340     bool         isFull;
341 
342     friend class FreeBlockPool;
343     friend class StartupBlock;
344     friend class LifoList;
345     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
346     friend bool OrphanedBlocks::cleanup(Backend*);
347     friend Block *MemoryPool::getEmptyBlock(size_t);
348 };
349 
350 // Use inheritance to guarantee that a user data start on next cache line.
351 // Can't use member for it, because when LocalBlockFields already on cache line,
352 // we must have no additional memory consumption for all compilers.
353 class Block : public LocalBlockFields,
354               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
355 public:
356     bool empty() const {
357         if (allocatedCount > 0) return false;
358         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
359         return true;
360     }
361     inline FreeObject* allocate();
362     inline FreeObject *allocateFromFreeList();
363 
364     inline bool adjustFullness();
365     void adjustPositionInBin(Bin* bin = nullptr);
366 #if MALLOC_DEBUG
367     bool freeListNonNull() { return freeList; }
368 #endif
369     void freePublicObject(FreeObject *objectToFree);
370     inline void freeOwnObject(void *object);
371     void reset();
372     void privatizePublicFreeList( bool reset = true );
373     void restoreBumpPtr();
374     void privatizeOrphaned(TLSData *tls, unsigned index);
375     bool readyToShare();
376     void shareOrphaned(intptr_t binTag, unsigned index);
377     unsigned int getSize() const {
378         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
379                       "Invalid object size");
380         return isStartupAllocObject()? 0 : objectSize;
381     }
382     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
383     inline bool isOwnedByCurrentThread() const;
384     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
385     inline FreeObject *findObjectToFree(const void *object) const;
386     void checkFreePrecond(const void *object) const {
387 #if MALLOC_DEBUG
388         const char *msg = "Possible double free or heap corruption.";
389         // small objects are always at least sizeof(size_t) Byte aligned,
390         // try to check this before this dereference as for invalid objects
391         // this may be unreadable
392         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
393 #if !__TBB_USE_THREAD_SANITIZER
394         // releasing to free slab
395         MALLOC_ASSERT(allocatedCount>0, msg);
396 #endif
397         // must not point to slab's header
398         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
399         if (startupAllocObjSizeMark == objectSize) // startup block
400             MALLOC_ASSERT(object<=bumpPtr, msg);
401         else {
402             // non-startup objects are 8 Byte aligned
403             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
404             FreeObject *toFree = findObjectToFree(object);
405 #if !__TBB_USE_THREAD_SANITIZER
406             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
407                           && (!bumpPtr || object>bumpPtr), msg);
408             // check against head of freeList, as this is mostly
409             // expected after double free
410             MALLOC_ASSERT(toFree != freeList, msg);
411 #endif
412             // check against head of publicFreeList, to detect double free
413             // involving foreign thread
414             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
415         }
416 #else
417         suppress_unused_warning(object);
418 #endif
419     }
420     void initEmptyBlock(TLSData *tls, size_t size);
421     size_t findObjectSize(void *object) const;
422     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
423 
424 protected:
425     void cleanBlockHeader();
426 
427 private:
428     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
429 
430     inline FreeObject *allocateFromBumpPtr();
431     inline FreeObject *findAllocatedObject(const void *address) const;
432 #if MALLOC_DEBUG
433     inline bool isProperlyPlaced(const void *object) const;
434 #endif
435     inline void markOwned(TLSData *tls) {
436         MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
437         ownerTid = ThreadId(); /* save the ID of the current thread */
438         tlsPtr.store(tls, std::memory_order_relaxed);
439     }
440     inline void markOrphaned() {
441         MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
442         tlsPtr.store(nullptr, std::memory_order_relaxed);
443     }
444 
445     friend class Bin;
446     friend class TLSData;
447     friend bool MemoryPool::destroy();
448 };
449 
450 const float Block::emptyEnoughRatio = 1.0 / 4.0;
451 
452 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
453     "The class Block does not fit into 2 cache lines on this platform. "
454     "Defining USE_INTERNAL_TID may help to fix it.");
455 
456 class Bin {
457 private:
458 public:
459     Block *activeBlk;
460     std::atomic<Block*> mailbox;
461     MallocMutex mailLock;
462 
463 public:
464     inline Block* getActiveBlock() const { return activeBlk; }
465     void resetActiveBlock() { activeBlk = nullptr; }
466     inline void setActiveBlock(Block *block);
467     inline Block* setPreviousBlockActive();
468     Block* getPrivatizedFreeListBlock();
469     void moveBlockToFront(Block *block);
470     bool cleanPublicFreeLists();
471     void processEmptyBlock(Block *block, bool poolTheBlock);
472     void addPublicFreeListBlock(Block* block);
473 
474     void outofTLSBin(Block* block);
475     void verifyTLSBin(size_t size) const;
476     void pushTLSBin(Block* block);
477 
478 #if MALLOC_DEBUG
479     void verifyInitState() const {
480         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
481         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
482     }
483 #endif
484 
485     friend void Block::freePublicObject (FreeObject *objectToFree);
486 };
487 
488 /********* End of the data structures                    **************/
489 
490 /*
491  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
492  */
493 const uint32_t minSmallObjectIndex = 0;
494 const uint32_t numSmallObjectBins = 8;
495 const uint32_t maxSmallObjectSize = 64;
496 
497 /*
498  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
499  * from maxSmallObjectSize till this size; 16 bins in total
500  */
501 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
502 const uint32_t numSegregatedObjectBins = 16;
503 const uint32_t maxSegregatedObjectSize = 1024;
504 
505 /*
506  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
507  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
508  */
509 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
510 const uint32_t numFittingBins = 5;
511 
512 const uint32_t fittingAlignment = estimatedCacheLineSize;
513 
514 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
515 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
516 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
517 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
518 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
519 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
520 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
521 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
522 #undef SET_FITTING_SIZE
523 
524 /*
525  * The total number of thread-specific Block-based bins
526  */
527 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
528 
529 /*
530  * Objects of this size and larger are considered large objects.
531  */
532 const uint32_t minLargeObjectSize = fittingSize5 + 1;
533 
534 /*
535  * Per-thread pool of slab blocks. Idea behind it is to not share with other
536  * threads memory that are likely in local cache(s) of our CPU.
537  */
538 class FreeBlockPool {
539 private:
540     std::atomic<Block*> head;
541     int         size;
542     Backend    *backend;
543 public:
544     static const int POOL_HIGH_MARK = 32;
545     static const int POOL_LOW_MARK  = 8;
546 
547     class ResOfGet {
548         ResOfGet() = delete;
549     public:
550         Block* block;
551         bool   lastAccMiss;
552         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
553     };
554 
555     // allocated in zero-initialized memory
556     FreeBlockPool(Backend *bknd) : backend(bknd) {}
557     ResOfGet getBlock();
558     void returnBlock(Block *block);
559     bool externalCleanup(); // can be called by another thread
560 };
561 
562 template<int LOW_MARK, int HIGH_MARK>
563 class LocalLOCImpl {
564 private:
565     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
566     // TODO: can single-linked list be faster here?
567     LargeMemoryBlock *tail; // need it when do releasing on overflow
568     std::atomic<LargeMemoryBlock*> head;
569     size_t            totalSize;
570     int               numOfBlocks;
571 public:
572     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
573     LargeMemoryBlock *get(size_t size);
574     bool externalCleanup(ExtMemoryPool *extMemPool);
575 #if __TBB_MALLOC_WHITEBOX_TEST
576     LocalLOCImpl() : tail(nullptr), head(nullptr), totalSize(0), numOfBlocks(0) {}
577     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
578     static const int LOC_HIGH_MARK = HIGH_MARK;
579 #else
580     // no ctor, object must be created in zero-initialized memory
581 #endif
582 };
583 
584 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
585 
586 class TLSData : public TLSRemote {
587     MemoryPool   *memPool;
588 public:
589     Bin           bin[numBlockBinLimit];
590     FreeBlockPool freeSlabBlocks;
591     LocalLOC      lloc;
592     unsigned      currCacheIdx;
593 private:
594     std::atomic<bool> unused;
595 public:
596     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd), currCacheIdx(0) {}
597     MemoryPool *getMemPool() const { return memPool; }
598     Bin* getAllocationBin(size_t size);
599     void release();
600     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
601         if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
602         // Heavy operation in terms of synchronization complexity,
603         // should be called only for the current thread
604         bool released = cleanBins ? cleanupBlockBins() : false;
605         // both cleanups to be called, and the order is not important
606         bool lloc_cleaned = lloc.externalCleanup(&memPool->extMemPool);
607         bool free_slab_blocks_cleaned = freeSlabBlocks.externalCleanup();
608         return released || lloc_cleaned || free_slab_blocks_cleaned;
609     }
610     bool cleanupBlockBins();
611     void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
612     void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
613 };
614 
615 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
616 {
617     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
618     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
619     if ( !tls )
620         return nullptr;
621     new(tls) TLSData(memPool, backend);
622     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
623 #if MALLOC_DEBUG
624     for (uint32_t i = 0; i < numBlockBinLimit; i++)
625         tls->bin[i].verifyInitState();
626 #endif
627     setThreadMallocTLS(tls);
628     memPool->extMemPool.allLocalCaches.registerThread(tls);
629     return tls;
630 }
631 
632 bool TLSData::cleanupBlockBins()
633 {
634     bool released = false;
635     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
636         released |= bin[i].cleanPublicFreeLists();
637         // After cleaning public free lists, only the active block might be empty.
638         // Do not use processEmptyBlock because it will just restore bumpPtr.
639         Block *block = bin[i].getActiveBlock();
640         if (block && block->empty()) {
641             bin[i].outofTLSBin(block);
642             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
643             released = true;
644         }
645     }
646     return released;
647 }
648 
649 bool ExtMemoryPool::releaseAllLocalCaches()
650 {
651     // Iterate all registered TLS data and clean LLOC and Slab pools
652     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
653 
654     // Bins privatization is done only for the current thread
655     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
656         released |= tlsData->cleanupBlockBins();
657 
658     return released;
659 }
660 
661 void AllLocalCaches::registerThread(TLSRemote *tls)
662 {
663     tls->prev = nullptr;
664     MallocMutex::scoped_lock lock(listLock);
665     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
666     tls->next = head;
667     if (head)
668         head->prev = tls;
669     head = tls;
670     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
671 }
672 
673 void AllLocalCaches::unregisterThread(TLSRemote *tls)
674 {
675     MallocMutex::scoped_lock lock(listLock);
676     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
677     if (head == tls)
678         head = tls->next;
679     if (tls->next)
680         tls->next->prev = tls->prev;
681     if (tls->prev)
682         tls->prev->next = tls->next;
683     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
684 }
685 
686 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
687 {
688     bool released = false;
689     {
690         MallocMutex::scoped_lock lock(listLock);
691         for (TLSRemote *curr=head; curr; curr=curr->next)
692             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
693     }
694     return released;
695 }
696 
697 void AllLocalCaches::markUnused()
698 {
699     bool locked = false;
700     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
701     if (!locked) // not wait for marking if someone doing something with it
702         return;
703 
704     for (TLSRemote *curr=head; curr; curr=curr->next)
705         static_cast<TLSData*>(curr)->markUnused();
706 }
707 
708 #if MALLOC_CHECK_RECURSION
709 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
710 std::atomic<pthread_t> RecursiveMallocCallProtector::owner_thread;
711 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
712 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
713 #if __FreeBSD__
714 bool        RecursiveMallocCallProtector::canUsePthread;
715 #endif
716 
717 #endif
718 
719 /*********** End code to provide thread ID and a TLS pointer **********/
720 
721 // Parameter for isLargeObject, keeps our expectations on memory origin.
722 // Assertions must use unknownMem to reliably report object invalidity.
723 enum MemoryOrigin {
724     ourMem,    // allocated by TBB allocator
725     unknownMem // can be allocated by system allocator or TBB allocator
726 };
727 
728 template<MemoryOrigin>
729 #if __TBB_USE_THREAD_SANITIZER
730 // We have a real race when accessing the large object header for
731 // non large objects (e.g. small or foreign objects).
732 // Therefore, we need to hide this access from the thread sanitizer
733 __attribute__((no_sanitize("thread")))
734 #endif
735 bool isLargeObject(void *object);
736 static void *internalMalloc(size_t size);
737 static void internalFree(void *object);
738 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
739 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
740 
741 #if !MALLOC_DEBUG
742 #if __INTEL_COMPILER || _MSC_VER
743 #define NOINLINE(decl) __declspec(noinline) decl
744 #define ALWAYSINLINE(decl) __forceinline decl
745 #elif __GNUC__
746 #define NOINLINE(decl) decl __attribute__ ((noinline))
747 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
748 #else
749 #define NOINLINE(decl) decl
750 #define ALWAYSINLINE(decl) decl
751 #endif
752 
753 static NOINLINE( bool doInitialization() );
754 ALWAYSINLINE( bool isMallocInitialized() );
755 
756 #undef ALWAYSINLINE
757 #undef NOINLINE
758 #endif /* !MALLOC_DEBUG */
759 
760 
761 /********* Now some rough utility code to deal with indexing the size bins. **************/
762 
763 /*
764  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
765  * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
766  * This is enough for current algorithm of distribution of sizes among bins.
767  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
768  */
769 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
770 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
771 #pragma intrinsic(_BitScanReverse)
772 #endif
773 static inline unsigned int highestBitPos(unsigned int n)
774 {
775     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
776     unsigned int pos;
777 #if __ARCH_x86_32||__ARCH_x86_64
778 
779 # if __unix__||__APPLE__||__MINGW32__
780     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
781 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
782     __asm
783     {
784         bsr eax, n
785         mov pos, eax
786     }
787 # elif _WIN64 && _MSC_VER>=1400
788     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
789 # else
790 #   error highestBitPos() not implemented for this platform
791 # endif
792 #elif __arm__
793     __asm__ __volatile__
794     (
795        "clz %0, %1\n"
796        "rsb %0, %0, %2\n"
797        :"=r" (pos) :"r" (n), "I" (31)
798     );
799 #else
800     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
801     pos = bsr[ n>>6 ];
802 #endif /* __ARCH_* */
803     return pos;
804 }
805 
806 unsigned int getSmallObjectIndex(unsigned int size)
807 {
808     unsigned int result = (size-1)>>3;
809     constexpr bool is_64bit = (8 == sizeof(void*));
810     if (is_64bit) {
811         // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
812         if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
813     }
814     return result;
815 }
816 
817 /*
818  * Depending on indexRequest, for a given size return either the index into the bin
819  * for objects of this size, or the actual size of objects in this bin.
820  */
821 template<bool indexRequest>
822 static unsigned int getIndexOrObjectSize (unsigned int size)
823 {
824     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
825         unsigned int index = getSmallObjectIndex( size );
826          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
827         return indexRequest ? index : (index+1)<<3;
828     }
829     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
830         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
831         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
832         if (indexRequest)
833             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
834         else {
835             unsigned int alignment = 128 >> (9-order); // alignment in the group
836             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
837             return alignUp(size,alignment);
838         }
839     }
840     else {
841         if( size <= fittingSize3 ) {
842             if( size <= fittingSize2 ) {
843                 if( size <= fittingSize1 )
844                     return indexRequest ? minFittingIndex : fittingSize1;
845                 else
846                     return indexRequest ? minFittingIndex+1 : fittingSize2;
847             } else
848                 return indexRequest ? minFittingIndex+2 : fittingSize3;
849         } else {
850             if( size <= fittingSize5 ) {
851                 if( size <= fittingSize4 )
852                     return indexRequest ? minFittingIndex+3 : fittingSize4;
853                 else
854                     return indexRequest ? minFittingIndex+4 : fittingSize5;
855             } else {
856                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
857                 return ~0U;
858             }
859         }
860     }
861 }
862 
863 static unsigned int getIndex (unsigned int size)
864 {
865     return getIndexOrObjectSize</*indexRequest=*/true>(size);
866 }
867 
868 static unsigned int getObjectSize (unsigned int size)
869 {
870     return getIndexOrObjectSize</*indexRequest=*/false>(size);
871 }
872 
873 
874 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
875 {
876     FreeObject *result;
877 
878     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
879 
880     { // Lock with acquire
881         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
882 
883         if( bootStrapObjectList) {
884             result = bootStrapObjectList;
885             bootStrapObjectList = bootStrapObjectList->next;
886         } else {
887             if (!bootStrapBlock) {
888                 bootStrapBlock = memPool->getEmptyBlock(size);
889                 if (!bootStrapBlock) return nullptr;
890             }
891             result = bootStrapBlock->bumpPtr;
892             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
893             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
894                 bootStrapBlock->bumpPtr = nullptr;
895                 bootStrapBlock->next = bootStrapBlockUsed;
896                 bootStrapBlockUsed = bootStrapBlock;
897                 bootStrapBlock = nullptr;
898             }
899         }
900     } // Unlock with release
901     memset (result, 0, size);
902     return (void*)result;
903 }
904 
905 void BootStrapBlocks::free(void* ptr)
906 {
907     MALLOC_ASSERT( ptr, ASSERT_TEXT );
908     { // Lock with acquire
909         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
910         ((FreeObject*)ptr)->next = bootStrapObjectList;
911         bootStrapObjectList = (FreeObject*)ptr;
912     } // Unlock with release
913 }
914 
915 void BootStrapBlocks::reset()
916 {
917     bootStrapBlock = bootStrapBlockUsed = nullptr;
918     bootStrapObjectList = nullptr;
919 }
920 
921 #if !(FREELIST_NONBLOCKING)
922 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
923 #endif
924 
925 /********* End rough utility code  **************/
926 
927 /* LifoList assumes zero initialization so a vector of it can be created
928  * by just allocating some space with no call to constructor.
929  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
930  *
931  * By usage convention there is no race on the initialization. */
932 LifoList::LifoList( ) : top(nullptr)
933 {
934     // MallocMutex assumes zero initialization
935     memset(static_cast<void*>(&lock), 0, sizeof(MallocMutex));
936 }
937 
938 void LifoList::push(Block *block)
939 {
940     MallocMutex::scoped_lock scoped_cs(lock);
941     block->next = top.load(std::memory_order_relaxed);
942     top.store(block, std::memory_order_relaxed);
943 }
944 
945 Block *LifoList::pop()
946 {
947     Block* block = nullptr;
948     if (top.load(std::memory_order_relaxed)) {
949         MallocMutex::scoped_lock scoped_cs(lock);
950         block = top.load(std::memory_order_relaxed);
951         if (block) {
952             top.store(block->next, std::memory_order_relaxed);
953         }
954     }
955     return block;
956 }
957 
958 Block *LifoList::grab()
959 {
960     Block *block = nullptr;
961     if (top.load(std::memory_order_relaxed)) {
962         MallocMutex::scoped_lock scoped_cs(lock);
963         block = top.load(std::memory_order_relaxed);
964         top.store(nullptr, std::memory_order_relaxed);
965     }
966     return block;
967 }
968 
969 /********* Thread and block related code      *************/
970 
971 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
972      LargeMemoryBlock *next, *lmb = loHead;
973      loHead = nullptr;
974 
975      for (; lmb; lmb = next) {
976          next = lmb->gNext;
977          if (poolDestroy) {
978              // as it's pool destruction, no need to return object to backend,
979              // only remove backrefs, as they are global
980              removeBackRef(lmb->backRefIdx);
981          } else {
982              // clean g(Next|Prev) to prevent removing lmb
983              // from AllLargeBlocksList inside returnLargeObject
984              lmb->gNext = lmb->gPrev = nullptr;
985              backend->returnLargeObject(lmb);
986          }
987      }
988 }
989 
990 TLSData* MemoryPool::getTLS(bool create)
991 {
992     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
993     if (create && !tls)
994         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
995     return tls;
996 }
997 
998 /*
999  * Return the bin for the given size.
1000  */
1001 inline Bin* TLSData::getAllocationBin(size_t size)
1002 {
1003     return bin + getIndex(size);
1004 }
1005 
1006 /* Return an empty uninitialized block in a non-blocking fashion. */
1007 Block *MemoryPool::getEmptyBlock(size_t size)
1008 {
1009     TLSData* tls = getTLS(/*create=*/false);
1010     // try to use per-thread cache, if TLS available
1011     FreeBlockPool::ResOfGet resOfGet = tls?
1012         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(nullptr, false);
1013     Block *result = resOfGet.block;
1014 
1015     if (!result) { // not found in local cache, asks backend for slabs
1016         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1017         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1018 
1019         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1020         if (!result) return nullptr;
1021 
1022         if (!extMemPool.userPool())
1023             for (int i=0; i<num; i++) {
1024                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1025                 if (backRefIdx[i].isInvalid()) {
1026                     // roll back resource allocation
1027                     for (int j=0; j<i; j++)
1028                         removeBackRef(backRefIdx[j]);
1029                     Block *b = result;
1030                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1031                         extMemPool.backend.putSlabBlock(b);
1032                     return nullptr;
1033                 }
1034             }
1035         // resources were allocated, register blocks
1036         Block *b = result;
1037         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1038             // slab block in user's pool must have invalid backRefIdx
1039             if (extMemPool.userPool()) {
1040                 new (&b->backRefIdx) BackRefIdx();
1041             } else {
1042                 setBackRef(backRefIdx[i], b);
1043                 b->backRefIdx = backRefIdx[i];
1044             }
1045             b->tlsPtr.store(tls, std::memory_order_relaxed);
1046             b->poolPtr = this;
1047             // all but first one go to per-thread pool
1048             if (i > 0) {
1049                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1050                 tls->freeSlabBlocks.returnBlock(b);
1051             }
1052         }
1053     }
1054     MALLOC_ASSERT(result, ASSERT_TEXT);
1055     result->initEmptyBlock(tls, size);
1056     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1057     return result;
1058 }
1059 
1060 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1061 {
1062     block->reset();
1063     if (poolTheBlock) {
1064         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1065     } else {
1066         // slab blocks in user's pools do not have valid backRefIdx
1067         if (!extMemPool.userPool())
1068             removeBackRef(*(block->getBackRefIdx()));
1069         extMemPool.backend.putSlabBlock(block);
1070     }
1071 }
1072 
1073 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1074                          rawFreeType rawFree, size_t granularity,
1075                          bool keepAllMemory, bool fixedPool)
1076 {
1077     this->poolId = poolId;
1078     this->rawAlloc = rawAlloc;
1079     this->rawFree = rawFree;
1080     this->granularity = granularity;
1081     this->keepAllMemory = keepAllMemory;
1082     this->fixedPool = fixedPool;
1083     this->delayRegsReleasing = false;
1084     if (!initTLS())
1085         return false;
1086     loc.init(this);
1087     backend.init(this);
1088     MALLOC_ASSERT(isPoolValid(), nullptr);
1089     return true;
1090 }
1091 
1092 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1093 
1094 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1095 {
1096     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1097                policy->granularity? policy->granularity : defaultGranularity,
1098                policy->keepAllMemory, policy->fixedPool))
1099         return false;
1100     {
1101         MallocMutex::scoped_lock lock(memPoolListLock);
1102         next = defaultMemPool->next;
1103         defaultMemPool->next = this;
1104         prev = defaultMemPool;
1105         if (next)
1106             next->prev = this;
1107     }
1108     return true;
1109 }
1110 
1111 bool MemoryPool::reset()
1112 {
1113     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1114     // memory is not releasing during pool reset
1115     // TODO: mark regions to release unused on next reset()
1116     extMemPool.delayRegionsReleasing(true);
1117 
1118     bootStrapBlocks.reset();
1119     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1120     if (!extMemPool.reset())
1121         return false;
1122 
1123     if (!extMemPool.initTLS())
1124         return false;
1125     extMemPool.delayRegionsReleasing(false);
1126     return true;
1127 }
1128 
1129 bool MemoryPool::destroy()
1130 {
1131 #if __TBB_MALLOC_LOCACHE_STAT
1132     extMemPool.loc.reportStat(stdout);
1133 #endif
1134 #if __TBB_MALLOC_BACKEND_STAT
1135     extMemPool.backend.reportStat(stdout);
1136 #endif
1137     {
1138         MallocMutex::scoped_lock lock(memPoolListLock);
1139         // remove itself from global pool list
1140         if (prev)
1141             prev->next = next;
1142         if (next)
1143             next->prev = prev;
1144     }
1145     // slab blocks in non-default pool do not have backreferences,
1146     // only large objects do
1147     if (extMemPool.userPool())
1148         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1149     else {
1150         // only one non-userPool() is supported now
1151         MALLOC_ASSERT(this==defaultMemPool, nullptr);
1152         // There and below in extMemPool.destroy(), do not restore initial state
1153         // for user pool, because it's just about to be released. But for system
1154         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1155         bootStrapBlocks.reset();
1156         extMemPool.orphanedBlocks.reset();
1157     }
1158     return extMemPool.destroy();
1159 }
1160 
1161 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1162 {
1163     if (tlsData) { // might be called for "empty" TLS
1164         tlsData->release();
1165         bootStrapBlocks.free(tlsData);
1166         clearTLS();
1167     }
1168 }
1169 
1170 #if MALLOC_DEBUG
1171 void Bin::verifyTLSBin (size_t size) const
1172 {
1173 /* The debug version verifies the TLSBin as needed */
1174     uint32_t objSize = getObjectSize(size);
1175 
1176     if (activeBlk) {
1177         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1178         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1179 #if MALLOC_DEBUG>1
1180         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1181             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1182             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1183             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1184             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1185             if (temp->next) {
1186                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1187             }
1188         }
1189         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1190             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1191             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1192             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1193             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1194             if (temp->previous) {
1195                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1196             }
1197         }
1198 #endif /* MALLOC_DEBUG>1 */
1199     }
1200 }
1201 #else /* MALLOC_DEBUG */
1202 inline void Bin::verifyTLSBin (size_t) const { }
1203 #endif /* MALLOC_DEBUG */
1204 
1205 /*
1206  * Add a block to the start of this tls bin list.
1207  */
1208 void Bin::pushTLSBin(Block* block)
1209 {
1210     /* The objectSize should be defined and not a parameter
1211        because the function is applied to partially filled blocks as well */
1212     unsigned int size = block->objectSize;
1213 
1214     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1215     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1216     MALLOC_ASSERT( block->next == nullptr, ASSERT_TEXT );
1217     MALLOC_ASSERT( block->previous == nullptr, ASSERT_TEXT );
1218 
1219     MALLOC_ASSERT( this, ASSERT_TEXT );
1220     verifyTLSBin(size);
1221 
1222     block->next = activeBlk;
1223     if( activeBlk ) {
1224         block->previous = activeBlk->previous;
1225         activeBlk->previous = block;
1226         if( block->previous )
1227             block->previous->next = block;
1228     } else {
1229         activeBlk = block;
1230     }
1231 
1232     verifyTLSBin(size);
1233 }
1234 
1235 /*
1236  * Take a block out of its tls bin (e.g. before removal).
1237  */
1238 void Bin::outofTLSBin(Block* block)
1239 {
1240     unsigned int size = block->objectSize;
1241 
1242     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1243     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1244 
1245     MALLOC_ASSERT( this, ASSERT_TEXT );
1246     verifyTLSBin(size);
1247 
1248     if (block == activeBlk) {
1249         activeBlk = block->previous? block->previous : block->next;
1250     }
1251     /* Unlink the block */
1252     if (block->previous) {
1253         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1254         block->previous->next = block->next;
1255     }
1256     if (block->next) {
1257         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1258         block->next->previous = block->previous;
1259     }
1260     block->next = nullptr;
1261     block->previous = nullptr;
1262 
1263     verifyTLSBin(size);
1264 }
1265 
1266 Block* Bin::getPrivatizedFreeListBlock()
1267 {
1268     Block* block;
1269     MALLOC_ASSERT( this, ASSERT_TEXT );
1270     // if this method is called, active block usage must be unsuccessful
1271     MALLOC_ASSERT( (!activeBlk && !mailbox.load(std::memory_order_relaxed)) || (activeBlk && activeBlk->isFull), ASSERT_TEXT );
1272 
1273 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1274     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1275         return nullptr;
1276     else { // mailbox is not empty, take lock and inspect it
1277         MallocMutex::scoped_lock scoped_cs(mailLock);
1278         block = mailbox.load(std::memory_order_relaxed);
1279         if( block ) {
1280             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1281             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1282             mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1283             block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1284         }
1285     }
1286     if( block ) {
1287         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1288         block->privatizePublicFreeList();
1289         block->adjustPositionInBin(this);
1290     }
1291     return block;
1292 }
1293 
1294 void Bin::addPublicFreeListBlock(Block* block)
1295 {
1296     MallocMutex::scoped_lock scoped_cs(mailLock);
1297     block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1298     mailbox.store(block, std::memory_order_relaxed);
1299 }
1300 
1301 // Process publicly freed objects in all blocks and return empty blocks
1302 // to the backend in order to reduce overall footprint.
1303 bool Bin::cleanPublicFreeLists()
1304 {
1305     Block* block;
1306     if (!mailbox.load(std::memory_order_acquire))
1307         return false;
1308     else {
1309         // Grab all the blocks in the mailbox
1310         MallocMutex::scoped_lock scoped_cs(mailLock);
1311         block = mailbox.load(std::memory_order_relaxed);
1312         mailbox.store(nullptr, std::memory_order_relaxed);
1313     }
1314     bool released = false;
1315     while (block) {
1316         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1317         Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1318         block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1319         block->privatizePublicFreeList();
1320         if (block->empty()) {
1321             processEmptyBlock(block, /*poolTheBlock=*/false);
1322             released = true;
1323         } else
1324             block->adjustPositionInBin(this);
1325         block = tmp;
1326     }
1327     return released;
1328 }
1329 
1330 bool Block::adjustFullness()
1331 {
1332     if (bumpPtr) {
1333         /* If we are still using a bump ptr for this block it is empty enough to use. */
1334         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1335         isFull = false;
1336     } else {
1337         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1338         /* allocatedCount shows how many objects in the block are in use; however it still counts
1339          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1340         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1341 #if COLLECT_STATISTICS
1342         if (isFull)
1343             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1344         else
1345             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1346 #endif
1347     }
1348     return isFull;
1349 }
1350 
1351 // This method resides in class Block, and not in class Bin, in order to avoid
1352 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
1353 void Block::adjustPositionInBin(Bin* bin/*=nullptr*/)
1354 {
1355     // If the block were full, but became empty enough to use,
1356     // move it to the front of the list
1357     if (isFull && !adjustFullness()) {
1358         if (!bin)
1359             bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1360         bin->moveBlockToFront(this);
1361     }
1362 }
1363 
1364 /* Restore the bump pointer for an empty block that is planned to use */
1365 void Block::restoreBumpPtr()
1366 {
1367     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1368     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1369     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1370     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1371     freeList = nullptr;
1372     isFull = false;
1373 }
1374 
1375 void Block::freeOwnObject(void *object)
1376 {
1377     tlsPtr.load(std::memory_order_relaxed)->markUsed();
1378     allocatedCount--;
1379     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1380 #if COLLECT_STATISTICS
1381     // Note that getAllocationBin is not called on the hottest path with statistics off.
1382     if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1383         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1384     else
1385         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1386 #endif
1387     if (empty()) {
1388         // If the last object of a slab is freed, the slab cannot be marked full
1389         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1390         tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1391     } else { // hot path
1392         FreeObject *objectToFree = findObjectToFree(object);
1393         objectToFree->next = freeList;
1394         freeList = objectToFree;
1395         adjustPositionInBin();
1396     }
1397 }
1398 
1399 void Block::freePublicObject (FreeObject *objectToFree)
1400 {
1401     FreeObject* localPublicFreeList{};
1402 
1403     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1404 #if FREELIST_NONBLOCKING
1405     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1406     localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1407     do {
1408         objectToFree->next = localPublicFreeList;
1409         // no backoff necessary because trying to make change, not waiting for a change
1410     } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1411 #else
1412     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1413     {
1414         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1415         localPublicFreeList = objectToFree->next = publicFreeList;
1416         publicFreeList = objectToFree;
1417     }
1418 #endif
1419 
1420     if( localPublicFreeList==nullptr ) {
1421         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1422         // otherwise, it should point to the bin the block belongs to.
1423         // reading nextPrivatizable is thread-safe below, because:
1424         // 1) the executing thread atomically got publicFreeList==nullptr and changed it to non-nullptr;
1425         // 2) only owning thread can change it back to nullptr,
1426         // 3) but it can not be done until the block is put to the mailbox
1427         // So the executing thread is now the only one that can change nextPrivatizable
1428         Block* next = nextPrivatizable.load(std::memory_order_acquire);
1429         if( !isNotForUse(next) ) {
1430             MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1431             Bin* theBin = (Bin*) next;
1432 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1433             // FIXME: The thread that returns the block is not the block's owner.
1434             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1435             // Need to find a way to get the correct remote bin for comparison.
1436             { // check that nextPrivatizable points to the bin the block belongs to
1437                 uint32_t index = getIndex( objectSize );
1438                 TLSData* tls = getThreadMallocTLS();
1439                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1440             }
1441 #endif // MALLOC_DEBUG
1442             theBin->addPublicFreeListBlock(this);
1443         }
1444     }
1445     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1446     STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1447 }
1448 
1449 // Make objects freed by other threads available for use again
1450 void Block::privatizePublicFreeList( bool reset )
1451 {
1452     FreeObject *localPublicFreeList;
1453     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1454     // to properly synchronize with other threads freeing objects to this slab.
1455     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1456 
1457     // Only the owner thread may reset the pointer to nullptr
1458     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1459 #if FREELIST_NONBLOCKING
1460     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1461 #else
1462     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1463     {
1464         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1465         localPublicFreeList = publicFreeList;
1466         publicFreeList = endMarker;
1467     }
1468 #endif
1469     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1470     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1471 
1472     // publicFreeList must have been UNUSABLE or valid, but not nullptr
1473     MALLOC_ASSERT( localPublicFreeList!=nullptr, ASSERT_TEXT );
1474     if( isSolidPtr(localPublicFreeList) ) {
1475         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1476         /* other threads did not change the counter freeing our blocks */
1477         allocatedCount--;
1478         FreeObject *temp = localPublicFreeList;
1479         while( isSolidPtr(temp->next) ){ // the list will end with either nullptr or UNUSABLE
1480             temp = temp->next;
1481             allocatedCount--;
1482             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1483         }
1484         /* merge with local freeList */
1485         temp->next = freeList;
1486         freeList = localPublicFreeList;
1487         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1488     }
1489 }
1490 
1491 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1492 {
1493     Bin* bin = tls->bin + index;
1494     STAT_increment(getThreadId(), index, allocBlockPublic);
1495     next = nullptr;
1496     previous = nullptr;
1497     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != nullptr, ASSERT_TEXT );
1498     /* There is not a race here since no other thread owns this block */
1499     markOwned(tls);
1500     // It is safe to change nextPrivatizable, as publicFreeList is not null
1501     MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1502     nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1503     // the next call is required to change publicFreeList to 0
1504     privatizePublicFreeList();
1505     if( empty() ) {
1506         restoreBumpPtr();
1507     } else {
1508         adjustFullness(); // check the block fullness and set isFull
1509     }
1510     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1511 }
1512 
1513 
1514 bool Block::readyToShare()
1515 {
1516     FreeObject* oldVal = nullptr;
1517 #if FREELIST_NONBLOCKING
1518     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1519 #else
1520     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1521     {
1522         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1523         if ( (oldVal=publicFreeList)==nullptr )
1524             (intptr_t&)(publicFreeList) = UNUSABLE;
1525     }
1526 #endif
1527     return oldVal==nullptr;
1528 }
1529 
1530 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1531 {
1532     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1533     // unreferenced formal parameter warning
1534     tbb::detail::suppress_unused_warning(index);
1535     STAT_increment(getThreadId(), index, freeBlockPublic);
1536     markOrphaned();
1537     if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1538         // First check passed: the block is not in mailbox yet.
1539         // Need to set publicFreeList to non-zero, so other threads
1540         // will not change nextPrivatizable and it can be zeroed.
1541         if ( !readyToShare() ) {
1542             // another thread freed an object; we need to wait until it finishes.
1543             // There is no need for exponential backoff, as the wait here is not for a lock;
1544             // but need to yield, so the thread we wait has a chance to run.
1545             // TODO: add a pause to also be friendly to hyperthreads
1546             int count = 256;
1547             while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1548                 if (--count==0) {
1549                     do_yield();
1550                     count = 256;
1551                 }
1552             }
1553         }
1554     }
1555     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=nullptr, ASSERT_TEXT );
1556     // now it is safe to change our data
1557     previous = nullptr;
1558     // it is caller responsibility to ensure that the list of blocks
1559     // formed by nextPrivatizable pointers is kept consistent if required.
1560     // if only called from thread shutdown code, it does not matter.
1561     nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1562 }
1563 
1564 void Block::cleanBlockHeader()
1565 {
1566     next = nullptr;
1567     previous = nullptr;
1568     freeList = nullptr;
1569     allocatedCount = 0;
1570     isFull = false;
1571     tlsPtr.store(nullptr, std::memory_order_relaxed);
1572 
1573     publicFreeList.store(nullptr, std::memory_order_relaxed);
1574 }
1575 
1576 void Block::initEmptyBlock(TLSData *tls, size_t size)
1577 {
1578     // Having getIndex and getObjectSize called next to each other
1579     // allows better compiler optimization as they basically share the code.
1580     unsigned int index = getIndex(size);
1581     unsigned int objSz = getObjectSize(size);
1582 
1583     cleanBlockHeader();
1584     objectSize = objSz;
1585     markOwned(tls);
1586     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1587     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1588 
1589     // each block should have the address where the head of the list of "privatizable" blocks is kept
1590     // the only exception is a block for boot strap which is initialized when TLS is yet nullptr
1591     nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1592     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1593              this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1594 }
1595 
1596 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1597 {
1598     // TODO: try to use index from getAllocationBin
1599     unsigned int index = getIndex(size);
1600     Block *block = bins[index].pop();
1601     if (block) {
1602         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1603         block->privatizeOrphaned(tls, index);
1604     }
1605     return block;
1606 }
1607 
1608 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1609 {
1610     unsigned int index = getIndex(block->getSize());
1611     block->shareOrphaned(binTag, index);
1612     MALLOC_ITT_SYNC_RELEASING(bins+index);
1613     bins[index].push(block);
1614 }
1615 
1616 void OrphanedBlocks::reset()
1617 {
1618     for (uint32_t i=0; i<numBlockBinLimit; i++)
1619         new (bins+i) LifoList();
1620 }
1621 
1622 bool OrphanedBlocks::cleanup(Backend* backend)
1623 {
1624     bool released = false;
1625     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1626         Block* block = bins[i].grab();
1627         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1628         while (block) {
1629             Block* next = block->next;
1630             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to nullptr
1631             if (block->empty()) {
1632                 block->reset();
1633                 // slab blocks in user's pools do not have valid backRefIdx
1634                 if (!backend->inUserPool())
1635                     removeBackRef(*(block->getBackRefIdx()));
1636                 backend->putSlabBlock(block);
1637                 released = true;
1638             } else {
1639                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1640                 bins[i].push(block);
1641             }
1642             block = next;
1643         }
1644     }
1645     return released;
1646 }
1647 
1648 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1649 {
1650     Block *b = head.exchange(nullptr);
1651     bool lastAccessMiss;
1652 
1653     if (b) {
1654         size--;
1655         Block *newHead = b->next;
1656         lastAccessMiss = false;
1657         head.store(newHead, std::memory_order_release);
1658     } else {
1659         lastAccessMiss = true;
1660     }
1661     return ResOfGet(b, lastAccessMiss);
1662 }
1663 
1664 void FreeBlockPool::returnBlock(Block *block)
1665 {
1666     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1667     Block *localHead = head.exchange(nullptr);
1668 
1669     if (!localHead) {
1670         size = 0; // head was stolen by externalClean, correct size accordingly
1671     } else if (size == POOL_HIGH_MARK) {
1672         // release cold blocks and add hot one,
1673         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1674         Block *headToFree = localHead, *helper;
1675         for (int i=0; i<POOL_LOW_MARK-2; i++)
1676             headToFree = headToFree->next;
1677         Block *last = headToFree;
1678         headToFree = headToFree->next;
1679         last->next = nullptr;
1680         size = POOL_LOW_MARK-1;
1681         for (Block *currBl = headToFree; currBl; currBl = helper) {
1682             helper = currBl->next;
1683             // slab blocks in user's pools do not have valid backRefIdx
1684             if (!backend->inUserPool())
1685                 removeBackRef(currBl->backRefIdx);
1686             backend->putSlabBlock(currBl);
1687         }
1688     }
1689     size++;
1690     block->next = localHead;
1691     head.store(block, std::memory_order_release);
1692 }
1693 
1694 bool FreeBlockPool::externalCleanup()
1695 {
1696     Block *helper;
1697     bool released = false;
1698 
1699     for (Block *currBl=head.exchange(nullptr); currBl; currBl=helper) {
1700         helper = currBl->next;
1701         // slab blocks in user's pools do not have valid backRefIdx
1702         if (!backend->inUserPool())
1703             removeBackRef(currBl->backRefIdx);
1704         backend->putSlabBlock(currBl);
1705         released = true;
1706     }
1707     return released;
1708 }
1709 
1710 /* Prepare the block for returning to FreeBlockPool */
1711 void Block::reset()
1712 {
1713     // it is caller's responsibility to ensure no data is lost before calling this
1714     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1715     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1716     if (!isStartupAllocObject())
1717         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1718 
1719     cleanBlockHeader();
1720 
1721     nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1722 
1723     objectSize = 0;
1724     // for an empty block, bump pointer should point right after the end of the block
1725     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1726 }
1727 
1728 inline void Bin::setActiveBlock (Block *block)
1729 {
1730 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1731     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1732     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1733     activeBlk = block;
1734 }
1735 
1736 inline Block* Bin::setPreviousBlockActive()
1737 {
1738     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1739     Block* temp = activeBlk->previous;
1740     if( temp ) {
1741         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1742         activeBlk = temp;
1743     }
1744     return temp;
1745 }
1746 
1747 inline bool Block::isOwnedByCurrentThread() const {
1748     return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1749 }
1750 
1751 FreeObject *Block::findObjectToFree(const void *object) const
1752 {
1753     FreeObject *objectToFree;
1754     // Due to aligned allocations, a pointer passed to scalable_free
1755     // might differ from the address of internally allocated object.
1756     // Small objects however should always be fine.
1757     if (objectSize <= maxSegregatedObjectSize)
1758         objectToFree = (FreeObject*)object;
1759     // "Fitting size" allocations are suspicious if aligned higher than naturally
1760     else {
1761         if ( ! isAligned(object,2*fittingAlignment) )
1762             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1763             //       so might even be slower in average than unconditional use of findAllocatedObject.
1764             // here it should be a "real" object
1765             objectToFree = (FreeObject*)object;
1766         else
1767             // here object can be an aligned address, so applying additional checks
1768             objectToFree = findAllocatedObject(object);
1769         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1770     }
1771     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1772 
1773     return objectToFree;
1774 }
1775 
1776 void TLSData::release()
1777 {
1778     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1779     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1780 
1781     for (unsigned index = 0; index < numBlockBins; index++) {
1782         Block *activeBlk = bin[index].getActiveBlock();
1783         if (!activeBlk)
1784             continue;
1785         Block *threadlessBlock = activeBlk->previous;
1786         bool syncOnMailbox = false;
1787         while (threadlessBlock) {
1788             Block *threadBlock = threadlessBlock->previous;
1789             if (threadlessBlock->empty()) {
1790                 /* we destroy the thread, so not use its block pool */
1791                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1792             } else {
1793                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1794                 syncOnMailbox = true;
1795             }
1796             threadlessBlock = threadBlock;
1797         }
1798         threadlessBlock = activeBlk;
1799         while (threadlessBlock) {
1800             Block *threadBlock = threadlessBlock->next;
1801             if (threadlessBlock->empty()) {
1802                 /* we destroy the thread, so not use its block pool */
1803                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1804             } else {
1805                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1806                 syncOnMailbox = true;
1807             }
1808             threadlessBlock = threadBlock;
1809         }
1810         bin[index].resetActiveBlock();
1811 
1812         if (syncOnMailbox) {
1813             // Although, we synchronized on nextPrivatizable inside a block, we still need to
1814             // synchronize on the bin lifetime because the thread releasing an object into the public
1815             // free list is touching the bin (mailbox and mailLock)
1816             MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1817         }
1818     }
1819 }
1820 
1821 
1822 #if MALLOC_CHECK_RECURSION
1823 // TODO: Use dedicated heap for this
1824 
1825 /*
1826  * It's a special kind of allocation that can be used when malloc is
1827  * not available (either during startup or when malloc was already called and
1828  * we are, say, inside pthread_setspecific's call).
1829  * Block can contain objects of different sizes,
1830  * allocations are performed by moving bump pointer and increasing of object counter,
1831  * releasing is done via counter of objects allocated in the block
1832  * or moving bump pointer if releasing object is on a bound.
1833  * TODO: make bump pointer to grow to the same backward direction as all the others.
1834  */
1835 
1836 class StartupBlock : public Block {
1837     size_t availableSize() const {
1838         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1839     }
1840     static StartupBlock *getBlock();
1841 public:
1842     static FreeObject *allocate(size_t size);
1843     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1844     void free(void *ptr);
1845 };
1846 
1847 static MallocMutex startupMallocLock;
1848 static StartupBlock *firstStartupBlock;
1849 
1850 StartupBlock *StartupBlock::getBlock()
1851 {
1852     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1853     if (backRefIdx.isInvalid()) return nullptr;
1854 
1855     StartupBlock *block = static_cast<StartupBlock*>(
1856         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1857     if (!block) return nullptr;
1858 
1859     block->cleanBlockHeader();
1860     setBackRef(backRefIdx, block);
1861     block->backRefIdx = backRefIdx;
1862     // use startupAllocObjSizeMark to mark objects from startup block marker
1863     block->objectSize = startupAllocObjSizeMark;
1864     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1865     return block;
1866 }
1867 
1868 FreeObject *StartupBlock::allocate(size_t size)
1869 {
1870     FreeObject *result;
1871     StartupBlock *newBlock = nullptr;
1872 
1873     /* Objects must be aligned on their natural bounds,
1874        and objects bigger than word on word's bound. */
1875     size = alignUp(size, sizeof(size_t));
1876     // We need size of an object to implement msize.
1877     size_t reqSize = size + sizeof(size_t);
1878     {
1879         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1880         // Re-check whether we need a new block (conditions might have changed)
1881         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1882             if (!newBlock) {
1883                 newBlock = StartupBlock::getBlock();
1884                 if (!newBlock) return nullptr;
1885             }
1886             newBlock->next = (Block*)firstStartupBlock;
1887             if (firstStartupBlock)
1888                 firstStartupBlock->previous = (Block*)newBlock;
1889             firstStartupBlock = newBlock;
1890         }
1891         result = firstStartupBlock->bumpPtr;
1892         firstStartupBlock->allocatedCount++;
1893         firstStartupBlock->bumpPtr =
1894             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1895     }
1896 
1897     // keep object size at the negative offset
1898     *((size_t*)result) = size;
1899     return (FreeObject*)((size_t*)result+1);
1900 }
1901 
1902 void StartupBlock::free(void *ptr)
1903 {
1904     Block* blockToRelease = nullptr;
1905     {
1906         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1907 
1908         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1909         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1910                       && allocatedCount>0, ASSERT_TEXT);
1911         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1912                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1913                       ASSERT_TEXT);
1914         if (0 == --allocatedCount) {
1915             if (this == firstStartupBlock)
1916                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1917             if (previous)
1918                 previous->next = next;
1919             if (next)
1920                 next->previous = previous;
1921             blockToRelease = this;
1922         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1923             // last object in the block released
1924             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1925             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1926                           ASSERT_TEXT);
1927             bumpPtr = newBump;
1928         }
1929     }
1930     if (blockToRelease) {
1931         blockToRelease->previous = blockToRelease->next = nullptr;
1932         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1933     }
1934 }
1935 
1936 #endif /* MALLOC_CHECK_RECURSION */
1937 
1938 /********* End thread related code  *************/
1939 
1940 /********* Library initialization *************/
1941 
1942 //! Value indicating the state of initialization.
1943 /* 0 = initialization not started.
1944  * 1 = initialization started but not finished.
1945  * 2 = initialization finished.
1946  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1947  * useful for detecting errors in the double-check pattern.
1948  */
1949 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1950 static MallocMutex initMutex;
1951 
1952 /** The leading "\0" is here so that applying "strings" to the binary
1953     delivers a clean result. */
1954 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1955 
1956 #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1957 
1958 /* Decrease race interval between dynamic library unloading and pthread key
1959    destructor. Protect only Pthreads with supported unloading. */
1960 class ShutdownSync {
1961 /* flag is the number of threads in pthread key dtor body
1962    (i.e., between threadDtorStart() and threadDtorDone())
1963    or the signal to skip dtor, if flag < 0 */
1964     std::atomic<intptr_t> flag;
1965     static const intptr_t skipDtor = INTPTR_MIN/2;
1966 public:
1967     void init() { flag.store(0, std::memory_order_release); }
1968 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1969    simultaneously, so flag never becomes negative because of that. */
1970     bool threadDtorStart() {
1971         if (flag.load(std::memory_order_acquire) < 0)
1972             return false;
1973         if (++flag <= 0) { // note that new value returned
1974             flag.fetch_sub(1); // flag is spoiled by us, restore it
1975             return false;
1976         }
1977         return true;
1978     }
1979     void threadDtorDone() {
1980         flag.fetch_sub(1);
1981     }
1982     void processExit() {
1983         if (flag.fetch_add(skipDtor) != 0) {
1984             SpinWaitUntilEq(flag, skipDtor);
1985         }
1986     }
1987 };
1988 
1989 #else
1990 
1991 class ShutdownSync {
1992 public:
1993     void init() { }
1994     bool threadDtorStart() { return true; }
1995     void threadDtorDone() { }
1996     void processExit() { }
1997 };
1998 
1999 #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
2000 
2001 static ShutdownSync shutdownSync;
2002 
2003 inline bool isMallocInitialized() {
2004     // Load must have acquire fence; otherwise thread taking "initialized" path
2005     // might perform textually later loads *before* mallocInitialized becomes 2.
2006     return 2 == mallocInitialized.load(std::memory_order_acquire);
2007 }
2008 
2009 /* Caller is responsible for ensuring this routine is called exactly once. */
2010 extern "C" void MallocInitializeITT() {
2011 #if __TBB_USE_ITT_NOTIFY
2012     if (!usedBySrcIncluded)
2013         tbb::detail::r1::__TBB_load_ittnotify();
2014 #endif
2015 }
2016 
2017 void MemoryPool::initDefaultPool() {
2018     hugePages.init();
2019 }
2020 
2021 /*
2022  * Allocator initialization routine;
2023  * it is called lazily on the very first scalable_malloc call.
2024  */
2025 static bool initMemoryManager()
2026 {
2027     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2028              sizeof(Block), sizeof(uintptr_t) ));
2029     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2030     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2031     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2032                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2033 
2034 #if USE_WINTHREAD
2035     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2036 #else
2037     // POSIX.1-2001-compliant way to get page size
2038     const size_t granularity = sysconf(_SC_PAGESIZE);
2039 #endif
2040     if (!defaultMemPool) {
2041         // Do not rely on static constructors and do the assignment in case
2042         // of library static section not initialized at this call yet.
2043         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2044     }
2045     bool initOk = defaultMemPool->
2046         extMemPool.init(0, nullptr, nullptr, granularity,
2047                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2048 // TODO: extMemPool.init() to not allocate memory
2049     if (!initOk || !initBackRefMain(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2050         return false;
2051     MemoryPool::initDefaultPool();
2052     // init() is required iff initMemoryManager() is called
2053     // after mallocProcessShutdownNotification()
2054     shutdownSync.init();
2055 #if COLLECT_STATISTICS
2056     initStatisticsCollection();
2057 #endif
2058     return true;
2059 }
2060 
2061 static bool GetBoolEnvironmentVariable(const char* name) {
2062     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2063 }
2064 
2065 //! Ensures that initMemoryManager() is called once and only once.
2066 /** Does not return until initMemoryManager() has been completed by a thread.
2067     There is no need to call this routine if mallocInitialized==2 . */
2068 static bool doInitialization()
2069 {
2070     MallocMutex::scoped_lock lock( initMutex );
2071     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2072         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2073         mallocInitialized.store(1, std::memory_order_relaxed);
2074         RecursiveMallocCallProtector scoped;
2075         if (!initMemoryManager()) {
2076             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2077             return false;
2078         }
2079 #ifdef  MALLOC_EXTRA_INITIALIZATION
2080         MALLOC_EXTRA_INITIALIZATION;
2081 #endif
2082 #if MALLOC_CHECK_RECURSION
2083         RecursiveMallocCallProtector::detectNaiveOverload();
2084 #endif
2085         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2086         // Store must have release fence, otherwise mallocInitialized==2
2087         // might become remotely visible before side effects of
2088         // initMemoryManager() become remotely visible.
2089         mallocInitialized.store(2, std::memory_order_release);
2090         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2091             fputs(VersionString+1,stderr);
2092             hugePages.printStatus();
2093         }
2094     }
2095     /* It can't be 0 or I would have initialized it */
2096     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2097     return true;
2098 }
2099 
2100 /********* End library initialization *************/
2101 
2102 /********* The malloc show begins     *************/
2103 
2104 
2105 FreeObject *Block::allocateFromFreeList()
2106 {
2107     FreeObject *result;
2108 
2109     if (!freeList) return nullptr;
2110 
2111     result = freeList;
2112     MALLOC_ASSERT( result, ASSERT_TEXT );
2113 
2114     freeList = result->next;
2115     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2116     allocatedCount++;
2117     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2118 
2119     return result;
2120 }
2121 
2122 FreeObject *Block::allocateFromBumpPtr()
2123 {
2124     FreeObject *result = bumpPtr;
2125     if (result) {
2126         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2127         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2128             bumpPtr = nullptr;
2129         }
2130         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2131         allocatedCount++;
2132         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2133     }
2134     return result;
2135 }
2136 
2137 inline FreeObject* Block::allocate()
2138 {
2139     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2140 
2141     /* for better cache locality, first looking in the free list. */
2142     if ( FreeObject *result = allocateFromFreeList() ) {
2143         return result;
2144     }
2145     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2146 
2147     /* if free list is empty, try thread local bump pointer allocation. */
2148     if ( FreeObject *result = allocateFromBumpPtr() ) {
2149         return result;
2150     }
2151     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2152 
2153     /* the block is considered full. */
2154     isFull = true;
2155     return nullptr;
2156 }
2157 
2158 size_t Block::findObjectSize(void *object) const
2159 {
2160     size_t blSize = getSize();
2161 #if MALLOC_CHECK_RECURSION
2162     // Currently, there is no aligned allocations from startup blocks,
2163     // so we can return just StartupBlock::msize().
2164     // TODO: This must be extended if we add aligned allocation from startup blocks.
2165     if (!blSize)
2166         return StartupBlock::msize(object);
2167 #endif
2168     // object can be aligned, so real size can be less than block's
2169     size_t size =
2170         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2171     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2172     return size;
2173 }
2174 
2175 void Bin::moveBlockToFront(Block *block)
2176 {
2177     /* move the block to the front of the bin */
2178     if (block == activeBlk) return;
2179     outofTLSBin(block);
2180     pushTLSBin(block);
2181 }
2182 
2183 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2184 {
2185     if (block != activeBlk) {
2186         /* We are not using this block; return it to the pool */
2187         outofTLSBin(block);
2188         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2189     } else {
2190         /* all objects are free - let's restore the bump pointer */
2191         block->restoreBumpPtr();
2192     }
2193 }
2194 
2195 template<int LOW_MARK, int HIGH_MARK>
2196 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2197 {
2198     const size_t size = object->unalignedSize;
2199     // not spoil cache with too large object, that can cause its total cleanup
2200     if (size > MAX_TOTAL_SIZE)
2201         return false;
2202     LargeMemoryBlock *localHead = head.exchange(nullptr);
2203 
2204     object->prev = nullptr;
2205     object->next = localHead;
2206     if (localHead)
2207         localHead->prev = object;
2208     else {
2209         // those might not be cleaned during local cache stealing, correct them
2210         totalSize = 0;
2211         numOfBlocks = 0;
2212         tail = object;
2213     }
2214     localHead = object;
2215     totalSize += size;
2216     numOfBlocks++;
2217     // must meet both size and number of cached objects constrains
2218     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2219         // scanning from tail until meet conditions
2220         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2221             totalSize -= tail->unalignedSize;
2222             numOfBlocks--;
2223             tail = tail->prev;
2224         }
2225         LargeMemoryBlock *headToRelease = tail->next;
2226         tail->next = nullptr;
2227 
2228         extMemPool->freeLargeObjectList(headToRelease);
2229     }
2230 
2231     head.store(localHead, std::memory_order_release);
2232     return true;
2233 }
2234 
2235 template<int LOW_MARK, int HIGH_MARK>
2236 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2237 {
2238     LargeMemoryBlock *localHead, *res = nullptr;
2239 
2240     if (size > MAX_TOTAL_SIZE)
2241         return nullptr;
2242 
2243     // TBB_REVAMP_TODO: review this line
2244     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(nullptr)) == nullptr) {
2245         // do not restore totalSize, numOfBlocks and tail at this point,
2246         // as they are used only in put(), where they must be restored
2247         return nullptr;
2248     }
2249 
2250     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2251         if (curr->unalignedSize == size) {
2252             res = curr;
2253             if (curr->next)
2254                 curr->next->prev = curr->prev;
2255             else
2256                 tail = curr->prev;
2257             if (curr != localHead)
2258                 curr->prev->next = curr->next;
2259             else
2260                 localHead = curr->next;
2261             totalSize -= size;
2262             numOfBlocks--;
2263             break;
2264         }
2265     }
2266 
2267     head.store(localHead, std::memory_order_release);
2268     return res;
2269 }
2270 
2271 template<int LOW_MARK, int HIGH_MARK>
2272 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2273 {
2274     if (LargeMemoryBlock *localHead = head.exchange(nullptr)) {
2275         extMemPool->freeLargeObjectList(localHead);
2276         return true;
2277     }
2278     return false;
2279 }
2280 
2281 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2282 {
2283     LargeMemoryBlock *lmb = nullptr;
2284 
2285     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2286     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2287     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2288         return nullptr;
2289     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2290 
2291     if (tls) {
2292         tls->markUsed();
2293         lmb = tls->lloc.get(allocationSize);
2294     }
2295     if (!lmb)
2296         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2297 
2298     if (lmb) {
2299         // doing shuffle we suppose that alignment offset guarantees
2300         // that different cache lines are in use
2301         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2302 
2303         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2304         uintptr_t alignedRight =
2305             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2306         // Has some room to shuffle object between cache lines?
2307         // Note that alignedRight and alignedArea are aligned at alignment.
2308         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2309         if (ptrDelta && tls) { // !tls is cold path
2310             // for the hot path of alignment==estimatedCacheLineSize,
2311             // allow compilers to use shift for division
2312             // (since estimatedCacheLineSize is a power-of-2 constant)
2313             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2314                   ptrDelta / estimatedCacheLineSize :
2315                   ptrDelta / alignment;
2316             unsigned myCacheIdx = ++tls->currCacheIdx;
2317             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2318 
2319             // Move object to a cache line with an offset that is different from
2320             // previous allocation. This supposedly allows us to use cache
2321             // associativity more efficiently.
2322             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2323         }
2324         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2325                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2326         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2327         header->memoryBlock = lmb;
2328         header->backRefIdx = lmb->backRefIdx;
2329         setBackRef(header->backRefIdx, header);
2330 
2331         lmb->objectSize = size;
2332 
2333         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2334         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2335 
2336         return alignedArea;
2337     }
2338     return nullptr;
2339 }
2340 
2341 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2342 {
2343     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2344     // overwrite backRefIdx to simplify double free detection
2345     header->backRefIdx = BackRefIdx();
2346 
2347     if (tls) {
2348         tls->markUsed();
2349         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2350             return;
2351     }
2352     extMemPool.freeLargeObject(header->memoryBlock);
2353 }
2354 
2355 /*
2356  * All aligned allocations fall into one of the following categories:
2357  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2358  *       we just align the size up, and request this amount, because for every size
2359  *       aligned to some power of 2, the allocated object is at least that aligned.
2360  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2361  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2362  *       its address up; given such pointer, scalable_free could find the real object.
2363  *       Wrapping of size+alignment is impossible because maximal allowed
2364  *       alignment plus minLargeObjectSize can't lead to wrapping.
2365  * 4. otherwise, aligned large object is allocated.
2366  */
2367 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2368 {
2369     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2370 
2371     if (!isMallocInitialized())
2372         if (!doInitialization())
2373             return nullptr;
2374 
2375     void *result;
2376     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2377         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2378     else if (size<minLargeObjectSize) {
2379         if (alignment<=fittingAlignment)
2380             result = internalPoolMalloc(memPool, size);
2381         else if (size+alignment < minLargeObjectSize) {
2382             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2383             if (!unaligned) return nullptr;
2384             result = alignUp(unaligned, alignment);
2385         } else
2386             goto LargeObjAlloc;
2387     } else {
2388     LargeObjAlloc:
2389         TLSData *tls = memPool->getTLS(/*create=*/true);
2390         // take into account only alignment that are higher then natural
2391         result =
2392             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2393                                                largeObjectAlignment: alignment);
2394     }
2395 
2396     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2397     return result;
2398 }
2399 
2400 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2401                             size_t newSize, size_t alignment = 0)
2402 {
2403     void *result;
2404     size_t copySize;
2405 
2406     if (isLargeObject<ourMem>(ptr)) {
2407         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2408         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2409 
2410         // Apply different strategies if size decreases
2411         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2412 
2413             // For huge objects (that do not fit in backend cache), keep the same space unless
2414             // the new size is at least twice smaller
2415             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2416             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2417             if (newSize > threshold) {
2418                 lmb->objectSize = newSize;
2419                 return ptr;
2420             }
2421             // TODO: For large objects suitable for the backend cache,
2422             // split out the excessive part and put it to the backend.
2423         }
2424         // Reallocate for real
2425         copySize = lmb->objectSize;
2426 #if BACKEND_HAS_MREMAP
2427         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2428                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2429             return r;
2430 #endif
2431         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2432             internalPoolMalloc(memPool, newSize);
2433 
2434     } else {
2435         Block* block = (Block *)alignDown(ptr, slabSize);
2436         copySize = block->findObjectSize(ptr);
2437 
2438         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2439         // Currently, in case of size decreasing, old pointer is returned
2440         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2441             return ptr;
2442         } else {
2443             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2444                 internalPoolMalloc(memPool, newSize);
2445         }
2446     }
2447     if (result) {
2448         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2449         internalPoolFree(memPool, ptr, 0);
2450     }
2451     return result;
2452 }
2453 
2454 #if MALLOC_DEBUG
2455 /* A predicate checks if an object is properly placed inside its block */
2456 inline bool Block::isProperlyPlaced(const void *object) const
2457 {
2458     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2459 }
2460 #endif
2461 
2462 /* Finds the real object inside the block */
2463 FreeObject *Block::findAllocatedObject(const void *address) const
2464 {
2465     // calculate offset from the end of the block space
2466     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2467     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2468     // find offset difference from a multiple of allocation size
2469     offset %= objectSize;
2470     // and move the address down to where the real object starts.
2471     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2472 }
2473 
2474 /*
2475  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2476  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2477  */
2478 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2479 {
2480     BackRefIdx id;
2481 #if _MSC_VER
2482     __try {
2483 #endif
2484         id = dereference(ptr);
2485 #if _MSC_VER
2486     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2487                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2488         id = BackRefIdx();
2489     }
2490 #endif
2491     return id;
2492 }
2493 
2494 template<MemoryOrigin memOrigin>
2495 bool isLargeObject(void *object)
2496 {
2497     if (!isAligned(object, largeObjectAlignment))
2498         return false;
2499     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2500     BackRefIdx idx = (memOrigin == unknownMem) ?
2501         safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2502 
2503     return idx.isLargeObject()
2504         // in valid LargeObjectHdr memoryBlock is not nullptr
2505         && header->memoryBlock
2506         // in valid LargeObjectHdr memoryBlock points somewhere before header
2507         // TODO: more strict check
2508         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2509         && getBackRef(idx) == header;
2510 }
2511 
2512 static inline bool isSmallObject (void *ptr)
2513 {
2514     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2515     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2516 
2517     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2518     if (isSmall)
2519         expectedBlock->checkFreePrecond(ptr);
2520     return isSmall;
2521 }
2522 
2523 /**** Check if an object was allocated by scalable_malloc ****/
2524 static inline bool isRecognized (void* ptr)
2525 {
2526     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2527         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2528 }
2529 
2530 static inline void freeSmallObject(void *object)
2531 {
2532     /* mask low bits to get the block */
2533     Block *block = (Block *)alignDown(object, slabSize);
2534     block->checkFreePrecond(object);
2535 
2536 #if MALLOC_CHECK_RECURSION
2537     if (block->isStartupAllocObject()) {
2538         ((StartupBlock *)block)->free(object);
2539         return;
2540     }
2541 #endif
2542     if (block->isOwnedByCurrentThread()) {
2543         block->freeOwnObject(object);
2544     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2545         FreeObject *objectToFree = block->findObjectToFree(object);
2546         block->freePublicObject(objectToFree);
2547     }
2548 }
2549 
2550 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2551 {
2552     Bin* bin;
2553     Block * mallocBlock;
2554 
2555     if (!memPool) return nullptr;
2556 
2557     if (!size) size = sizeof(size_t);
2558 
2559     TLSData *tls = memPool->getTLS(/*create=*/true);
2560 
2561     /* Allocate a large object */
2562     if (size >= minLargeObjectSize)
2563         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2564 
2565     if (!tls) return nullptr;
2566 
2567     tls->markUsed();
2568     /*
2569      * Get an element in thread-local array corresponding to the given size;
2570      * It keeps ptr to the active block for allocations of this size
2571      */
2572     bin = tls->getAllocationBin(size);
2573     if ( !bin ) return nullptr;
2574 
2575     /* Get a block to try to allocate in. */
2576     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2577          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2578     {
2579         if( FreeObject *result = mallocBlock->allocate() )
2580             return result;
2581     }
2582 
2583     /*
2584      * else privatize publicly freed objects in some block and allocate from it
2585      */
2586     mallocBlock = bin->getPrivatizedFreeListBlock();
2587     if (mallocBlock) {
2588         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2589         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2590             return result;
2591         /* Else something strange happened, need to retry from the beginning; */
2592         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2593         return internalPoolMalloc(memPool, size);
2594     }
2595 
2596     /*
2597      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2598      */
2599     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2600     while (mallocBlock) {
2601         bin->pushTLSBin(mallocBlock);
2602         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2603         if( FreeObject *result = mallocBlock->allocate() )
2604             return result;
2605         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2606     }
2607 
2608     /*
2609      * else try to get a new empty block
2610      */
2611     mallocBlock = memPool->getEmptyBlock(size);
2612     if (mallocBlock) {
2613         bin->pushTLSBin(mallocBlock);
2614         bin->setActiveBlock(mallocBlock);
2615         if( FreeObject *result = mallocBlock->allocate() )
2616             return result;
2617         /* Else something strange happened, need to retry from the beginning; */
2618         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2619         return internalPoolMalloc(memPool, size);
2620     }
2621     /*
2622      * else nothing works so return nullptr
2623      */
2624     TRACEF(( "[ScalableMalloc trace] No memory found, returning nullptr.\n" ));
2625     return nullptr;
2626 }
2627 
2628 // When size==0 (i.e. unknown), detect here whether the object is large.
2629 // For size is known and < minLargeObjectSize, we still need to check
2630 // if the actual object is large, because large objects might be used
2631 // for aligned small allocations.
2632 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2633 {
2634     if (!memPool || !object) return false;
2635 
2636     // The library is initialized at allocation call, so releasing while
2637     // not initialized means foreign object is releasing.
2638     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2639     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2640                   "Invalid pointer during object releasing is detected.");
2641 
2642     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2643         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2644     else
2645         freeSmallObject(object);
2646     return true;
2647 }
2648 
2649 static void *internalMalloc(size_t size)
2650 {
2651     if (!size) size = sizeof(size_t);
2652 
2653 #if MALLOC_CHECK_RECURSION
2654     if (RecursiveMallocCallProtector::sameThreadActive())
2655         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2656             // nested allocation, so skip tls
2657             (FreeObject*)defaultMemPool->getFromLLOCache(nullptr, size, slabSize);
2658 #endif
2659 
2660     if (!isMallocInitialized())
2661         if (!doInitialization())
2662             return nullptr;
2663     return internalPoolMalloc(defaultMemPool, size);
2664 }
2665 
2666 static void internalFree(void *object)
2667 {
2668     internalPoolFree(defaultMemPool, object, 0);
2669 }
2670 
2671 static size_t internalMsize(void* ptr)
2672 {
2673     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2674     if (isLargeObject<ourMem>(ptr)) {
2675         // TODO: return the maximum memory size, that can be written to this object
2676         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2677         return lmb->objectSize;
2678     } else {
2679         Block *block = (Block*)alignDown(ptr, slabSize);
2680         return block->findObjectSize(ptr);
2681     }
2682 }
2683 
2684 } // namespace internal
2685 
2686 using namespace rml::internal;
2687 
2688 // legacy entry point saved for compatibility with binaries complied
2689 // with pre-6003 versions of TBB
2690 TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2691 {
2692     rml::MemoryPool *pool;
2693     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2694 
2695     pool_create_v1(pool_id, &pol, &pool);
2696     return pool;
2697 }
2698 
2699 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2700                                  rml::MemoryPool **pool)
2701 {
2702     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2703          // empty pFree allowed only for fixed pools
2704          || !(policy->fixedPool || policy->pFree)) {
2705         *pool = nullptr;
2706         return INVALID_POLICY;
2707     }
2708     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2709          // new flags can be added in place of reserved, but default
2710          // behaviour must be supported by this version
2711          || policy->reserved ) {
2712         *pool = nullptr;
2713         return UNSUPPORTED_POLICY;
2714     }
2715     if (!isMallocInitialized())
2716         if (!doInitialization()) {
2717             *pool = nullptr;
2718             return NO_MEMORY;
2719         }
2720     rml::internal::MemoryPool *memPool =
2721         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2722     if (!memPool) {
2723         *pool = nullptr;
2724         return NO_MEMORY;
2725     }
2726     memset(static_cast<void*>(memPool), 0, sizeof(rml::internal::MemoryPool));
2727     if (!memPool->init(pool_id, policy)) {
2728         internalFree(memPool);
2729         *pool = nullptr;
2730         return NO_MEMORY;
2731     }
2732 
2733     *pool = (rml::MemoryPool*)memPool;
2734     return POOL_OK;
2735 }
2736 
2737 bool pool_destroy(rml::MemoryPool* memPool)
2738 {
2739     if (!memPool) return false;
2740     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2741     internalFree(memPool);
2742 
2743     return ret;
2744 }
2745 
2746 bool pool_reset(rml::MemoryPool* memPool)
2747 {
2748     if (!memPool) return false;
2749 
2750     return ((rml::internal::MemoryPool*)memPool)->reset();
2751 }
2752 
2753 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2754 {
2755     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2756 }
2757 
2758 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2759 {
2760     if (!object)
2761         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2762     if (!size) {
2763         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2764         return nullptr;
2765     }
2766     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2767 }
2768 
2769 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2770 {
2771     if (!isPowerOfTwo(alignment) || 0==size)
2772         return nullptr;
2773 
2774     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2775 }
2776 
2777 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2778 {
2779     if (!isPowerOfTwo(alignment))
2780         return nullptr;
2781     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2782     void *tmp;
2783 
2784     if (!ptr)
2785         tmp = allocateAligned(mPool, size, alignment);
2786     else if (!size) {
2787         internalPoolFree(mPool, ptr, 0);
2788         return nullptr;
2789     } else
2790         tmp = reallocAligned(mPool, ptr, size, alignment);
2791 
2792     return tmp;
2793 }
2794 
2795 bool pool_free(rml::MemoryPool *mPool, void *object)
2796 {
2797     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2798 }
2799 
2800 rml::MemoryPool *pool_identify(void *object)
2801 {
2802     rml::internal::MemoryPool *pool;
2803     if (isLargeObject<ourMem>(object)) {
2804         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2805         pool = header->memoryBlock->pool;
2806     } else {
2807         Block *block = (Block*)alignDown(object, slabSize);
2808         pool = block->getMemPool();
2809     }
2810     // do not return defaultMemPool, as it can't be used in pool_free() etc
2811     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2812         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2813     return (rml::MemoryPool*)pool;
2814 }
2815 
2816 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2817 {
2818     if (object) {
2819         // No assert for object recognition, cause objects allocated from non-default
2820         // memory pool do not participate in range checking and do not have valid backreferences for
2821         // small objects. Instead, check that an object belong to the certain memory pool.
2822         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2823         return internalMsize(object);
2824     }
2825     errno = EINVAL;
2826     // Unlike _msize, return 0 in case of parameter error.
2827     // Returning size_t(-1) looks more like the way to troubles.
2828     return 0;
2829 }
2830 
2831 } // namespace rml
2832 
2833 using namespace rml::internal;
2834 
2835 #if MALLOC_TRACE
2836 static unsigned int threadGoingDownCount = 0;
2837 #endif
2838 
2839 /*
2840  * When a thread is shutting down this routine should be called to remove all the thread ids
2841  * from the malloc blocks and replace them with a nullptr thread id.
2842  *
2843  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2844  * It will be automatically called at thread exit with the key value as the argument,
2845  * unless that value is nullptr.
2846  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2847  *
2848  * However neither of the above is called for the main process thread, so the routine
2849  * also needs to be called during the process shutdown.
2850  *
2851 */
2852 // TODO: Consider making this function part of class MemoryPool.
2853 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2854 {
2855     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2856              getThreadId(),  threadGoingDownCount++ ));
2857 
2858 #if USE_PTHREAD
2859     if (tls) {
2860         if (!shutdownSync.threadDtorStart()) return;
2861         tls->getMemPool()->onThreadShutdown(tls);
2862         shutdownSync.threadDtorDone();
2863     } else
2864 #endif
2865     {
2866         suppress_unused_warning(tls); // not used on Windows
2867         // The default pool is safe to use at this point:
2868         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2869         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2870         // See also __TBB_mallocProcessShutdownNotification()
2871         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2872         // Take lock to walk through other pools; but waiting might be dangerous at this point
2873         // (e.g. on Windows the main thread might deadlock)
2874         bool locked = false;
2875         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2876         if (locked) { // the list is safe to process
2877             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2878                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2879         }
2880     }
2881 
2882     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2883 }
2884 
2885 #if USE_PTHREAD
2886 void mallocThreadShutdownNotification(void* arg)
2887 {
2888     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2889     if (!isMallocInitialized()) return;
2890     doThreadShutdownNotification((TLSData*)arg, false);
2891 }
2892 #else
2893 extern "C" void __TBB_mallocThreadShutdownNotification()
2894 {
2895     // The routine is called once per thread on Windows
2896     if (!isMallocInitialized()) return;
2897     doThreadShutdownNotification(nullptr, false);
2898 }
2899 #endif
2900 
2901 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2902 {
2903     if (!isMallocInitialized()) return;
2904 
2905     // Don't clean allocator internals if the entire process is exiting
2906     if (!windows_process_dying) {
2907         doThreadShutdownNotification(nullptr, /*main_thread=*/true);
2908     }
2909 #if  __TBB_MALLOC_LOCACHE_STAT
2910     printf("cache hit ratio %f, size hit %f\n",
2911            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2912     defaultMemPool->extMemPool.loc.reportStat(stdout);
2913 #endif
2914 
2915     shutdownSync.processExit();
2916 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2917 /* Pthread keys must be deleted as soon as possible to not call key dtor
2918    on thread termination when then the tbbmalloc code can be already unloaded.
2919 */
2920     defaultMemPool->destroy();
2921     destroyBackRefMain(&defaultMemPool->extMemPool.backend);
2922     ThreadId::destroy();      // Delete key for thread id
2923     hugePages.reset();
2924     // new total malloc initialization is possible after this point
2925     mallocInitialized.store(0, std::memory_order_release);
2926 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2927 
2928 #if COLLECT_STATISTICS
2929     unsigned nThreads = ThreadId::getMaxThreadId();
2930     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2931         STAT_print(i);
2932 #endif
2933     if (!usedBySrcIncluded) {
2934         MALLOC_ITT_FINI_ITTLIB();
2935         MALLOC_ITT_RELEASE_RESOURCES();
2936     }
2937 }
2938 
2939 extern "C" void * scalable_malloc(size_t size)
2940 {
2941     void *ptr = internalMalloc(size);
2942     if (!ptr) errno = ENOMEM;
2943     return ptr;
2944 }
2945 
2946 extern "C" void scalable_free(void *object)
2947 {
2948     internalFree(object);
2949 }
2950 
2951 #if MALLOC_ZONE_OVERLOAD_ENABLED
2952 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2953 {
2954     internalPoolFree(defaultMemPool, object, size);
2955 }
2956 #endif
2957 
2958 /*
2959  * A variant that provides additional memory safety, by checking whether the given address
2960  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2961  */
2962 extern "C" TBBMALLOC_EXPORT void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2963 {
2964     if (!object)
2965         return;
2966 
2967     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2968     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2969         if (isLargeObject<unknownMem>(object)) {
2970             // must check 1st for large object, because small object check touches 4 pages on left,
2971             // and it can be inaccessible
2972             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2973 
2974             defaultMemPool->putToLLOCache(tls, object);
2975             return;
2976         } else if (isSmallObject(object)) {
2977             freeSmallObject(object);
2978             return;
2979         }
2980     }
2981     if (original_free)
2982         original_free(object);
2983 }
2984 
2985 /********* End the free code        *************/
2986 
2987 /********* Code for scalable_realloc       ***********/
2988 
2989 /*
2990  * From K&R
2991  * "realloc changes the size of the object pointed to by p to size. The contents will
2992  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2993  * the new space is uninitialized. realloc returns a pointer to the new space, or
2994  * nullptr if the request cannot be satisfied, in which case *p is unchanged."
2995  *
2996  */
2997 extern "C" void* scalable_realloc(void* ptr, size_t size)
2998 {
2999     void *tmp;
3000 
3001     if (!ptr)
3002         tmp = internalMalloc(size);
3003     else if (!size) {
3004         internalFree(ptr);
3005         return nullptr;
3006     } else
3007         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3008 
3009     if (!tmp) errno = ENOMEM;
3010     return tmp;
3011 }
3012 
3013 /*
3014  * A variant that provides additional memory safety, by checking whether the given address
3015  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3016  */
3017 extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3018 {
3019     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3020 
3021     if (!ptr) {
3022         tmp = internalMalloc(sz);
3023     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3024         if (!sz) {
3025             internalFree(ptr);
3026             return nullptr;
3027         } else {
3028             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3029         }
3030     }
3031 #if USE_WINTHREAD
3032     else if (original_realloc && sz) {
3033         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3034         if ( original_ptrs->msize ){
3035             size_t oldSize = original_ptrs->msize(ptr);
3036             tmp = internalMalloc(sz);
3037             if (tmp) {
3038                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3039                 if ( original_ptrs->free ){
3040                     original_ptrs->free( ptr );
3041                 }
3042             }
3043         } else
3044             tmp = nullptr;
3045     }
3046 #else
3047     else if (original_realloc) {
3048         typedef void* (*realloc_ptr_t)(void*,size_t);
3049         realloc_ptr_t original_realloc_ptr;
3050         (void *&)original_realloc_ptr = original_realloc;
3051         tmp = original_realloc_ptr(ptr,sz);
3052     }
3053 #endif
3054     else tmp = nullptr;
3055 
3056     if (!tmp) errno = ENOMEM;
3057     return tmp;
3058 }
3059 
3060 /********* End code for scalable_realloc   ***********/
3061 
3062 /********* Code for scalable_calloc   ***********/
3063 
3064 /*
3065  * From K&R
3066  * calloc returns a pointer to space for an array of nobj objects,
3067  * each of size size, or nullptr if the request cannot be satisfied.
3068  * The space is initialized to zero bytes.
3069  *
3070  */
3071 
3072 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3073 {
3074     // it's square root of maximal size_t value
3075     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3076     const size_t arraySize = nobj * size;
3077 
3078     // check for overflow during multiplication:
3079     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3080         if (nobj && arraySize / nobj != size) {             // 2) exact check
3081             errno = ENOMEM;
3082             return nullptr;
3083         }
3084     void* result = internalMalloc(arraySize);
3085     if (result)
3086         memset(result, 0, arraySize);
3087     else
3088         errno = ENOMEM;
3089     return result;
3090 }
3091 
3092 /********* End code for scalable_calloc   ***********/
3093 
3094 /********* Code for aligned allocation API **********/
3095 
3096 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3097 {
3098     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3099         return EINVAL;
3100     void *result = allocateAligned(defaultMemPool, size, alignment);
3101     if (!result)
3102         return ENOMEM;
3103     *memptr = result;
3104     return 0;
3105 }
3106 
3107 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3108 {
3109     if (!isPowerOfTwo(alignment) || 0==size) {
3110         errno = EINVAL;
3111         return nullptr;
3112     }
3113     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3114     if (!tmp) errno = ENOMEM;
3115     return tmp;
3116 }
3117 
3118 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3119 {
3120     if (!isPowerOfTwo(alignment)) {
3121         errno = EINVAL;
3122         return nullptr;
3123     }
3124     void *tmp;
3125 
3126     if (!ptr)
3127         tmp = allocateAligned(defaultMemPool, size, alignment);
3128     else if (!size) {
3129         scalable_free(ptr);
3130         return nullptr;
3131     } else
3132         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3133 
3134     if (!tmp) errno = ENOMEM;
3135     return tmp;
3136 }
3137 
3138 extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3139 {
3140     /* corner cases left out of reallocAligned to not deal with errno there */
3141     if (!isPowerOfTwo(alignment)) {
3142         errno = EINVAL;
3143         return nullptr;
3144     }
3145     void *tmp = nullptr;
3146 
3147     if (!ptr) {
3148         tmp = allocateAligned(defaultMemPool, size, alignment);
3149     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3150         if (!size) {
3151             internalFree(ptr);
3152             return nullptr;
3153         } else {
3154             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3155         }
3156     }
3157 #if USE_WINTHREAD
3158     else {
3159         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3160         if (size) {
3161             // Without orig_msize, we can't do anything with this.
3162             // Just keeping old pointer.
3163             if ( original_ptrs->aligned_msize ){
3164                 // set alignment and offset to have possibly correct oldSize
3165                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3166                 tmp = allocateAligned(defaultMemPool, size, alignment);
3167                 if (tmp) {
3168                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3169                     if ( original_ptrs->aligned_free ){
3170                         original_ptrs->aligned_free( ptr );
3171                     }
3172                 }
3173             }
3174         } else {
3175             if ( original_ptrs->aligned_free ){
3176                 original_ptrs->aligned_free( ptr );
3177             }
3178             return nullptr;
3179         }
3180     }
3181 #else
3182     // As original_realloc can't align result, and there is no way to find
3183     // size of reallocating object, we are giving up.
3184     suppress_unused_warning(orig_function);
3185 #endif
3186     if (!tmp) errno = ENOMEM;
3187     return tmp;
3188 }
3189 
3190 extern "C" void scalable_aligned_free(void *ptr)
3191 {
3192     internalFree(ptr);
3193 }
3194 
3195 /********* end code for aligned allocation API **********/
3196 
3197 /********* Code for scalable_msize       ***********/
3198 
3199 /*
3200  * Returns the size of a memory block allocated in the heap.
3201  */
3202 extern "C" size_t scalable_msize(void* ptr)
3203 {
3204     if (ptr) {
3205         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3206         return internalMsize(ptr);
3207     }
3208     errno = EINVAL;
3209     // Unlike _msize, return 0 in case of parameter error.
3210     // Returning size_t(-1) looks more like the way to troubles.
3211     return 0;
3212 }
3213 
3214 /*
3215  * A variant that provides additional memory safety, by checking whether the given address
3216  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3217  */
3218 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3219 {
3220     if (object) {
3221         // Check if the memory was allocated by scalable_malloc
3222         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3223             return internalMsize(object);
3224         else if (original_msize)
3225             return original_msize(object);
3226     }
3227     // object is nullptr or unknown, or foreign and no original_msize
3228 #if USE_WINTHREAD
3229     errno = EINVAL; // errno expected to be set only on this platform
3230 #endif
3231     return 0;
3232 }
3233 
3234 /*
3235  * The same as above but for _aligned_msize case
3236  */
3237 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3238 {
3239     if (object) {
3240         // Check if the memory was allocated by scalable_malloc
3241         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3242             return internalMsize(object);
3243         else if (orig_aligned_msize)
3244             return orig_aligned_msize(object,alignment,offset);
3245     }
3246     // object is nullptr or unknown
3247     errno = EINVAL;
3248     return 0;
3249 }
3250 
3251 /********* End code for scalable_msize   ***********/
3252 
3253 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3254 {
3255     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3256         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3257         return TBBMALLOC_OK;
3258     } else if (param == USE_HUGE_PAGES) {
3259 #if __unix__
3260         switch (value) {
3261         case 0:
3262         case 1:
3263             hugePages.setMode(value);
3264             return TBBMALLOC_OK;
3265         default:
3266             return TBBMALLOC_INVALID_PARAM;
3267         }
3268 #else
3269         return TBBMALLOC_NO_EFFECT;
3270 #endif
3271 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3272     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3273         switch (value) {
3274         case 0: // used by dynamic library
3275         case 1: // used by static library or directly included sources
3276             usedBySrcIncluded = value;
3277             return TBBMALLOC_OK;
3278         default:
3279             return TBBMALLOC_INVALID_PARAM;
3280         }
3281 #endif
3282     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3283         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3284         return TBBMALLOC_OK;
3285     }
3286     return TBBMALLOC_INVALID_PARAM;
3287 }
3288 
3289 extern "C" int scalable_allocation_command(int cmd, void *param)
3290 {
3291     if (param)
3292         return TBBMALLOC_INVALID_PARAM;
3293 
3294     bool released = false;
3295     switch(cmd) {
3296     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3297         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3298             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3299         break;
3300     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3301         released = defaultMemPool->extMemPool.hardCachesCleanup(true);
3302         break;
3303     default:
3304         return TBBMALLOC_INVALID_PARAM;
3305     }
3306     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3307 }
3308