xref: /oneTBB/src/tbbmalloc/frontend.cpp (revision 71e1bb8e)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
31     extern "C" { static void mallocThreadShutdownNotification(void*); }
32     #if __sun || __SUNPRO_CC
33     #define __asm__ asm
34     #endif
35     #include <unistd.h> // sysconf(_SC_PAGESIZE)
36 #elif USE_WINTHREAD
37     #define GetMyTID() GetCurrentThreadId()
38 #if __TBB_WIN8UI_SUPPORT
39     #include<thread>
40     #define TlsSetValue_func FlsSetValue
41     #define TlsGetValue_func FlsGetValue
42     #define TlsAlloc() FlsAlloc(nullptr)
43     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
44     #define TlsFree FlsFree
45 #else
46     #define TlsSetValue_func TlsSetValue
47     #define TlsGetValue_func TlsGetValue
48     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
49 #endif
50 #else
51     #error Must define USE_PTHREAD or USE_WINTHREAD
52 #endif
53 
54 #define FREELIST_NONBLOCKING 1
55 
56 namespace rml {
57 class MemoryPool;
58 namespace internal {
59 
60 class Block;
61 class MemoryPool;
62 
63 #if MALLOC_CHECK_RECURSION
64 
65 inline bool isMallocInitialized();
66 
67 #endif // MALLOC_CHECK_RECURSION
68 
69 /** Support for handling the special UNUSABLE pointer state **/
70 const intptr_t UNUSABLE = 0x1;
71 inline bool isSolidPtr( void* ptr ) {
72     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
73 }
74 inline bool isNotForUse( void* ptr ) {
75     return (intptr_t)ptr==UNUSABLE;
76 }
77 
78 /*
79  * Block::objectSize value used to mark blocks allocated by startupAlloc
80  */
81 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
82 
83 /*
84  * The following constant is used to define the size of struct Block, the block header.
85  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
86  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
87  */
88 const int blockHeaderAlignment = estimatedCacheLineSize;
89 
90 /********* The data structures and global objects        **************/
91 
92 /*
93  * The malloc routines themselves need to be able to occasionally malloc some space,
94  * in order to set up the structures used by the thread local structures. This
95  * routine performs that functions.
96  */
97 class BootStrapBlocks {
98     MallocMutex bootStrapLock;
99     Block      *bootStrapBlock;
100     Block      *bootStrapBlockUsed;
101     FreeObject *bootStrapObjectList;
102 public:
103     void *allocate(MemoryPool *memPool, size_t size);
104     void free(void* ptr);
105     void reset();
106 };
107 
108 #if USE_INTERNAL_TID
109 class ThreadId {
110     static tls_key_t Tid_key;
111     std::atomic<intptr_t> ThreadCount;
112 
113     unsigned int id;
114 
115     static unsigned int tlsNumber() {
116         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
117         if( !result ) {
118             RecursiveMallocCallProtector scoped;
119             // Thread-local value is zero -> first call from this thread,
120             // need to initialize with next ID value (IDs start from 1)
121             result = ++ThreadCount; // returned new value!
122             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
123         }
124         return result;
125     }
126 public:
127     static bool init() {
128 #if USE_WINTHREAD
129         Tid_key = TlsAlloc();
130         if (Tid_key == TLS_ALLOC_FAILURE)
131             return false;
132 #else
133         int status = pthread_key_create( &Tid_key, nullptr );
134         if ( status ) {
135             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
136             return false;
137         }
138 #endif /* USE_WINTHREAD */
139         return true;
140     }
141 #if __TBB_SOURCE_DIRECTLY_INCLUDED
142     static void destroy() {
143         if( Tid_key ) {
144 #if USE_WINTHREAD
145             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
146 #else
147             int status = pthread_key_delete( Tid_key );
148 #endif /* USE_WINTHREAD */
149             if ( status )
150                 fprintf (stderr, "The memory manager cannot delete tls key\n");
151             Tid_key = 0;
152         }
153     }
154 #endif
155 
156     ThreadId() : id(ThreadId::tlsNumber()) {}
157     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
158 
159 #if COLLECT_STATISTICS || MALLOC_TRACE
160     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
161 #endif
162 #if COLLECT_STATISTICS
163     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
164 
165     friend int STAT_increment(ThreadId tid, int bin, int ctr);
166 #endif
167 };
168 
169 tls_key_t ThreadId::Tid_key;
170 intptr_t ThreadId::ThreadCount;
171 
172 #if COLLECT_STATISTICS
173 int STAT_increment(ThreadId tid, int bin, int ctr)
174 {
175     return ::STAT_increment(tid.id, bin, ctr);
176 }
177 #endif
178 
179 #else // USE_INTERNAL_TID
180 
181 class ThreadId {
182 #if USE_PTHREAD
183     std::atomic<pthread_t> tid;
184 #else
185     std::atomic<DWORD>     tid;
186 #endif
187 public:
188     ThreadId() : tid(GetMyTID()) {}
189 #if USE_PTHREAD
190     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
191 #else
192     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
193 #endif
194     ThreadId& operator=(const ThreadId& other) {
195         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
196         return *this;
197     }
198     static bool init() { return true; }
199 #if __TBB_SOURCE_DIRECTLY_INCLUDED
200     static void destroy() {}
201 #endif
202 };
203 
204 #endif // USE_INTERNAL_TID
205 
206 /*********** Code to provide thread ID and a thread-local void pointer **********/
207 
208 bool TLSKey::init()
209 {
210 #if USE_WINTHREAD
211     TLS_pointer_key = TlsAlloc();
212     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
213         return false;
214 #else
215     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
216     if ( status )
217         return false;
218 #endif /* USE_WINTHREAD */
219     return true;
220 }
221 
222 bool TLSKey::destroy()
223 {
224 #if USE_WINTHREAD
225     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
226 #else
227     int status1 = pthread_key_delete(TLS_pointer_key);
228 #endif /* USE_WINTHREAD */
229     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
230     return status1==0;
231 }
232 
233 inline TLSData* TLSKey::getThreadMallocTLS() const
234 {
235     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
236 }
237 
238 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
239     RecursiveMallocCallProtector scoped;
240     TlsSetValue_func( TLS_pointer_key, newvalue );
241 }
242 
243 /* The 'next' field in the block header has to maintain some invariants:
244  *   it needs to be on a 16K boundary and the first field in the block.
245  *   Any value stored there needs to have the lower 14 bits set to 0
246  *   so that various assert work. This means that if you want to smash this memory
247  *   for debugging purposes you will need to obey this invariant.
248  * The total size of the header needs to be a power of 2 to simplify
249  * the alignment requirements. For now it is a 128 byte structure.
250  * To avoid false sharing, the fields changed only locally are separated
251  * from the fields changed by foreign threads.
252  * Changing the size of the block header would require to change
253  * some bin allocation sizes, in particular "fitting" sizes (see above).
254  */
255 class Bin;
256 class StartupBlock;
257 
258 class MemoryPool {
259     // if no explicit grainsize, expect to see malloc in user's pAlloc
260     // and set reasonable low granularity
261     static const size_t defaultGranularity = estimatedCacheLineSize;
262 
263     MemoryPool() = delete;                  // deny
264 public:
265     static MallocMutex  memPoolListLock;
266 
267     // list of all active pools is used to release
268     // all TLS data on thread termination or library unload
269     MemoryPool    *next,
270                   *prev;
271     ExtMemoryPool  extMemPool;
272     BootStrapBlocks bootStrapBlocks;
273 
274     static void initDefaultPool();
275 
276     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
277     bool reset();
278     bool destroy();
279     void onThreadShutdown(TLSData *tlsData);
280 
281     inline TLSData *getTLS(bool create);
282     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(nullptr); }
283 
284     Block *getEmptyBlock(size_t size);
285     void returnEmptyBlock(Block *block, bool poolTheBlock);
286 
287     // get/put large object to/from local large object cache
288     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
289     void putToLLOCache(TLSData *tls, void *object);
290 };
291 
292 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
293                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
294 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
295 const size_t MemoryPool::defaultGranularity;
296 // zero-initialized
297 MallocMutex  MemoryPool::memPoolListLock;
298 // TODO: move huge page status to default pool, because that's its states
299 HugePagesStatus hugePages;
300 static bool usedBySrcIncluded = false;
301 
302 // Padding helpers
303 template<size_t padd>
304 struct PaddingImpl {
305     size_t       __padding[padd];
306 };
307 
308 template<>
309 struct PaddingImpl<0> {};
310 
311 template<int N>
312 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
313 
314 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
315 // fields and fields commonly accessed by not owner threads.
316 class GlobalBlockFields : public BlockI {
317 protected:
318     std::atomic<FreeObject*> publicFreeList;
319     std::atomic<Block*> nextPrivatizable;
320     MemoryPool  *poolPtr;
321 };
322 
323 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
324 protected:
325     Block       *next;
326     Block       *previous;        /* Use double linked list to speed up removal */
327     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
328     FreeObject  *freeList;
329     /* Pointer to local data for the owner thread. Used for fast finding tls
330        when releasing object from a block that current thread owned.
331        nullptr for orphaned blocks. */
332     std::atomic<TLSData*> tlsPtr;
333     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
334     BackRefIdx   backRefIdx;
335     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
336     uint16_t     objectSize;
337     bool         isFull;
338 
339     friend class FreeBlockPool;
340     friend class StartupBlock;
341     friend class LifoList;
342     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
343     friend bool OrphanedBlocks::cleanup(Backend*);
344     friend Block *MemoryPool::getEmptyBlock(size_t);
345 };
346 
347 // Use inheritance to guarantee that a user data start on next cache line.
348 // Can't use member for it, because when LocalBlockFields already on cache line,
349 // we must have no additional memory consumption for all compilers.
350 class Block : public LocalBlockFields,
351               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
352 public:
353     bool empty() const {
354         if (allocatedCount > 0) return false;
355         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
356         return true;
357     }
358     inline FreeObject* allocate();
359     inline FreeObject *allocateFromFreeList();
360 
361     inline bool adjustFullness();
362     void adjustPositionInBin(Bin* bin = nullptr);
363 #if MALLOC_DEBUG
364     bool freeListNonNull() { return freeList; }
365 #endif
366     void freePublicObject(FreeObject *objectToFree);
367     inline void freeOwnObject(void *object);
368     void reset();
369     void privatizePublicFreeList( bool reset = true );
370     void restoreBumpPtr();
371     void privatizeOrphaned(TLSData *tls, unsigned index);
372     bool readyToShare();
373     void shareOrphaned(intptr_t binTag, unsigned index);
374     unsigned int getSize() const {
375         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
376                       "Invalid object size");
377         return isStartupAllocObject()? 0 : objectSize;
378     }
379     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
380     inline bool isOwnedByCurrentThread() const;
381     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
382     inline FreeObject *findObjectToFree(const void *object) const;
383     void checkFreePrecond(const void *object) const {
384 #if MALLOC_DEBUG
385         const char *msg = "Possible double free or heap corruption.";
386         // small objects are always at least sizeof(size_t) Byte aligned,
387         // try to check this before this dereference as for invalid objects
388         // this may be unreadable
389         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
390 #if !__TBB_USE_THREAD_SANITIZER
391         // releasing to free slab
392         MALLOC_ASSERT(allocatedCount>0, msg);
393 #endif
394         // must not point to slab's header
395         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
396         if (startupAllocObjSizeMark == objectSize) // startup block
397             MALLOC_ASSERT(object<=bumpPtr, msg);
398         else {
399             // non-startup objects are 8 Byte aligned
400             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
401             FreeObject *toFree = findObjectToFree(object);
402 #if !__TBB_USE_THREAD_SANITIZER
403             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
404                           && (!bumpPtr || object>bumpPtr), msg);
405             // check against head of freeList, as this is mostly
406             // expected after double free
407             MALLOC_ASSERT(toFree != freeList, msg);
408 #endif
409             // check against head of publicFreeList, to detect double free
410             // involving foreign thread
411             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
412         }
413 #else
414         suppress_unused_warning(object);
415 #endif
416     }
417     void initEmptyBlock(TLSData *tls, size_t size);
418     size_t findObjectSize(void *object) const;
419     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
420 
421 protected:
422     void cleanBlockHeader();
423 
424 private:
425     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
426 
427     inline FreeObject *allocateFromBumpPtr();
428     inline FreeObject *findAllocatedObject(const void *address) const;
429 #if MALLOC_DEBUG
430     inline bool isProperlyPlaced(const void *object) const;
431 #endif
432     inline void markOwned(TLSData *tls) {
433         MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
434         ownerTid = ThreadId(); /* save the ID of the current thread */
435         tlsPtr.store(tls, std::memory_order_relaxed);
436     }
437     inline void markOrphaned() {
438         MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
439         tlsPtr.store(nullptr, std::memory_order_relaxed);
440     }
441 
442     friend class Bin;
443     friend class TLSData;
444     friend bool MemoryPool::destroy();
445 };
446 
447 const float Block::emptyEnoughRatio = 1.0 / 4.0;
448 
449 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
450     "The class Block does not fit into 2 cache lines on this platform. "
451     "Defining USE_INTERNAL_TID may help to fix it.");
452 
453 class Bin {
454 private:
455 public:
456     Block *activeBlk;
457     std::atomic<Block*> mailbox;
458     MallocMutex mailLock;
459 
460 public:
461     inline Block* getActiveBlock() const { return activeBlk; }
462     void resetActiveBlock() { activeBlk = nullptr; }
463     inline void setActiveBlock(Block *block);
464     inline Block* setPreviousBlockActive();
465     Block* getPrivatizedFreeListBlock();
466     void moveBlockToFront(Block *block);
467     bool cleanPublicFreeLists();
468     void processEmptyBlock(Block *block, bool poolTheBlock);
469     void addPublicFreeListBlock(Block* block);
470 
471     void outofTLSBin(Block* block);
472     void verifyTLSBin(size_t size) const;
473     void pushTLSBin(Block* block);
474 
475 #if MALLOC_DEBUG
476     void verifyInitState() const {
477         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
478         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
479     }
480 #endif
481 
482     friend void Block::freePublicObject (FreeObject *objectToFree);
483 };
484 
485 /********* End of the data structures                    **************/
486 
487 /*
488  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
489  */
490 const uint32_t minSmallObjectIndex = 0;
491 const uint32_t numSmallObjectBins = 8;
492 const uint32_t maxSmallObjectSize = 64;
493 
494 /*
495  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
496  * from maxSmallObjectSize till this size; 16 bins in total
497  */
498 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
499 const uint32_t numSegregatedObjectBins = 16;
500 const uint32_t maxSegregatedObjectSize = 1024;
501 
502 /*
503  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
504  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
505  */
506 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
507 const uint32_t numFittingBins = 5;
508 
509 const uint32_t fittingAlignment = estimatedCacheLineSize;
510 
511 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
512 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
513 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
514 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
515 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
516 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
517 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
518 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
519 #undef SET_FITTING_SIZE
520 
521 /*
522  * The total number of thread-specific Block-based bins
523  */
524 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
525 
526 /*
527  * Objects of this size and larger are considered large objects.
528  */
529 const uint32_t minLargeObjectSize = fittingSize5 + 1;
530 
531 /*
532  * Per-thread pool of slab blocks. Idea behind it is to not share with other
533  * threads memory that are likely in local cache(s) of our CPU.
534  */
535 class FreeBlockPool {
536 private:
537     std::atomic<Block*> head;
538     int         size;
539     Backend    *backend;
540 public:
541     static const int POOL_HIGH_MARK = 32;
542     static const int POOL_LOW_MARK  = 8;
543 
544     class ResOfGet {
545         ResOfGet() = delete;
546     public:
547         Block* block;
548         bool   lastAccMiss;
549         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
550     };
551 
552     // allocated in zero-initialized memory
553     FreeBlockPool(Backend *bknd) : backend(bknd) {}
554     ResOfGet getBlock();
555     void returnBlock(Block *block);
556     bool externalCleanup(); // can be called by another thread
557 };
558 
559 template<int LOW_MARK, int HIGH_MARK>
560 class LocalLOCImpl {
561 private:
562     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
563     // TODO: can single-linked list be faster here?
564     LargeMemoryBlock *tail; // need it when do releasing on overflow
565     std::atomic<LargeMemoryBlock*> head;
566     size_t            totalSize;
567     int               numOfBlocks;
568 public:
569     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
570     LargeMemoryBlock *get(size_t size);
571     bool externalCleanup(ExtMemoryPool *extMemPool);
572 #if __TBB_MALLOC_WHITEBOX_TEST
573     LocalLOCImpl() : tail(nullptr), head(nullptr), totalSize(0), numOfBlocks(0) {}
574     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
575     static const int LOC_HIGH_MARK = HIGH_MARK;
576 #else
577     // no ctor, object must be created in zero-initialized memory
578 #endif
579 };
580 
581 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
582 
583 class TLSData : public TLSRemote {
584     MemoryPool   *memPool;
585 public:
586     Bin           bin[numBlockBinLimit];
587     FreeBlockPool freeSlabBlocks;
588     LocalLOC      lloc;
589     unsigned      currCacheIdx;
590 private:
591     std::atomic<bool> unused;
592 public:
593     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd), currCacheIdx(0) {}
594     MemoryPool *getMemPool() const { return memPool; }
595     Bin* getAllocationBin(size_t size);
596     void release();
597     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
598         if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
599         // Heavy operation in terms of synchronization complexity,
600         // should be called only for the current thread
601         bool released = cleanBins ? cleanupBlockBins() : false;
602         // both cleanups to be called, and the order is not important
603         bool lloc_cleaned = lloc.externalCleanup(&memPool->extMemPool);
604         bool free_slab_blocks_cleaned = freeSlabBlocks.externalCleanup();
605         return released || lloc_cleaned || free_slab_blocks_cleaned;
606     }
607     bool cleanupBlockBins();
608     void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
609     void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
610 };
611 
612 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
613 {
614     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
615     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
616     if ( !tls )
617         return nullptr;
618     new(tls) TLSData(memPool, backend);
619     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
620 #if MALLOC_DEBUG
621     for (uint32_t i = 0; i < numBlockBinLimit; i++)
622         tls->bin[i].verifyInitState();
623 #endif
624     setThreadMallocTLS(tls);
625     memPool->extMemPool.allLocalCaches.registerThread(tls);
626     return tls;
627 }
628 
629 bool TLSData::cleanupBlockBins()
630 {
631     bool released = false;
632     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
633         released |= bin[i].cleanPublicFreeLists();
634         // After cleaning public free lists, only the active block might be empty.
635         // Do not use processEmptyBlock because it will just restore bumpPtr.
636         Block *block = bin[i].getActiveBlock();
637         if (block && block->empty()) {
638             bin[i].outofTLSBin(block);
639             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
640             released = true;
641         }
642     }
643     return released;
644 }
645 
646 bool ExtMemoryPool::releaseAllLocalCaches()
647 {
648     // Iterate all registered TLS data and clean LLOC and Slab pools
649     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
650 
651     // Bins privatization is done only for the current thread
652     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
653         released |= tlsData->cleanupBlockBins();
654 
655     return released;
656 }
657 
658 void AllLocalCaches::registerThread(TLSRemote *tls)
659 {
660     tls->prev = nullptr;
661     MallocMutex::scoped_lock lock(listLock);
662     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
663     tls->next = head;
664     if (head)
665         head->prev = tls;
666     head = tls;
667     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
668 }
669 
670 void AllLocalCaches::unregisterThread(TLSRemote *tls)
671 {
672     MallocMutex::scoped_lock lock(listLock);
673     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
674     if (head == tls)
675         head = tls->next;
676     if (tls->next)
677         tls->next->prev = tls->prev;
678     if (tls->prev)
679         tls->prev->next = tls->next;
680     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
681 }
682 
683 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
684 {
685     bool released = false;
686     {
687         MallocMutex::scoped_lock lock(listLock);
688         for (TLSRemote *curr=head; curr; curr=curr->next)
689             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
690     }
691     return released;
692 }
693 
694 void AllLocalCaches::markUnused()
695 {
696     bool locked = false;
697     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
698     if (!locked) // not wait for marking if someone doing something with it
699         return;
700 
701     for (TLSRemote *curr=head; curr; curr=curr->next)
702         static_cast<TLSData*>(curr)->markUnused();
703 }
704 
705 #if MALLOC_CHECK_RECURSION
706 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
707 std::atomic<pthread_t> RecursiveMallocCallProtector::owner_thread;
708 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
709 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
710 #if __FreeBSD__
711 bool        RecursiveMallocCallProtector::canUsePthread;
712 #endif
713 
714 #endif
715 
716 /*********** End code to provide thread ID and a TLS pointer **********/
717 
718 // Parameter for isLargeObject, keeps our expectations on memory origin.
719 // Assertions must use unknownMem to reliably report object invalidity.
720 enum MemoryOrigin {
721     ourMem,    // allocated by TBB allocator
722     unknownMem // can be allocated by system allocator or TBB allocator
723 };
724 
725 template<MemoryOrigin>
726 #if __TBB_USE_THREAD_SANITIZER
727 // We have a real race when accessing the large object header for
728 // non large objects (e.g. small or foreign objects).
729 // Therefore, we need to hide this access from the thread sanitizer
730 __attribute__((no_sanitize("thread")))
731 #endif
732 bool isLargeObject(void *object);
733 static void *internalMalloc(size_t size);
734 static void internalFree(void *object);
735 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
736 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
737 
738 #if !MALLOC_DEBUG
739 #if __INTEL_COMPILER || _MSC_VER
740 #define NOINLINE(decl) __declspec(noinline) decl
741 #define ALWAYSINLINE(decl) __forceinline decl
742 #elif __GNUC__
743 #define NOINLINE(decl) decl __attribute__ ((noinline))
744 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
745 #else
746 #define NOINLINE(decl) decl
747 #define ALWAYSINLINE(decl) decl
748 #endif
749 
750 static NOINLINE( bool doInitialization() );
751 ALWAYSINLINE( bool isMallocInitialized() );
752 
753 #undef ALWAYSINLINE
754 #undef NOINLINE
755 #endif /* !MALLOC_DEBUG */
756 
757 
758 /********* Now some rough utility code to deal with indexing the size bins. **************/
759 
760 /*
761  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
762  * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
763  * This is enough for current algorithm of distribution of sizes among bins.
764  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
765  */
766 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
767 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
768 #pragma intrinsic(_BitScanReverse)
769 #endif
770 static inline unsigned int highestBitPos(unsigned int n)
771 {
772     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
773     unsigned int pos;
774 #if __ARCH_x86_32||__ARCH_x86_64
775 
776 # if __unix__||__APPLE__||__MINGW32__
777     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
778 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
779     __asm
780     {
781         bsr eax, n
782         mov pos, eax
783     }
784 # elif _WIN64 && _MSC_VER>=1400
785     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
786 # else
787 #   error highestBitPos() not implemented for this platform
788 # endif
789 #elif __arm__
790     __asm__ __volatile__
791     (
792        "clz %0, %1\n"
793        "rsb %0, %0, %2\n"
794        :"=r" (pos) :"r" (n), "I" (31)
795     );
796 #else
797     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
798     pos = bsr[ n>>6 ];
799 #endif /* __ARCH_* */
800     return pos;
801 }
802 
803 unsigned int getSmallObjectIndex(unsigned int size)
804 {
805     unsigned int result = (size-1)>>3;
806     constexpr bool is_64bit = (8 == sizeof(void*));
807     if (is_64bit) {
808         // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
809         if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
810     }
811     return result;
812 }
813 
814 /*
815  * Depending on indexRequest, for a given size return either the index into the bin
816  * for objects of this size, or the actual size of objects in this bin.
817  */
818 template<bool indexRequest>
819 static unsigned int getIndexOrObjectSize (unsigned int size)
820 {
821     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
822         unsigned int index = getSmallObjectIndex( size );
823          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
824         return indexRequest ? index : (index+1)<<3;
825     }
826     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
827         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
828         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
829         if (indexRequest)
830             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
831         else {
832             unsigned int alignment = 128 >> (9-order); // alignment in the group
833             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
834             return alignUp(size,alignment);
835         }
836     }
837     else {
838         if( size <= fittingSize3 ) {
839             if( size <= fittingSize2 ) {
840                 if( size <= fittingSize1 )
841                     return indexRequest ? minFittingIndex : fittingSize1;
842                 else
843                     return indexRequest ? minFittingIndex+1 : fittingSize2;
844             } else
845                 return indexRequest ? minFittingIndex+2 : fittingSize3;
846         } else {
847             if( size <= fittingSize5 ) {
848                 if( size <= fittingSize4 )
849                     return indexRequest ? minFittingIndex+3 : fittingSize4;
850                 else
851                     return indexRequest ? minFittingIndex+4 : fittingSize5;
852             } else {
853                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
854                 return ~0U;
855             }
856         }
857     }
858 }
859 
860 static unsigned int getIndex (unsigned int size)
861 {
862     return getIndexOrObjectSize</*indexRequest=*/true>(size);
863 }
864 
865 static unsigned int getObjectSize (unsigned int size)
866 {
867     return getIndexOrObjectSize</*indexRequest=*/false>(size);
868 }
869 
870 
871 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
872 {
873     FreeObject *result;
874 
875     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
876 
877     { // Lock with acquire
878         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
879 
880         if( bootStrapObjectList) {
881             result = bootStrapObjectList;
882             bootStrapObjectList = bootStrapObjectList->next;
883         } else {
884             if (!bootStrapBlock) {
885                 bootStrapBlock = memPool->getEmptyBlock(size);
886                 if (!bootStrapBlock) return nullptr;
887             }
888             result = bootStrapBlock->bumpPtr;
889             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
890             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
891                 bootStrapBlock->bumpPtr = nullptr;
892                 bootStrapBlock->next = bootStrapBlockUsed;
893                 bootStrapBlockUsed = bootStrapBlock;
894                 bootStrapBlock = nullptr;
895             }
896         }
897     } // Unlock with release
898     memset (result, 0, size);
899     return (void*)result;
900 }
901 
902 void BootStrapBlocks::free(void* ptr)
903 {
904     MALLOC_ASSERT( ptr, ASSERT_TEXT );
905     { // Lock with acquire
906         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
907         ((FreeObject*)ptr)->next = bootStrapObjectList;
908         bootStrapObjectList = (FreeObject*)ptr;
909     } // Unlock with release
910 }
911 
912 void BootStrapBlocks::reset()
913 {
914     bootStrapBlock = bootStrapBlockUsed = nullptr;
915     bootStrapObjectList = nullptr;
916 }
917 
918 #if !(FREELIST_NONBLOCKING)
919 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
920 #endif
921 
922 /********* End rough utility code  **************/
923 
924 /* LifoList assumes zero initialization so a vector of it can be created
925  * by just allocating some space with no call to constructor.
926  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
927  *
928  * By usage convention there is no race on the initialization. */
929 LifoList::LifoList( ) : top(nullptr)
930 {
931     // MallocMutex assumes zero initialization
932     memset(static_cast<void*>(&lock), 0, sizeof(MallocMutex));
933 }
934 
935 void LifoList::push(Block *block)
936 {
937     MallocMutex::scoped_lock scoped_cs(lock);
938     block->next = top.load(std::memory_order_relaxed);
939     top.store(block, std::memory_order_relaxed);
940 }
941 
942 Block *LifoList::pop()
943 {
944     Block* block = nullptr;
945     if (top.load(std::memory_order_relaxed)) {
946         MallocMutex::scoped_lock scoped_cs(lock);
947         block = top.load(std::memory_order_relaxed);
948         if (block) {
949             top.store(block->next, std::memory_order_relaxed);
950         }
951     }
952     return block;
953 }
954 
955 Block *LifoList::grab()
956 {
957     Block *block = nullptr;
958     if (top.load(std::memory_order_relaxed)) {
959         MallocMutex::scoped_lock scoped_cs(lock);
960         block = top.load(std::memory_order_relaxed);
961         top.store(nullptr, std::memory_order_relaxed);
962     }
963     return block;
964 }
965 
966 /********* Thread and block related code      *************/
967 
968 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
969      LargeMemoryBlock *next, *lmb = loHead;
970      loHead = nullptr;
971 
972      for (; lmb; lmb = next) {
973          next = lmb->gNext;
974          if (poolDestroy) {
975              // as it's pool destruction, no need to return object to backend,
976              // only remove backrefs, as they are global
977              removeBackRef(lmb->backRefIdx);
978          } else {
979              // clean g(Next|Prev) to prevent removing lmb
980              // from AllLargeBlocksList inside returnLargeObject
981              lmb->gNext = lmb->gPrev = nullptr;
982              backend->returnLargeObject(lmb);
983          }
984      }
985 }
986 
987 TLSData* MemoryPool::getTLS(bool create)
988 {
989     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
990     if (create && !tls)
991         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
992     return tls;
993 }
994 
995 /*
996  * Return the bin for the given size.
997  */
998 inline Bin* TLSData::getAllocationBin(size_t size)
999 {
1000     return bin + getIndex(size);
1001 }
1002 
1003 /* Return an empty uninitialized block in a non-blocking fashion. */
1004 Block *MemoryPool::getEmptyBlock(size_t size)
1005 {
1006     TLSData* tls = getTLS(/*create=*/false);
1007     // try to use per-thread cache, if TLS available
1008     FreeBlockPool::ResOfGet resOfGet = tls?
1009         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(nullptr, false);
1010     Block *result = resOfGet.block;
1011 
1012     if (!result) { // not found in local cache, asks backend for slabs
1013         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1014         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1015 
1016         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1017         if (!result) return nullptr;
1018 
1019         if (!extMemPool.userPool())
1020             for (int i=0; i<num; i++) {
1021                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1022                 if (backRefIdx[i].isInvalid()) {
1023                     // roll back resource allocation
1024                     for (int j=0; j<i; j++)
1025                         removeBackRef(backRefIdx[j]);
1026                     Block *b = result;
1027                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1028                         extMemPool.backend.putSlabBlock(b);
1029                     return nullptr;
1030                 }
1031             }
1032         // resources were allocated, register blocks
1033         Block *b = result;
1034         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1035             // slab block in user's pool must have invalid backRefIdx
1036             if (extMemPool.userPool()) {
1037                 new (&b->backRefIdx) BackRefIdx();
1038             } else {
1039                 setBackRef(backRefIdx[i], b);
1040                 b->backRefIdx = backRefIdx[i];
1041             }
1042             b->tlsPtr.store(tls, std::memory_order_relaxed);
1043             b->poolPtr = this;
1044             // all but first one go to per-thread pool
1045             if (i > 0) {
1046                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1047                 tls->freeSlabBlocks.returnBlock(b);
1048             }
1049         }
1050     }
1051     MALLOC_ASSERT(result, ASSERT_TEXT);
1052     result->initEmptyBlock(tls, size);
1053     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1054     return result;
1055 }
1056 
1057 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1058 {
1059     block->reset();
1060     if (poolTheBlock) {
1061         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1062     } else {
1063         // slab blocks in user's pools do not have valid backRefIdx
1064         if (!extMemPool.userPool())
1065             removeBackRef(*(block->getBackRefIdx()));
1066         extMemPool.backend.putSlabBlock(block);
1067     }
1068 }
1069 
1070 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1071                          rawFreeType rawFree, size_t granularity,
1072                          bool keepAllMemory, bool fixedPool)
1073 {
1074     this->poolId = poolId;
1075     this->rawAlloc = rawAlloc;
1076     this->rawFree = rawFree;
1077     this->granularity = granularity;
1078     this->keepAllMemory = keepAllMemory;
1079     this->fixedPool = fixedPool;
1080     this->delayRegsReleasing = false;
1081     if (!initTLS())
1082         return false;
1083     loc.init(this);
1084     backend.init(this);
1085     MALLOC_ASSERT(isPoolValid(), nullptr);
1086     return true;
1087 }
1088 
1089 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1090 
1091 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1092 {
1093     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1094                policy->granularity? policy->granularity : defaultGranularity,
1095                policy->keepAllMemory, policy->fixedPool))
1096         return false;
1097     {
1098         MallocMutex::scoped_lock lock(memPoolListLock);
1099         next = defaultMemPool->next;
1100         defaultMemPool->next = this;
1101         prev = defaultMemPool;
1102         if (next)
1103             next->prev = this;
1104     }
1105     return true;
1106 }
1107 
1108 bool MemoryPool::reset()
1109 {
1110     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1111     // memory is not releasing during pool reset
1112     // TODO: mark regions to release unused on next reset()
1113     extMemPool.delayRegionsReleasing(true);
1114 
1115     bootStrapBlocks.reset();
1116     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1117     if (!extMemPool.reset())
1118         return false;
1119 
1120     if (!extMemPool.initTLS())
1121         return false;
1122     extMemPool.delayRegionsReleasing(false);
1123     return true;
1124 }
1125 
1126 bool MemoryPool::destroy()
1127 {
1128 #if __TBB_MALLOC_LOCACHE_STAT
1129     extMemPool.loc.reportStat(stdout);
1130 #endif
1131 #if __TBB_MALLOC_BACKEND_STAT
1132     extMemPool.backend.reportStat(stdout);
1133 #endif
1134     {
1135         MallocMutex::scoped_lock lock(memPoolListLock);
1136         // remove itself from global pool list
1137         if (prev)
1138             prev->next = next;
1139         if (next)
1140             next->prev = prev;
1141     }
1142     // slab blocks in non-default pool do not have backreferences,
1143     // only large objects do
1144     if (extMemPool.userPool())
1145         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1146     else {
1147         // only one non-userPool() is supported now
1148         MALLOC_ASSERT(this==defaultMemPool, nullptr);
1149         // There and below in extMemPool.destroy(), do not restore initial state
1150         // for user pool, because it's just about to be released. But for system
1151         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1152         bootStrapBlocks.reset();
1153         extMemPool.orphanedBlocks.reset();
1154     }
1155     return extMemPool.destroy();
1156 }
1157 
1158 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1159 {
1160     if (tlsData) { // might be called for "empty" TLS
1161         tlsData->release();
1162         bootStrapBlocks.free(tlsData);
1163         clearTLS();
1164     }
1165 }
1166 
1167 #if MALLOC_DEBUG
1168 void Bin::verifyTLSBin (size_t size) const
1169 {
1170 /* The debug version verifies the TLSBin as needed */
1171     uint32_t objSize = getObjectSize(size);
1172 
1173     if (activeBlk) {
1174         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1175         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1176 #if MALLOC_DEBUG>1
1177         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1178             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1179             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1180             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1181             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1182             if (temp->next) {
1183                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1184             }
1185         }
1186         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1187             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1188             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1189             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1190             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1191             if (temp->previous) {
1192                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1193             }
1194         }
1195 #endif /* MALLOC_DEBUG>1 */
1196     }
1197 }
1198 #else /* MALLOC_DEBUG */
1199 inline void Bin::verifyTLSBin (size_t) const { }
1200 #endif /* MALLOC_DEBUG */
1201 
1202 /*
1203  * Add a block to the start of this tls bin list.
1204  */
1205 void Bin::pushTLSBin(Block* block)
1206 {
1207     /* The objectSize should be defined and not a parameter
1208        because the function is applied to partially filled blocks as well */
1209     unsigned int size = block->objectSize;
1210 
1211     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1212     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1213     MALLOC_ASSERT( block->next == nullptr, ASSERT_TEXT );
1214     MALLOC_ASSERT( block->previous == nullptr, ASSERT_TEXT );
1215 
1216     MALLOC_ASSERT( this, ASSERT_TEXT );
1217     verifyTLSBin(size);
1218 
1219     block->next = activeBlk;
1220     if( activeBlk ) {
1221         block->previous = activeBlk->previous;
1222         activeBlk->previous = block;
1223         if( block->previous )
1224             block->previous->next = block;
1225     } else {
1226         activeBlk = block;
1227     }
1228 
1229     verifyTLSBin(size);
1230 }
1231 
1232 /*
1233  * Take a block out of its tls bin (e.g. before removal).
1234  */
1235 void Bin::outofTLSBin(Block* block)
1236 {
1237     unsigned int size = block->objectSize;
1238 
1239     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1240     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1241 
1242     MALLOC_ASSERT( this, ASSERT_TEXT );
1243     verifyTLSBin(size);
1244 
1245     if (block == activeBlk) {
1246         activeBlk = block->previous? block->previous : block->next;
1247     }
1248     /* Unlink the block */
1249     if (block->previous) {
1250         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1251         block->previous->next = block->next;
1252     }
1253     if (block->next) {
1254         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1255         block->next->previous = block->previous;
1256     }
1257     block->next = nullptr;
1258     block->previous = nullptr;
1259 
1260     verifyTLSBin(size);
1261 }
1262 
1263 Block* Bin::getPrivatizedFreeListBlock()
1264 {
1265     Block* block;
1266     MALLOC_ASSERT( this, ASSERT_TEXT );
1267     // if this method is called, active block usage must be unsuccessful
1268     MALLOC_ASSERT( (!activeBlk && !mailbox.load(std::memory_order_relaxed)) || (activeBlk && activeBlk->isFull), ASSERT_TEXT );
1269 
1270 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1271     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1272         return nullptr;
1273     else { // mailbox is not empty, take lock and inspect it
1274         MallocMutex::scoped_lock scoped_cs(mailLock);
1275         block = mailbox.load(std::memory_order_relaxed);
1276         if( block ) {
1277             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1278             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1279             mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1280             block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1281         }
1282     }
1283     if( block ) {
1284         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1285         block->privatizePublicFreeList();
1286         block->adjustPositionInBin(this);
1287     }
1288     return block;
1289 }
1290 
1291 void Bin::addPublicFreeListBlock(Block* block)
1292 {
1293     MallocMutex::scoped_lock scoped_cs(mailLock);
1294     block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1295     mailbox.store(block, std::memory_order_relaxed);
1296 }
1297 
1298 // Process publicly freed objects in all blocks and return empty blocks
1299 // to the backend in order to reduce overall footprint.
1300 bool Bin::cleanPublicFreeLists()
1301 {
1302     Block* block;
1303     if (!mailbox.load(std::memory_order_acquire))
1304         return false;
1305     else {
1306         // Grab all the blocks in the mailbox
1307         MallocMutex::scoped_lock scoped_cs(mailLock);
1308         block = mailbox.load(std::memory_order_relaxed);
1309         mailbox.store(nullptr, std::memory_order_relaxed);
1310     }
1311     bool released = false;
1312     while (block) {
1313         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1314         Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1315         block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1316         block->privatizePublicFreeList();
1317         if (block->empty()) {
1318             processEmptyBlock(block, /*poolTheBlock=*/false);
1319             released = true;
1320         } else
1321             block->adjustPositionInBin(this);
1322         block = tmp;
1323     }
1324     return released;
1325 }
1326 
1327 bool Block::adjustFullness()
1328 {
1329     if (bumpPtr) {
1330         /* If we are still using a bump ptr for this block it is empty enough to use. */
1331         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1332         isFull = false;
1333     } else {
1334         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1335         /* allocatedCount shows how many objects in the block are in use; however it still counts
1336          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1337         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1338 #if COLLECT_STATISTICS
1339         if (isFull)
1340             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1341         else
1342             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1343 #endif
1344     }
1345     return isFull;
1346 }
1347 
1348 // This method resides in class Block, and not in class Bin, in order to avoid
1349 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
1350 void Block::adjustPositionInBin(Bin* bin/*=nullptr*/)
1351 {
1352     // If the block were full, but became empty enough to use,
1353     // move it to the front of the list
1354     if (isFull && !adjustFullness()) {
1355         if (!bin)
1356             bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1357         bin->moveBlockToFront(this);
1358     }
1359 }
1360 
1361 /* Restore the bump pointer for an empty block that is planned to use */
1362 void Block::restoreBumpPtr()
1363 {
1364     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1365     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1366     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1367     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1368     freeList = nullptr;
1369     isFull = false;
1370 }
1371 
1372 void Block::freeOwnObject(void *object)
1373 {
1374     tlsPtr.load(std::memory_order_relaxed)->markUsed();
1375     allocatedCount--;
1376     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1377 #if COLLECT_STATISTICS
1378     // Note that getAllocationBin is not called on the hottest path with statistics off.
1379     if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1380         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1381     else
1382         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1383 #endif
1384     if (empty()) {
1385         // If the last object of a slab is freed, the slab cannot be marked full
1386         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1387         tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1388     } else { // hot path
1389         FreeObject *objectToFree = findObjectToFree(object);
1390         objectToFree->next = freeList;
1391         freeList = objectToFree;
1392         adjustPositionInBin();
1393     }
1394 }
1395 
1396 void Block::freePublicObject (FreeObject *objectToFree)
1397 {
1398     FreeObject* localPublicFreeList{};
1399 
1400     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1401 #if FREELIST_NONBLOCKING
1402     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1403     localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1404     do {
1405         objectToFree->next = localPublicFreeList;
1406         // no backoff necessary because trying to make change, not waiting for a change
1407     } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1408 #else
1409     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1410     {
1411         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1412         localPublicFreeList = objectToFree->next = publicFreeList;
1413         publicFreeList = objectToFree;
1414     }
1415 #endif
1416 
1417     if( localPublicFreeList==nullptr ) {
1418         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1419         // otherwise, it should point to the bin the block belongs to.
1420         // reading nextPrivatizable is thread-safe below, because:
1421         // 1) the executing thread atomically got publicFreeList==nullptr and changed it to non-nullptr;
1422         // 2) only owning thread can change it back to nullptr,
1423         // 3) but it can not be done until the block is put to the mailbox
1424         // So the executing thread is now the only one that can change nextPrivatizable
1425         Block* next = nextPrivatizable.load(std::memory_order_acquire);
1426         if( !isNotForUse(next) ) {
1427             MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1428             Bin* theBin = (Bin*) next;
1429 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1430             // FIXME: The thread that returns the block is not the block's owner.
1431             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1432             // Need to find a way to get the correct remote bin for comparison.
1433             { // check that nextPrivatizable points to the bin the block belongs to
1434                 uint32_t index = getIndex( objectSize );
1435                 TLSData* tls = getThreadMallocTLS();
1436                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1437             }
1438 #endif // MALLOC_DEBUG
1439             theBin->addPublicFreeListBlock(this);
1440         }
1441     }
1442     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1443     STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1444 }
1445 
1446 // Make objects freed by other threads available for use again
1447 void Block::privatizePublicFreeList( bool reset )
1448 {
1449     FreeObject *localPublicFreeList;
1450     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1451     // to properly synchronize with other threads freeing objects to this slab.
1452     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1453 
1454     // Only the owner thread may reset the pointer to nullptr
1455     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1456 #if FREELIST_NONBLOCKING
1457     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1458 #else
1459     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1460     {
1461         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1462         localPublicFreeList = publicFreeList;
1463         publicFreeList = endMarker;
1464     }
1465 #endif
1466     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1467     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1468 
1469     // publicFreeList must have been UNUSABLE or valid, but not nullptr
1470     MALLOC_ASSERT( localPublicFreeList!=nullptr, ASSERT_TEXT );
1471     if( isSolidPtr(localPublicFreeList) ) {
1472         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1473         /* other threads did not change the counter freeing our blocks */
1474         allocatedCount--;
1475         FreeObject *temp = localPublicFreeList;
1476         while( isSolidPtr(temp->next) ){ // the list will end with either nullptr or UNUSABLE
1477             temp = temp->next;
1478             allocatedCount--;
1479             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1480         }
1481         /* merge with local freeList */
1482         temp->next = freeList;
1483         freeList = localPublicFreeList;
1484         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1485     }
1486 }
1487 
1488 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1489 {
1490     Bin* bin = tls->bin + index;
1491     STAT_increment(getThreadId(), index, allocBlockPublic);
1492     next = nullptr;
1493     previous = nullptr;
1494     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != nullptr, ASSERT_TEXT );
1495     /* There is not a race here since no other thread owns this block */
1496     markOwned(tls);
1497     // It is safe to change nextPrivatizable, as publicFreeList is not null
1498     MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1499     nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1500     // the next call is required to change publicFreeList to 0
1501     privatizePublicFreeList();
1502     if( empty() ) {
1503         restoreBumpPtr();
1504     } else {
1505         adjustFullness(); // check the block fullness and set isFull
1506     }
1507     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1508 }
1509 
1510 
1511 bool Block::readyToShare()
1512 {
1513     FreeObject* oldVal = nullptr;
1514 #if FREELIST_NONBLOCKING
1515     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1516 #else
1517     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1518     {
1519         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1520         if ( (oldVal=publicFreeList)==nullptr )
1521             (intptr_t&)(publicFreeList) = UNUSABLE;
1522     }
1523 #endif
1524     return oldVal==nullptr;
1525 }
1526 
1527 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1528 {
1529     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1530     // unreferenced formal parameter warning
1531     tbb::detail::suppress_unused_warning(index);
1532     STAT_increment(getThreadId(), index, freeBlockPublic);
1533     markOrphaned();
1534     if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1535         // First check passed: the block is not in mailbox yet.
1536         // Need to set publicFreeList to non-zero, so other threads
1537         // will not change nextPrivatizable and it can be zeroed.
1538         if ( !readyToShare() ) {
1539             // another thread freed an object; we need to wait until it finishes.
1540             // There is no need for exponential backoff, as the wait here is not for a lock;
1541             // but need to yield, so the thread we wait has a chance to run.
1542             // TODO: add a pause to also be friendly to hyperthreads
1543             int count = 256;
1544             while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1545                 if (--count==0) {
1546                     do_yield();
1547                     count = 256;
1548                 }
1549             }
1550         }
1551     }
1552     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=nullptr, ASSERT_TEXT );
1553     // now it is safe to change our data
1554     previous = nullptr;
1555     // it is caller responsibility to ensure that the list of blocks
1556     // formed by nextPrivatizable pointers is kept consistent if required.
1557     // if only called from thread shutdown code, it does not matter.
1558     nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1559 }
1560 
1561 void Block::cleanBlockHeader()
1562 {
1563     next = nullptr;
1564     previous = nullptr;
1565     freeList = nullptr;
1566     allocatedCount = 0;
1567     isFull = false;
1568     tlsPtr.store(nullptr, std::memory_order_relaxed);
1569 
1570     publicFreeList.store(nullptr, std::memory_order_relaxed);
1571 }
1572 
1573 void Block::initEmptyBlock(TLSData *tls, size_t size)
1574 {
1575     // Having getIndex and getObjectSize called next to each other
1576     // allows better compiler optimization as they basically share the code.
1577     unsigned int index = getIndex(size);
1578     unsigned int objSz = getObjectSize(size);
1579 
1580     cleanBlockHeader();
1581     objectSize = objSz;
1582     markOwned(tls);
1583     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1584     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1585 
1586     // each block should have the address where the head of the list of "privatizable" blocks is kept
1587     // the only exception is a block for boot strap which is initialized when TLS is yet nullptr
1588     nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1589     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1590              this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1591 }
1592 
1593 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1594 {
1595     // TODO: try to use index from getAllocationBin
1596     unsigned int index = getIndex(size);
1597     Block *block = bins[index].pop();
1598     if (block) {
1599         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1600         block->privatizeOrphaned(tls, index);
1601     }
1602     return block;
1603 }
1604 
1605 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1606 {
1607     unsigned int index = getIndex(block->getSize());
1608     block->shareOrphaned(binTag, index);
1609     MALLOC_ITT_SYNC_RELEASING(bins+index);
1610     bins[index].push(block);
1611 }
1612 
1613 void OrphanedBlocks::reset()
1614 {
1615     for (uint32_t i=0; i<numBlockBinLimit; i++)
1616         new (bins+i) LifoList();
1617 }
1618 
1619 bool OrphanedBlocks::cleanup(Backend* backend)
1620 {
1621     bool released = false;
1622     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1623         Block* block = bins[i].grab();
1624         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1625         while (block) {
1626             Block* next = block->next;
1627             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to nullptr
1628             if (block->empty()) {
1629                 block->reset();
1630                 // slab blocks in user's pools do not have valid backRefIdx
1631                 if (!backend->inUserPool())
1632                     removeBackRef(*(block->getBackRefIdx()));
1633                 backend->putSlabBlock(block);
1634                 released = true;
1635             } else {
1636                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1637                 bins[i].push(block);
1638             }
1639             block = next;
1640         }
1641     }
1642     return released;
1643 }
1644 
1645 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1646 {
1647     Block *b = head.exchange(nullptr);
1648     bool lastAccessMiss;
1649 
1650     if (b) {
1651         size--;
1652         Block *newHead = b->next;
1653         lastAccessMiss = false;
1654         head.store(newHead, std::memory_order_release);
1655     } else {
1656         lastAccessMiss = true;
1657     }
1658     return ResOfGet(b, lastAccessMiss);
1659 }
1660 
1661 void FreeBlockPool::returnBlock(Block *block)
1662 {
1663     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1664     Block *localHead = head.exchange(nullptr);
1665 
1666     if (!localHead) {
1667         size = 0; // head was stolen by externalClean, correct size accordingly
1668     } else if (size == POOL_HIGH_MARK) {
1669         // release cold blocks and add hot one,
1670         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1671         Block *headToFree = localHead, *helper;
1672         for (int i=0; i<POOL_LOW_MARK-2; i++)
1673             headToFree = headToFree->next;
1674         Block *last = headToFree;
1675         headToFree = headToFree->next;
1676         last->next = nullptr;
1677         size = POOL_LOW_MARK-1;
1678         for (Block *currBl = headToFree; currBl; currBl = helper) {
1679             helper = currBl->next;
1680             // slab blocks in user's pools do not have valid backRefIdx
1681             if (!backend->inUserPool())
1682                 removeBackRef(currBl->backRefIdx);
1683             backend->putSlabBlock(currBl);
1684         }
1685     }
1686     size++;
1687     block->next = localHead;
1688     head.store(block, std::memory_order_release);
1689 }
1690 
1691 bool FreeBlockPool::externalCleanup()
1692 {
1693     Block *helper;
1694     bool released = false;
1695 
1696     for (Block *currBl=head.exchange(nullptr); currBl; currBl=helper) {
1697         helper = currBl->next;
1698         // slab blocks in user's pools do not have valid backRefIdx
1699         if (!backend->inUserPool())
1700             removeBackRef(currBl->backRefIdx);
1701         backend->putSlabBlock(currBl);
1702         released = true;
1703     }
1704     return released;
1705 }
1706 
1707 /* Prepare the block for returning to FreeBlockPool */
1708 void Block::reset()
1709 {
1710     // it is caller's responsibility to ensure no data is lost before calling this
1711     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1712     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1713     if (!isStartupAllocObject())
1714         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1715 
1716     cleanBlockHeader();
1717 
1718     nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1719 
1720     objectSize = 0;
1721     // for an empty block, bump pointer should point right after the end of the block
1722     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1723 }
1724 
1725 inline void Bin::setActiveBlock (Block *block)
1726 {
1727 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1728     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1729     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1730     activeBlk = block;
1731 }
1732 
1733 inline Block* Bin::setPreviousBlockActive()
1734 {
1735     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1736     Block* temp = activeBlk->previous;
1737     if( temp ) {
1738         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1739         activeBlk = temp;
1740     }
1741     return temp;
1742 }
1743 
1744 inline bool Block::isOwnedByCurrentThread() const {
1745     return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1746 }
1747 
1748 FreeObject *Block::findObjectToFree(const void *object) const
1749 {
1750     FreeObject *objectToFree;
1751     // Due to aligned allocations, a pointer passed to scalable_free
1752     // might differ from the address of internally allocated object.
1753     // Small objects however should always be fine.
1754     if (objectSize <= maxSegregatedObjectSize)
1755         objectToFree = (FreeObject*)object;
1756     // "Fitting size" allocations are suspicious if aligned higher than naturally
1757     else {
1758         if ( ! isAligned(object,2*fittingAlignment) )
1759             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1760             //       so might even be slower in average than unconditional use of findAllocatedObject.
1761             // here it should be a "real" object
1762             objectToFree = (FreeObject*)object;
1763         else
1764             // here object can be an aligned address, so applying additional checks
1765             objectToFree = findAllocatedObject(object);
1766         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1767     }
1768     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1769 
1770     return objectToFree;
1771 }
1772 
1773 void TLSData::release()
1774 {
1775     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1776     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1777 
1778     for (unsigned index = 0; index < numBlockBins; index++) {
1779         Block *activeBlk = bin[index].getActiveBlock();
1780         if (!activeBlk)
1781             continue;
1782         Block *threadlessBlock = activeBlk->previous;
1783         bool syncOnMailbox = false;
1784         while (threadlessBlock) {
1785             Block *threadBlock = threadlessBlock->previous;
1786             if (threadlessBlock->empty()) {
1787                 /* we destroy the thread, so not use its block pool */
1788                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1789             } else {
1790                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1791                 syncOnMailbox = true;
1792             }
1793             threadlessBlock = threadBlock;
1794         }
1795         threadlessBlock = activeBlk;
1796         while (threadlessBlock) {
1797             Block *threadBlock = threadlessBlock->next;
1798             if (threadlessBlock->empty()) {
1799                 /* we destroy the thread, so not use its block pool */
1800                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1801             } else {
1802                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1803                 syncOnMailbox = true;
1804             }
1805             threadlessBlock = threadBlock;
1806         }
1807         bin[index].resetActiveBlock();
1808 
1809         if (syncOnMailbox) {
1810             // Although, we synchronized on nextPrivatizable inside a block, we still need to
1811             // synchronize on the bin lifetime because the thread releasing an object into the public
1812             // free list is touching the bin (mailbox and mailLock)
1813             MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1814         }
1815     }
1816 }
1817 
1818 
1819 #if MALLOC_CHECK_RECURSION
1820 // TODO: Use dedicated heap for this
1821 
1822 /*
1823  * It's a special kind of allocation that can be used when malloc is
1824  * not available (either during startup or when malloc was already called and
1825  * we are, say, inside pthread_setspecific's call).
1826  * Block can contain objects of different sizes,
1827  * allocations are performed by moving bump pointer and increasing of object counter,
1828  * releasing is done via counter of objects allocated in the block
1829  * or moving bump pointer if releasing object is on a bound.
1830  * TODO: make bump pointer to grow to the same backward direction as all the others.
1831  */
1832 
1833 class StartupBlock : public Block {
1834     size_t availableSize() const {
1835         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1836     }
1837     static StartupBlock *getBlock();
1838 public:
1839     static FreeObject *allocate(size_t size);
1840     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1841     void free(void *ptr);
1842 };
1843 
1844 static MallocMutex startupMallocLock;
1845 static StartupBlock *firstStartupBlock;
1846 
1847 StartupBlock *StartupBlock::getBlock()
1848 {
1849     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1850     if (backRefIdx.isInvalid()) return nullptr;
1851 
1852     StartupBlock *block = static_cast<StartupBlock*>(
1853         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1854     if (!block) return nullptr;
1855 
1856     block->cleanBlockHeader();
1857     setBackRef(backRefIdx, block);
1858     block->backRefIdx = backRefIdx;
1859     // use startupAllocObjSizeMark to mark objects from startup block marker
1860     block->objectSize = startupAllocObjSizeMark;
1861     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1862     return block;
1863 }
1864 
1865 FreeObject *StartupBlock::allocate(size_t size)
1866 {
1867     FreeObject *result;
1868     StartupBlock *newBlock = nullptr;
1869 
1870     /* Objects must be aligned on their natural bounds,
1871        and objects bigger than word on word's bound. */
1872     size = alignUp(size, sizeof(size_t));
1873     // We need size of an object to implement msize.
1874     size_t reqSize = size + sizeof(size_t);
1875     {
1876         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1877         // Re-check whether we need a new block (conditions might have changed)
1878         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1879             if (!newBlock) {
1880                 newBlock = StartupBlock::getBlock();
1881                 if (!newBlock) return nullptr;
1882             }
1883             newBlock->next = (Block*)firstStartupBlock;
1884             if (firstStartupBlock)
1885                 firstStartupBlock->previous = (Block*)newBlock;
1886             firstStartupBlock = newBlock;
1887         }
1888         result = firstStartupBlock->bumpPtr;
1889         firstStartupBlock->allocatedCount++;
1890         firstStartupBlock->bumpPtr =
1891             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1892     }
1893 
1894     // keep object size at the negative offset
1895     *((size_t*)result) = size;
1896     return (FreeObject*)((size_t*)result+1);
1897 }
1898 
1899 void StartupBlock::free(void *ptr)
1900 {
1901     Block* blockToRelease = nullptr;
1902     {
1903         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1904 
1905         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1906         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1907                       && allocatedCount>0, ASSERT_TEXT);
1908         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1909                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1910                       ASSERT_TEXT);
1911         if (0 == --allocatedCount) {
1912             if (this == firstStartupBlock)
1913                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1914             if (previous)
1915                 previous->next = next;
1916             if (next)
1917                 next->previous = previous;
1918             blockToRelease = this;
1919         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1920             // last object in the block released
1921             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1922             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1923                           ASSERT_TEXT);
1924             bumpPtr = newBump;
1925         }
1926     }
1927     if (blockToRelease) {
1928         blockToRelease->previous = blockToRelease->next = nullptr;
1929         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1930     }
1931 }
1932 
1933 #endif /* MALLOC_CHECK_RECURSION */
1934 
1935 /********* End thread related code  *************/
1936 
1937 /********* Library initialization *************/
1938 
1939 //! Value indicating the state of initialization.
1940 /* 0 = initialization not started.
1941  * 1 = initialization started but not finished.
1942  * 2 = initialization finished.
1943  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1944  * useful for detecting errors in the double-check pattern.
1945  */
1946 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1947 static MallocMutex initMutex;
1948 
1949 /** The leading "\0" is here so that applying "strings" to the binary
1950     delivers a clean result. */
1951 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1952 
1953 #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1954 
1955 /* Decrease race interval between dynamic library unloading and pthread key
1956    destructor. Protect only Pthreads with supported unloading. */
1957 class ShutdownSync {
1958 /* flag is the number of threads in pthread key dtor body
1959    (i.e., between threadDtorStart() and threadDtorDone())
1960    or the signal to skip dtor, if flag < 0 */
1961     std::atomic<intptr_t> flag;
1962     static const intptr_t skipDtor = INTPTR_MIN/2;
1963 public:
1964     void init() { flag.store(0, std::memory_order_release); }
1965 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1966    simultaneously, so flag never becomes negative because of that. */
1967     bool threadDtorStart() {
1968         if (flag.load(std::memory_order_acquire) < 0)
1969             return false;
1970         if (++flag <= 0) { // note that new value returned
1971             flag.fetch_sub(1); // flag is spoiled by us, restore it
1972             return false;
1973         }
1974         return true;
1975     }
1976     void threadDtorDone() {
1977         flag.fetch_sub(1);
1978     }
1979     void processExit() {
1980         if (flag.fetch_add(skipDtor) != 0) {
1981             SpinWaitUntilEq(flag, skipDtor);
1982         }
1983     }
1984 };
1985 
1986 #else
1987 
1988 class ShutdownSync {
1989 public:
1990     void init() { }
1991     bool threadDtorStart() { return true; }
1992     void threadDtorDone() { }
1993     void processExit() { }
1994 };
1995 
1996 #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1997 
1998 static ShutdownSync shutdownSync;
1999 
2000 inline bool isMallocInitialized() {
2001     // Load must have acquire fence; otherwise thread taking "initialized" path
2002     // might perform textually later loads *before* mallocInitialized becomes 2.
2003     return 2 == mallocInitialized.load(std::memory_order_acquire);
2004 }
2005 
2006 /* Caller is responsible for ensuring this routine is called exactly once. */
2007 extern "C" void MallocInitializeITT() {
2008 #if __TBB_USE_ITT_NOTIFY
2009     if (!usedBySrcIncluded)
2010         tbb::detail::r1::__TBB_load_ittnotify();
2011 #endif
2012 }
2013 
2014 void MemoryPool::initDefaultPool() {
2015     hugePages.init();
2016 }
2017 
2018 /*
2019  * Allocator initialization routine;
2020  * it is called lazily on the very first scalable_malloc call.
2021  */
2022 static bool initMemoryManager()
2023 {
2024     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2025              sizeof(Block), sizeof(uintptr_t) ));
2026     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2027     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2028     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2029                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2030 
2031 #if USE_WINTHREAD
2032     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2033 #else
2034     // POSIX.1-2001-compliant way to get page size
2035     const size_t granularity = sysconf(_SC_PAGESIZE);
2036 #endif
2037     if (!defaultMemPool) {
2038         // Do not rely on static constructors and do the assignment in case
2039         // of library static section not initialized at this call yet.
2040         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2041     }
2042     bool initOk = defaultMemPool->
2043         extMemPool.init(0, nullptr, nullptr, granularity,
2044                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2045 // TODO: extMemPool.init() to not allocate memory
2046     if (!initOk || !initBackRefMain(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2047         return false;
2048     MemoryPool::initDefaultPool();
2049     // init() is required iff initMemoryManager() is called
2050     // after mallocProcessShutdownNotification()
2051     shutdownSync.init();
2052 #if COLLECT_STATISTICS
2053     initStatisticsCollection();
2054 #endif
2055     return true;
2056 }
2057 
2058 static bool GetBoolEnvironmentVariable(const char* name) {
2059     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2060 }
2061 
2062 //! Ensures that initMemoryManager() is called once and only once.
2063 /** Does not return until initMemoryManager() has been completed by a thread.
2064     There is no need to call this routine if mallocInitialized==2 . */
2065 static bool doInitialization()
2066 {
2067     MallocMutex::scoped_lock lock( initMutex );
2068     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2069         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2070         mallocInitialized.store(1, std::memory_order_relaxed);
2071         RecursiveMallocCallProtector scoped;
2072         if (!initMemoryManager()) {
2073             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2074             return false;
2075         }
2076 #ifdef  MALLOC_EXTRA_INITIALIZATION
2077         MALLOC_EXTRA_INITIALIZATION;
2078 #endif
2079 #if MALLOC_CHECK_RECURSION
2080         RecursiveMallocCallProtector::detectNaiveOverload();
2081 #endif
2082         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2083         // Store must have release fence, otherwise mallocInitialized==2
2084         // might become remotely visible before side effects of
2085         // initMemoryManager() become remotely visible.
2086         mallocInitialized.store(2, std::memory_order_release);
2087         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2088             fputs(VersionString+1,stderr);
2089             hugePages.printStatus();
2090         }
2091     }
2092     /* It can't be 0 or I would have initialized it */
2093     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2094     return true;
2095 }
2096 
2097 /********* End library initialization *************/
2098 
2099 /********* The malloc show begins     *************/
2100 
2101 
2102 FreeObject *Block::allocateFromFreeList()
2103 {
2104     FreeObject *result;
2105 
2106     if (!freeList) return nullptr;
2107 
2108     result = freeList;
2109     MALLOC_ASSERT( result, ASSERT_TEXT );
2110 
2111     freeList = result->next;
2112     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2113     allocatedCount++;
2114     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2115 
2116     return result;
2117 }
2118 
2119 FreeObject *Block::allocateFromBumpPtr()
2120 {
2121     FreeObject *result = bumpPtr;
2122     if (result) {
2123         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2124         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2125             bumpPtr = nullptr;
2126         }
2127         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2128         allocatedCount++;
2129         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2130     }
2131     return result;
2132 }
2133 
2134 inline FreeObject* Block::allocate()
2135 {
2136     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2137 
2138     /* for better cache locality, first looking in the free list. */
2139     if ( FreeObject *result = allocateFromFreeList() ) {
2140         return result;
2141     }
2142     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2143 
2144     /* if free list is empty, try thread local bump pointer allocation. */
2145     if ( FreeObject *result = allocateFromBumpPtr() ) {
2146         return result;
2147     }
2148     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2149 
2150     /* the block is considered full. */
2151     isFull = true;
2152     return nullptr;
2153 }
2154 
2155 size_t Block::findObjectSize(void *object) const
2156 {
2157     size_t blSize = getSize();
2158 #if MALLOC_CHECK_RECURSION
2159     // Currently, there is no aligned allocations from startup blocks,
2160     // so we can return just StartupBlock::msize().
2161     // TODO: This must be extended if we add aligned allocation from startup blocks.
2162     if (!blSize)
2163         return StartupBlock::msize(object);
2164 #endif
2165     // object can be aligned, so real size can be less than block's
2166     size_t size =
2167         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2168     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2169     return size;
2170 }
2171 
2172 void Bin::moveBlockToFront(Block *block)
2173 {
2174     /* move the block to the front of the bin */
2175     if (block == activeBlk) return;
2176     outofTLSBin(block);
2177     pushTLSBin(block);
2178 }
2179 
2180 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2181 {
2182     if (block != activeBlk) {
2183         /* We are not using this block; return it to the pool */
2184         outofTLSBin(block);
2185         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2186     } else {
2187         /* all objects are free - let's restore the bump pointer */
2188         block->restoreBumpPtr();
2189     }
2190 }
2191 
2192 template<int LOW_MARK, int HIGH_MARK>
2193 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2194 {
2195     const size_t size = object->unalignedSize;
2196     // not spoil cache with too large object, that can cause its total cleanup
2197     if (size > MAX_TOTAL_SIZE)
2198         return false;
2199     LargeMemoryBlock *localHead = head.exchange(nullptr);
2200 
2201     object->prev = nullptr;
2202     object->next = localHead;
2203     if (localHead)
2204         localHead->prev = object;
2205     else {
2206         // those might not be cleaned during local cache stealing, correct them
2207         totalSize = 0;
2208         numOfBlocks = 0;
2209         tail = object;
2210     }
2211     localHead = object;
2212     totalSize += size;
2213     numOfBlocks++;
2214     // must meet both size and number of cached objects constrains
2215     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2216         // scanning from tail until meet conditions
2217         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2218             totalSize -= tail->unalignedSize;
2219             numOfBlocks--;
2220             tail = tail->prev;
2221         }
2222         LargeMemoryBlock *headToRelease = tail->next;
2223         tail->next = nullptr;
2224 
2225         extMemPool->freeLargeObjectList(headToRelease);
2226     }
2227 
2228     head.store(localHead, std::memory_order_release);
2229     return true;
2230 }
2231 
2232 template<int LOW_MARK, int HIGH_MARK>
2233 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2234 {
2235     LargeMemoryBlock *localHead, *res = nullptr;
2236 
2237     if (size > MAX_TOTAL_SIZE)
2238         return nullptr;
2239 
2240     // TBB_REVAMP_TODO: review this line
2241     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(nullptr)) == nullptr) {
2242         // do not restore totalSize, numOfBlocks and tail at this point,
2243         // as they are used only in put(), where they must be restored
2244         return nullptr;
2245     }
2246 
2247     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2248         if (curr->unalignedSize == size) {
2249             res = curr;
2250             if (curr->next)
2251                 curr->next->prev = curr->prev;
2252             else
2253                 tail = curr->prev;
2254             if (curr != localHead)
2255                 curr->prev->next = curr->next;
2256             else
2257                 localHead = curr->next;
2258             totalSize -= size;
2259             numOfBlocks--;
2260             break;
2261         }
2262     }
2263 
2264     head.store(localHead, std::memory_order_release);
2265     return res;
2266 }
2267 
2268 template<int LOW_MARK, int HIGH_MARK>
2269 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2270 {
2271     if (LargeMemoryBlock *localHead = head.exchange(nullptr)) {
2272         extMemPool->freeLargeObjectList(localHead);
2273         return true;
2274     }
2275     return false;
2276 }
2277 
2278 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2279 {
2280     LargeMemoryBlock *lmb = nullptr;
2281 
2282     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2283     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2284     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2285         return nullptr;
2286     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2287 
2288     if (tls) {
2289         tls->markUsed();
2290         lmb = tls->lloc.get(allocationSize);
2291     }
2292     if (!lmb)
2293         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2294 
2295     if (lmb) {
2296         // doing shuffle we suppose that alignment offset guarantees
2297         // that different cache lines are in use
2298         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2299 
2300         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2301         uintptr_t alignedRight =
2302             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2303         // Has some room to shuffle object between cache lines?
2304         // Note that alignedRight and alignedArea are aligned at alignment.
2305         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2306         if (ptrDelta && tls) { // !tls is cold path
2307             // for the hot path of alignment==estimatedCacheLineSize,
2308             // allow compilers to use shift for division
2309             // (since estimatedCacheLineSize is a power-of-2 constant)
2310             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2311                   ptrDelta / estimatedCacheLineSize :
2312                   ptrDelta / alignment;
2313             unsigned myCacheIdx = ++tls->currCacheIdx;
2314             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2315 
2316             // Move object to a cache line with an offset that is different from
2317             // previous allocation. This supposedly allows us to use cache
2318             // associativity more efficiently.
2319             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2320         }
2321         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2322                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2323         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2324         header->memoryBlock = lmb;
2325         header->backRefIdx = lmb->backRefIdx;
2326         setBackRef(header->backRefIdx, header);
2327 
2328         lmb->objectSize = size;
2329 
2330         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2331         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2332 
2333         return alignedArea;
2334     }
2335     return nullptr;
2336 }
2337 
2338 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2339 {
2340     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2341     // overwrite backRefIdx to simplify double free detection
2342     header->backRefIdx = BackRefIdx();
2343 
2344     if (tls) {
2345         tls->markUsed();
2346         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2347             return;
2348     }
2349     extMemPool.freeLargeObject(header->memoryBlock);
2350 }
2351 
2352 /*
2353  * All aligned allocations fall into one of the following categories:
2354  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2355  *       we just align the size up, and request this amount, because for every size
2356  *       aligned to some power of 2, the allocated object is at least that aligned.
2357  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2358  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2359  *       its address up; given such pointer, scalable_free could find the real object.
2360  *       Wrapping of size+alignment is impossible because maximal allowed
2361  *       alignment plus minLargeObjectSize can't lead to wrapping.
2362  * 4. otherwise, aligned large object is allocated.
2363  */
2364 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2365 {
2366     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2367 
2368     if (!isMallocInitialized())
2369         if (!doInitialization())
2370             return nullptr;
2371 
2372     void *result;
2373     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2374         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2375     else if (size<minLargeObjectSize) {
2376         if (alignment<=fittingAlignment)
2377             result = internalPoolMalloc(memPool, size);
2378         else if (size+alignment < minLargeObjectSize) {
2379             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2380             if (!unaligned) return nullptr;
2381             result = alignUp(unaligned, alignment);
2382         } else
2383             goto LargeObjAlloc;
2384     } else {
2385     LargeObjAlloc:
2386         TLSData *tls = memPool->getTLS(/*create=*/true);
2387         // take into account only alignment that are higher then natural
2388         result =
2389             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2390                                                largeObjectAlignment: alignment);
2391     }
2392 
2393     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2394     return result;
2395 }
2396 
2397 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2398                             size_t newSize, size_t alignment = 0)
2399 {
2400     void *result;
2401     size_t copySize;
2402 
2403     if (isLargeObject<ourMem>(ptr)) {
2404         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2405         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2406 
2407         // Apply different strategies if size decreases
2408         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2409 
2410             // For huge objects (that do not fit in backend cache), keep the same space unless
2411             // the new size is at least twice smaller
2412             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2413             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2414             if (newSize > threshold) {
2415                 lmb->objectSize = newSize;
2416                 return ptr;
2417             }
2418             // TODO: For large objects suitable for the backend cache,
2419             // split out the excessive part and put it to the backend.
2420         }
2421         // Reallocate for real
2422         copySize = lmb->objectSize;
2423 #if BACKEND_HAS_MREMAP
2424         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2425                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2426             return r;
2427 #endif
2428         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2429             internalPoolMalloc(memPool, newSize);
2430 
2431     } else {
2432         Block* block = (Block *)alignDown(ptr, slabSize);
2433         copySize = block->findObjectSize(ptr);
2434 
2435         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2436         // Currently, in case of size decreasing, old pointer is returned
2437         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2438             return ptr;
2439         } else {
2440             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2441                 internalPoolMalloc(memPool, newSize);
2442         }
2443     }
2444     if (result) {
2445         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2446         internalPoolFree(memPool, ptr, 0);
2447     }
2448     return result;
2449 }
2450 
2451 #if MALLOC_DEBUG
2452 /* A predicate checks if an object is properly placed inside its block */
2453 inline bool Block::isProperlyPlaced(const void *object) const
2454 {
2455     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2456 }
2457 #endif
2458 
2459 /* Finds the real object inside the block */
2460 FreeObject *Block::findAllocatedObject(const void *address) const
2461 {
2462     // calculate offset from the end of the block space
2463     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2464     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2465     // find offset difference from a multiple of allocation size
2466     offset %= objectSize;
2467     // and move the address down to where the real object starts.
2468     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2469 }
2470 
2471 /*
2472  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2473  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2474  */
2475 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2476 {
2477     BackRefIdx id;
2478 #if _MSC_VER
2479     __try {
2480 #endif
2481         id = dereference(ptr);
2482 #if _MSC_VER
2483     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2484                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2485         id = BackRefIdx();
2486     }
2487 #endif
2488     return id;
2489 }
2490 
2491 template<MemoryOrigin memOrigin>
2492 bool isLargeObject(void *object)
2493 {
2494     if (!isAligned(object, largeObjectAlignment))
2495         return false;
2496     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2497     BackRefIdx idx = (memOrigin == unknownMem) ?
2498         safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2499 
2500     return idx.isLargeObject()
2501         // in valid LargeObjectHdr memoryBlock is not nullptr
2502         && header->memoryBlock
2503         // in valid LargeObjectHdr memoryBlock points somewhere before header
2504         // TODO: more strict check
2505         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2506         && getBackRef(idx) == header;
2507 }
2508 
2509 static inline bool isSmallObject (void *ptr)
2510 {
2511     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2512     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2513 
2514     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2515     if (isSmall)
2516         expectedBlock->checkFreePrecond(ptr);
2517     return isSmall;
2518 }
2519 
2520 /**** Check if an object was allocated by scalable_malloc ****/
2521 static inline bool isRecognized (void* ptr)
2522 {
2523     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2524         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2525 }
2526 
2527 static inline void freeSmallObject(void *object)
2528 {
2529     /* mask low bits to get the block */
2530     Block *block = (Block *)alignDown(object, slabSize);
2531     block->checkFreePrecond(object);
2532 
2533 #if MALLOC_CHECK_RECURSION
2534     if (block->isStartupAllocObject()) {
2535         ((StartupBlock *)block)->free(object);
2536         return;
2537     }
2538 #endif
2539     if (block->isOwnedByCurrentThread()) {
2540         block->freeOwnObject(object);
2541     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2542         FreeObject *objectToFree = block->findObjectToFree(object);
2543         block->freePublicObject(objectToFree);
2544     }
2545 }
2546 
2547 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2548 {
2549     Bin* bin;
2550     Block * mallocBlock;
2551 
2552     if (!memPool) return nullptr;
2553 
2554     if (!size) size = sizeof(size_t);
2555 
2556     TLSData *tls = memPool->getTLS(/*create=*/true);
2557 
2558     /* Allocate a large object */
2559     if (size >= minLargeObjectSize)
2560         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2561 
2562     if (!tls) return nullptr;
2563 
2564     tls->markUsed();
2565     /*
2566      * Get an element in thread-local array corresponding to the given size;
2567      * It keeps ptr to the active block for allocations of this size
2568      */
2569     bin = tls->getAllocationBin(size);
2570     if ( !bin ) return nullptr;
2571 
2572     /* Get a block to try to allocate in. */
2573     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2574          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2575     {
2576         if( FreeObject *result = mallocBlock->allocate() )
2577             return result;
2578     }
2579 
2580     /*
2581      * else privatize publicly freed objects in some block and allocate from it
2582      */
2583     mallocBlock = bin->getPrivatizedFreeListBlock();
2584     if (mallocBlock) {
2585         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2586         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2587             return result;
2588         /* Else something strange happened, need to retry from the beginning; */
2589         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2590         return internalPoolMalloc(memPool, size);
2591     }
2592 
2593     /*
2594      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2595      */
2596     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2597     while (mallocBlock) {
2598         bin->pushTLSBin(mallocBlock);
2599         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2600         if( FreeObject *result = mallocBlock->allocate() )
2601             return result;
2602         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2603     }
2604 
2605     /*
2606      * else try to get a new empty block
2607      */
2608     mallocBlock = memPool->getEmptyBlock(size);
2609     if (mallocBlock) {
2610         bin->pushTLSBin(mallocBlock);
2611         bin->setActiveBlock(mallocBlock);
2612         if( FreeObject *result = mallocBlock->allocate() )
2613             return result;
2614         /* Else something strange happened, need to retry from the beginning; */
2615         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2616         return internalPoolMalloc(memPool, size);
2617     }
2618     /*
2619      * else nothing works so return nullptr
2620      */
2621     TRACEF(( "[ScalableMalloc trace] No memory found, returning nullptr.\n" ));
2622     return nullptr;
2623 }
2624 
2625 // When size==0 (i.e. unknown), detect here whether the object is large.
2626 // For size is known and < minLargeObjectSize, we still need to check
2627 // if the actual object is large, because large objects might be used
2628 // for aligned small allocations.
2629 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2630 {
2631     if (!memPool || !object) return false;
2632 
2633     // The library is initialized at allocation call, so releasing while
2634     // not initialized means foreign object is releasing.
2635     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2636     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2637                   "Invalid pointer during object releasing is detected.");
2638 
2639     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2640         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2641     else
2642         freeSmallObject(object);
2643     return true;
2644 }
2645 
2646 static void *internalMalloc(size_t size)
2647 {
2648     if (!size) size = sizeof(size_t);
2649 
2650 #if MALLOC_CHECK_RECURSION
2651     if (RecursiveMallocCallProtector::sameThreadActive())
2652         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2653             // nested allocation, so skip tls
2654             (FreeObject*)defaultMemPool->getFromLLOCache(nullptr, size, slabSize);
2655 #endif
2656 
2657     if (!isMallocInitialized())
2658         if (!doInitialization())
2659             return nullptr;
2660     return internalPoolMalloc(defaultMemPool, size);
2661 }
2662 
2663 static void internalFree(void *object)
2664 {
2665     internalPoolFree(defaultMemPool, object, 0);
2666 }
2667 
2668 static size_t internalMsize(void* ptr)
2669 {
2670     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2671     if (isLargeObject<ourMem>(ptr)) {
2672         // TODO: return the maximum memory size, that can be written to this object
2673         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2674         return lmb->objectSize;
2675     } else {
2676         Block *block = (Block*)alignDown(ptr, slabSize);
2677         return block->findObjectSize(ptr);
2678     }
2679 }
2680 
2681 } // namespace internal
2682 
2683 using namespace rml::internal;
2684 
2685 // legacy entry point saved for compatibility with binaries complied
2686 // with pre-6003 versions of TBB
2687 TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2688 {
2689     rml::MemoryPool *pool;
2690     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2691 
2692     pool_create_v1(pool_id, &pol, &pool);
2693     return pool;
2694 }
2695 
2696 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2697                                  rml::MemoryPool **pool)
2698 {
2699     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2700          // empty pFree allowed only for fixed pools
2701          || !(policy->fixedPool || policy->pFree)) {
2702         *pool = nullptr;
2703         return INVALID_POLICY;
2704     }
2705     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2706          // new flags can be added in place of reserved, but default
2707          // behaviour must be supported by this version
2708          || policy->reserved ) {
2709         *pool = nullptr;
2710         return UNSUPPORTED_POLICY;
2711     }
2712     if (!isMallocInitialized())
2713         if (!doInitialization()) {
2714             *pool = nullptr;
2715             return NO_MEMORY;
2716         }
2717     rml::internal::MemoryPool *memPool =
2718         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2719     if (!memPool) {
2720         *pool = nullptr;
2721         return NO_MEMORY;
2722     }
2723     memset(static_cast<void*>(memPool), 0, sizeof(rml::internal::MemoryPool));
2724     if (!memPool->init(pool_id, policy)) {
2725         internalFree(memPool);
2726         *pool = nullptr;
2727         return NO_MEMORY;
2728     }
2729 
2730     *pool = (rml::MemoryPool*)memPool;
2731     return POOL_OK;
2732 }
2733 
2734 bool pool_destroy(rml::MemoryPool* memPool)
2735 {
2736     if (!memPool) return false;
2737     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2738     internalFree(memPool);
2739 
2740     return ret;
2741 }
2742 
2743 bool pool_reset(rml::MemoryPool* memPool)
2744 {
2745     if (!memPool) return false;
2746 
2747     return ((rml::internal::MemoryPool*)memPool)->reset();
2748 }
2749 
2750 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2751 {
2752     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2753 }
2754 
2755 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2756 {
2757     if (!object)
2758         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2759     if (!size) {
2760         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2761         return nullptr;
2762     }
2763     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2764 }
2765 
2766 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2767 {
2768     if (!isPowerOfTwo(alignment) || 0==size)
2769         return nullptr;
2770 
2771     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2772 }
2773 
2774 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2775 {
2776     if (!isPowerOfTwo(alignment))
2777         return nullptr;
2778     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2779     void *tmp;
2780 
2781     if (!ptr)
2782         tmp = allocateAligned(mPool, size, alignment);
2783     else if (!size) {
2784         internalPoolFree(mPool, ptr, 0);
2785         return nullptr;
2786     } else
2787         tmp = reallocAligned(mPool, ptr, size, alignment);
2788 
2789     return tmp;
2790 }
2791 
2792 bool pool_free(rml::MemoryPool *mPool, void *object)
2793 {
2794     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2795 }
2796 
2797 rml::MemoryPool *pool_identify(void *object)
2798 {
2799     rml::internal::MemoryPool *pool;
2800     if (isLargeObject<ourMem>(object)) {
2801         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2802         pool = header->memoryBlock->pool;
2803     } else {
2804         Block *block = (Block*)alignDown(object, slabSize);
2805         pool = block->getMemPool();
2806     }
2807     // do not return defaultMemPool, as it can't be used in pool_free() etc
2808     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2809         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2810     return (rml::MemoryPool*)pool;
2811 }
2812 
2813 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2814 {
2815     if (object) {
2816         // No assert for object recognition, cause objects allocated from non-default
2817         // memory pool do not participate in range checking and do not have valid backreferences for
2818         // small objects. Instead, check that an object belong to the certain memory pool.
2819         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2820         return internalMsize(object);
2821     }
2822     errno = EINVAL;
2823     // Unlike _msize, return 0 in case of parameter error.
2824     // Returning size_t(-1) looks more like the way to troubles.
2825     return 0;
2826 }
2827 
2828 } // namespace rml
2829 
2830 using namespace rml::internal;
2831 
2832 #if MALLOC_TRACE
2833 static unsigned int threadGoingDownCount = 0;
2834 #endif
2835 
2836 /*
2837  * When a thread is shutting down this routine should be called to remove all the thread ids
2838  * from the malloc blocks and replace them with a nullptr thread id.
2839  *
2840  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2841  * It will be automatically called at thread exit with the key value as the argument,
2842  * unless that value is nullptr.
2843  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2844  *
2845  * However neither of the above is called for the main process thread, so the routine
2846  * also needs to be called during the process shutdown.
2847  *
2848 */
2849 // TODO: Consider making this function part of class MemoryPool.
2850 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2851 {
2852     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2853              getThreadId(),  threadGoingDownCount++ ));
2854 
2855 #if USE_PTHREAD
2856     if (tls) {
2857         if (!shutdownSync.threadDtorStart()) return;
2858         tls->getMemPool()->onThreadShutdown(tls);
2859         shutdownSync.threadDtorDone();
2860     } else
2861 #endif
2862     {
2863         suppress_unused_warning(tls); // not used on Windows
2864         // The default pool is safe to use at this point:
2865         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2866         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2867         // See also __TBB_mallocProcessShutdownNotification()
2868         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2869         // Take lock to walk through other pools; but waiting might be dangerous at this point
2870         // (e.g. on Windows the main thread might deadlock)
2871         bool locked = false;
2872         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2873         if (locked) { // the list is safe to process
2874             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2875                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2876         }
2877     }
2878 
2879     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2880 }
2881 
2882 #if USE_PTHREAD
2883 void mallocThreadShutdownNotification(void* arg)
2884 {
2885     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2886     if (!isMallocInitialized()) return;
2887     doThreadShutdownNotification((TLSData*)arg, false);
2888 }
2889 #else
2890 extern "C" void __TBB_mallocThreadShutdownNotification()
2891 {
2892     // The routine is called once per thread on Windows
2893     if (!isMallocInitialized()) return;
2894     doThreadShutdownNotification(nullptr, false);
2895 }
2896 #endif
2897 
2898 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2899 {
2900     if (!isMallocInitialized()) return;
2901 
2902     // Don't clean allocator internals if the entire process is exiting
2903     if (!windows_process_dying) {
2904         doThreadShutdownNotification(nullptr, /*main_thread=*/true);
2905     }
2906 #if  __TBB_MALLOC_LOCACHE_STAT
2907     printf("cache hit ratio %f, size hit %f\n",
2908            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2909     defaultMemPool->extMemPool.loc.reportStat(stdout);
2910 #endif
2911 
2912     shutdownSync.processExit();
2913 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2914 /* Pthread keys must be deleted as soon as possible to not call key dtor
2915    on thread termination when then the tbbmalloc code can be already unloaded.
2916 */
2917     defaultMemPool->destroy();
2918     destroyBackRefMain(&defaultMemPool->extMemPool.backend);
2919     ThreadId::destroy();      // Delete key for thread id
2920     hugePages.reset();
2921     // new total malloc initialization is possible after this point
2922     mallocInitialized.store(0, std::memory_order_release);
2923 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2924 
2925 #if COLLECT_STATISTICS
2926     unsigned nThreads = ThreadId::getMaxThreadId();
2927     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2928         STAT_print(i);
2929 #endif
2930     if (!usedBySrcIncluded) {
2931         MALLOC_ITT_FINI_ITTLIB();
2932         MALLOC_ITT_RELEASE_RESOURCES();
2933     }
2934 }
2935 
2936 extern "C" void * scalable_malloc(size_t size)
2937 {
2938     void *ptr = internalMalloc(size);
2939     if (!ptr) errno = ENOMEM;
2940     return ptr;
2941 }
2942 
2943 extern "C" void scalable_free(void *object)
2944 {
2945     internalFree(object);
2946 }
2947 
2948 #if MALLOC_ZONE_OVERLOAD_ENABLED
2949 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2950 {
2951     internalPoolFree(defaultMemPool, object, size);
2952 }
2953 #endif
2954 
2955 /*
2956  * A variant that provides additional memory safety, by checking whether the given address
2957  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2958  */
2959 extern "C" TBBMALLOC_EXPORT void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2960 {
2961     if (!object)
2962         return;
2963 
2964     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2965     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2966         if (isLargeObject<unknownMem>(object)) {
2967             // must check 1st for large object, because small object check touches 4 pages on left,
2968             // and it can be inaccessible
2969             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2970 
2971             defaultMemPool->putToLLOCache(tls, object);
2972             return;
2973         } else if (isSmallObject(object)) {
2974             freeSmallObject(object);
2975             return;
2976         }
2977     }
2978     if (original_free)
2979         original_free(object);
2980 }
2981 
2982 /********* End the free code        *************/
2983 
2984 /********* Code for scalable_realloc       ***********/
2985 
2986 /*
2987  * From K&R
2988  * "realloc changes the size of the object pointed to by p to size. The contents will
2989  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2990  * the new space is uninitialized. realloc returns a pointer to the new space, or
2991  * nullptr if the request cannot be satisfied, in which case *p is unchanged."
2992  *
2993  */
2994 extern "C" void* scalable_realloc(void* ptr, size_t size)
2995 {
2996     void *tmp;
2997 
2998     if (!ptr)
2999         tmp = internalMalloc(size);
3000     else if (!size) {
3001         internalFree(ptr);
3002         return nullptr;
3003     } else
3004         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3005 
3006     if (!tmp) errno = ENOMEM;
3007     return tmp;
3008 }
3009 
3010 /*
3011  * A variant that provides additional memory safety, by checking whether the given address
3012  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3013  */
3014 extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3015 {
3016     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3017 
3018     if (!ptr) {
3019         tmp = internalMalloc(sz);
3020     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3021         if (!sz) {
3022             internalFree(ptr);
3023             return nullptr;
3024         } else {
3025             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3026         }
3027     }
3028 #if USE_WINTHREAD
3029     else if (original_realloc && sz) {
3030         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3031         if ( original_ptrs->msize ){
3032             size_t oldSize = original_ptrs->msize(ptr);
3033             tmp = internalMalloc(sz);
3034             if (tmp) {
3035                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3036                 if ( original_ptrs->free ){
3037                     original_ptrs->free( ptr );
3038                 }
3039             }
3040         } else
3041             tmp = nullptr;
3042     }
3043 #else
3044     else if (original_realloc) {
3045         typedef void* (*realloc_ptr_t)(void*,size_t);
3046         realloc_ptr_t original_realloc_ptr;
3047         (void *&)original_realloc_ptr = original_realloc;
3048         tmp = original_realloc_ptr(ptr,sz);
3049     }
3050 #endif
3051     else tmp = nullptr;
3052 
3053     if (!tmp) errno = ENOMEM;
3054     return tmp;
3055 }
3056 
3057 /********* End code for scalable_realloc   ***********/
3058 
3059 /********* Code for scalable_calloc   ***********/
3060 
3061 /*
3062  * From K&R
3063  * calloc returns a pointer to space for an array of nobj objects,
3064  * each of size size, or nullptr if the request cannot be satisfied.
3065  * The space is initialized to zero bytes.
3066  *
3067  */
3068 
3069 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3070 {
3071     // it's square root of maximal size_t value
3072     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3073     const size_t arraySize = nobj * size;
3074 
3075     // check for overflow during multiplication:
3076     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3077         if (nobj && arraySize / nobj != size) {             // 2) exact check
3078             errno = ENOMEM;
3079             return nullptr;
3080         }
3081     void* result = internalMalloc(arraySize);
3082     if (result)
3083         memset(result, 0, arraySize);
3084     else
3085         errno = ENOMEM;
3086     return result;
3087 }
3088 
3089 /********* End code for scalable_calloc   ***********/
3090 
3091 /********* Code for aligned allocation API **********/
3092 
3093 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3094 {
3095     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3096         return EINVAL;
3097     void *result = allocateAligned(defaultMemPool, size, alignment);
3098     if (!result)
3099         return ENOMEM;
3100     *memptr = result;
3101     return 0;
3102 }
3103 
3104 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3105 {
3106     if (!isPowerOfTwo(alignment) || 0==size) {
3107         errno = EINVAL;
3108         return nullptr;
3109     }
3110     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3111     if (!tmp) errno = ENOMEM;
3112     return tmp;
3113 }
3114 
3115 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3116 {
3117     if (!isPowerOfTwo(alignment)) {
3118         errno = EINVAL;
3119         return nullptr;
3120     }
3121     void *tmp;
3122 
3123     if (!ptr)
3124         tmp = allocateAligned(defaultMemPool, size, alignment);
3125     else if (!size) {
3126         scalable_free(ptr);
3127         return nullptr;
3128     } else
3129         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3130 
3131     if (!tmp) errno = ENOMEM;
3132     return tmp;
3133 }
3134 
3135 extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3136 {
3137     /* corner cases left out of reallocAligned to not deal with errno there */
3138     if (!isPowerOfTwo(alignment)) {
3139         errno = EINVAL;
3140         return nullptr;
3141     }
3142     void *tmp = nullptr;
3143 
3144     if (!ptr) {
3145         tmp = allocateAligned(defaultMemPool, size, alignment);
3146     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3147         if (!size) {
3148             internalFree(ptr);
3149             return nullptr;
3150         } else {
3151             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3152         }
3153     }
3154 #if USE_WINTHREAD
3155     else {
3156         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3157         if (size) {
3158             // Without orig_msize, we can't do anything with this.
3159             // Just keeping old pointer.
3160             if ( original_ptrs->aligned_msize ){
3161                 // set alignment and offset to have possibly correct oldSize
3162                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3163                 tmp = allocateAligned(defaultMemPool, size, alignment);
3164                 if (tmp) {
3165                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3166                     if ( original_ptrs->aligned_free ){
3167                         original_ptrs->aligned_free( ptr );
3168                     }
3169                 }
3170             }
3171         } else {
3172             if ( original_ptrs->aligned_free ){
3173                 original_ptrs->aligned_free( ptr );
3174             }
3175             return nullptr;
3176         }
3177     }
3178 #else
3179     // As original_realloc can't align result, and there is no way to find
3180     // size of reallocating object, we are giving up.
3181     suppress_unused_warning(orig_function);
3182 #endif
3183     if (!tmp) errno = ENOMEM;
3184     return tmp;
3185 }
3186 
3187 extern "C" void scalable_aligned_free(void *ptr)
3188 {
3189     internalFree(ptr);
3190 }
3191 
3192 /********* end code for aligned allocation API **********/
3193 
3194 /********* Code for scalable_msize       ***********/
3195 
3196 /*
3197  * Returns the size of a memory block allocated in the heap.
3198  */
3199 extern "C" size_t scalable_msize(void* ptr)
3200 {
3201     if (ptr) {
3202         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3203         return internalMsize(ptr);
3204     }
3205     errno = EINVAL;
3206     // Unlike _msize, return 0 in case of parameter error.
3207     // Returning size_t(-1) looks more like the way to troubles.
3208     return 0;
3209 }
3210 
3211 /*
3212  * A variant that provides additional memory safety, by checking whether the given address
3213  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3214  */
3215 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3216 {
3217     if (object) {
3218         // Check if the memory was allocated by scalable_malloc
3219         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3220             return internalMsize(object);
3221         else if (original_msize)
3222             return original_msize(object);
3223     }
3224     // object is nullptr or unknown, or foreign and no original_msize
3225 #if USE_WINTHREAD
3226     errno = EINVAL; // errno expected to be set only on this platform
3227 #endif
3228     return 0;
3229 }
3230 
3231 /*
3232  * The same as above but for _aligned_msize case
3233  */
3234 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3235 {
3236     if (object) {
3237         // Check if the memory was allocated by scalable_malloc
3238         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3239             return internalMsize(object);
3240         else if (orig_aligned_msize)
3241             return orig_aligned_msize(object,alignment,offset);
3242     }
3243     // object is nullptr or unknown
3244     errno = EINVAL;
3245     return 0;
3246 }
3247 
3248 /********* End code for scalable_msize   ***********/
3249 
3250 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3251 {
3252     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3253         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3254         return TBBMALLOC_OK;
3255     } else if (param == USE_HUGE_PAGES) {
3256 #if __unix__
3257         switch (value) {
3258         case 0:
3259         case 1:
3260             hugePages.setMode(value);
3261             return TBBMALLOC_OK;
3262         default:
3263             return TBBMALLOC_INVALID_PARAM;
3264         }
3265 #else
3266         return TBBMALLOC_NO_EFFECT;
3267 #endif
3268 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3269     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3270         switch (value) {
3271         case 0: // used by dynamic library
3272         case 1: // used by static library or directly included sources
3273             usedBySrcIncluded = value;
3274             return TBBMALLOC_OK;
3275         default:
3276             return TBBMALLOC_INVALID_PARAM;
3277         }
3278 #endif
3279     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3280         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3281         return TBBMALLOC_OK;
3282     }
3283     return TBBMALLOC_INVALID_PARAM;
3284 }
3285 
3286 extern "C" int scalable_allocation_command(int cmd, void *param)
3287 {
3288     if (param)
3289         return TBBMALLOC_INVALID_PARAM;
3290 
3291     bool released = false;
3292     switch(cmd) {
3293     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3294         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3295             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3296         break;
3297     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3298         released = defaultMemPool->extMemPool.hardCachesCleanup(true);
3299         break;
3300     default:
3301         return TBBMALLOC_INVALID_PARAM;
3302     }
3303     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3304 }
3305