xref: /oneTBB/src/tbbmalloc/frontend.cpp (revision 2110128e)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
31     extern "C" { static void mallocThreadShutdownNotification(void*); }
32     #if __sun || __SUNPRO_CC
33     #define __asm__ asm
34     #endif
35     #include <unistd.h> // sysconf(_SC_PAGESIZE)
36 #elif USE_WINTHREAD
37     #define GetMyTID() GetCurrentThreadId()
38 #if __TBB_WIN8UI_SUPPORT
39     #include<thread>
40     #define TlsSetValue_func FlsSetValue
41     #define TlsGetValue_func FlsGetValue
42     #define TlsAlloc() FlsAlloc(nullptr)
43     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
44     #define TlsFree FlsFree
45 #else
46     #define TlsSetValue_func TlsSetValue
47     #define TlsGetValue_func TlsGetValue
48     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
49 #endif
50 #else
51     #error Must define USE_PTHREAD or USE_WINTHREAD
52 #endif
53 
54 #define FREELIST_NONBLOCKING 1
55 
56 namespace rml {
57 class MemoryPool;
58 namespace internal {
59 
60 class Block;
61 class MemoryPool;
62 
63 #if MALLOC_CHECK_RECURSION
64 
65 inline bool isMallocInitialized();
66 
67 #endif // MALLOC_CHECK_RECURSION
68 
69 /** Support for handling the special UNUSABLE pointer state **/
70 const intptr_t UNUSABLE = 0x1;
71 inline bool isSolidPtr( void* ptr ) {
72     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
73 }
74 inline bool isNotForUse( void* ptr ) {
75     return (intptr_t)ptr==UNUSABLE;
76 }
77 
78 /*
79  * Block::objectSize value used to mark blocks allocated by startupAlloc
80  */
81 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
82 
83 /*
84  * The following constant is used to define the size of struct Block, the block header.
85  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
86  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
87  */
88 const int blockHeaderAlignment = estimatedCacheLineSize;
89 
90 /********* The data structures and global objects        **************/
91 
92 /*
93  * The malloc routines themselves need to be able to occasionally malloc some space,
94  * in order to set up the structures used by the thread local structures. This
95  * routine performs that functions.
96  */
97 class BootStrapBlocks {
98     MallocMutex bootStrapLock;
99     Block      *bootStrapBlock;
100     Block      *bootStrapBlockUsed;
101     FreeObject *bootStrapObjectList;
102 public:
103     void *allocate(MemoryPool *memPool, size_t size);
104     void free(void* ptr);
105     void reset();
106 };
107 
108 #if USE_INTERNAL_TID
109 class ThreadId {
110     static tls_key_t Tid_key;
111     std::atomic<intptr_t> ThreadCount;
112 
113     unsigned int id;
114 
115     static unsigned int tlsNumber() {
116         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
117         if( !result ) {
118             RecursiveMallocCallProtector scoped;
119             // Thread-local value is zero -> first call from this thread,
120             // need to initialize with next ID value (IDs start from 1)
121             result = ++ThreadCount; // returned new value!
122             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
123         }
124         return result;
125     }
126 public:
127     static bool init() {
128 #if USE_WINTHREAD
129         Tid_key = TlsAlloc();
130         if (Tid_key == TLS_ALLOC_FAILURE)
131             return false;
132 #else
133         int status = pthread_key_create( &Tid_key, nullptr );
134         if ( status ) {
135             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
136             return false;
137         }
138 #endif /* USE_WINTHREAD */
139         return true;
140     }
141 #if __TBB_SOURCE_DIRECTLY_INCLUDED
142     static void destroy() {
143         if( Tid_key ) {
144 #if USE_WINTHREAD
145             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
146 #else
147             int status = pthread_key_delete( Tid_key );
148 #endif /* USE_WINTHREAD */
149             if ( status )
150                 fprintf (stderr, "The memory manager cannot delete tls key\n");
151             Tid_key = 0;
152         }
153     }
154 #endif
155 
156     ThreadId() : id(ThreadId::tlsNumber()) {}
157     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
158 
159 #if COLLECT_STATISTICS || MALLOC_TRACE
160     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
161 #endif
162 #if COLLECT_STATISTICS
163     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
164 
165     friend int STAT_increment(ThreadId tid, int bin, int ctr);
166 #endif
167 };
168 
169 tls_key_t ThreadId::Tid_key;
170 intptr_t ThreadId::ThreadCount;
171 
172 #if COLLECT_STATISTICS
173 int STAT_increment(ThreadId tid, int bin, int ctr)
174 {
175     return ::STAT_increment(tid.id, bin, ctr);
176 }
177 #endif
178 
179 #else // USE_INTERNAL_TID
180 
181 class ThreadId {
182 #if USE_PTHREAD
183     std::atomic<pthread_t> tid;
184 #else
185     std::atomic<DWORD>     tid;
186 #endif
187 public:
188     ThreadId() : tid(GetMyTID()) {}
189 #if USE_PTHREAD
190     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
191 #else
192     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
193 #endif
194     ThreadId& operator=(const ThreadId& other) {
195         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
196         return *this;
197     }
198     static bool init() { return true; }
199 #if __TBB_SOURCE_DIRECTLY_INCLUDED
200     static void destroy() {}
201 #endif
202 };
203 
204 #endif // USE_INTERNAL_TID
205 
206 /*********** Code to provide thread ID and a thread-local void pointer **********/
207 
208 bool TLSKey::init()
209 {
210 #if USE_WINTHREAD
211     TLS_pointer_key = TlsAlloc();
212     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
213         return false;
214 #else
215     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
216     if ( status )
217         return false;
218 #endif /* USE_WINTHREAD */
219     return true;
220 }
221 
222 bool TLSKey::destroy()
223 {
224 #if USE_WINTHREAD
225     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
226 #else
227     int status1 = pthread_key_delete(TLS_pointer_key);
228 #endif /* USE_WINTHREAD */
229     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
230     return status1==0;
231 }
232 
233 inline TLSData* TLSKey::getThreadMallocTLS() const
234 {
235     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
236 }
237 
238 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
239     RecursiveMallocCallProtector scoped;
240     TlsSetValue_func( TLS_pointer_key, newvalue );
241 }
242 
243 /* The 'next' field in the block header has to maintain some invariants:
244  *   it needs to be on a 16K boundary and the first field in the block.
245  *   Any value stored there needs to have the lower 14 bits set to 0
246  *   so that various assert work. This means that if you want to smash this memory
247  *   for debugging purposes you will need to obey this invariant.
248  * The total size of the header needs to be a power of 2 to simplify
249  * the alignment requirements. For now it is a 128 byte structure.
250  * To avoid false sharing, the fields changed only locally are separated
251  * from the fields changed by foreign threads.
252  * Changing the size of the block header would require to change
253  * some bin allocation sizes, in particular "fitting" sizes (see above).
254  */
255 class Bin;
256 class StartupBlock;
257 
258 class MemoryPool {
259     // if no explicit grainsize, expect to see malloc in user's pAlloc
260     // and set reasonable low granularity
261     static const size_t defaultGranularity = estimatedCacheLineSize;
262 
263     MemoryPool() = delete;                  // deny
264 public:
265     static MallocMutex  memPoolListLock;
266 
267     // list of all active pools is used to release
268     // all TLS data on thread termination or library unload
269     MemoryPool    *next,
270                   *prev;
271     ExtMemoryPool  extMemPool;
272     BootStrapBlocks bootStrapBlocks;
273 
274     static void initDefaultPool();
275 
276     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
277     bool reset();
278     bool destroy();
279     void onThreadShutdown(TLSData *tlsData);
280 
281     inline TLSData *getTLS(bool create);
282     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(nullptr); }
283 
284     Block *getEmptyBlock(size_t size);
285     void returnEmptyBlock(Block *block, bool poolTheBlock);
286 
287     // get/put large object to/from local large object cache
288     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
289     void putToLLOCache(TLSData *tls, void *object);
290 };
291 
292 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
293                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
294 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
295 const size_t MemoryPool::defaultGranularity;
296 // zero-initialized
297 MallocMutex  MemoryPool::memPoolListLock;
298 // TODO: move huge page status to default pool, because that's its states
299 HugePagesStatus hugePages;
300 static bool usedBySrcIncluded = false;
301 
302 // Padding helpers
303 template<size_t padd>
304 struct PaddingImpl {
305     size_t       __padding[padd];
306 };
307 
308 template<>
309 struct PaddingImpl<0> {};
310 
311 template<int N>
312 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
313 
314 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
315 // fields and fields commonly accessed by not owner threads.
316 class GlobalBlockFields : public BlockI {
317 protected:
318     std::atomic<FreeObject*> publicFreeList;
319     std::atomic<Block*> nextPrivatizable;
320     MemoryPool  *poolPtr;
321 };
322 
323 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
324 protected:
325     Block       *next;
326     Block       *previous;        /* Use double linked list to speed up removal */
327     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
328     FreeObject  *freeList;
329     /* Pointer to local data for the owner thread. Used for fast finding tls
330        when releasing object from a block that current thread owned.
331        nullptr for orphaned blocks. */
332     std::atomic<TLSData*> tlsPtr;
333     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
334     BackRefIdx   backRefIdx;
335     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
336     uint16_t     objectSize;
337     bool         isFull;
338 
339     friend class FreeBlockPool;
340     friend class StartupBlock;
341     friend class LifoList;
342     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
343     friend bool OrphanedBlocks::cleanup(Backend*);
344     friend Block *MemoryPool::getEmptyBlock(size_t);
345 };
346 
347 // Use inheritance to guarantee that a user data start on next cache line.
348 // Can't use member for it, because when LocalBlockFields already on cache line,
349 // we must have no additional memory consumption for all compilers.
350 class Block : public LocalBlockFields,
351               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
352 public:
353     bool empty() const {
354         if (allocatedCount > 0) return false;
355         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
356         return true;
357     }
358     inline FreeObject* allocate();
359     inline FreeObject *allocateFromFreeList();
360 
361     inline bool adjustFullness();
362     void adjustPositionInBin(Bin* bin = nullptr);
363 #if MALLOC_DEBUG
364     bool freeListNonNull() { return freeList; }
365 #endif
366     void freePublicObject(FreeObject *objectToFree);
367     inline void freeOwnObject(void *object);
368     void reset();
369     void privatizePublicFreeList( bool reset = true );
370     void restoreBumpPtr();
371     void privatizeOrphaned(TLSData *tls, unsigned index);
372     bool readyToShare();
373     void shareOrphaned(intptr_t binTag, unsigned index);
374     unsigned int getSize() const {
375         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
376                       "Invalid object size");
377         return isStartupAllocObject()? 0 : objectSize;
378     }
379     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
380     inline bool isOwnedByCurrentThread() const;
381     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
382     inline FreeObject *findObjectToFree(const void *object) const;
383     void checkFreePrecond(const void *object) const {
384 #if MALLOC_DEBUG
385         const char *msg = "Possible double free or heap corruption.";
386         // small objects are always at least sizeof(size_t) Byte aligned,
387         // try to check this before this dereference as for invalid objects
388         // this may be unreadable
389         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
390 #if !__TBB_USE_THREAD_SANITIZER
391         // releasing to free slab
392         MALLOC_ASSERT(allocatedCount>0, msg);
393 #endif
394         // must not point to slab's header
395         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
396         if (startupAllocObjSizeMark == objectSize) // startup block
397             MALLOC_ASSERT(object<=bumpPtr, msg);
398         else {
399             // non-startup objects are 8 Byte aligned
400             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
401             FreeObject *toFree = findObjectToFree(object);
402 #if !__TBB_USE_THREAD_SANITIZER
403             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
404                           && (!bumpPtr || object>bumpPtr), msg);
405             // check against head of freeList, as this is mostly
406             // expected after double free
407             MALLOC_ASSERT(toFree != freeList, msg);
408 #endif
409             // check against head of publicFreeList, to detect double free
410             // involving foreign thread
411             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
412         }
413 #else
414         suppress_unused_warning(object);
415 #endif
416     }
417     void initEmptyBlock(TLSData *tls, size_t size);
418     size_t findObjectSize(void *object) const;
419     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
420 
421 protected:
422     void cleanBlockHeader();
423 
424 private:
425     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
426 
427     inline FreeObject *allocateFromBumpPtr();
428     inline FreeObject *findAllocatedObject(const void *address) const;
429 #if MALLOC_DEBUG
430     inline bool isProperlyPlaced(const void *object) const;
431 #endif
432     inline void markOwned(TLSData *tls) {
433         MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
434         ownerTid = ThreadId(); /* save the ID of the current thread */
435         tlsPtr.store(tls, std::memory_order_relaxed);
436     }
437     inline void markOrphaned() {
438         MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
439         tlsPtr.store(nullptr, std::memory_order_relaxed);
440     }
441 
442     friend class Bin;
443     friend class TLSData;
444     friend bool MemoryPool::destroy();
445 };
446 
447 const float Block::emptyEnoughRatio = 1.0 / 4.0;
448 
449 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
450     "The class Block does not fit into 2 cache lines on this platform. "
451     "Defining USE_INTERNAL_TID may help to fix it.");
452 
453 class Bin {
454 private:
455 public:
456     Block *activeBlk;
457     std::atomic<Block*> mailbox;
458     MallocMutex mailLock;
459 
460 public:
461     inline Block* getActiveBlock() const { return activeBlk; }
462     void resetActiveBlock() { activeBlk = nullptr; }
463     inline void setActiveBlock(Block *block);
464     inline Block* setPreviousBlockActive();
465     Block* getPrivatizedFreeListBlock();
466     void moveBlockToFront(Block *block);
467     bool cleanPublicFreeLists();
468     void processEmptyBlock(Block *block, bool poolTheBlock);
469     void addPublicFreeListBlock(Block* block);
470 
471     void outofTLSBin(Block* block);
472     void verifyTLSBin(size_t size) const;
473     void pushTLSBin(Block* block);
474 
475 #if MALLOC_DEBUG
476     void verifyInitState() const {
477         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
478         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
479     }
480 #endif
481 
482     friend void Block::freePublicObject (FreeObject *objectToFree);
483 };
484 
485 /********* End of the data structures                    **************/
486 
487 /*
488  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
489  */
490 const uint32_t minSmallObjectIndex = 0;
491 const uint32_t numSmallObjectBins = 8;
492 const uint32_t maxSmallObjectSize = 64;
493 
494 /*
495  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
496  * from maxSmallObjectSize till this size; 16 bins in total
497  */
498 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
499 const uint32_t numSegregatedObjectBins = 16;
500 const uint32_t maxSegregatedObjectSize = 1024;
501 
502 /*
503  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
504  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
505  */
506 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
507 const uint32_t numFittingBins = 5;
508 
509 const uint32_t fittingAlignment = estimatedCacheLineSize;
510 
511 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
512 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
513 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
514 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
515 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
516 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
517 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
518 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
519 #undef SET_FITTING_SIZE
520 
521 /*
522  * The total number of thread-specific Block-based bins
523  */
524 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
525 
526 /*
527  * Objects of this size and larger are considered large objects.
528  */
529 const uint32_t minLargeObjectSize = fittingSize5 + 1;
530 
531 /*
532  * Per-thread pool of slab blocks. Idea behind it is to not share with other
533  * threads memory that are likely in local cache(s) of our CPU.
534  */
535 class FreeBlockPool {
536 private:
537     std::atomic<Block*> head;
538     int         size;
539     Backend    *backend;
540     bool        lastAccessMiss;
541 public:
542     static const int POOL_HIGH_MARK = 32;
543     static const int POOL_LOW_MARK  = 8;
544 
545     class ResOfGet {
546         ResOfGet() = delete;
547     public:
548         Block* block;
549         bool   lastAccMiss;
550         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
551     };
552 
553     // allocated in zero-initialized memory
554     FreeBlockPool(Backend *bknd) : backend(bknd) {}
555     ResOfGet getBlock();
556     void returnBlock(Block *block);
557     bool externalCleanup(); // can be called by another thread
558 };
559 
560 template<int LOW_MARK, int HIGH_MARK>
561 class LocalLOCImpl {
562 private:
563     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
564     // TODO: can single-linked list be faster here?
565     LargeMemoryBlock *tail; // need it when do releasing on overflow
566     std::atomic<LargeMemoryBlock*> head;
567     size_t            totalSize;
568     int               numOfBlocks;
569 public:
570     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
571     LargeMemoryBlock *get(size_t size);
572     bool externalCleanup(ExtMemoryPool *extMemPool);
573 #if __TBB_MALLOC_WHITEBOX_TEST
574     LocalLOCImpl() : tail(nullptr), head(nullptr), totalSize(0), numOfBlocks(0) {}
575     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
576     static const int LOC_HIGH_MARK = HIGH_MARK;
577 #else
578     // no ctor, object must be created in zero-initialized memory
579 #endif
580 };
581 
582 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
583 
584 class TLSData : public TLSRemote {
585     MemoryPool   *memPool;
586 public:
587     Bin           bin[numBlockBinLimit];
588     FreeBlockPool freeSlabBlocks;
589     LocalLOC      lloc;
590     unsigned      currCacheIdx;
591 private:
592     std::atomic<bool> unused;
593 public:
594     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {}
595     MemoryPool *getMemPool() const { return memPool; }
596     Bin* getAllocationBin(size_t size);
597     void release();
598     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
599         if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
600         // Heavy operation in terms of synchronization complexity,
601         // should be called only for the current thread
602         bool released = cleanBins ? cleanupBlockBins() : false;
603         // both cleanups to be called, and the order is not important
604         return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup();
605     }
606     bool cleanupBlockBins();
607     void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
608     void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
609 };
610 
611 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
612 {
613     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
614     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
615     if ( !tls )
616         return nullptr;
617     new(tls) TLSData(memPool, backend);
618     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
619 #if MALLOC_DEBUG
620     for (uint32_t i = 0; i < numBlockBinLimit; i++)
621         tls->bin[i].verifyInitState();
622 #endif
623     setThreadMallocTLS(tls);
624     memPool->extMemPool.allLocalCaches.registerThread(tls);
625     return tls;
626 }
627 
628 bool TLSData::cleanupBlockBins()
629 {
630     bool released = false;
631     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
632         released |= bin[i].cleanPublicFreeLists();
633         // After cleaning public free lists, only the active block might be empty.
634         // Do not use processEmptyBlock because it will just restore bumpPtr.
635         Block *block = bin[i].getActiveBlock();
636         if (block && block->empty()) {
637             bin[i].outofTLSBin(block);
638             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
639             released = true;
640         }
641     }
642     return released;
643 }
644 
645 bool ExtMemoryPool::releaseAllLocalCaches()
646 {
647     // Iterate all registered TLS data and clean LLOC and Slab pools
648     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
649 
650     // Bins privatization is done only for the current thread
651     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
652         released |= tlsData->cleanupBlockBins();
653 
654     return released;
655 }
656 
657 void AllLocalCaches::registerThread(TLSRemote *tls)
658 {
659     tls->prev = nullptr;
660     MallocMutex::scoped_lock lock(listLock);
661     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
662     tls->next = head;
663     if (head)
664         head->prev = tls;
665     head = tls;
666     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
667 }
668 
669 void AllLocalCaches::unregisterThread(TLSRemote *tls)
670 {
671     MallocMutex::scoped_lock lock(listLock);
672     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
673     if (head == tls)
674         head = tls->next;
675     if (tls->next)
676         tls->next->prev = tls->prev;
677     if (tls->prev)
678         tls->prev->next = tls->next;
679     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
680 }
681 
682 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
683 {
684     bool released = false;
685     {
686         MallocMutex::scoped_lock lock(listLock);
687         for (TLSRemote *curr=head; curr; curr=curr->next)
688             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
689     }
690     return released;
691 }
692 
693 void AllLocalCaches::markUnused()
694 {
695     bool locked;
696     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
697     if (!locked) // not wait for marking if someone doing something with it
698         return;
699 
700     for (TLSRemote *curr=head; curr; curr=curr->next)
701         static_cast<TLSData*>(curr)->markUnused();
702 }
703 
704 #if MALLOC_CHECK_RECURSION
705 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
706 std::atomic<pthread_t> RecursiveMallocCallProtector::owner_thread;
707 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
708 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
709 #if __FreeBSD__
710 bool        RecursiveMallocCallProtector::canUsePthread;
711 #endif
712 
713 #endif
714 
715 /*********** End code to provide thread ID and a TLS pointer **********/
716 
717 // Parameter for isLargeObject, keeps our expectations on memory origin.
718 // Assertions must use unknownMem to reliably report object invalidity.
719 enum MemoryOrigin {
720     ourMem,    // allocated by TBB allocator
721     unknownMem // can be allocated by system allocator or TBB allocator
722 };
723 
724 template<MemoryOrigin>
725 #if __TBB_USE_THREAD_SANITIZER
726 // We have a real race when accessing the large object header for
727 // non large objects (e.g. small or foreign objects).
728 // Therefore, we need to hide this access from the thread sanitizer
729 __attribute__((no_sanitize("thread")))
730 #endif
731 bool isLargeObject(void *object);
732 static void *internalMalloc(size_t size);
733 static void internalFree(void *object);
734 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
735 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
736 
737 #if !MALLOC_DEBUG
738 #if __INTEL_COMPILER || _MSC_VER
739 #define NOINLINE(decl) __declspec(noinline) decl
740 #define ALWAYSINLINE(decl) __forceinline decl
741 #elif __GNUC__
742 #define NOINLINE(decl) decl __attribute__ ((noinline))
743 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
744 #else
745 #define NOINLINE(decl) decl
746 #define ALWAYSINLINE(decl) decl
747 #endif
748 
749 static NOINLINE( bool doInitialization() );
750 ALWAYSINLINE( bool isMallocInitialized() );
751 
752 #undef ALWAYSINLINE
753 #undef NOINLINE
754 #endif /* !MALLOC_DEBUG */
755 
756 
757 /********* Now some rough utility code to deal with indexing the size bins. **************/
758 
759 /*
760  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
761  * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
762  * This is enough for current algorithm of distribution of sizes among bins.
763  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
764  */
765 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
766 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
767 #pragma intrinsic(_BitScanReverse)
768 #endif
769 static inline unsigned int highestBitPos(unsigned int n)
770 {
771     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
772     unsigned int pos;
773 #if __ARCH_x86_32||__ARCH_x86_64
774 
775 # if __unix__||__APPLE__||__MINGW32__
776     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
777 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
778     __asm
779     {
780         bsr eax, n
781         mov pos, eax
782     }
783 # elif _WIN64 && _MSC_VER>=1400
784     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
785 # else
786 #   error highestBitPos() not implemented for this platform
787 # endif
788 #elif __arm__
789     __asm__ __volatile__
790     (
791        "clz %0, %1\n"
792        "rsb %0, %0, %2\n"
793        :"=r" (pos) :"r" (n), "I" (31)
794     );
795 #else
796     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
797     pos = bsr[ n>>6 ];
798 #endif /* __ARCH_* */
799     return pos;
800 }
801 
802 unsigned int getSmallObjectIndex(unsigned int size)
803 {
804     unsigned int result = (size-1)>>3;
805     constexpr bool is_64bit = (8 == sizeof(void*));
806     if (is_64bit) {
807         // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
808         if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
809     }
810     return result;
811 }
812 
813 /*
814  * Depending on indexRequest, for a given size return either the index into the bin
815  * for objects of this size, or the actual size of objects in this bin.
816  */
817 template<bool indexRequest>
818 static unsigned int getIndexOrObjectSize (unsigned int size)
819 {
820     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
821         unsigned int index = getSmallObjectIndex( size );
822          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
823         return indexRequest ? index : (index+1)<<3;
824     }
825     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
826         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
827         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
828         if (indexRequest)
829             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
830         else {
831             unsigned int alignment = 128 >> (9-order); // alignment in the group
832             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
833             return alignUp(size,alignment);
834         }
835     }
836     else {
837         if( size <= fittingSize3 ) {
838             if( size <= fittingSize2 ) {
839                 if( size <= fittingSize1 )
840                     return indexRequest ? minFittingIndex : fittingSize1;
841                 else
842                     return indexRequest ? minFittingIndex+1 : fittingSize2;
843             } else
844                 return indexRequest ? minFittingIndex+2 : fittingSize3;
845         } else {
846             if( size <= fittingSize5 ) {
847                 if( size <= fittingSize4 )
848                     return indexRequest ? minFittingIndex+3 : fittingSize4;
849                 else
850                     return indexRequest ? minFittingIndex+4 : fittingSize5;
851             } else {
852                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
853                 return ~0U;
854             }
855         }
856     }
857 }
858 
859 static unsigned int getIndex (unsigned int size)
860 {
861     return getIndexOrObjectSize</*indexRequest=*/true>(size);
862 }
863 
864 static unsigned int getObjectSize (unsigned int size)
865 {
866     return getIndexOrObjectSize</*indexRequest=*/false>(size);
867 }
868 
869 
870 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
871 {
872     FreeObject *result;
873 
874     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
875 
876     { // Lock with acquire
877         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
878 
879         if( bootStrapObjectList) {
880             result = bootStrapObjectList;
881             bootStrapObjectList = bootStrapObjectList->next;
882         } else {
883             if (!bootStrapBlock) {
884                 bootStrapBlock = memPool->getEmptyBlock(size);
885                 if (!bootStrapBlock) return nullptr;
886             }
887             result = bootStrapBlock->bumpPtr;
888             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
889             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
890                 bootStrapBlock->bumpPtr = nullptr;
891                 bootStrapBlock->next = bootStrapBlockUsed;
892                 bootStrapBlockUsed = bootStrapBlock;
893                 bootStrapBlock = nullptr;
894             }
895         }
896     } // Unlock with release
897     memset (result, 0, size);
898     return (void*)result;
899 }
900 
901 void BootStrapBlocks::free(void* ptr)
902 {
903     MALLOC_ASSERT( ptr, ASSERT_TEXT );
904     { // Lock with acquire
905         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
906         ((FreeObject*)ptr)->next = bootStrapObjectList;
907         bootStrapObjectList = (FreeObject*)ptr;
908     } // Unlock with release
909 }
910 
911 void BootStrapBlocks::reset()
912 {
913     bootStrapBlock = bootStrapBlockUsed = nullptr;
914     bootStrapObjectList = nullptr;
915 }
916 
917 #if !(FREELIST_NONBLOCKING)
918 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
919 #endif
920 
921 /********* End rough utility code  **************/
922 
923 /* LifoList assumes zero initialization so a vector of it can be created
924  * by just allocating some space with no call to constructor.
925  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
926  *
927  * By usage convention there is no race on the initialization. */
928 LifoList::LifoList( ) : top(nullptr)
929 {
930     // MallocMutex assumes zero initialization
931     memset(static_cast<void*>(&lock), 0, sizeof(MallocMutex));
932 }
933 
934 void LifoList::push(Block *block)
935 {
936     MallocMutex::scoped_lock scoped_cs(lock);
937     block->next = top.load(std::memory_order_relaxed);
938     top.store(block, std::memory_order_relaxed);
939 }
940 
941 Block *LifoList::pop()
942 {
943     Block* block = nullptr;
944     if (top.load(std::memory_order_relaxed)) {
945         MallocMutex::scoped_lock scoped_cs(lock);
946         block = top.load(std::memory_order_relaxed);
947         if (block) {
948             top.store(block->next, std::memory_order_relaxed);
949         }
950     }
951     return block;
952 }
953 
954 Block *LifoList::grab()
955 {
956     Block *block = nullptr;
957     if (top.load(std::memory_order_relaxed)) {
958         MallocMutex::scoped_lock scoped_cs(lock);
959         block = top.load(std::memory_order_relaxed);
960         top.store(nullptr, std::memory_order_relaxed);
961     }
962     return block;
963 }
964 
965 /********* Thread and block related code      *************/
966 
967 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
968      LargeMemoryBlock *next, *lmb = loHead;
969      loHead = nullptr;
970 
971      for (; lmb; lmb = next) {
972          next = lmb->gNext;
973          if (poolDestroy) {
974              // as it's pool destruction, no need to return object to backend,
975              // only remove backrefs, as they are global
976              removeBackRef(lmb->backRefIdx);
977          } else {
978              // clean g(Next|Prev) to prevent removing lmb
979              // from AllLargeBlocksList inside returnLargeObject
980              lmb->gNext = lmb->gPrev = nullptr;
981              backend->returnLargeObject(lmb);
982          }
983      }
984 }
985 
986 TLSData* MemoryPool::getTLS(bool create)
987 {
988     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
989     if (create && !tls)
990         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
991     return tls;
992 }
993 
994 /*
995  * Return the bin for the given size.
996  */
997 inline Bin* TLSData::getAllocationBin(size_t size)
998 {
999     return bin + getIndex(size);
1000 }
1001 
1002 /* Return an empty uninitialized block in a non-blocking fashion. */
1003 Block *MemoryPool::getEmptyBlock(size_t size)
1004 {
1005     TLSData* tls = getTLS(/*create=*/false);
1006     // try to use per-thread cache, if TLS available
1007     FreeBlockPool::ResOfGet resOfGet = tls?
1008         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(nullptr, false);
1009     Block *result = resOfGet.block;
1010 
1011     if (!result) { // not found in local cache, asks backend for slabs
1012         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1013         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1014 
1015         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1016         if (!result) return nullptr;
1017 
1018         if (!extMemPool.userPool())
1019             for (int i=0; i<num; i++) {
1020                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1021                 if (backRefIdx[i].isInvalid()) {
1022                     // roll back resource allocation
1023                     for (int j=0; j<i; j++)
1024                         removeBackRef(backRefIdx[j]);
1025                     Block *b = result;
1026                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1027                         extMemPool.backend.putSlabBlock(b);
1028                     return nullptr;
1029                 }
1030             }
1031         // resources were allocated, register blocks
1032         Block *b = result;
1033         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1034             // slab block in user's pool must have invalid backRefIdx
1035             if (extMemPool.userPool()) {
1036                 new (&b->backRefIdx) BackRefIdx();
1037             } else {
1038                 setBackRef(backRefIdx[i], b);
1039                 b->backRefIdx = backRefIdx[i];
1040             }
1041             b->tlsPtr.store(tls, std::memory_order_relaxed);
1042             b->poolPtr = this;
1043             // all but first one go to per-thread pool
1044             if (i > 0) {
1045                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1046                 tls->freeSlabBlocks.returnBlock(b);
1047             }
1048         }
1049     }
1050     MALLOC_ASSERT(result, ASSERT_TEXT);
1051     result->initEmptyBlock(tls, size);
1052     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1053     return result;
1054 }
1055 
1056 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1057 {
1058     block->reset();
1059     if (poolTheBlock) {
1060         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1061     } else {
1062         // slab blocks in user's pools do not have valid backRefIdx
1063         if (!extMemPool.userPool())
1064             removeBackRef(*(block->getBackRefIdx()));
1065         extMemPool.backend.putSlabBlock(block);
1066     }
1067 }
1068 
1069 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1070                          rawFreeType rawFree, size_t granularity,
1071                          bool keepAllMemory, bool fixedPool)
1072 {
1073     this->poolId = poolId;
1074     this->rawAlloc = rawAlloc;
1075     this->rawFree = rawFree;
1076     this->granularity = granularity;
1077     this->keepAllMemory = keepAllMemory;
1078     this->fixedPool = fixedPool;
1079     this->delayRegsReleasing = false;
1080     if (!initTLS())
1081         return false;
1082     loc.init(this);
1083     backend.init(this);
1084     MALLOC_ASSERT(isPoolValid(), nullptr);
1085     return true;
1086 }
1087 
1088 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1089 
1090 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1091 {
1092     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1093                policy->granularity? policy->granularity : defaultGranularity,
1094                policy->keepAllMemory, policy->fixedPool))
1095         return false;
1096     {
1097         MallocMutex::scoped_lock lock(memPoolListLock);
1098         next = defaultMemPool->next;
1099         defaultMemPool->next = this;
1100         prev = defaultMemPool;
1101         if (next)
1102             next->prev = this;
1103     }
1104     return true;
1105 }
1106 
1107 bool MemoryPool::reset()
1108 {
1109     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1110     // memory is not releasing during pool reset
1111     // TODO: mark regions to release unused on next reset()
1112     extMemPool.delayRegionsReleasing(true);
1113 
1114     bootStrapBlocks.reset();
1115     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1116     if (!extMemPool.reset())
1117         return false;
1118 
1119     if (!extMemPool.initTLS())
1120         return false;
1121     extMemPool.delayRegionsReleasing(false);
1122     return true;
1123 }
1124 
1125 bool MemoryPool::destroy()
1126 {
1127 #if __TBB_MALLOC_LOCACHE_STAT
1128     extMemPool.loc.reportStat(stdout);
1129 #endif
1130 #if __TBB_MALLOC_BACKEND_STAT
1131     extMemPool.backend.reportStat(stdout);
1132 #endif
1133     {
1134         MallocMutex::scoped_lock lock(memPoolListLock);
1135         // remove itself from global pool list
1136         if (prev)
1137             prev->next = next;
1138         if (next)
1139             next->prev = prev;
1140     }
1141     // slab blocks in non-default pool do not have backreferences,
1142     // only large objects do
1143     if (extMemPool.userPool())
1144         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1145     else {
1146         // only one non-userPool() is supported now
1147         MALLOC_ASSERT(this==defaultMemPool, nullptr);
1148         // There and below in extMemPool.destroy(), do not restore initial state
1149         // for user pool, because it's just about to be released. But for system
1150         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1151         bootStrapBlocks.reset();
1152         extMemPool.orphanedBlocks.reset();
1153     }
1154     return extMemPool.destroy();
1155 }
1156 
1157 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1158 {
1159     if (tlsData) { // might be called for "empty" TLS
1160         tlsData->release();
1161         bootStrapBlocks.free(tlsData);
1162         clearTLS();
1163     }
1164 }
1165 
1166 #if MALLOC_DEBUG
1167 void Bin::verifyTLSBin (size_t size) const
1168 {
1169 /* The debug version verifies the TLSBin as needed */
1170     uint32_t objSize = getObjectSize(size);
1171 
1172     if (activeBlk) {
1173         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1174         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1175 #if MALLOC_DEBUG>1
1176         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1177             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1178             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1179             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1180             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1181             if (temp->next) {
1182                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1183             }
1184         }
1185         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1186             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1187             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1188             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1189             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1190             if (temp->previous) {
1191                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1192             }
1193         }
1194 #endif /* MALLOC_DEBUG>1 */
1195     }
1196 }
1197 #else /* MALLOC_DEBUG */
1198 inline void Bin::verifyTLSBin (size_t) const { }
1199 #endif /* MALLOC_DEBUG */
1200 
1201 /*
1202  * Add a block to the start of this tls bin list.
1203  */
1204 void Bin::pushTLSBin(Block* block)
1205 {
1206     /* The objectSize should be defined and not a parameter
1207        because the function is applied to partially filled blocks as well */
1208     unsigned int size = block->objectSize;
1209 
1210     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1211     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1212     MALLOC_ASSERT( block->next == nullptr, ASSERT_TEXT );
1213     MALLOC_ASSERT( block->previous == nullptr, ASSERT_TEXT );
1214 
1215     MALLOC_ASSERT( this, ASSERT_TEXT );
1216     verifyTLSBin(size);
1217 
1218     block->next = activeBlk;
1219     if( activeBlk ) {
1220         block->previous = activeBlk->previous;
1221         activeBlk->previous = block;
1222         if( block->previous )
1223             block->previous->next = block;
1224     } else {
1225         activeBlk = block;
1226     }
1227 
1228     verifyTLSBin(size);
1229 }
1230 
1231 /*
1232  * Take a block out of its tls bin (e.g. before removal).
1233  */
1234 void Bin::outofTLSBin(Block* block)
1235 {
1236     unsigned int size = block->objectSize;
1237 
1238     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1239     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1240 
1241     MALLOC_ASSERT( this, ASSERT_TEXT );
1242     verifyTLSBin(size);
1243 
1244     if (block == activeBlk) {
1245         activeBlk = block->previous? block->previous : block->next;
1246     }
1247     /* Unlink the block */
1248     if (block->previous) {
1249         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1250         block->previous->next = block->next;
1251     }
1252     if (block->next) {
1253         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1254         block->next->previous = block->previous;
1255     }
1256     block->next = nullptr;
1257     block->previous = nullptr;
1258 
1259     verifyTLSBin(size);
1260 }
1261 
1262 Block* Bin::getPrivatizedFreeListBlock()
1263 {
1264     Block* block;
1265     MALLOC_ASSERT( this, ASSERT_TEXT );
1266     // if this method is called, active block usage must be unsuccessful
1267     MALLOC_ASSERT( (!activeBlk && !mailbox.load(std::memory_order_relaxed)) || (activeBlk && activeBlk->isFull), ASSERT_TEXT );
1268 
1269 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1270     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1271         return nullptr;
1272     else { // mailbox is not empty, take lock and inspect it
1273         MallocMutex::scoped_lock scoped_cs(mailLock);
1274         block = mailbox.load(std::memory_order_relaxed);
1275         if( block ) {
1276             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1277             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1278             mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1279             block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1280         }
1281     }
1282     if( block ) {
1283         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1284         block->privatizePublicFreeList();
1285         block->adjustPositionInBin(this);
1286     }
1287     return block;
1288 }
1289 
1290 void Bin::addPublicFreeListBlock(Block* block)
1291 {
1292     MallocMutex::scoped_lock scoped_cs(mailLock);
1293     block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1294     mailbox.store(block, std::memory_order_relaxed);
1295 }
1296 
1297 // Process publicly freed objects in all blocks and return empty blocks
1298 // to the backend in order to reduce overall footprint.
1299 bool Bin::cleanPublicFreeLists()
1300 {
1301     Block* block;
1302     if (!mailbox.load(std::memory_order_acquire))
1303         return false;
1304     else {
1305         // Grab all the blocks in the mailbox
1306         MallocMutex::scoped_lock scoped_cs(mailLock);
1307         block = mailbox.load(std::memory_order_relaxed);
1308         mailbox.store(nullptr, std::memory_order_relaxed);
1309     }
1310     bool released = false;
1311     while (block) {
1312         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1313         Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1314         block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1315         block->privatizePublicFreeList();
1316         if (block->empty()) {
1317             processEmptyBlock(block, /*poolTheBlock=*/false);
1318             released = true;
1319         } else
1320             block->adjustPositionInBin(this);
1321         block = tmp;
1322     }
1323     return released;
1324 }
1325 
1326 bool Block::adjustFullness()
1327 {
1328     if (bumpPtr) {
1329         /* If we are still using a bump ptr for this block it is empty enough to use. */
1330         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1331         isFull = false;
1332     } else {
1333         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1334         /* allocatedCount shows how many objects in the block are in use; however it still counts
1335          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1336         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1337 #if COLLECT_STATISTICS
1338         if (isFull)
1339             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1340         else
1341             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1342 #endif
1343     }
1344     return isFull;
1345 }
1346 
1347 // This method resides in class Block, and not in class Bin, in order to avoid
1348 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
1349 void Block::adjustPositionInBin(Bin* bin/*=nullptr*/)
1350 {
1351     // If the block were full, but became empty enough to use,
1352     // move it to the front of the list
1353     if (isFull && !adjustFullness()) {
1354         if (!bin)
1355             bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1356         bin->moveBlockToFront(this);
1357     }
1358 }
1359 
1360 /* Restore the bump pointer for an empty block that is planned to use */
1361 void Block::restoreBumpPtr()
1362 {
1363     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1364     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1365     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1366     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1367     freeList = nullptr;
1368     isFull = false;
1369 }
1370 
1371 void Block::freeOwnObject(void *object)
1372 {
1373     tlsPtr.load(std::memory_order_relaxed)->markUsed();
1374     allocatedCount--;
1375     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1376 #if COLLECT_STATISTICS
1377     // Note that getAllocationBin is not called on the hottest path with statistics off.
1378     if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1379         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1380     else
1381         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1382 #endif
1383     if (empty()) {
1384         // If the last object of a slab is freed, the slab cannot be marked full
1385         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1386         tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1387     } else { // hot path
1388         FreeObject *objectToFree = findObjectToFree(object);
1389         objectToFree->next = freeList;
1390         freeList = objectToFree;
1391         adjustPositionInBin();
1392     }
1393 }
1394 
1395 void Block::freePublicObject (FreeObject *objectToFree)
1396 {
1397     FreeObject* localPublicFreeList{};
1398 
1399     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1400 #if FREELIST_NONBLOCKING
1401     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1402     localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1403     do {
1404         objectToFree->next = localPublicFreeList;
1405         // no backoff necessary because trying to make change, not waiting for a change
1406     } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1407 #else
1408     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1409     {
1410         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1411         localPublicFreeList = objectToFree->next = publicFreeList;
1412         publicFreeList = objectToFree;
1413     }
1414 #endif
1415 
1416     if( localPublicFreeList==nullptr ) {
1417         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1418         // otherwise, it should point to the bin the block belongs to.
1419         // reading nextPrivatizable is thread-safe below, because:
1420         // 1) the executing thread atomically got publicFreeList==nullptr and changed it to non-nullptr;
1421         // 2) only owning thread can change it back to nullptr,
1422         // 3) but it can not be done until the block is put to the mailbox
1423         // So the executing thread is now the only one that can change nextPrivatizable
1424         Block* next = nextPrivatizable.load(std::memory_order_acquire);
1425         if( !isNotForUse(next) ) {
1426             MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1427             Bin* theBin = (Bin*) next;
1428 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1429             // FIXME: The thread that returns the block is not the block's owner.
1430             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1431             // Need to find a way to get the correct remote bin for comparison.
1432             { // check that nextPrivatizable points to the bin the block belongs to
1433                 uint32_t index = getIndex( objectSize );
1434                 TLSData* tls = getThreadMallocTLS();
1435                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1436             }
1437 #endif // MALLOC_DEBUG
1438             theBin->addPublicFreeListBlock(this);
1439         }
1440     }
1441     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1442     STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1443 }
1444 
1445 // Make objects freed by other threads available for use again
1446 void Block::privatizePublicFreeList( bool reset )
1447 {
1448     FreeObject *localPublicFreeList;
1449     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1450     // to properly synchronize with other threads freeing objects to this slab.
1451     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1452 
1453     // Only the owner thread may reset the pointer to nullptr
1454     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1455 #if FREELIST_NONBLOCKING
1456     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1457 #else
1458     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1459     {
1460         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1461         localPublicFreeList = publicFreeList;
1462         publicFreeList = endMarker;
1463     }
1464 #endif
1465     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1466     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1467 
1468     // publicFreeList must have been UNUSABLE or valid, but not nullptr
1469     MALLOC_ASSERT( localPublicFreeList!=nullptr, ASSERT_TEXT );
1470     if( isSolidPtr(localPublicFreeList) ) {
1471         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1472         /* other threads did not change the counter freeing our blocks */
1473         allocatedCount--;
1474         FreeObject *temp = localPublicFreeList;
1475         while( isSolidPtr(temp->next) ){ // the list will end with either nullptr or UNUSABLE
1476             temp = temp->next;
1477             allocatedCount--;
1478             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1479         }
1480         /* merge with local freeList */
1481         temp->next = freeList;
1482         freeList = localPublicFreeList;
1483         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1484     }
1485 }
1486 
1487 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1488 {
1489     Bin* bin = tls->bin + index;
1490     STAT_increment(getThreadId(), index, allocBlockPublic);
1491     next = nullptr;
1492     previous = nullptr;
1493     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != nullptr, ASSERT_TEXT );
1494     /* There is not a race here since no other thread owns this block */
1495     markOwned(tls);
1496     // It is safe to change nextPrivatizable, as publicFreeList is not null
1497     MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1498     nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1499     // the next call is required to change publicFreeList to 0
1500     privatizePublicFreeList();
1501     if( empty() ) {
1502         restoreBumpPtr();
1503     } else {
1504         adjustFullness(); // check the block fullness and set isFull
1505     }
1506     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1507 }
1508 
1509 
1510 bool Block::readyToShare()
1511 {
1512     FreeObject* oldVal = nullptr;
1513 #if FREELIST_NONBLOCKING
1514     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1515 #else
1516     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1517     {
1518         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1519         if ( (oldVal=publicFreeList)==nullptr )
1520             (intptr_t&)(publicFreeList) = UNUSABLE;
1521     }
1522 #endif
1523     return oldVal==nullptr;
1524 }
1525 
1526 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1527 {
1528     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1529     // unreferenced formal parameter warning
1530     tbb::detail::suppress_unused_warning(index);
1531     STAT_increment(getThreadId(), index, freeBlockPublic);
1532     markOrphaned();
1533     if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1534         // First check passed: the block is not in mailbox yet.
1535         // Need to set publicFreeList to non-zero, so other threads
1536         // will not change nextPrivatizable and it can be zeroed.
1537         if ( !readyToShare() ) {
1538             // another thread freed an object; we need to wait until it finishes.
1539             // There is no need for exponential backoff, as the wait here is not for a lock;
1540             // but need to yield, so the thread we wait has a chance to run.
1541             // TODO: add a pause to also be friendly to hyperthreads
1542             int count = 256;
1543             while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1544                 if (--count==0) {
1545                     do_yield();
1546                     count = 256;
1547                 }
1548             }
1549         }
1550     }
1551     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=nullptr, ASSERT_TEXT );
1552     // now it is safe to change our data
1553     previous = nullptr;
1554     // it is caller responsibility to ensure that the list of blocks
1555     // formed by nextPrivatizable pointers is kept consistent if required.
1556     // if only called from thread shutdown code, it does not matter.
1557     nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1558 }
1559 
1560 void Block::cleanBlockHeader()
1561 {
1562     next = nullptr;
1563     previous = nullptr;
1564     freeList = nullptr;
1565     allocatedCount = 0;
1566     isFull = false;
1567     tlsPtr.store(nullptr, std::memory_order_relaxed);
1568 
1569     publicFreeList.store(nullptr, std::memory_order_relaxed);
1570 }
1571 
1572 void Block::initEmptyBlock(TLSData *tls, size_t size)
1573 {
1574     // Having getIndex and getObjectSize called next to each other
1575     // allows better compiler optimization as they basically share the code.
1576     unsigned int index = getIndex(size);
1577     unsigned int objSz = getObjectSize(size);
1578 
1579     cleanBlockHeader();
1580     objectSize = objSz;
1581     markOwned(tls);
1582     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1583     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1584 
1585     // each block should have the address where the head of the list of "privatizable" blocks is kept
1586     // the only exception is a block for boot strap which is initialized when TLS is yet nullptr
1587     nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1588     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1589              this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1590 }
1591 
1592 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1593 {
1594     // TODO: try to use index from getAllocationBin
1595     unsigned int index = getIndex(size);
1596     Block *block = bins[index].pop();
1597     if (block) {
1598         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1599         block->privatizeOrphaned(tls, index);
1600     }
1601     return block;
1602 }
1603 
1604 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1605 {
1606     unsigned int index = getIndex(block->getSize());
1607     block->shareOrphaned(binTag, index);
1608     MALLOC_ITT_SYNC_RELEASING(bins+index);
1609     bins[index].push(block);
1610 }
1611 
1612 void OrphanedBlocks::reset()
1613 {
1614     for (uint32_t i=0; i<numBlockBinLimit; i++)
1615         new (bins+i) LifoList();
1616 }
1617 
1618 bool OrphanedBlocks::cleanup(Backend* backend)
1619 {
1620     bool released = false;
1621     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1622         Block* block = bins[i].grab();
1623         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1624         while (block) {
1625             Block* next = block->next;
1626             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to nullptr
1627             if (block->empty()) {
1628                 block->reset();
1629                 // slab blocks in user's pools do not have valid backRefIdx
1630                 if (!backend->inUserPool())
1631                     removeBackRef(*(block->getBackRefIdx()));
1632                 backend->putSlabBlock(block);
1633                 released = true;
1634             } else {
1635                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1636                 bins[i].push(block);
1637             }
1638             block = next;
1639         }
1640     }
1641     return released;
1642 }
1643 
1644 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1645 {
1646     Block *b = head.exchange(nullptr);
1647 
1648     if (b) {
1649         size--;
1650         Block *newHead = b->next;
1651         lastAccessMiss = false;
1652         head.store(newHead, std::memory_order_release);
1653     } else {
1654         lastAccessMiss = true;
1655     }
1656     return ResOfGet(b, lastAccessMiss);
1657 }
1658 
1659 void FreeBlockPool::returnBlock(Block *block)
1660 {
1661     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1662     Block *localHead = head.exchange(nullptr);
1663 
1664     if (!localHead) {
1665         size = 0; // head was stolen by externalClean, correct size accordingly
1666     } else if (size == POOL_HIGH_MARK) {
1667         // release cold blocks and add hot one,
1668         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1669         Block *headToFree = localHead, *helper;
1670         for (int i=0; i<POOL_LOW_MARK-2; i++)
1671             headToFree = headToFree->next;
1672         Block *last = headToFree;
1673         headToFree = headToFree->next;
1674         last->next = nullptr;
1675         size = POOL_LOW_MARK-1;
1676         for (Block *currBl = headToFree; currBl; currBl = helper) {
1677             helper = currBl->next;
1678             // slab blocks in user's pools do not have valid backRefIdx
1679             if (!backend->inUserPool())
1680                 removeBackRef(currBl->backRefIdx);
1681             backend->putSlabBlock(currBl);
1682         }
1683     }
1684     size++;
1685     block->next = localHead;
1686     head.store(block, std::memory_order_release);
1687 }
1688 
1689 bool FreeBlockPool::externalCleanup()
1690 {
1691     Block *helper;
1692     bool released = false;
1693 
1694     for (Block *currBl=head.exchange(nullptr); currBl; currBl=helper) {
1695         helper = currBl->next;
1696         // slab blocks in user's pools do not have valid backRefIdx
1697         if (!backend->inUserPool())
1698             removeBackRef(currBl->backRefIdx);
1699         backend->putSlabBlock(currBl);
1700         released = true;
1701     }
1702     return released;
1703 }
1704 
1705 /* Prepare the block for returning to FreeBlockPool */
1706 void Block::reset()
1707 {
1708     // it is caller's responsibility to ensure no data is lost before calling this
1709     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1710     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1711     if (!isStartupAllocObject())
1712         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1713 
1714     cleanBlockHeader();
1715 
1716     nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1717 
1718     objectSize = 0;
1719     // for an empty block, bump pointer should point right after the end of the block
1720     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1721 }
1722 
1723 inline void Bin::setActiveBlock (Block *block)
1724 {
1725 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1726     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1727     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1728     activeBlk = block;
1729 }
1730 
1731 inline Block* Bin::setPreviousBlockActive()
1732 {
1733     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1734     Block* temp = activeBlk->previous;
1735     if( temp ) {
1736         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1737         activeBlk = temp;
1738     }
1739     return temp;
1740 }
1741 
1742 inline bool Block::isOwnedByCurrentThread() const {
1743     return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1744 }
1745 
1746 FreeObject *Block::findObjectToFree(const void *object) const
1747 {
1748     FreeObject *objectToFree;
1749     // Due to aligned allocations, a pointer passed to scalable_free
1750     // might differ from the address of internally allocated object.
1751     // Small objects however should always be fine.
1752     if (objectSize <= maxSegregatedObjectSize)
1753         objectToFree = (FreeObject*)object;
1754     // "Fitting size" allocations are suspicious if aligned higher than naturally
1755     else {
1756         if ( ! isAligned(object,2*fittingAlignment) )
1757             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1758             //       so might even be slower in average than unconditional use of findAllocatedObject.
1759             // here it should be a "real" object
1760             objectToFree = (FreeObject*)object;
1761         else
1762             // here object can be an aligned address, so applying additional checks
1763             objectToFree = findAllocatedObject(object);
1764         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1765     }
1766     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1767 
1768     return objectToFree;
1769 }
1770 
1771 void TLSData::release()
1772 {
1773     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1774     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1775 
1776     for (unsigned index = 0; index < numBlockBins; index++) {
1777         Block *activeBlk = bin[index].getActiveBlock();
1778         if (!activeBlk)
1779             continue;
1780         Block *threadlessBlock = activeBlk->previous;
1781         bool syncOnMailbox = false;
1782         while (threadlessBlock) {
1783             Block *threadBlock = threadlessBlock->previous;
1784             if (threadlessBlock->empty()) {
1785                 /* we destroy the thread, so not use its block pool */
1786                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1787             } else {
1788                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1789                 syncOnMailbox = true;
1790             }
1791             threadlessBlock = threadBlock;
1792         }
1793         threadlessBlock = activeBlk;
1794         while (threadlessBlock) {
1795             Block *threadBlock = threadlessBlock->next;
1796             if (threadlessBlock->empty()) {
1797                 /* we destroy the thread, so not use its block pool */
1798                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1799             } else {
1800                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1801                 syncOnMailbox = true;
1802             }
1803             threadlessBlock = threadBlock;
1804         }
1805         bin[index].resetActiveBlock();
1806 
1807         if (syncOnMailbox) {
1808             // Although, we synchronized on nextPrivatizable inside a block, we still need to
1809             // synchronize on the bin lifetime because the thread releasing an object into the public
1810             // free list is touching the bin (mailbox and mailLock)
1811             MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1812         }
1813     }
1814 }
1815 
1816 
1817 #if MALLOC_CHECK_RECURSION
1818 // TODO: Use dedicated heap for this
1819 
1820 /*
1821  * It's a special kind of allocation that can be used when malloc is
1822  * not available (either during startup or when malloc was already called and
1823  * we are, say, inside pthread_setspecific's call).
1824  * Block can contain objects of different sizes,
1825  * allocations are performed by moving bump pointer and increasing of object counter,
1826  * releasing is done via counter of objects allocated in the block
1827  * or moving bump pointer if releasing object is on a bound.
1828  * TODO: make bump pointer to grow to the same backward direction as all the others.
1829  */
1830 
1831 class StartupBlock : public Block {
1832     size_t availableSize() const {
1833         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1834     }
1835     static StartupBlock *getBlock();
1836 public:
1837     static FreeObject *allocate(size_t size);
1838     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1839     void free(void *ptr);
1840 };
1841 
1842 static MallocMutex startupMallocLock;
1843 static StartupBlock *firstStartupBlock;
1844 
1845 StartupBlock *StartupBlock::getBlock()
1846 {
1847     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1848     if (backRefIdx.isInvalid()) return nullptr;
1849 
1850     StartupBlock *block = static_cast<StartupBlock*>(
1851         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1852     if (!block) return nullptr;
1853 
1854     block->cleanBlockHeader();
1855     setBackRef(backRefIdx, block);
1856     block->backRefIdx = backRefIdx;
1857     // use startupAllocObjSizeMark to mark objects from startup block marker
1858     block->objectSize = startupAllocObjSizeMark;
1859     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1860     return block;
1861 }
1862 
1863 FreeObject *StartupBlock::allocate(size_t size)
1864 {
1865     FreeObject *result;
1866     StartupBlock *newBlock = nullptr;
1867 
1868     /* Objects must be aligned on their natural bounds,
1869        and objects bigger than word on word's bound. */
1870     size = alignUp(size, sizeof(size_t));
1871     // We need size of an object to implement msize.
1872     size_t reqSize = size + sizeof(size_t);
1873     {
1874         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1875         // Re-check whether we need a new block (conditions might have changed)
1876         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1877             if (!newBlock) {
1878                 newBlock = StartupBlock::getBlock();
1879                 if (!newBlock) return nullptr;
1880             }
1881             newBlock->next = (Block*)firstStartupBlock;
1882             if (firstStartupBlock)
1883                 firstStartupBlock->previous = (Block*)newBlock;
1884             firstStartupBlock = newBlock;
1885         }
1886         result = firstStartupBlock->bumpPtr;
1887         firstStartupBlock->allocatedCount++;
1888         firstStartupBlock->bumpPtr =
1889             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1890     }
1891 
1892     // keep object size at the negative offset
1893     *((size_t*)result) = size;
1894     return (FreeObject*)((size_t*)result+1);
1895 }
1896 
1897 void StartupBlock::free(void *ptr)
1898 {
1899     Block* blockToRelease = nullptr;
1900     {
1901         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1902 
1903         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1904         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1905                       && allocatedCount>0, ASSERT_TEXT);
1906         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1907                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1908                       ASSERT_TEXT);
1909         if (0 == --allocatedCount) {
1910             if (this == firstStartupBlock)
1911                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1912             if (previous)
1913                 previous->next = next;
1914             if (next)
1915                 next->previous = previous;
1916             blockToRelease = this;
1917         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1918             // last object in the block released
1919             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1920             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1921                           ASSERT_TEXT);
1922             bumpPtr = newBump;
1923         }
1924     }
1925     if (blockToRelease) {
1926         blockToRelease->previous = blockToRelease->next = nullptr;
1927         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1928     }
1929 }
1930 
1931 #endif /* MALLOC_CHECK_RECURSION */
1932 
1933 /********* End thread related code  *************/
1934 
1935 /********* Library initialization *************/
1936 
1937 //! Value indicating the state of initialization.
1938 /* 0 = initialization not started.
1939  * 1 = initialization started but not finished.
1940  * 2 = initialization finished.
1941  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1942  * useful for detecting errors in the double-check pattern.
1943  */
1944 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1945 static MallocMutex initMutex;
1946 
1947 /** The leading "\0" is here so that applying "strings" to the binary
1948     delivers a clean result. */
1949 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1950 
1951 #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1952 
1953 /* Decrease race interval between dynamic library unloading and pthread key
1954    destructor. Protect only Pthreads with supported unloading. */
1955 class ShutdownSync {
1956 /* flag is the number of threads in pthread key dtor body
1957    (i.e., between threadDtorStart() and threadDtorDone())
1958    or the signal to skip dtor, if flag < 0 */
1959     std::atomic<intptr_t> flag;
1960     static const intptr_t skipDtor = INTPTR_MIN/2;
1961 public:
1962     void init() { flag.store(0, std::memory_order_release); }
1963 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1964    simultaneously, so flag never becomes negative because of that. */
1965     bool threadDtorStart() {
1966         if (flag.load(std::memory_order_acquire) < 0)
1967             return false;
1968         if (++flag <= 0) { // note that new value returned
1969             flag.fetch_sub(1); // flag is spoiled by us, restore it
1970             return false;
1971         }
1972         return true;
1973     }
1974     void threadDtorDone() {
1975         flag.fetch_sub(1);
1976     }
1977     void processExit() {
1978         if (flag.fetch_add(skipDtor) != 0) {
1979             SpinWaitUntilEq(flag, skipDtor);
1980         }
1981     }
1982 };
1983 
1984 #else
1985 
1986 class ShutdownSync {
1987 public:
1988     void init() { }
1989     bool threadDtorStart() { return true; }
1990     void threadDtorDone() { }
1991     void processExit() { }
1992 };
1993 
1994 #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1995 
1996 static ShutdownSync shutdownSync;
1997 
1998 inline bool isMallocInitialized() {
1999     // Load must have acquire fence; otherwise thread taking "initialized" path
2000     // might perform textually later loads *before* mallocInitialized becomes 2.
2001     return 2 == mallocInitialized.load(std::memory_order_acquire);
2002 }
2003 
2004 /* Caller is responsible for ensuring this routine is called exactly once. */
2005 extern "C" void MallocInitializeITT() {
2006 #if __TBB_USE_ITT_NOTIFY
2007     if (!usedBySrcIncluded)
2008         tbb::detail::r1::__TBB_load_ittnotify();
2009 #endif
2010 }
2011 
2012 void MemoryPool::initDefaultPool() {
2013     hugePages.init();
2014 }
2015 
2016 /*
2017  * Allocator initialization routine;
2018  * it is called lazily on the very first scalable_malloc call.
2019  */
2020 static bool initMemoryManager()
2021 {
2022     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2023              sizeof(Block), sizeof(uintptr_t) ));
2024     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2025     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2026     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2027                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2028 
2029 #if USE_WINTHREAD
2030     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2031 #else
2032     // POSIX.1-2001-compliant way to get page size
2033     const size_t granularity = sysconf(_SC_PAGESIZE);
2034 #endif
2035     if (!defaultMemPool) {
2036         // Do not rely on static constructors and do the assignment in case
2037         // of library static section not initialized at this call yet.
2038         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2039     }
2040     bool initOk = defaultMemPool->
2041         extMemPool.init(0, nullptr, nullptr, granularity,
2042                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2043 // TODO: extMemPool.init() to not allocate memory
2044     if (!initOk || !initBackRefMain(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2045         return false;
2046     MemoryPool::initDefaultPool();
2047     // init() is required iff initMemoryManager() is called
2048     // after mallocProcessShutdownNotification()
2049     shutdownSync.init();
2050 #if COLLECT_STATISTICS
2051     initStatisticsCollection();
2052 #endif
2053     return true;
2054 }
2055 
2056 static bool GetBoolEnvironmentVariable(const char* name) {
2057     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2058 }
2059 
2060 //! Ensures that initMemoryManager() is called once and only once.
2061 /** Does not return until initMemoryManager() has been completed by a thread.
2062     There is no need to call this routine if mallocInitialized==2 . */
2063 static bool doInitialization()
2064 {
2065     MallocMutex::scoped_lock lock( initMutex );
2066     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2067         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2068         mallocInitialized.store(1, std::memory_order_relaxed);
2069         RecursiveMallocCallProtector scoped;
2070         if (!initMemoryManager()) {
2071             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2072             return false;
2073         }
2074 #ifdef  MALLOC_EXTRA_INITIALIZATION
2075         MALLOC_EXTRA_INITIALIZATION;
2076 #endif
2077 #if MALLOC_CHECK_RECURSION
2078         RecursiveMallocCallProtector::detectNaiveOverload();
2079 #endif
2080         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2081         // Store must have release fence, otherwise mallocInitialized==2
2082         // might become remotely visible before side effects of
2083         // initMemoryManager() become remotely visible.
2084         mallocInitialized.store(2, std::memory_order_release);
2085         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2086             fputs(VersionString+1,stderr);
2087             hugePages.printStatus();
2088         }
2089     }
2090     /* It can't be 0 or I would have initialized it */
2091     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2092     return true;
2093 }
2094 
2095 /********* End library initialization *************/
2096 
2097 /********* The malloc show begins     *************/
2098 
2099 
2100 FreeObject *Block::allocateFromFreeList()
2101 {
2102     FreeObject *result;
2103 
2104     if (!freeList) return nullptr;
2105 
2106     result = freeList;
2107     MALLOC_ASSERT( result, ASSERT_TEXT );
2108 
2109     freeList = result->next;
2110     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2111     allocatedCount++;
2112     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2113 
2114     return result;
2115 }
2116 
2117 FreeObject *Block::allocateFromBumpPtr()
2118 {
2119     FreeObject *result = bumpPtr;
2120     if (result) {
2121         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2122         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2123             bumpPtr = nullptr;
2124         }
2125         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2126         allocatedCount++;
2127         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2128     }
2129     return result;
2130 }
2131 
2132 inline FreeObject* Block::allocate()
2133 {
2134     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2135 
2136     /* for better cache locality, first looking in the free list. */
2137     if ( FreeObject *result = allocateFromFreeList() ) {
2138         return result;
2139     }
2140     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2141 
2142     /* if free list is empty, try thread local bump pointer allocation. */
2143     if ( FreeObject *result = allocateFromBumpPtr() ) {
2144         return result;
2145     }
2146     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2147 
2148     /* the block is considered full. */
2149     isFull = true;
2150     return nullptr;
2151 }
2152 
2153 size_t Block::findObjectSize(void *object) const
2154 {
2155     size_t blSize = getSize();
2156 #if MALLOC_CHECK_RECURSION
2157     // Currently, there is no aligned allocations from startup blocks,
2158     // so we can return just StartupBlock::msize().
2159     // TODO: This must be extended if we add aligned allocation from startup blocks.
2160     if (!blSize)
2161         return StartupBlock::msize(object);
2162 #endif
2163     // object can be aligned, so real size can be less than block's
2164     size_t size =
2165         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2166     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2167     return size;
2168 }
2169 
2170 void Bin::moveBlockToFront(Block *block)
2171 {
2172     /* move the block to the front of the bin */
2173     if (block == activeBlk) return;
2174     outofTLSBin(block);
2175     pushTLSBin(block);
2176 }
2177 
2178 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2179 {
2180     if (block != activeBlk) {
2181         /* We are not using this block; return it to the pool */
2182         outofTLSBin(block);
2183         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2184     } else {
2185         /* all objects are free - let's restore the bump pointer */
2186         block->restoreBumpPtr();
2187     }
2188 }
2189 
2190 template<int LOW_MARK, int HIGH_MARK>
2191 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2192 {
2193     const size_t size = object->unalignedSize;
2194     // not spoil cache with too large object, that can cause its total cleanup
2195     if (size > MAX_TOTAL_SIZE)
2196         return false;
2197     LargeMemoryBlock *localHead = head.exchange(nullptr);
2198 
2199     object->prev = nullptr;
2200     object->next = localHead;
2201     if (localHead)
2202         localHead->prev = object;
2203     else {
2204         // those might not be cleaned during local cache stealing, correct them
2205         totalSize = 0;
2206         numOfBlocks = 0;
2207         tail = object;
2208     }
2209     localHead = object;
2210     totalSize += size;
2211     numOfBlocks++;
2212     // must meet both size and number of cached objects constrains
2213     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2214         // scanning from tail until meet conditions
2215         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2216             totalSize -= tail->unalignedSize;
2217             numOfBlocks--;
2218             tail = tail->prev;
2219         }
2220         LargeMemoryBlock *headToRelease = tail->next;
2221         tail->next = nullptr;
2222 
2223         extMemPool->freeLargeObjectList(headToRelease);
2224     }
2225 
2226     head.store(localHead, std::memory_order_release);
2227     return true;
2228 }
2229 
2230 template<int LOW_MARK, int HIGH_MARK>
2231 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2232 {
2233     LargeMemoryBlock *localHead, *res = nullptr;
2234 
2235     if (size > MAX_TOTAL_SIZE)
2236         return nullptr;
2237 
2238     // TBB_REVAMP_TODO: review this line
2239     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(nullptr)) == nullptr) {
2240         // do not restore totalSize, numOfBlocks and tail at this point,
2241         // as they are used only in put(), where they must be restored
2242         return nullptr;
2243     }
2244 
2245     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2246         if (curr->unalignedSize == size) {
2247             res = curr;
2248             if (curr->next)
2249                 curr->next->prev = curr->prev;
2250             else
2251                 tail = curr->prev;
2252             if (curr != localHead)
2253                 curr->prev->next = curr->next;
2254             else
2255                 localHead = curr->next;
2256             totalSize -= size;
2257             numOfBlocks--;
2258             break;
2259         }
2260     }
2261 
2262     head.store(localHead, std::memory_order_release);
2263     return res;
2264 }
2265 
2266 template<int LOW_MARK, int HIGH_MARK>
2267 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2268 {
2269     if (LargeMemoryBlock *localHead = head.exchange(nullptr)) {
2270         extMemPool->freeLargeObjectList(localHead);
2271         return true;
2272     }
2273     return false;
2274 }
2275 
2276 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2277 {
2278     LargeMemoryBlock *lmb = nullptr;
2279 
2280     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2281     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2282     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2283         return nullptr;
2284     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2285 
2286     if (tls) {
2287         tls->markUsed();
2288         lmb = tls->lloc.get(allocationSize);
2289     }
2290     if (!lmb)
2291         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2292 
2293     if (lmb) {
2294         // doing shuffle we suppose that alignment offset guarantees
2295         // that different cache lines are in use
2296         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2297 
2298         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2299         uintptr_t alignedRight =
2300             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2301         // Has some room to shuffle object between cache lines?
2302         // Note that alignedRight and alignedArea are aligned at alignment.
2303         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2304         if (ptrDelta && tls) { // !tls is cold path
2305             // for the hot path of alignment==estimatedCacheLineSize,
2306             // allow compilers to use shift for division
2307             // (since estimatedCacheLineSize is a power-of-2 constant)
2308             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2309                   ptrDelta / estimatedCacheLineSize :
2310                   ptrDelta / alignment;
2311             unsigned myCacheIdx = ++tls->currCacheIdx;
2312             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2313 
2314             // Move object to a cache line with an offset that is different from
2315             // previous allocation. This supposedly allows us to use cache
2316             // associativity more efficiently.
2317             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2318         }
2319         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2320                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2321         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2322         header->memoryBlock = lmb;
2323         header->backRefIdx = lmb->backRefIdx;
2324         setBackRef(header->backRefIdx, header);
2325 
2326         lmb->objectSize = size;
2327 
2328         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2329         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2330 
2331         return alignedArea;
2332     }
2333     return nullptr;
2334 }
2335 
2336 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2337 {
2338     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2339     // overwrite backRefIdx to simplify double free detection
2340     header->backRefIdx = BackRefIdx();
2341 
2342     if (tls) {
2343         tls->markUsed();
2344         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2345             return;
2346     }
2347     extMemPool.freeLargeObject(header->memoryBlock);
2348 }
2349 
2350 /*
2351  * All aligned allocations fall into one of the following categories:
2352  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2353  *       we just align the size up, and request this amount, because for every size
2354  *       aligned to some power of 2, the allocated object is at least that aligned.
2355  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2356  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2357  *       its address up; given such pointer, scalable_free could find the real object.
2358  *       Wrapping of size+alignment is impossible because maximal allowed
2359  *       alignment plus minLargeObjectSize can't lead to wrapping.
2360  * 4. otherwise, aligned large object is allocated.
2361  */
2362 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2363 {
2364     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2365 
2366     if (!isMallocInitialized())
2367         if (!doInitialization())
2368             return nullptr;
2369 
2370     void *result;
2371     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2372         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2373     else if (size<minLargeObjectSize) {
2374         if (alignment<=fittingAlignment)
2375             result = internalPoolMalloc(memPool, size);
2376         else if (size+alignment < minLargeObjectSize) {
2377             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2378             if (!unaligned) return nullptr;
2379             result = alignUp(unaligned, alignment);
2380         } else
2381             goto LargeObjAlloc;
2382     } else {
2383     LargeObjAlloc:
2384         TLSData *tls = memPool->getTLS(/*create=*/true);
2385         // take into account only alignment that are higher then natural
2386         result =
2387             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2388                                                largeObjectAlignment: alignment);
2389     }
2390 
2391     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2392     return result;
2393 }
2394 
2395 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2396                             size_t newSize, size_t alignment = 0)
2397 {
2398     void *result;
2399     size_t copySize;
2400 
2401     if (isLargeObject<ourMem>(ptr)) {
2402         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2403         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2404 
2405         // Apply different strategies if size decreases
2406         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2407 
2408             // For huge objects (that do not fit in backend cache), keep the same space unless
2409             // the new size is at least twice smaller
2410             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2411             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2412             if (newSize > threshold) {
2413                 lmb->objectSize = newSize;
2414                 return ptr;
2415             }
2416             // TODO: For large objects suitable for the backend cache,
2417             // split out the excessive part and put it to the backend.
2418         }
2419         // Reallocate for real
2420         copySize = lmb->objectSize;
2421 #if BACKEND_HAS_MREMAP
2422         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2423                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2424             return r;
2425 #endif
2426         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2427             internalPoolMalloc(memPool, newSize);
2428 
2429     } else {
2430         Block* block = (Block *)alignDown(ptr, slabSize);
2431         copySize = block->findObjectSize(ptr);
2432 
2433         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2434         // Currently, in case of size decreasing, old pointer is returned
2435         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2436             return ptr;
2437         } else {
2438             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2439                 internalPoolMalloc(memPool, newSize);
2440         }
2441     }
2442     if (result) {
2443         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2444         internalPoolFree(memPool, ptr, 0);
2445     }
2446     return result;
2447 }
2448 
2449 #if MALLOC_DEBUG
2450 /* A predicate checks if an object is properly placed inside its block */
2451 inline bool Block::isProperlyPlaced(const void *object) const
2452 {
2453     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2454 }
2455 #endif
2456 
2457 /* Finds the real object inside the block */
2458 FreeObject *Block::findAllocatedObject(const void *address) const
2459 {
2460     // calculate offset from the end of the block space
2461     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2462     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2463     // find offset difference from a multiple of allocation size
2464     offset %= objectSize;
2465     // and move the address down to where the real object starts.
2466     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2467 }
2468 
2469 /*
2470  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2471  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2472  */
2473 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2474 {
2475     BackRefIdx id;
2476 #if _MSC_VER
2477     __try {
2478 #endif
2479         id = dereference(ptr);
2480 #if _MSC_VER
2481     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2482                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2483         id = BackRefIdx();
2484     }
2485 #endif
2486     return id;
2487 }
2488 
2489 template<MemoryOrigin memOrigin>
2490 bool isLargeObject(void *object)
2491 {
2492     if (!isAligned(object, largeObjectAlignment))
2493         return false;
2494     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2495     BackRefIdx idx = (memOrigin == unknownMem) ?
2496         safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2497 
2498     return idx.isLargeObject()
2499         // in valid LargeObjectHdr memoryBlock is not nullptr
2500         && header->memoryBlock
2501         // in valid LargeObjectHdr memoryBlock points somewhere before header
2502         // TODO: more strict check
2503         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2504         && getBackRef(idx) == header;
2505 }
2506 
2507 static inline bool isSmallObject (void *ptr)
2508 {
2509     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2510     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2511 
2512     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2513     if (isSmall)
2514         expectedBlock->checkFreePrecond(ptr);
2515     return isSmall;
2516 }
2517 
2518 /**** Check if an object was allocated by scalable_malloc ****/
2519 static inline bool isRecognized (void* ptr)
2520 {
2521     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2522         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2523 }
2524 
2525 static inline void freeSmallObject(void *object)
2526 {
2527     /* mask low bits to get the block */
2528     Block *block = (Block *)alignDown(object, slabSize);
2529     block->checkFreePrecond(object);
2530 
2531 #if MALLOC_CHECK_RECURSION
2532     if (block->isStartupAllocObject()) {
2533         ((StartupBlock *)block)->free(object);
2534         return;
2535     }
2536 #endif
2537     if (block->isOwnedByCurrentThread()) {
2538         block->freeOwnObject(object);
2539     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2540         FreeObject *objectToFree = block->findObjectToFree(object);
2541         block->freePublicObject(objectToFree);
2542     }
2543 }
2544 
2545 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2546 {
2547     Bin* bin;
2548     Block * mallocBlock;
2549 
2550     if (!memPool) return nullptr;
2551 
2552     if (!size) size = sizeof(size_t);
2553 
2554     TLSData *tls = memPool->getTLS(/*create=*/true);
2555 
2556     /* Allocate a large object */
2557     if (size >= minLargeObjectSize)
2558         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2559 
2560     if (!tls) return nullptr;
2561 
2562     tls->markUsed();
2563     /*
2564      * Get an element in thread-local array corresponding to the given size;
2565      * It keeps ptr to the active block for allocations of this size
2566      */
2567     bin = tls->getAllocationBin(size);
2568     if ( !bin ) return nullptr;
2569 
2570     /* Get a block to try to allocate in. */
2571     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2572          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2573     {
2574         if( FreeObject *result = mallocBlock->allocate() )
2575             return result;
2576     }
2577 
2578     /*
2579      * else privatize publicly freed objects in some block and allocate from it
2580      */
2581     mallocBlock = bin->getPrivatizedFreeListBlock();
2582     if (mallocBlock) {
2583         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2584         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2585             return result;
2586         /* Else something strange happened, need to retry from the beginning; */
2587         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2588         return internalPoolMalloc(memPool, size);
2589     }
2590 
2591     /*
2592      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2593      */
2594     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2595     while (mallocBlock) {
2596         bin->pushTLSBin(mallocBlock);
2597         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2598         if( FreeObject *result = mallocBlock->allocate() )
2599             return result;
2600         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2601     }
2602 
2603     /*
2604      * else try to get a new empty block
2605      */
2606     mallocBlock = memPool->getEmptyBlock(size);
2607     if (mallocBlock) {
2608         bin->pushTLSBin(mallocBlock);
2609         bin->setActiveBlock(mallocBlock);
2610         if( FreeObject *result = mallocBlock->allocate() )
2611             return result;
2612         /* Else something strange happened, need to retry from the beginning; */
2613         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2614         return internalPoolMalloc(memPool, size);
2615     }
2616     /*
2617      * else nothing works so return nullptr
2618      */
2619     TRACEF(( "[ScalableMalloc trace] No memory found, returning nullptr.\n" ));
2620     return nullptr;
2621 }
2622 
2623 // When size==0 (i.e. unknown), detect here whether the object is large.
2624 // For size is known and < minLargeObjectSize, we still need to check
2625 // if the actual object is large, because large objects might be used
2626 // for aligned small allocations.
2627 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2628 {
2629     if (!memPool || !object) return false;
2630 
2631     // The library is initialized at allocation call, so releasing while
2632     // not initialized means foreign object is releasing.
2633     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2634     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2635                   "Invalid pointer during object releasing is detected.");
2636 
2637     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2638         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2639     else
2640         freeSmallObject(object);
2641     return true;
2642 }
2643 
2644 static void *internalMalloc(size_t size)
2645 {
2646     if (!size) size = sizeof(size_t);
2647 
2648 #if MALLOC_CHECK_RECURSION
2649     if (RecursiveMallocCallProtector::sameThreadActive())
2650         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2651             // nested allocation, so skip tls
2652             (FreeObject*)defaultMemPool->getFromLLOCache(nullptr, size, slabSize);
2653 #endif
2654 
2655     if (!isMallocInitialized())
2656         if (!doInitialization())
2657             return nullptr;
2658     return internalPoolMalloc(defaultMemPool, size);
2659 }
2660 
2661 static void internalFree(void *object)
2662 {
2663     internalPoolFree(defaultMemPool, object, 0);
2664 }
2665 
2666 static size_t internalMsize(void* ptr)
2667 {
2668     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2669     if (isLargeObject<ourMem>(ptr)) {
2670         // TODO: return the maximum memory size, that can be written to this object
2671         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2672         return lmb->objectSize;
2673     } else {
2674         Block *block = (Block*)alignDown(ptr, slabSize);
2675         return block->findObjectSize(ptr);
2676     }
2677 }
2678 
2679 } // namespace internal
2680 
2681 using namespace rml::internal;
2682 
2683 // legacy entry point saved for compatibility with binaries complied
2684 // with pre-6003 versions of TBB
2685 TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2686 {
2687     rml::MemoryPool *pool;
2688     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2689 
2690     pool_create_v1(pool_id, &pol, &pool);
2691     return pool;
2692 }
2693 
2694 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2695                                  rml::MemoryPool **pool)
2696 {
2697     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2698          // empty pFree allowed only for fixed pools
2699          || !(policy->fixedPool || policy->pFree)) {
2700         *pool = nullptr;
2701         return INVALID_POLICY;
2702     }
2703     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2704          // new flags can be added in place of reserved, but default
2705          // behaviour must be supported by this version
2706          || policy->reserved ) {
2707         *pool = nullptr;
2708         return UNSUPPORTED_POLICY;
2709     }
2710     if (!isMallocInitialized())
2711         if (!doInitialization()) {
2712             *pool = nullptr;
2713             return NO_MEMORY;
2714         }
2715     rml::internal::MemoryPool *memPool =
2716         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2717     if (!memPool) {
2718         *pool = nullptr;
2719         return NO_MEMORY;
2720     }
2721     memset(static_cast<void*>(memPool), 0, sizeof(rml::internal::MemoryPool));
2722     if (!memPool->init(pool_id, policy)) {
2723         internalFree(memPool);
2724         *pool = nullptr;
2725         return NO_MEMORY;
2726     }
2727 
2728     *pool = (rml::MemoryPool*)memPool;
2729     return POOL_OK;
2730 }
2731 
2732 bool pool_destroy(rml::MemoryPool* memPool)
2733 {
2734     if (!memPool) return false;
2735     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2736     internalFree(memPool);
2737 
2738     return ret;
2739 }
2740 
2741 bool pool_reset(rml::MemoryPool* memPool)
2742 {
2743     if (!memPool) return false;
2744 
2745     return ((rml::internal::MemoryPool*)memPool)->reset();
2746 }
2747 
2748 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2749 {
2750     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2751 }
2752 
2753 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2754 {
2755     if (!object)
2756         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2757     if (!size) {
2758         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2759         return nullptr;
2760     }
2761     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2762 }
2763 
2764 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2765 {
2766     if (!isPowerOfTwo(alignment) || 0==size)
2767         return nullptr;
2768 
2769     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2770 }
2771 
2772 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2773 {
2774     if (!isPowerOfTwo(alignment))
2775         return nullptr;
2776     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2777     void *tmp;
2778 
2779     if (!ptr)
2780         tmp = allocateAligned(mPool, size, alignment);
2781     else if (!size) {
2782         internalPoolFree(mPool, ptr, 0);
2783         return nullptr;
2784     } else
2785         tmp = reallocAligned(mPool, ptr, size, alignment);
2786 
2787     return tmp;
2788 }
2789 
2790 bool pool_free(rml::MemoryPool *mPool, void *object)
2791 {
2792     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2793 }
2794 
2795 rml::MemoryPool *pool_identify(void *object)
2796 {
2797     rml::internal::MemoryPool *pool;
2798     if (isLargeObject<ourMem>(object)) {
2799         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2800         pool = header->memoryBlock->pool;
2801     } else {
2802         Block *block = (Block*)alignDown(object, slabSize);
2803         pool = block->getMemPool();
2804     }
2805     // do not return defaultMemPool, as it can't be used in pool_free() etc
2806     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2807         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2808     return (rml::MemoryPool*)pool;
2809 }
2810 
2811 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2812 {
2813     if (object) {
2814         // No assert for object recognition, cause objects allocated from non-default
2815         // memory pool do not participate in range checking and do not have valid backreferences for
2816         // small objects. Instead, check that an object belong to the certain memory pool.
2817         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2818         return internalMsize(object);
2819     }
2820     errno = EINVAL;
2821     // Unlike _msize, return 0 in case of parameter error.
2822     // Returning size_t(-1) looks more like the way to troubles.
2823     return 0;
2824 }
2825 
2826 } // namespace rml
2827 
2828 using namespace rml::internal;
2829 
2830 #if MALLOC_TRACE
2831 static unsigned int threadGoingDownCount = 0;
2832 #endif
2833 
2834 /*
2835  * When a thread is shutting down this routine should be called to remove all the thread ids
2836  * from the malloc blocks and replace them with a nullptr thread id.
2837  *
2838  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2839  * It will be automatically called at thread exit with the key value as the argument,
2840  * unless that value is nullptr.
2841  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2842  *
2843  * However neither of the above is called for the main process thread, so the routine
2844  * also needs to be called during the process shutdown.
2845  *
2846 */
2847 // TODO: Consider making this function part of class MemoryPool.
2848 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2849 {
2850     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2851              getThreadId(),  threadGoingDownCount++ ));
2852 
2853 #if USE_PTHREAD
2854     if (tls) {
2855         if (!shutdownSync.threadDtorStart()) return;
2856         tls->getMemPool()->onThreadShutdown(tls);
2857         shutdownSync.threadDtorDone();
2858     } else
2859 #endif
2860     {
2861         suppress_unused_warning(tls); // not used on Windows
2862         // The default pool is safe to use at this point:
2863         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2864         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2865         // See also __TBB_mallocProcessShutdownNotification()
2866         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2867         // Take lock to walk through other pools; but waiting might be dangerous at this point
2868         // (e.g. on Windows the main thread might deadlock)
2869         bool locked;
2870         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2871         if (locked) { // the list is safe to process
2872             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2873                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2874         }
2875     }
2876 
2877     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2878 }
2879 
2880 #if USE_PTHREAD
2881 void mallocThreadShutdownNotification(void* arg)
2882 {
2883     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2884     if (!isMallocInitialized()) return;
2885     doThreadShutdownNotification((TLSData*)arg, false);
2886 }
2887 #else
2888 extern "C" void __TBB_mallocThreadShutdownNotification()
2889 {
2890     // The routine is called once per thread on Windows
2891     if (!isMallocInitialized()) return;
2892     doThreadShutdownNotification(nullptr, false);
2893 }
2894 #endif
2895 
2896 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2897 {
2898     if (!isMallocInitialized()) return;
2899 
2900     // Don't clean allocator internals if the entire process is exiting
2901     if (!windows_process_dying) {
2902         doThreadShutdownNotification(nullptr, /*main_thread=*/true);
2903     }
2904 #if  __TBB_MALLOC_LOCACHE_STAT
2905     printf("cache hit ratio %f, size hit %f\n",
2906            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2907     defaultMemPool->extMemPool.loc.reportStat(stdout);
2908 #endif
2909 
2910     shutdownSync.processExit();
2911 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2912 /* Pthread keys must be deleted as soon as possible to not call key dtor
2913    on thread termination when then the tbbmalloc code can be already unloaded.
2914 */
2915     defaultMemPool->destroy();
2916     destroyBackRefMain(&defaultMemPool->extMemPool.backend);
2917     ThreadId::destroy();      // Delete key for thread id
2918     hugePages.reset();
2919     // new total malloc initialization is possible after this point
2920     mallocInitialized.store(0, std::memory_order_release);
2921 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2922 
2923 #if COLLECT_STATISTICS
2924     unsigned nThreads = ThreadId::getMaxThreadId();
2925     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2926         STAT_print(i);
2927 #endif
2928     if (!usedBySrcIncluded) {
2929         MALLOC_ITT_FINI_ITTLIB();
2930         MALLOC_ITT_RELEASE_RESOURCES();
2931     }
2932 }
2933 
2934 extern "C" void * scalable_malloc(size_t size)
2935 {
2936     void *ptr = internalMalloc(size);
2937     if (!ptr) errno = ENOMEM;
2938     return ptr;
2939 }
2940 
2941 extern "C" void scalable_free(void *object)
2942 {
2943     internalFree(object);
2944 }
2945 
2946 #if MALLOC_ZONE_OVERLOAD_ENABLED
2947 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2948 {
2949     internalPoolFree(defaultMemPool, object, size);
2950 }
2951 #endif
2952 
2953 /*
2954  * A variant that provides additional memory safety, by checking whether the given address
2955  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2956  */
2957 extern "C" TBBMALLOC_EXPORT void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2958 {
2959     if (!object)
2960         return;
2961 
2962     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2963     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2964         if (isLargeObject<unknownMem>(object)) {
2965             // must check 1st for large object, because small object check touches 4 pages on left,
2966             // and it can be inaccessible
2967             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2968 
2969             defaultMemPool->putToLLOCache(tls, object);
2970             return;
2971         } else if (isSmallObject(object)) {
2972             freeSmallObject(object);
2973             return;
2974         }
2975     }
2976     if (original_free)
2977         original_free(object);
2978 }
2979 
2980 /********* End the free code        *************/
2981 
2982 /********* Code for scalable_realloc       ***********/
2983 
2984 /*
2985  * From K&R
2986  * "realloc changes the size of the object pointed to by p to size. The contents will
2987  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2988  * the new space is uninitialized. realloc returns a pointer to the new space, or
2989  * nullptr if the request cannot be satisfied, in which case *p is unchanged."
2990  *
2991  */
2992 extern "C" void* scalable_realloc(void* ptr, size_t size)
2993 {
2994     void *tmp;
2995 
2996     if (!ptr)
2997         tmp = internalMalloc(size);
2998     else if (!size) {
2999         internalFree(ptr);
3000         return nullptr;
3001     } else
3002         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3003 
3004     if (!tmp) errno = ENOMEM;
3005     return tmp;
3006 }
3007 
3008 /*
3009  * A variant that provides additional memory safety, by checking whether the given address
3010  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3011  */
3012 extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3013 {
3014     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3015 
3016     if (!ptr) {
3017         tmp = internalMalloc(sz);
3018     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3019         if (!sz) {
3020             internalFree(ptr);
3021             return nullptr;
3022         } else {
3023             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3024         }
3025     }
3026 #if USE_WINTHREAD
3027     else if (original_realloc && sz) {
3028         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3029         if ( original_ptrs->msize ){
3030             size_t oldSize = original_ptrs->msize(ptr);
3031             tmp = internalMalloc(sz);
3032             if (tmp) {
3033                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3034                 if ( original_ptrs->free ){
3035                     original_ptrs->free( ptr );
3036                 }
3037             }
3038         } else
3039             tmp = nullptr;
3040     }
3041 #else
3042     else if (original_realloc) {
3043         typedef void* (*realloc_ptr_t)(void*,size_t);
3044         realloc_ptr_t original_realloc_ptr;
3045         (void *&)original_realloc_ptr = original_realloc;
3046         tmp = original_realloc_ptr(ptr,sz);
3047     }
3048 #endif
3049     else tmp = nullptr;
3050 
3051     if (!tmp) errno = ENOMEM;
3052     return tmp;
3053 }
3054 
3055 /********* End code for scalable_realloc   ***********/
3056 
3057 /********* Code for scalable_calloc   ***********/
3058 
3059 /*
3060  * From K&R
3061  * calloc returns a pointer to space for an array of nobj objects,
3062  * each of size size, or nullptr if the request cannot be satisfied.
3063  * The space is initialized to zero bytes.
3064  *
3065  */
3066 
3067 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3068 {
3069     // it's square root of maximal size_t value
3070     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3071     const size_t arraySize = nobj * size;
3072 
3073     // check for overflow during multiplication:
3074     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3075         if (nobj && arraySize / nobj != size) {             // 2) exact check
3076             errno = ENOMEM;
3077             return nullptr;
3078         }
3079     void* result = internalMalloc(arraySize);
3080     if (result)
3081         memset(result, 0, arraySize);
3082     else
3083         errno = ENOMEM;
3084     return result;
3085 }
3086 
3087 /********* End code for scalable_calloc   ***********/
3088 
3089 /********* Code for aligned allocation API **********/
3090 
3091 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3092 {
3093     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3094         return EINVAL;
3095     void *result = allocateAligned(defaultMemPool, size, alignment);
3096     if (!result)
3097         return ENOMEM;
3098     *memptr = result;
3099     return 0;
3100 }
3101 
3102 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3103 {
3104     if (!isPowerOfTwo(alignment) || 0==size) {
3105         errno = EINVAL;
3106         return nullptr;
3107     }
3108     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3109     if (!tmp) errno = ENOMEM;
3110     return tmp;
3111 }
3112 
3113 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3114 {
3115     if (!isPowerOfTwo(alignment)) {
3116         errno = EINVAL;
3117         return nullptr;
3118     }
3119     void *tmp;
3120 
3121     if (!ptr)
3122         tmp = allocateAligned(defaultMemPool, size, alignment);
3123     else if (!size) {
3124         scalable_free(ptr);
3125         return nullptr;
3126     } else
3127         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3128 
3129     if (!tmp) errno = ENOMEM;
3130     return tmp;
3131 }
3132 
3133 extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3134 {
3135     /* corner cases left out of reallocAligned to not deal with errno there */
3136     if (!isPowerOfTwo(alignment)) {
3137         errno = EINVAL;
3138         return nullptr;
3139     }
3140     void *tmp = nullptr;
3141 
3142     if (!ptr) {
3143         tmp = allocateAligned(defaultMemPool, size, alignment);
3144     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3145         if (!size) {
3146             internalFree(ptr);
3147             return nullptr;
3148         } else {
3149             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3150         }
3151     }
3152 #if USE_WINTHREAD
3153     else {
3154         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3155         if (size) {
3156             // Without orig_msize, we can't do anything with this.
3157             // Just keeping old pointer.
3158             if ( original_ptrs->aligned_msize ){
3159                 // set alignment and offset to have possibly correct oldSize
3160                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3161                 tmp = allocateAligned(defaultMemPool, size, alignment);
3162                 if (tmp) {
3163                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3164                     if ( original_ptrs->aligned_free ){
3165                         original_ptrs->aligned_free( ptr );
3166                     }
3167                 }
3168             }
3169         } else {
3170             if ( original_ptrs->aligned_free ){
3171                 original_ptrs->aligned_free( ptr );
3172             }
3173             return nullptr;
3174         }
3175     }
3176 #else
3177     // As original_realloc can't align result, and there is no way to find
3178     // size of reallocating object, we are giving up.
3179     suppress_unused_warning(orig_function);
3180 #endif
3181     if (!tmp) errno = ENOMEM;
3182     return tmp;
3183 }
3184 
3185 extern "C" void scalable_aligned_free(void *ptr)
3186 {
3187     internalFree(ptr);
3188 }
3189 
3190 /********* end code for aligned allocation API **********/
3191 
3192 /********* Code for scalable_msize       ***********/
3193 
3194 /*
3195  * Returns the size of a memory block allocated in the heap.
3196  */
3197 extern "C" size_t scalable_msize(void* ptr)
3198 {
3199     if (ptr) {
3200         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3201         return internalMsize(ptr);
3202     }
3203     errno = EINVAL;
3204     // Unlike _msize, return 0 in case of parameter error.
3205     // Returning size_t(-1) looks more like the way to troubles.
3206     return 0;
3207 }
3208 
3209 /*
3210  * A variant that provides additional memory safety, by checking whether the given address
3211  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3212  */
3213 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3214 {
3215     if (object) {
3216         // Check if the memory was allocated by scalable_malloc
3217         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3218             return internalMsize(object);
3219         else if (original_msize)
3220             return original_msize(object);
3221     }
3222     // object is nullptr or unknown, or foreign and no original_msize
3223 #if USE_WINTHREAD
3224     errno = EINVAL; // errno expected to be set only on this platform
3225 #endif
3226     return 0;
3227 }
3228 
3229 /*
3230  * The same as above but for _aligned_msize case
3231  */
3232 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3233 {
3234     if (object) {
3235         // Check if the memory was allocated by scalable_malloc
3236         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3237             return internalMsize(object);
3238         else if (orig_aligned_msize)
3239             return orig_aligned_msize(object,alignment,offset);
3240     }
3241     // object is nullptr or unknown
3242     errno = EINVAL;
3243     return 0;
3244 }
3245 
3246 /********* End code for scalable_msize   ***********/
3247 
3248 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3249 {
3250     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3251         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3252         return TBBMALLOC_OK;
3253     } else if (param == USE_HUGE_PAGES) {
3254 #if __unix__
3255         switch (value) {
3256         case 0:
3257         case 1:
3258             hugePages.setMode(value);
3259             return TBBMALLOC_OK;
3260         default:
3261             return TBBMALLOC_INVALID_PARAM;
3262         }
3263 #else
3264         return TBBMALLOC_NO_EFFECT;
3265 #endif
3266 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3267     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3268         switch (value) {
3269         case 0: // used by dynamic library
3270         case 1: // used by static library or directly included sources
3271             usedBySrcIncluded = value;
3272             return TBBMALLOC_OK;
3273         default:
3274             return TBBMALLOC_INVALID_PARAM;
3275         }
3276 #endif
3277     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3278         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3279         return TBBMALLOC_OK;
3280     }
3281     return TBBMALLOC_INVALID_PARAM;
3282 }
3283 
3284 extern "C" int scalable_allocation_command(int cmd, void *param)
3285 {
3286     if (param)
3287         return TBBMALLOC_INVALID_PARAM;
3288 
3289     bool released = false;
3290     switch(cmd) {
3291     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3292         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3293             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3294         break;
3295     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3296         released = defaultMemPool->extMemPool.hardCachesCleanup();
3297         break;
3298     default:
3299         return TBBMALLOC_INVALID_PARAM;
3300     }
3301     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3302 }
3303