xref: /oneTBB/src/tbbmalloc/frontend.cpp (revision b9a0ab45)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
31     extern "C" { static void mallocThreadShutdownNotification(void*); }
32     #if __sun || __SUNPRO_CC
33     #define __asm__ asm
34     #endif
35     #include <unistd.h> // sysconf(_SC_PAGESIZE)
36 #elif USE_WINTHREAD
37     #define GetMyTID() GetCurrentThreadId()
38 #if __TBB_WIN8UI_SUPPORT
39     #include<thread>
40     #define TlsSetValue_func FlsSetValue
41     #define TlsGetValue_func FlsGetValue
42     #define TlsAlloc() FlsAlloc(nullptr)
43     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
44     #define TlsFree FlsFree
45 #else
46     #define TlsSetValue_func TlsSetValue
47     #define TlsGetValue_func TlsGetValue
48     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
49 #endif
50 #else
51     #error Must define USE_PTHREAD or USE_WINTHREAD
52 #endif
53 
54 #define FREELIST_NONBLOCKING 1
55 
56 namespace rml {
57 class MemoryPool;
58 namespace internal {
59 
60 class Block;
61 class MemoryPool;
62 
63 #if MALLOC_CHECK_RECURSION
64 
65 inline bool isMallocInitialized();
66 
67 #endif // MALLOC_CHECK_RECURSION
68 
69 /** Support for handling the special UNUSABLE pointer state **/
70 const intptr_t UNUSABLE = 0x1;
71 inline bool isSolidPtr( void* ptr ) {
72     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
73 }
74 inline bool isNotForUse( void* ptr ) {
75     return (intptr_t)ptr==UNUSABLE;
76 }
77 
78 /*
79  * Block::objectSize value used to mark blocks allocated by startupAlloc
80  */
81 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
82 
83 /*
84  * The following constant is used to define the size of struct Block, the block header.
85  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
86  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
87  */
88 const int blockHeaderAlignment = estimatedCacheLineSize;
89 
90 /********* The data structures and global objects        **************/
91 
92 /*
93  * The malloc routines themselves need to be able to occasionally malloc some space,
94  * in order to set up the structures used by the thread local structures. This
95  * routine performs that functions.
96  */
97 class BootStrapBlocks {
98     MallocMutex bootStrapLock;
99     Block      *bootStrapBlock;
100     Block      *bootStrapBlockUsed;
101     FreeObject *bootStrapObjectList;
102 public:
103     void *allocate(MemoryPool *memPool, size_t size);
104     void free(void* ptr);
105     void reset();
106 };
107 
108 #if USE_INTERNAL_TID
109 class ThreadId {
110     static tls_key_t Tid_key;
111     std::atomic<intptr_t> ThreadCount;
112 
113     unsigned int id;
114 
115     static unsigned int tlsNumber() {
116         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
117         if( !result ) {
118             RecursiveMallocCallProtector scoped;
119             // Thread-local value is zero -> first call from this thread,
120             // need to initialize with next ID value (IDs start from 1)
121             result = ++ThreadCount; // returned new value!
122             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
123         }
124         return result;
125     }
126 public:
127     static bool init() {
128 #if USE_WINTHREAD
129         Tid_key = TlsAlloc();
130         if (Tid_key == TLS_ALLOC_FAILURE)
131             return false;
132 #else
133         int status = pthread_key_create( &Tid_key, nullptr );
134         if ( status ) {
135             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
136             return false;
137         }
138 #endif /* USE_WINTHREAD */
139         return true;
140     }
141 #if __TBB_SOURCE_DIRECTLY_INCLUDED
142     static void destroy() {
143         if( Tid_key ) {
144 #if USE_WINTHREAD
145             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
146 #else
147             int status = pthread_key_delete( Tid_key );
148 #endif /* USE_WINTHREAD */
149             if ( status )
150                 fprintf (stderr, "The memory manager cannot delete tls key\n");
151             Tid_key = 0;
152         }
153     }
154 #endif
155 
156     ThreadId() : id(ThreadId::tlsNumber()) {}
157     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
158 
159 #if COLLECT_STATISTICS || MALLOC_TRACE
160     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
161 #endif
162 #if COLLECT_STATISTICS
163     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
164 
165     friend int STAT_increment(ThreadId tid, int bin, int ctr);
166 #endif
167 };
168 
169 tls_key_t ThreadId::Tid_key;
170 intptr_t ThreadId::ThreadCount;
171 
172 #if COLLECT_STATISTICS
173 int STAT_increment(ThreadId tid, int bin, int ctr)
174 {
175     return ::STAT_increment(tid.id, bin, ctr);
176 }
177 #endif
178 
179 #else // USE_INTERNAL_TID
180 
181 class ThreadId {
182 #if USE_PTHREAD
183     std::atomic<pthread_t> tid;
184 #else
185     std::atomic<DWORD>     tid;
186 #endif
187 public:
188     ThreadId() : tid(GetMyTID()) {}
189     ThreadId(ThreadId &other) {
190         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
191     }
192     ~ThreadId() {}
193 
194 #if USE_PTHREAD
195     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
196 #else
197     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
198 #endif
199     ThreadId& operator=(const ThreadId& other) {
200         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
201         return *this;
202     }
203     static bool init() { return true; }
204 #if __TBB_SOURCE_DIRECTLY_INCLUDED
205     static void destroy() {}
206 #endif
207 };
208 
209 #endif // USE_INTERNAL_TID
210 
211 /*********** Code to provide thread ID and a thread-local void pointer **********/
212 
213 bool TLSKey::init()
214 {
215 #if USE_WINTHREAD
216     TLS_pointer_key = TlsAlloc();
217     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
218         return false;
219 #else
220     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
221     if ( status )
222         return false;
223 #endif /* USE_WINTHREAD */
224     return true;
225 }
226 
227 bool TLSKey::destroy()
228 {
229 #if USE_WINTHREAD
230     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
231 #else
232     int status1 = pthread_key_delete(TLS_pointer_key);
233 #endif /* USE_WINTHREAD */
234     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
235     return status1==0;
236 }
237 
238 inline TLSData* TLSKey::getThreadMallocTLS() const
239 {
240     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
241 }
242 
243 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
244     RecursiveMallocCallProtector scoped;
245     TlsSetValue_func( TLS_pointer_key, newvalue );
246 }
247 
248 /* The 'next' field in the block header has to maintain some invariants:
249  *   it needs to be on a 16K boundary and the first field in the block.
250  *   Any value stored there needs to have the lower 14 bits set to 0
251  *   so that various assert work. This means that if you want to smash this memory
252  *   for debugging purposes you will need to obey this invariant.
253  * The total size of the header needs to be a power of 2 to simplify
254  * the alignment requirements. For now it is a 128 byte structure.
255  * To avoid false sharing, the fields changed only locally are separated
256  * from the fields changed by foreign threads.
257  * Changing the size of the block header would require to change
258  * some bin allocation sizes, in particular "fitting" sizes (see above).
259  */
260 class Bin;
261 class StartupBlock;
262 
263 class MemoryPool {
264     // if no explicit grainsize, expect to see malloc in user's pAlloc
265     // and set reasonable low granularity
266     static const size_t defaultGranularity = estimatedCacheLineSize;
267 
268     MemoryPool() = delete;                  // deny
269 public:
270     static MallocMutex  memPoolListLock;
271 
272     // list of all active pools is used to release
273     // all TLS data on thread termination or library unload
274     MemoryPool    *next,
275                   *prev;
276     ExtMemoryPool  extMemPool;
277     BootStrapBlocks bootStrapBlocks;
278 
279     static void initDefaultPool();
280 
281     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
282     bool reset();
283     bool destroy();
284     void onThreadShutdown(TLSData *tlsData);
285 
286     inline TLSData *getTLS(bool create);
287     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(nullptr); }
288 
289     Block *getEmptyBlock(size_t size);
290     void returnEmptyBlock(Block *block, bool poolTheBlock);
291 
292     // get/put large object to/from local large object cache
293     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
294     void putToLLOCache(TLSData *tls, void *object);
295 };
296 
297 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
298                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
299 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
300 const size_t MemoryPool::defaultGranularity;
301 // zero-initialized
302 MallocMutex  MemoryPool::memPoolListLock;
303 // TODO: move huge page status to default pool, because that's its states
304 HugePagesStatus hugePages;
305 static bool usedBySrcIncluded = false;
306 
307 // Padding helpers
308 template<size_t padd>
309 struct PaddingImpl {
310     size_t       __padding[padd];
311 };
312 
313 template<>
314 struct PaddingImpl<0> {};
315 
316 template<int N>
317 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
318 
319 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
320 // fields and fields commonly accessed by not owner threads.
321 class GlobalBlockFields : public BlockI {
322 protected:
323     std::atomic<FreeObject*> publicFreeList;
324     std::atomic<Block*> nextPrivatizable;
325     MemoryPool  *poolPtr;
326 };
327 
328 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
329 protected:
330     Block       *next;
331     Block       *previous;        /* Use double linked list to speed up removal */
332     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
333     FreeObject  *freeList;
334     /* Pointer to local data for the owner thread. Used for fast finding tls
335        when releasing object from a block that current thread owned.
336        nullptr for orphaned blocks. */
337     std::atomic<TLSData*> tlsPtr;
338     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
339     BackRefIdx   backRefIdx;
340     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
341     uint16_t     objectSize;
342     bool         isFull;
343 
344     friend class FreeBlockPool;
345     friend class StartupBlock;
346     friend class LifoList;
347     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
348     friend bool OrphanedBlocks::cleanup(Backend*);
349     friend Block *MemoryPool::getEmptyBlock(size_t);
350 };
351 
352 // Use inheritance to guarantee that a user data start on next cache line.
353 // Can't use member for it, because when LocalBlockFields already on cache line,
354 // we must have no additional memory consumption for all compilers.
355 class Block : public LocalBlockFields,
356               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
357 public:
358     bool empty() const {
359         if (allocatedCount > 0) return false;
360         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
361         return true;
362     }
363     inline FreeObject* allocate();
364     inline FreeObject *allocateFromFreeList();
365 
366     inline bool adjustFullness();
367     void adjustPositionInBin(Bin* bin = nullptr);
368 #if MALLOC_DEBUG
369     bool freeListNonNull() { return freeList; }
370 #endif
371     void freePublicObject(FreeObject *objectToFree);
372     inline void freeOwnObject(void *object);
373     void reset();
374     void privatizePublicFreeList( bool reset = true );
375     void restoreBumpPtr();
376     void privatizeOrphaned(TLSData *tls, unsigned index);
377     bool readyToShare();
378     void shareOrphaned(intptr_t binTag, unsigned index);
379     unsigned int getSize() const {
380         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
381                       "Invalid object size");
382         return isStartupAllocObject()? 0 : objectSize;
383     }
384     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
385     inline bool isOwnedByCurrentThread() const;
386     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
387     inline FreeObject *findObjectToFree(const void *object) const;
388     void checkFreePrecond(const void *object) const {
389 #if MALLOC_DEBUG
390         const char *msg = "Possible double free or heap corruption.";
391         // small objects are always at least sizeof(size_t) Byte aligned,
392         // try to check this before this dereference as for invalid objects
393         // this may be unreadable
394         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
395 #if !__TBB_USE_THREAD_SANITIZER
396         // releasing to free slab
397         MALLOC_ASSERT(allocatedCount>0, msg);
398 #endif
399         // must not point to slab's header
400         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
401         if (startupAllocObjSizeMark == objectSize) // startup block
402             MALLOC_ASSERT(object<=bumpPtr, msg);
403         else {
404             // non-startup objects are 8 Byte aligned
405             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
406             FreeObject *toFree = findObjectToFree(object);
407 #if !__TBB_USE_THREAD_SANITIZER
408             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
409                           && (!bumpPtr || object>bumpPtr), msg);
410             // check against head of freeList, as this is mostly
411             // expected after double free
412             MALLOC_ASSERT(toFree != freeList, msg);
413 #endif
414             // check against head of publicFreeList, to detect double free
415             // involving foreign thread
416             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
417         }
418 #else
419         suppress_unused_warning(object);
420 #endif
421     }
422     void initEmptyBlock(TLSData *tls, size_t size);
423     size_t findObjectSize(void *object) const;
424     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
425 
426 protected:
427     void cleanBlockHeader();
428 
429 private:
430     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
431 
432     inline FreeObject *allocateFromBumpPtr();
433     inline FreeObject *findAllocatedObject(const void *address) const;
434 #if MALLOC_DEBUG
435     inline bool isProperlyPlaced(const void *object) const;
436 #endif
437     inline void markOwned(TLSData *tls) {
438         MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
439         ownerTid = ThreadId(); /* save the ID of the current thread */
440         tlsPtr.store(tls, std::memory_order_relaxed);
441     }
442     inline void markOrphaned() {
443         MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
444         tlsPtr.store(nullptr, std::memory_order_relaxed);
445     }
446 
447     friend class Bin;
448     friend class TLSData;
449     friend bool MemoryPool::destroy();
450 };
451 
452 const float Block::emptyEnoughRatio = 1.0 / 4.0;
453 
454 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
455     "The class Block does not fit into 2 cache lines on this platform. "
456     "Defining USE_INTERNAL_TID may help to fix it.");
457 
458 class Bin {
459 private:
460 public:
461     Block *activeBlk;
462     std::atomic<Block*> mailbox;
463     MallocMutex mailLock;
464 
465 public:
466     inline Block* getActiveBlock() const { return activeBlk; }
467     void resetActiveBlock() { activeBlk = nullptr; }
468     inline void setActiveBlock(Block *block);
469     inline Block* setPreviousBlockActive();
470     Block* getPrivatizedFreeListBlock();
471     void moveBlockToFront(Block *block);
472     bool cleanPublicFreeLists();
473     void processEmptyBlock(Block *block, bool poolTheBlock);
474     void addPublicFreeListBlock(Block* block);
475 
476     void outofTLSBin(Block* block);
477     void verifyTLSBin(size_t size) const;
478     void pushTLSBin(Block* block);
479 
480 #if MALLOC_DEBUG
481     void verifyInitState() const {
482         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
483         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
484     }
485 #endif
486 
487     friend void Block::freePublicObject (FreeObject *objectToFree);
488 };
489 
490 /********* End of the data structures                    **************/
491 
492 /*
493  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
494  */
495 const uint32_t minSmallObjectIndex = 0;
496 const uint32_t numSmallObjectBins = 8;
497 const uint32_t maxSmallObjectSize = 64;
498 
499 /*
500  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
501  * from maxSmallObjectSize till this size; 16 bins in total
502  */
503 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
504 const uint32_t numSegregatedObjectBins = 16;
505 const uint32_t maxSegregatedObjectSize = 1024;
506 
507 /*
508  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
509  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
510  */
511 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
512 const uint32_t numFittingBins = 5;
513 
514 const uint32_t fittingAlignment = estimatedCacheLineSize;
515 
516 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
517 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
518 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
519 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
520 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
521 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
522 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
523 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
524 #undef SET_FITTING_SIZE
525 
526 /*
527  * The total number of thread-specific Block-based bins
528  */
529 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
530 
531 /*
532  * Objects of this size and larger are considered large objects.
533  */
534 const uint32_t minLargeObjectSize = fittingSize5 + 1;
535 
536 /*
537  * Per-thread pool of slab blocks. Idea behind it is to not share with other
538  * threads memory that are likely in local cache(s) of our CPU.
539  */
540 class FreeBlockPool {
541 private:
542     std::atomic<Block*> head;
543     int         size;
544     Backend    *backend;
545 public:
546     static const int POOL_HIGH_MARK = 32;
547     static const int POOL_LOW_MARK  = 8;
548 
549     class ResOfGet {
550         ResOfGet() = delete;
551     public:
552         Block* block;
553         bool   lastAccMiss;
554         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
555     };
556 
557     // allocated in zero-initialized memory
558     FreeBlockPool(Backend *bknd) : backend(bknd) {}
559     ResOfGet getBlock();
560     void returnBlock(Block *block);
561     bool externalCleanup(); // can be called by another thread
562 };
563 
564 template<int LOW_MARK, int HIGH_MARK>
565 class LocalLOCImpl {
566 private:
567     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
568     // TODO: can single-linked list be faster here?
569     LargeMemoryBlock *tail; // need it when do releasing on overflow
570     std::atomic<LargeMemoryBlock*> head;
571     size_t            totalSize;
572     int               numOfBlocks;
573 public:
574     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
575     LargeMemoryBlock *get(size_t size);
576     bool externalCleanup(ExtMemoryPool *extMemPool);
577 #if __TBB_MALLOC_WHITEBOX_TEST
578     LocalLOCImpl() : tail(nullptr), head(nullptr), totalSize(0), numOfBlocks(0) {}
579     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
580     static const int LOC_HIGH_MARK = HIGH_MARK;
581 #else
582     // no ctor, object must be created in zero-initialized memory
583 #endif
584 };
585 
586 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
587 
588 class TLSData : public TLSRemote {
589     MemoryPool   *memPool;
590 public:
591     Bin           bin[numBlockBinLimit];
592     FreeBlockPool freeSlabBlocks;
593     LocalLOC      lloc;
594     unsigned      currCacheIdx;
595 private:
596     std::atomic<bool> unused;
597 public:
598     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd), currCacheIdx(0) {}
599     MemoryPool *getMemPool() const { return memPool; }
600     Bin* getAllocationBin(size_t size);
601     void release();
602     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
603         if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
604         // Heavy operation in terms of synchronization complexity,
605         // should be called only for the current thread
606         bool released = cleanBins ? cleanupBlockBins() : false;
607         // both cleanups to be called, and the order is not important
608         bool lloc_cleaned = lloc.externalCleanup(&memPool->extMemPool);
609         bool free_slab_blocks_cleaned = freeSlabBlocks.externalCleanup();
610         return released || lloc_cleaned || free_slab_blocks_cleaned;
611     }
612     bool cleanupBlockBins();
613     void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
614     void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
615 };
616 
617 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
618 {
619     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
620     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
621     if ( !tls )
622         return nullptr;
623     new(tls) TLSData(memPool, backend);
624     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
625 #if MALLOC_DEBUG
626     for (uint32_t i = 0; i < numBlockBinLimit; i++)
627         tls->bin[i].verifyInitState();
628 #endif
629     setThreadMallocTLS(tls);
630     memPool->extMemPool.allLocalCaches.registerThread(tls);
631     return tls;
632 }
633 
634 bool TLSData::cleanupBlockBins()
635 {
636     bool released = false;
637     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
638         released |= bin[i].cleanPublicFreeLists();
639         // After cleaning public free lists, only the active block might be empty.
640         // Do not use processEmptyBlock because it will just restore bumpPtr.
641         Block *block = bin[i].getActiveBlock();
642         if (block && block->empty()) {
643             bin[i].outofTLSBin(block);
644             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
645             released = true;
646         }
647     }
648     return released;
649 }
650 
651 bool ExtMemoryPool::releaseAllLocalCaches()
652 {
653     // Iterate all registered TLS data and clean LLOC and Slab pools
654     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
655 
656     // Bins privatization is done only for the current thread
657     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
658         released |= tlsData->cleanupBlockBins();
659 
660     return released;
661 }
662 
663 void AllLocalCaches::registerThread(TLSRemote *tls)
664 {
665     tls->prev = nullptr;
666     MallocMutex::scoped_lock lock(listLock);
667     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
668     tls->next = head;
669     if (head)
670         head->prev = tls;
671     head = tls;
672     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
673 }
674 
675 void AllLocalCaches::unregisterThread(TLSRemote *tls)
676 {
677     MallocMutex::scoped_lock lock(listLock);
678     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
679     if (head == tls)
680         head = tls->next;
681     if (tls->next)
682         tls->next->prev = tls->prev;
683     if (tls->prev)
684         tls->prev->next = tls->next;
685     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
686 }
687 
688 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
689 {
690     bool released = false;
691     {
692         MallocMutex::scoped_lock lock(listLock);
693         for (TLSRemote *curr=head; curr; curr=curr->next)
694             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
695     }
696     return released;
697 }
698 
699 void AllLocalCaches::markUnused()
700 {
701     bool locked = false;
702     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
703     if (!locked) // not wait for marking if someone doing something with it
704         return;
705 
706     for (TLSRemote *curr=head; curr; curr=curr->next)
707         static_cast<TLSData*>(curr)->markUnused();
708 }
709 
710 #if MALLOC_CHECK_RECURSION
711 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
712 std::atomic<pthread_t> RecursiveMallocCallProtector::owner_thread;
713 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
714 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
715 #if __FreeBSD__
716 bool        RecursiveMallocCallProtector::canUsePthread;
717 #endif
718 
719 #endif
720 
721 /*********** End code to provide thread ID and a TLS pointer **********/
722 
723 // Parameter for isLargeObject, keeps our expectations on memory origin.
724 // Assertions must use unknownMem to reliably report object invalidity.
725 enum MemoryOrigin {
726     ourMem,    // allocated by TBB allocator
727     unknownMem // can be allocated by system allocator or TBB allocator
728 };
729 
730 template<MemoryOrigin>
731 #if __TBB_USE_THREAD_SANITIZER
732 // We have a real race when accessing the large object header for
733 // non large objects (e.g. small or foreign objects).
734 // Therefore, we need to hide this access from the thread sanitizer
735 __attribute__((no_sanitize("thread")))
736 #endif
737 bool isLargeObject(void *object);
738 static void *internalMalloc(size_t size);
739 static void internalFree(void *object);
740 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
741 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
742 
743 #if !MALLOC_DEBUG
744 #if __INTEL_COMPILER || _MSC_VER
745 #define NOINLINE(decl) __declspec(noinline) decl
746 #define ALWAYSINLINE(decl) __forceinline decl
747 #elif __GNUC__
748 #define NOINLINE(decl) decl __attribute__ ((noinline))
749 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
750 #else
751 #define NOINLINE(decl) decl
752 #define ALWAYSINLINE(decl) decl
753 #endif
754 
755 static NOINLINE( bool doInitialization() );
756 ALWAYSINLINE( bool isMallocInitialized() );
757 
758 #undef ALWAYSINLINE
759 #undef NOINLINE
760 #endif /* !MALLOC_DEBUG */
761 
762 
763 /********* Now some rough utility code to deal with indexing the size bins. **************/
764 
765 /*
766  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
767  * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
768  * This is enough for current algorithm of distribution of sizes among bins.
769  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
770  */
771 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
772 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
773 #pragma intrinsic(_BitScanReverse)
774 #endif
775 static inline unsigned int highestBitPos(unsigned int n)
776 {
777     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
778     unsigned int pos;
779 #if __ARCH_x86_32||__ARCH_x86_64
780 
781 # if __unix__||__APPLE__||__MINGW32__
782     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
783 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
784     __asm
785     {
786         bsr eax, n
787         mov pos, eax
788     }
789 # elif _WIN64 && _MSC_VER>=1400
790     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
791 # else
792 #   error highestBitPos() not implemented for this platform
793 # endif
794 #elif __arm__
795     __asm__ __volatile__
796     (
797        "clz %0, %1\n"
798        "rsb %0, %0, %2\n"
799        :"=r" (pos) :"r" (n), "I" (31)
800     );
801 #else
802     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
803     pos = bsr[ n>>6 ];
804 #endif /* __ARCH_* */
805     return pos;
806 }
807 
808 unsigned int getSmallObjectIndex(unsigned int size)
809 {
810     unsigned int result = (size-1)>>3;
811     constexpr bool is_64bit = (8 == sizeof(void*));
812     if (is_64bit) {
813         // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
814         if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
815     }
816     return result;
817 }
818 
819 /*
820  * Depending on indexRequest, for a given size return either the index into the bin
821  * for objects of this size, or the actual size of objects in this bin.
822  */
823 template<bool indexRequest>
824 static unsigned int getIndexOrObjectSize (unsigned int size)
825 {
826     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
827         unsigned int index = getSmallObjectIndex( size );
828          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
829         return indexRequest ? index : (index+1)<<3;
830     }
831     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
832         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
833         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
834         if (indexRequest)
835             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
836         else {
837             unsigned int alignment = 128 >> (9-order); // alignment in the group
838             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
839             return alignUp(size,alignment);
840         }
841     }
842     else {
843         if( size <= fittingSize3 ) {
844             if( size <= fittingSize2 ) {
845                 if( size <= fittingSize1 )
846                     return indexRequest ? minFittingIndex : fittingSize1;
847                 else
848                     return indexRequest ? minFittingIndex+1 : fittingSize2;
849             } else
850                 return indexRequest ? minFittingIndex+2 : fittingSize3;
851         } else {
852             if( size <= fittingSize5 ) {
853                 if( size <= fittingSize4 )
854                     return indexRequest ? minFittingIndex+3 : fittingSize4;
855                 else
856                     return indexRequest ? minFittingIndex+4 : fittingSize5;
857             } else {
858                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
859                 return ~0U;
860             }
861         }
862     }
863 }
864 
865 static unsigned int getIndex (unsigned int size)
866 {
867     return getIndexOrObjectSize</*indexRequest=*/true>(size);
868 }
869 
870 static unsigned int getObjectSize (unsigned int size)
871 {
872     return getIndexOrObjectSize</*indexRequest=*/false>(size);
873 }
874 
875 
876 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
877 {
878     FreeObject *result;
879 
880     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
881 
882     { // Lock with acquire
883         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
884 
885         if( bootStrapObjectList) {
886             result = bootStrapObjectList;
887             bootStrapObjectList = bootStrapObjectList->next;
888         } else {
889             if (!bootStrapBlock) {
890                 bootStrapBlock = memPool->getEmptyBlock(size);
891                 if (!bootStrapBlock) return nullptr;
892             }
893             result = bootStrapBlock->bumpPtr;
894             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
895             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
896                 bootStrapBlock->bumpPtr = nullptr;
897                 bootStrapBlock->next = bootStrapBlockUsed;
898                 bootStrapBlockUsed = bootStrapBlock;
899                 bootStrapBlock = nullptr;
900             }
901         }
902     } // Unlock with release
903     memset (result, 0, size);
904     return (void*)result;
905 }
906 
907 void BootStrapBlocks::free(void* ptr)
908 {
909     MALLOC_ASSERT( ptr, ASSERT_TEXT );
910     { // Lock with acquire
911         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
912         ((FreeObject*)ptr)->next = bootStrapObjectList;
913         bootStrapObjectList = (FreeObject*)ptr;
914     } // Unlock with release
915 }
916 
917 void BootStrapBlocks::reset()
918 {
919     bootStrapBlock = bootStrapBlockUsed = nullptr;
920     bootStrapObjectList = nullptr;
921 }
922 
923 #if !(FREELIST_NONBLOCKING)
924 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
925 #endif
926 
927 /********* End rough utility code  **************/
928 
929 /* LifoList assumes zero initialization so a vector of it can be created
930  * by just allocating some space with no call to constructor.
931  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
932  *
933  * By usage convention there is no race on the initialization. */
934 LifoList::LifoList( ) : top(nullptr)
935 {
936     // MallocMutex assumes zero initialization
937     memset(static_cast<void*>(&lock), 0, sizeof(MallocMutex));
938 }
939 
940 void LifoList::push(Block *block)
941 {
942     MallocMutex::scoped_lock scoped_cs(lock);
943     block->next = top.load(std::memory_order_relaxed);
944     top.store(block, std::memory_order_relaxed);
945 }
946 
947 Block *LifoList::pop()
948 {
949     Block* block = nullptr;
950     if (top.load(std::memory_order_relaxed)) {
951         MallocMutex::scoped_lock scoped_cs(lock);
952         block = top.load(std::memory_order_relaxed);
953         if (block) {
954             top.store(block->next, std::memory_order_relaxed);
955         }
956     }
957     return block;
958 }
959 
960 Block *LifoList::grab()
961 {
962     Block *block = nullptr;
963     if (top.load(std::memory_order_relaxed)) {
964         MallocMutex::scoped_lock scoped_cs(lock);
965         block = top.load(std::memory_order_relaxed);
966         top.store(nullptr, std::memory_order_relaxed);
967     }
968     return block;
969 }
970 
971 /********* Thread and block related code      *************/
972 
973 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
974      LargeMemoryBlock *next, *lmb = loHead;
975      loHead = nullptr;
976 
977      for (; lmb; lmb = next) {
978          next = lmb->gNext;
979          if (poolDestroy) {
980              // as it's pool destruction, no need to return object to backend,
981              // only remove backrefs, as they are global
982              removeBackRef(lmb->backRefIdx);
983          } else {
984              // clean g(Next|Prev) to prevent removing lmb
985              // from AllLargeBlocksList inside returnLargeObject
986              lmb->gNext = lmb->gPrev = nullptr;
987              backend->returnLargeObject(lmb);
988          }
989      }
990 }
991 
992 TLSData* MemoryPool::getTLS(bool create)
993 {
994     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
995     if (create && !tls)
996         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
997     return tls;
998 }
999 
1000 /*
1001  * Return the bin for the given size.
1002  */
1003 inline Bin* TLSData::getAllocationBin(size_t size)
1004 {
1005     return bin + getIndex(size);
1006 }
1007 
1008 /* Return an empty uninitialized block in a non-blocking fashion. */
1009 Block *MemoryPool::getEmptyBlock(size_t size)
1010 {
1011     TLSData* tls = getTLS(/*create=*/false);
1012     // try to use per-thread cache, if TLS available
1013     FreeBlockPool::ResOfGet resOfGet = tls?
1014         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(nullptr, false);
1015     Block *result = resOfGet.block;
1016 
1017     if (!result) { // not found in local cache, asks backend for slabs
1018         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1019         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1020 
1021         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1022         if (!result) return nullptr;
1023 
1024         if (!extMemPool.userPool())
1025             for (int i=0; i<num; i++) {
1026                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1027                 if (backRefIdx[i].isInvalid()) {
1028                     // roll back resource allocation
1029                     for (int j=0; j<i; j++)
1030                         removeBackRef(backRefIdx[j]);
1031                     Block *b = result;
1032                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1033                         extMemPool.backend.putSlabBlock(b);
1034                     return nullptr;
1035                 }
1036             }
1037         // resources were allocated, register blocks
1038         Block *b = result;
1039         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1040             // slab block in user's pool must have invalid backRefIdx
1041             if (extMemPool.userPool()) {
1042                 new (&b->backRefIdx) BackRefIdx();
1043             } else {
1044                 setBackRef(backRefIdx[i], b);
1045                 b->backRefIdx = backRefIdx[i];
1046             }
1047             b->tlsPtr.store(tls, std::memory_order_relaxed);
1048             b->poolPtr = this;
1049             // all but first one go to per-thread pool
1050             if (i > 0) {
1051                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1052                 tls->freeSlabBlocks.returnBlock(b);
1053             }
1054         }
1055     }
1056     MALLOC_ASSERT(result, ASSERT_TEXT);
1057     result->initEmptyBlock(tls, size);
1058     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1059     return result;
1060 }
1061 
1062 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1063 {
1064     block->reset();
1065     if (poolTheBlock) {
1066         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1067     } else {
1068         // slab blocks in user's pools do not have valid backRefIdx
1069         if (!extMemPool.userPool())
1070             removeBackRef(*(block->getBackRefIdx()));
1071         extMemPool.backend.putSlabBlock(block);
1072     }
1073 }
1074 
1075 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1076                          rawFreeType rawFree, size_t granularity,
1077                          bool keepAllMemory, bool fixedPool)
1078 {
1079     this->poolId = poolId;
1080     this->rawAlloc = rawAlloc;
1081     this->rawFree = rawFree;
1082     this->granularity = granularity;
1083     this->keepAllMemory = keepAllMemory;
1084     this->fixedPool = fixedPool;
1085     this->delayRegsReleasing = false;
1086     if (!initTLS())
1087         return false;
1088     loc.init(this);
1089     backend.init(this);
1090     MALLOC_ASSERT(isPoolValid(), nullptr);
1091     return true;
1092 }
1093 
1094 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1095 
1096 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1097 {
1098     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1099                policy->granularity? policy->granularity : defaultGranularity,
1100                policy->keepAllMemory, policy->fixedPool))
1101         return false;
1102     {
1103         MallocMutex::scoped_lock lock(memPoolListLock);
1104         next = defaultMemPool->next;
1105         defaultMemPool->next = this;
1106         prev = defaultMemPool;
1107         if (next)
1108             next->prev = this;
1109     }
1110     return true;
1111 }
1112 
1113 bool MemoryPool::reset()
1114 {
1115     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1116     // memory is not releasing during pool reset
1117     // TODO: mark regions to release unused on next reset()
1118     extMemPool.delayRegionsReleasing(true);
1119 
1120     bootStrapBlocks.reset();
1121     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1122     if (!extMemPool.reset())
1123         return false;
1124 
1125     if (!extMemPool.initTLS())
1126         return false;
1127     extMemPool.delayRegionsReleasing(false);
1128     return true;
1129 }
1130 
1131 bool MemoryPool::destroy()
1132 {
1133 #if __TBB_MALLOC_LOCACHE_STAT
1134     extMemPool.loc.reportStat(stdout);
1135 #endif
1136 #if __TBB_MALLOC_BACKEND_STAT
1137     extMemPool.backend.reportStat(stdout);
1138 #endif
1139     {
1140         MallocMutex::scoped_lock lock(memPoolListLock);
1141         // remove itself from global pool list
1142         if (prev)
1143             prev->next = next;
1144         if (next)
1145             next->prev = prev;
1146     }
1147     // slab blocks in non-default pool do not have backreferences,
1148     // only large objects do
1149     if (extMemPool.userPool())
1150         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1151     else {
1152         // only one non-userPool() is supported now
1153         MALLOC_ASSERT(this==defaultMemPool, nullptr);
1154         // There and below in extMemPool.destroy(), do not restore initial state
1155         // for user pool, because it's just about to be released. But for system
1156         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1157         bootStrapBlocks.reset();
1158         extMemPool.orphanedBlocks.reset();
1159     }
1160     return extMemPool.destroy();
1161 }
1162 
1163 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1164 {
1165     if (tlsData) { // might be called for "empty" TLS
1166         tlsData->release();
1167         bootStrapBlocks.free(tlsData);
1168         clearTLS();
1169     }
1170 }
1171 
1172 #if MALLOC_DEBUG
1173 void Bin::verifyTLSBin (size_t size) const
1174 {
1175 /* The debug version verifies the TLSBin as needed */
1176     uint32_t objSize = getObjectSize(size);
1177 
1178     if (activeBlk) {
1179         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1180         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1181 #if MALLOC_DEBUG>1
1182         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1183             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1184             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1185             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1186             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1187             if (temp->next) {
1188                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1189             }
1190         }
1191         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1192             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1193             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1194             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1195             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1196             if (temp->previous) {
1197                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1198             }
1199         }
1200 #endif /* MALLOC_DEBUG>1 */
1201     }
1202 }
1203 #else /* MALLOC_DEBUG */
1204 inline void Bin::verifyTLSBin (size_t) const { }
1205 #endif /* MALLOC_DEBUG */
1206 
1207 /*
1208  * Add a block to the start of this tls bin list.
1209  */
1210 void Bin::pushTLSBin(Block* block)
1211 {
1212     /* The objectSize should be defined and not a parameter
1213        because the function is applied to partially filled blocks as well */
1214     unsigned int size = block->objectSize;
1215 
1216     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1217     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1218     MALLOC_ASSERT( block->next == nullptr, ASSERT_TEXT );
1219     MALLOC_ASSERT( block->previous == nullptr, ASSERT_TEXT );
1220 
1221     MALLOC_ASSERT( this, ASSERT_TEXT );
1222     verifyTLSBin(size);
1223 
1224     block->next = activeBlk;
1225     if( activeBlk ) {
1226         block->previous = activeBlk->previous;
1227         activeBlk->previous = block;
1228         if( block->previous )
1229             block->previous->next = block;
1230     } else {
1231         activeBlk = block;
1232     }
1233 
1234     verifyTLSBin(size);
1235 }
1236 
1237 /*
1238  * Take a block out of its tls bin (e.g. before removal).
1239  */
1240 void Bin::outofTLSBin(Block* block)
1241 {
1242     unsigned int size = block->objectSize;
1243 
1244     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1245     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1246 
1247     MALLOC_ASSERT( this, ASSERT_TEXT );
1248     verifyTLSBin(size);
1249 
1250     if (block == activeBlk) {
1251         activeBlk = block->previous? block->previous : block->next;
1252     }
1253     /* Unlink the block */
1254     if (block->previous) {
1255         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1256         block->previous->next = block->next;
1257     }
1258     if (block->next) {
1259         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1260         block->next->previous = block->previous;
1261     }
1262     block->next = nullptr;
1263     block->previous = nullptr;
1264 
1265     verifyTLSBin(size);
1266 }
1267 
1268 Block* Bin::getPrivatizedFreeListBlock()
1269 {
1270     Block* block;
1271     MALLOC_ASSERT( this, ASSERT_TEXT );
1272     // if this method is called, active block usage must be unsuccessful
1273     MALLOC_ASSERT( (!activeBlk && !mailbox.load(std::memory_order_relaxed)) || (activeBlk && activeBlk->isFull), ASSERT_TEXT );
1274 
1275 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1276     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1277         return nullptr;
1278     else { // mailbox is not empty, take lock and inspect it
1279         MallocMutex::scoped_lock scoped_cs(mailLock);
1280         block = mailbox.load(std::memory_order_relaxed);
1281         if( block ) {
1282             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1283             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1284             mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1285             block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1286         }
1287     }
1288     if( block ) {
1289         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1290         block->privatizePublicFreeList();
1291         block->adjustPositionInBin(this);
1292     }
1293     return block;
1294 }
1295 
1296 void Bin::addPublicFreeListBlock(Block* block)
1297 {
1298     MallocMutex::scoped_lock scoped_cs(mailLock);
1299     block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1300     mailbox.store(block, std::memory_order_relaxed);
1301 }
1302 
1303 // Process publicly freed objects in all blocks and return empty blocks
1304 // to the backend in order to reduce overall footprint.
1305 bool Bin::cleanPublicFreeLists()
1306 {
1307     Block* block;
1308     if (!mailbox.load(std::memory_order_acquire))
1309         return false;
1310     else {
1311         // Grab all the blocks in the mailbox
1312         MallocMutex::scoped_lock scoped_cs(mailLock);
1313         block = mailbox.load(std::memory_order_relaxed);
1314         mailbox.store(nullptr, std::memory_order_relaxed);
1315     }
1316     bool released = false;
1317     while (block) {
1318         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1319         Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1320         block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1321         block->privatizePublicFreeList();
1322         if (block->empty()) {
1323             processEmptyBlock(block, /*poolTheBlock=*/false);
1324             released = true;
1325         } else
1326             block->adjustPositionInBin(this);
1327         block = tmp;
1328     }
1329     return released;
1330 }
1331 
1332 bool Block::adjustFullness()
1333 {
1334     if (bumpPtr) {
1335         /* If we are still using a bump ptr for this block it is empty enough to use. */
1336         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1337         isFull = false;
1338     } else {
1339         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1340         /* allocatedCount shows how many objects in the block are in use; however it still counts
1341          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1342         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1343 #if COLLECT_STATISTICS
1344         if (isFull)
1345             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1346         else
1347             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1348 #endif
1349     }
1350     return isFull;
1351 }
1352 
1353 // This method resides in class Block, and not in class Bin, in order to avoid
1354 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
1355 void Block::adjustPositionInBin(Bin* bin/*=nullptr*/)
1356 {
1357     // If the block were full, but became empty enough to use,
1358     // move it to the front of the list
1359     if (isFull && !adjustFullness()) {
1360         if (!bin)
1361             bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1362         bin->moveBlockToFront(this);
1363     }
1364 }
1365 
1366 /* Restore the bump pointer for an empty block that is planned to use */
1367 void Block::restoreBumpPtr()
1368 {
1369     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1370     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1371     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1372     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1373     freeList = nullptr;
1374     isFull = false;
1375 }
1376 
1377 void Block::freeOwnObject(void *object)
1378 {
1379     tlsPtr.load(std::memory_order_relaxed)->markUsed();
1380     allocatedCount--;
1381     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1382 #if COLLECT_STATISTICS
1383     // Note that getAllocationBin is not called on the hottest path with statistics off.
1384     if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1385         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1386     else
1387         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1388 #endif
1389     if (empty()) {
1390         // If the last object of a slab is freed, the slab cannot be marked full
1391         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1392         tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1393     } else { // hot path
1394         FreeObject *objectToFree = findObjectToFree(object);
1395         objectToFree->next = freeList;
1396         freeList = objectToFree;
1397         adjustPositionInBin();
1398     }
1399 }
1400 
1401 void Block::freePublicObject (FreeObject *objectToFree)
1402 {
1403     FreeObject* localPublicFreeList{};
1404 
1405     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1406 #if FREELIST_NONBLOCKING
1407     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1408     localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1409     do {
1410         objectToFree->next = localPublicFreeList;
1411         // no backoff necessary because trying to make change, not waiting for a change
1412     } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1413 #else
1414     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1415     {
1416         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1417         localPublicFreeList = objectToFree->next = publicFreeList;
1418         publicFreeList = objectToFree;
1419     }
1420 #endif
1421 
1422     if( localPublicFreeList==nullptr ) {
1423         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1424         // otherwise, it should point to the bin the block belongs to.
1425         // reading nextPrivatizable is thread-safe below, because:
1426         // 1) the executing thread atomically got publicFreeList==nullptr and changed it to non-nullptr;
1427         // 2) only owning thread can change it back to nullptr,
1428         // 3) but it can not be done until the block is put to the mailbox
1429         // So the executing thread is now the only one that can change nextPrivatizable
1430         Block* next = nextPrivatizable.load(std::memory_order_acquire);
1431         if( !isNotForUse(next) ) {
1432             MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1433             Bin* theBin = (Bin*) next;
1434 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1435             // FIXME: The thread that returns the block is not the block's owner.
1436             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1437             // Need to find a way to get the correct remote bin for comparison.
1438             { // check that nextPrivatizable points to the bin the block belongs to
1439                 uint32_t index = getIndex( objectSize );
1440                 TLSData* tls = getThreadMallocTLS();
1441                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1442             }
1443 #endif // MALLOC_DEBUG
1444             theBin->addPublicFreeListBlock(this);
1445         }
1446     }
1447     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1448     STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1449 }
1450 
1451 // Make objects freed by other threads available for use again
1452 void Block::privatizePublicFreeList( bool reset )
1453 {
1454     FreeObject *localPublicFreeList;
1455     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1456     // to properly synchronize with other threads freeing objects to this slab.
1457     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1458 
1459     // Only the owner thread may reset the pointer to nullptr
1460     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1461 #if FREELIST_NONBLOCKING
1462     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1463 #else
1464     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1465     {
1466         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1467         localPublicFreeList = publicFreeList;
1468         publicFreeList = endMarker;
1469     }
1470 #endif
1471     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1472     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1473 
1474     // publicFreeList must have been UNUSABLE or valid, but not nullptr
1475     MALLOC_ASSERT( localPublicFreeList!=nullptr, ASSERT_TEXT );
1476     if( isSolidPtr(localPublicFreeList) ) {
1477         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1478         /* other threads did not change the counter freeing our blocks */
1479         allocatedCount--;
1480         FreeObject *temp = localPublicFreeList;
1481         while( isSolidPtr(temp->next) ){ // the list will end with either nullptr or UNUSABLE
1482             temp = temp->next;
1483             allocatedCount--;
1484             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1485         }
1486         /* merge with local freeList */
1487         temp->next = freeList;
1488         freeList = localPublicFreeList;
1489         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1490     }
1491 }
1492 
1493 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1494 {
1495     Bin* bin = tls->bin + index;
1496     STAT_increment(getThreadId(), index, allocBlockPublic);
1497     next = nullptr;
1498     previous = nullptr;
1499     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != nullptr, ASSERT_TEXT );
1500     /* There is not a race here since no other thread owns this block */
1501     markOwned(tls);
1502     // It is safe to change nextPrivatizable, as publicFreeList is not null
1503     MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1504     nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1505     // the next call is required to change publicFreeList to 0
1506     privatizePublicFreeList();
1507     if( empty() ) {
1508         restoreBumpPtr();
1509     } else {
1510         adjustFullness(); // check the block fullness and set isFull
1511     }
1512     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1513 }
1514 
1515 
1516 bool Block::readyToShare()
1517 {
1518     FreeObject* oldVal = nullptr;
1519 #if FREELIST_NONBLOCKING
1520     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1521 #else
1522     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1523     {
1524         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1525         if ( (oldVal=publicFreeList)==nullptr )
1526             (intptr_t&)(publicFreeList) = UNUSABLE;
1527     }
1528 #endif
1529     return oldVal==nullptr;
1530 }
1531 
1532 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1533 {
1534     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1535     // unreferenced formal parameter warning
1536     tbb::detail::suppress_unused_warning(index);
1537     STAT_increment(getThreadId(), index, freeBlockPublic);
1538     markOrphaned();
1539     if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1540         // First check passed: the block is not in mailbox yet.
1541         // Need to set publicFreeList to non-zero, so other threads
1542         // will not change nextPrivatizable and it can be zeroed.
1543         if ( !readyToShare() ) {
1544             // another thread freed an object; we need to wait until it finishes.
1545             // There is no need for exponential backoff, as the wait here is not for a lock;
1546             // but need to yield, so the thread we wait has a chance to run.
1547             // TODO: add a pause to also be friendly to hyperthreads
1548             int count = 256;
1549             while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1550                 if (--count==0) {
1551                     do_yield();
1552                     count = 256;
1553                 }
1554             }
1555         }
1556     }
1557     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=nullptr, ASSERT_TEXT );
1558     // now it is safe to change our data
1559     previous = nullptr;
1560     // it is caller responsibility to ensure that the list of blocks
1561     // formed by nextPrivatizable pointers is kept consistent if required.
1562     // if only called from thread shutdown code, it does not matter.
1563     nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1564 }
1565 
1566 void Block::cleanBlockHeader()
1567 {
1568     next = nullptr;
1569     previous = nullptr;
1570     freeList = nullptr;
1571     allocatedCount = 0;
1572     isFull = false;
1573     tlsPtr.store(nullptr, std::memory_order_relaxed);
1574 
1575     publicFreeList.store(nullptr, std::memory_order_relaxed);
1576 }
1577 
1578 void Block::initEmptyBlock(TLSData *tls, size_t size)
1579 {
1580     // Having getIndex and getObjectSize called next to each other
1581     // allows better compiler optimization as they basically share the code.
1582     unsigned int index = getIndex(size);
1583     unsigned int objSz = getObjectSize(size);
1584 
1585     cleanBlockHeader();
1586     objectSize = objSz;
1587     markOwned(tls);
1588     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1589     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1590 
1591     // each block should have the address where the head of the list of "privatizable" blocks is kept
1592     // the only exception is a block for boot strap which is initialized when TLS is yet nullptr
1593     nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1594     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1595              this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1596 }
1597 
1598 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1599 {
1600     // TODO: try to use index from getAllocationBin
1601     unsigned int index = getIndex(size);
1602     Block *block = bins[index].pop();
1603     if (block) {
1604         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1605         block->privatizeOrphaned(tls, index);
1606     }
1607     return block;
1608 }
1609 
1610 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1611 {
1612     unsigned int index = getIndex(block->getSize());
1613     block->shareOrphaned(binTag, index);
1614     MALLOC_ITT_SYNC_RELEASING(bins+index);
1615     bins[index].push(block);
1616 }
1617 
1618 void OrphanedBlocks::reset()
1619 {
1620     for (uint32_t i=0; i<numBlockBinLimit; i++)
1621         new (bins+i) LifoList();
1622 }
1623 
1624 bool OrphanedBlocks::cleanup(Backend* backend)
1625 {
1626     bool released = false;
1627     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1628         Block* block = bins[i].grab();
1629         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1630         while (block) {
1631             Block* next = block->next;
1632             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to nullptr
1633             if (block->empty()) {
1634                 block->reset();
1635                 // slab blocks in user's pools do not have valid backRefIdx
1636                 if (!backend->inUserPool())
1637                     removeBackRef(*(block->getBackRefIdx()));
1638                 backend->putSlabBlock(block);
1639                 released = true;
1640             } else {
1641                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1642                 bins[i].push(block);
1643             }
1644             block = next;
1645         }
1646     }
1647     return released;
1648 }
1649 
1650 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1651 {
1652     Block *b = head.exchange(nullptr);
1653     bool lastAccessMiss;
1654 
1655     if (b) {
1656         size--;
1657         Block *newHead = b->next;
1658         lastAccessMiss = false;
1659         head.store(newHead, std::memory_order_release);
1660     } else {
1661         lastAccessMiss = true;
1662     }
1663     return ResOfGet(b, lastAccessMiss);
1664 }
1665 
1666 void FreeBlockPool::returnBlock(Block *block)
1667 {
1668     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1669     Block *localHead = head.exchange(nullptr);
1670 
1671     if (!localHead) {
1672         size = 0; // head was stolen by externalClean, correct size accordingly
1673     } else if (size == POOL_HIGH_MARK) {
1674         // release cold blocks and add hot one,
1675         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1676         Block *headToFree = localHead, *helper;
1677         for (int i=0; i<POOL_LOW_MARK-2; i++)
1678             headToFree = headToFree->next;
1679         Block *last = headToFree;
1680         headToFree = headToFree->next;
1681         last->next = nullptr;
1682         size = POOL_LOW_MARK-1;
1683         for (Block *currBl = headToFree; currBl; currBl = helper) {
1684             helper = currBl->next;
1685             // slab blocks in user's pools do not have valid backRefIdx
1686             if (!backend->inUserPool())
1687                 removeBackRef(currBl->backRefIdx);
1688             backend->putSlabBlock(currBl);
1689         }
1690     }
1691     size++;
1692     block->next = localHead;
1693     head.store(block, std::memory_order_release);
1694 }
1695 
1696 bool FreeBlockPool::externalCleanup()
1697 {
1698     Block *helper;
1699     bool released = false;
1700 
1701     for (Block *currBl=head.exchange(nullptr); currBl; currBl=helper) {
1702         helper = currBl->next;
1703         // slab blocks in user's pools do not have valid backRefIdx
1704         if (!backend->inUserPool())
1705             removeBackRef(currBl->backRefIdx);
1706         backend->putSlabBlock(currBl);
1707         released = true;
1708     }
1709     return released;
1710 }
1711 
1712 /* Prepare the block for returning to FreeBlockPool */
1713 void Block::reset()
1714 {
1715     // it is caller's responsibility to ensure no data is lost before calling this
1716     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1717     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1718     if (!isStartupAllocObject())
1719         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1720 
1721     cleanBlockHeader();
1722 
1723     nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1724 
1725     objectSize = 0;
1726     // for an empty block, bump pointer should point right after the end of the block
1727     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1728 }
1729 
1730 inline void Bin::setActiveBlock (Block *block)
1731 {
1732 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1733     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1734     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1735     activeBlk = block;
1736 }
1737 
1738 inline Block* Bin::setPreviousBlockActive()
1739 {
1740     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1741     Block* temp = activeBlk->previous;
1742     if( temp ) {
1743         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1744         activeBlk = temp;
1745     }
1746     return temp;
1747 }
1748 
1749 inline bool Block::isOwnedByCurrentThread() const {
1750     return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1751 }
1752 
1753 FreeObject *Block::findObjectToFree(const void *object) const
1754 {
1755     FreeObject *objectToFree;
1756     // Due to aligned allocations, a pointer passed to scalable_free
1757     // might differ from the address of internally allocated object.
1758     // Small objects however should always be fine.
1759     if (objectSize <= maxSegregatedObjectSize)
1760         objectToFree = (FreeObject*)object;
1761     // "Fitting size" allocations are suspicious if aligned higher than naturally
1762     else {
1763         if ( ! isAligned(object,2*fittingAlignment) )
1764             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1765             //       so might even be slower in average than unconditional use of findAllocatedObject.
1766             // here it should be a "real" object
1767             objectToFree = (FreeObject*)object;
1768         else
1769             // here object can be an aligned address, so applying additional checks
1770             objectToFree = findAllocatedObject(object);
1771         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1772     }
1773     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1774 
1775     return objectToFree;
1776 }
1777 
1778 void TLSData::release()
1779 {
1780     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1781     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1782 
1783     for (unsigned index = 0; index < numBlockBins; index++) {
1784         Block *activeBlk = bin[index].getActiveBlock();
1785         if (!activeBlk)
1786             continue;
1787         Block *threadlessBlock = activeBlk->previous;
1788         bool syncOnMailbox = false;
1789         while (threadlessBlock) {
1790             Block *threadBlock = threadlessBlock->previous;
1791             if (threadlessBlock->empty()) {
1792                 /* we destroy the thread, so not use its block pool */
1793                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1794             } else {
1795                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1796                 syncOnMailbox = true;
1797             }
1798             threadlessBlock = threadBlock;
1799         }
1800         threadlessBlock = activeBlk;
1801         while (threadlessBlock) {
1802             Block *threadBlock = threadlessBlock->next;
1803             if (threadlessBlock->empty()) {
1804                 /* we destroy the thread, so not use its block pool */
1805                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1806             } else {
1807                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1808                 syncOnMailbox = true;
1809             }
1810             threadlessBlock = threadBlock;
1811         }
1812         bin[index].resetActiveBlock();
1813 
1814         if (syncOnMailbox) {
1815             // Although, we synchronized on nextPrivatizable inside a block, we still need to
1816             // synchronize on the bin lifetime because the thread releasing an object into the public
1817             // free list is touching the bin (mailbox and mailLock)
1818             MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1819         }
1820     }
1821 }
1822 
1823 
1824 #if MALLOC_CHECK_RECURSION
1825 // TODO: Use dedicated heap for this
1826 
1827 /*
1828  * It's a special kind of allocation that can be used when malloc is
1829  * not available (either during startup or when malloc was already called and
1830  * we are, say, inside pthread_setspecific's call).
1831  * Block can contain objects of different sizes,
1832  * allocations are performed by moving bump pointer and increasing of object counter,
1833  * releasing is done via counter of objects allocated in the block
1834  * or moving bump pointer if releasing object is on a bound.
1835  * TODO: make bump pointer to grow to the same backward direction as all the others.
1836  */
1837 
1838 class StartupBlock : public Block {
1839     size_t availableSize() const {
1840         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1841     }
1842     static StartupBlock *getBlock();
1843 public:
1844     static FreeObject *allocate(size_t size);
1845     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1846     void free(void *ptr);
1847 };
1848 
1849 static MallocMutex startupMallocLock;
1850 static StartupBlock *firstStartupBlock;
1851 
1852 StartupBlock *StartupBlock::getBlock()
1853 {
1854     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1855     if (backRefIdx.isInvalid()) return nullptr;
1856 
1857     StartupBlock *block = static_cast<StartupBlock*>(
1858         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1859     if (!block) return nullptr;
1860 
1861     block->cleanBlockHeader();
1862     setBackRef(backRefIdx, block);
1863     block->backRefIdx = backRefIdx;
1864     // use startupAllocObjSizeMark to mark objects from startup block marker
1865     block->objectSize = startupAllocObjSizeMark;
1866     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1867     return block;
1868 }
1869 
1870 FreeObject *StartupBlock::allocate(size_t size)
1871 {
1872     FreeObject *result;
1873     StartupBlock *newBlock = nullptr;
1874 
1875     /* Objects must be aligned on their natural bounds,
1876        and objects bigger than word on word's bound. */
1877     size = alignUp(size, sizeof(size_t));
1878     // We need size of an object to implement msize.
1879     size_t reqSize = size + sizeof(size_t);
1880     {
1881         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1882         // Re-check whether we need a new block (conditions might have changed)
1883         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1884             if (!newBlock) {
1885                 newBlock = StartupBlock::getBlock();
1886                 if (!newBlock) return nullptr;
1887             }
1888             newBlock->next = (Block*)firstStartupBlock;
1889             if (firstStartupBlock)
1890                 firstStartupBlock->previous = (Block*)newBlock;
1891             firstStartupBlock = newBlock;
1892         }
1893         result = firstStartupBlock->bumpPtr;
1894         firstStartupBlock->allocatedCount++;
1895         firstStartupBlock->bumpPtr =
1896             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1897     }
1898 
1899     // keep object size at the negative offset
1900     *((size_t*)result) = size;
1901     return (FreeObject*)((size_t*)result+1);
1902 }
1903 
1904 void StartupBlock::free(void *ptr)
1905 {
1906     Block* blockToRelease = nullptr;
1907     {
1908         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1909 
1910         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1911         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1912                       && allocatedCount>0, ASSERT_TEXT);
1913         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1914                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1915                       ASSERT_TEXT);
1916         if (0 == --allocatedCount) {
1917             if (this == firstStartupBlock)
1918                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1919             if (previous)
1920                 previous->next = next;
1921             if (next)
1922                 next->previous = previous;
1923             blockToRelease = this;
1924         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1925             // last object in the block released
1926             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1927             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1928                           ASSERT_TEXT);
1929             bumpPtr = newBump;
1930         }
1931     }
1932     if (blockToRelease) {
1933         blockToRelease->previous = blockToRelease->next = nullptr;
1934         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1935     }
1936 }
1937 
1938 #endif /* MALLOC_CHECK_RECURSION */
1939 
1940 /********* End thread related code  *************/
1941 
1942 /********* Library initialization *************/
1943 
1944 //! Value indicating the state of initialization.
1945 /* 0 = initialization not started.
1946  * 1 = initialization started but not finished.
1947  * 2 = initialization finished.
1948  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1949  * useful for detecting errors in the double-check pattern.
1950  */
1951 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1952 static MallocMutex initMutex;
1953 
1954 /** The leading "\0" is here so that applying "strings" to the binary
1955     delivers a clean result. */
1956 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1957 
1958 #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1959 
1960 /* Decrease race interval between dynamic library unloading and pthread key
1961    destructor. Protect only Pthreads with supported unloading. */
1962 class ShutdownSync {
1963 /* flag is the number of threads in pthread key dtor body
1964    (i.e., between threadDtorStart() and threadDtorDone())
1965    or the signal to skip dtor, if flag < 0 */
1966     std::atomic<intptr_t> flag;
1967     static const intptr_t skipDtor = INTPTR_MIN/2;
1968 public:
1969     void init() { flag.store(0, std::memory_order_release); }
1970 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1971    simultaneously, so flag never becomes negative because of that. */
1972     bool threadDtorStart() {
1973         if (flag.load(std::memory_order_acquire) < 0)
1974             return false;
1975         if (++flag <= 0) { // note that new value returned
1976             flag.fetch_sub(1); // flag is spoiled by us, restore it
1977             return false;
1978         }
1979         return true;
1980     }
1981     void threadDtorDone() {
1982         flag.fetch_sub(1);
1983     }
1984     void processExit() {
1985         if (flag.fetch_add(skipDtor) != 0) {
1986             SpinWaitUntilEq(flag, skipDtor);
1987         }
1988     }
1989 };
1990 
1991 #else
1992 
1993 class ShutdownSync {
1994 public:
1995     void init() { }
1996     bool threadDtorStart() { return true; }
1997     void threadDtorDone() { }
1998     void processExit() { }
1999 };
2000 
2001 #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
2002 
2003 static ShutdownSync shutdownSync;
2004 
2005 inline bool isMallocInitialized() {
2006     // Load must have acquire fence; otherwise thread taking "initialized" path
2007     // might perform textually later loads *before* mallocInitialized becomes 2.
2008     return 2 == mallocInitialized.load(std::memory_order_acquire);
2009 }
2010 
2011 /* Caller is responsible for ensuring this routine is called exactly once. */
2012 extern "C" void MallocInitializeITT() {
2013 #if __TBB_USE_ITT_NOTIFY
2014     if (!usedBySrcIncluded)
2015         tbb::detail::r1::__TBB_load_ittnotify();
2016 #endif
2017 }
2018 
2019 void MemoryPool::initDefaultPool() {
2020     hugePages.init();
2021 }
2022 
2023 /*
2024  * Allocator initialization routine;
2025  * it is called lazily on the very first scalable_malloc call.
2026  */
2027 static bool initMemoryManager()
2028 {
2029     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2030              sizeof(Block), sizeof(uintptr_t) ));
2031     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2032     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2033     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2034                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2035 
2036 #if USE_WINTHREAD
2037     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2038 #else
2039     // POSIX.1-2001-compliant way to get page size
2040     const size_t granularity = sysconf(_SC_PAGESIZE);
2041 #endif
2042     if (!defaultMemPool) {
2043         // Do not rely on static constructors and do the assignment in case
2044         // of library static section not initialized at this call yet.
2045         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2046     }
2047     bool initOk = defaultMemPool->
2048         extMemPool.init(0, nullptr, nullptr, granularity,
2049                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2050 // TODO: extMemPool.init() to not allocate memory
2051     if (!initOk || !initBackRefMain(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2052         return false;
2053     MemoryPool::initDefaultPool();
2054     // init() is required iff initMemoryManager() is called
2055     // after mallocProcessShutdownNotification()
2056     shutdownSync.init();
2057 #if COLLECT_STATISTICS
2058     initStatisticsCollection();
2059 #endif
2060     return true;
2061 }
2062 
2063 static bool GetBoolEnvironmentVariable(const char* name) {
2064     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2065 }
2066 
2067 //! Ensures that initMemoryManager() is called once and only once.
2068 /** Does not return until initMemoryManager() has been completed by a thread.
2069     There is no need to call this routine if mallocInitialized==2 . */
2070 static bool doInitialization()
2071 {
2072     MallocMutex::scoped_lock lock( initMutex );
2073     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2074         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2075         mallocInitialized.store(1, std::memory_order_relaxed);
2076         RecursiveMallocCallProtector scoped;
2077         if (!initMemoryManager()) {
2078             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2079             return false;
2080         }
2081 #ifdef  MALLOC_EXTRA_INITIALIZATION
2082         MALLOC_EXTRA_INITIALIZATION;
2083 #endif
2084 #if MALLOC_CHECK_RECURSION
2085         RecursiveMallocCallProtector::detectNaiveOverload();
2086 #endif
2087         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2088         // Store must have release fence, otherwise mallocInitialized==2
2089         // might become remotely visible before side effects of
2090         // initMemoryManager() become remotely visible.
2091         mallocInitialized.store(2, std::memory_order_release);
2092         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2093             fputs(VersionString+1,stderr);
2094             hugePages.printStatus();
2095         }
2096     }
2097     /* It can't be 0 or I would have initialized it */
2098     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2099     return true;
2100 }
2101 
2102 /********* End library initialization *************/
2103 
2104 /********* The malloc show begins     *************/
2105 
2106 
2107 FreeObject *Block::allocateFromFreeList()
2108 {
2109     FreeObject *result;
2110 
2111     if (!freeList) return nullptr;
2112 
2113     result = freeList;
2114     MALLOC_ASSERT( result, ASSERT_TEXT );
2115 
2116     freeList = result->next;
2117     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2118     allocatedCount++;
2119     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2120 
2121     return result;
2122 }
2123 
2124 FreeObject *Block::allocateFromBumpPtr()
2125 {
2126     FreeObject *result = bumpPtr;
2127     if (result) {
2128         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2129         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2130             bumpPtr = nullptr;
2131         }
2132         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2133         allocatedCount++;
2134         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2135     }
2136     return result;
2137 }
2138 
2139 inline FreeObject* Block::allocate()
2140 {
2141     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2142 
2143     /* for better cache locality, first looking in the free list. */
2144     if ( FreeObject *result = allocateFromFreeList() ) {
2145         return result;
2146     }
2147     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2148 
2149     /* if free list is empty, try thread local bump pointer allocation. */
2150     if ( FreeObject *result = allocateFromBumpPtr() ) {
2151         return result;
2152     }
2153     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2154 
2155     /* the block is considered full. */
2156     isFull = true;
2157     return nullptr;
2158 }
2159 
2160 size_t Block::findObjectSize(void *object) const
2161 {
2162     size_t blSize = getSize();
2163 #if MALLOC_CHECK_RECURSION
2164     // Currently, there is no aligned allocations from startup blocks,
2165     // so we can return just StartupBlock::msize().
2166     // TODO: This must be extended if we add aligned allocation from startup blocks.
2167     if (!blSize)
2168         return StartupBlock::msize(object);
2169 #endif
2170     // object can be aligned, so real size can be less than block's
2171     size_t size =
2172         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2173     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2174     return size;
2175 }
2176 
2177 void Bin::moveBlockToFront(Block *block)
2178 {
2179     /* move the block to the front of the bin */
2180     if (block == activeBlk) return;
2181     outofTLSBin(block);
2182     pushTLSBin(block);
2183 }
2184 
2185 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2186 {
2187     if (block != activeBlk) {
2188         /* We are not using this block; return it to the pool */
2189         outofTLSBin(block);
2190         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2191     } else {
2192         /* all objects are free - let's restore the bump pointer */
2193         block->restoreBumpPtr();
2194     }
2195 }
2196 
2197 template<int LOW_MARK, int HIGH_MARK>
2198 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2199 {
2200     const size_t size = object->unalignedSize;
2201     // not spoil cache with too large object, that can cause its total cleanup
2202     if (size > MAX_TOTAL_SIZE)
2203         return false;
2204     LargeMemoryBlock *localHead = head.exchange(nullptr);
2205 
2206     object->prev = nullptr;
2207     object->next = localHead;
2208     if (localHead)
2209         localHead->prev = object;
2210     else {
2211         // those might not be cleaned during local cache stealing, correct them
2212         totalSize = 0;
2213         numOfBlocks = 0;
2214         tail = object;
2215     }
2216     localHead = object;
2217     totalSize += size;
2218     numOfBlocks++;
2219     // must meet both size and number of cached objects constrains
2220     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2221         // scanning from tail until meet conditions
2222         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2223             totalSize -= tail->unalignedSize;
2224             numOfBlocks--;
2225             tail = tail->prev;
2226         }
2227         LargeMemoryBlock *headToRelease = tail->next;
2228         tail->next = nullptr;
2229 
2230         extMemPool->freeLargeObjectList(headToRelease);
2231     }
2232 
2233     head.store(localHead, std::memory_order_release);
2234     return true;
2235 }
2236 
2237 template<int LOW_MARK, int HIGH_MARK>
2238 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2239 {
2240     LargeMemoryBlock *localHead, *res = nullptr;
2241 
2242     if (size > MAX_TOTAL_SIZE)
2243         return nullptr;
2244 
2245     // TBB_REVAMP_TODO: review this line
2246     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(nullptr)) == nullptr) {
2247         // do not restore totalSize, numOfBlocks and tail at this point,
2248         // as they are used only in put(), where they must be restored
2249         return nullptr;
2250     }
2251 
2252     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2253         if (curr->unalignedSize == size) {
2254             res = curr;
2255             if (curr->next)
2256                 curr->next->prev = curr->prev;
2257             else
2258                 tail = curr->prev;
2259             if (curr != localHead)
2260                 curr->prev->next = curr->next;
2261             else
2262                 localHead = curr->next;
2263             totalSize -= size;
2264             numOfBlocks--;
2265             break;
2266         }
2267     }
2268 
2269     head.store(localHead, std::memory_order_release);
2270     return res;
2271 }
2272 
2273 template<int LOW_MARK, int HIGH_MARK>
2274 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2275 {
2276     if (LargeMemoryBlock *localHead = head.exchange(nullptr)) {
2277         extMemPool->freeLargeObjectList(localHead);
2278         return true;
2279     }
2280     return false;
2281 }
2282 
2283 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2284 {
2285     LargeMemoryBlock *lmb = nullptr;
2286 
2287     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2288     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2289     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2290         return nullptr;
2291     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2292 
2293     if (tls) {
2294         tls->markUsed();
2295         lmb = tls->lloc.get(allocationSize);
2296     }
2297     if (!lmb)
2298         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2299 
2300     if (lmb) {
2301         // doing shuffle we suppose that alignment offset guarantees
2302         // that different cache lines are in use
2303         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2304 
2305         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2306         uintptr_t alignedRight =
2307             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2308         // Has some room to shuffle object between cache lines?
2309         // Note that alignedRight and alignedArea are aligned at alignment.
2310         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2311         if (ptrDelta && tls) { // !tls is cold path
2312             // for the hot path of alignment==estimatedCacheLineSize,
2313             // allow compilers to use shift for division
2314             // (since estimatedCacheLineSize is a power-of-2 constant)
2315             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2316                   ptrDelta / estimatedCacheLineSize :
2317                   ptrDelta / alignment;
2318             unsigned myCacheIdx = ++tls->currCacheIdx;
2319             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2320 
2321             // Move object to a cache line with an offset that is different from
2322             // previous allocation. This supposedly allows us to use cache
2323             // associativity more efficiently.
2324             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2325         }
2326         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2327                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2328         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2329         header->memoryBlock = lmb;
2330         header->backRefIdx = lmb->backRefIdx;
2331         setBackRef(header->backRefIdx, header);
2332 
2333         lmb->objectSize = size;
2334 
2335         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2336         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2337 
2338         return alignedArea;
2339     }
2340     return nullptr;
2341 }
2342 
2343 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2344 {
2345     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2346     // overwrite backRefIdx to simplify double free detection
2347     header->backRefIdx = BackRefIdx();
2348 
2349     if (tls) {
2350         tls->markUsed();
2351         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2352             return;
2353     }
2354     extMemPool.freeLargeObject(header->memoryBlock);
2355 }
2356 
2357 /*
2358  * All aligned allocations fall into one of the following categories:
2359  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2360  *       we just align the size up, and request this amount, because for every size
2361  *       aligned to some power of 2, the allocated object is at least that aligned.
2362  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2363  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2364  *       its address up; given such pointer, scalable_free could find the real object.
2365  *       Wrapping of size+alignment is impossible because maximal allowed
2366  *       alignment plus minLargeObjectSize can't lead to wrapping.
2367  * 4. otherwise, aligned large object is allocated.
2368  */
2369 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2370 {
2371     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2372 
2373     if (!isMallocInitialized())
2374         if (!doInitialization())
2375             return nullptr;
2376 
2377     void *result;
2378     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2379         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2380     else if (size<minLargeObjectSize) {
2381         if (alignment<=fittingAlignment)
2382             result = internalPoolMalloc(memPool, size);
2383         else if (size+alignment < minLargeObjectSize) {
2384             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2385             if (!unaligned) return nullptr;
2386             result = alignUp(unaligned, alignment);
2387         } else
2388             goto LargeObjAlloc;
2389     } else {
2390     LargeObjAlloc:
2391         TLSData *tls = memPool->getTLS(/*create=*/true);
2392         // take into account only alignment that are higher then natural
2393         result =
2394             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2395                                                largeObjectAlignment: alignment);
2396     }
2397 
2398     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2399     return result;
2400 }
2401 
2402 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2403                             size_t newSize, size_t alignment = 0)
2404 {
2405     void *result;
2406     size_t copySize;
2407 
2408     if (isLargeObject<ourMem>(ptr)) {
2409         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2410         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2411 
2412         // Apply different strategies if size decreases
2413         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2414 
2415             // For huge objects (that do not fit in backend cache), keep the same space unless
2416             // the new size is at least twice smaller
2417             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2418             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2419             if (newSize > threshold) {
2420                 lmb->objectSize = newSize;
2421                 return ptr;
2422             }
2423             // TODO: For large objects suitable for the backend cache,
2424             // split out the excessive part and put it to the backend.
2425         }
2426         // Reallocate for real
2427         copySize = lmb->objectSize;
2428 #if BACKEND_HAS_MREMAP
2429         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2430                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2431             return r;
2432 #endif
2433         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2434             internalPoolMalloc(memPool, newSize);
2435 
2436     } else {
2437         Block* block = (Block *)alignDown(ptr, slabSize);
2438         copySize = block->findObjectSize(ptr);
2439 
2440         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2441         // Currently, in case of size decreasing, old pointer is returned
2442         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2443             return ptr;
2444         } else {
2445             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2446                 internalPoolMalloc(memPool, newSize);
2447         }
2448     }
2449     if (result) {
2450         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2451         internalPoolFree(memPool, ptr, 0);
2452     }
2453     return result;
2454 }
2455 
2456 #if MALLOC_DEBUG
2457 /* A predicate checks if an object is properly placed inside its block */
2458 inline bool Block::isProperlyPlaced(const void *object) const
2459 {
2460     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2461 }
2462 #endif
2463 
2464 /* Finds the real object inside the block */
2465 FreeObject *Block::findAllocatedObject(const void *address) const
2466 {
2467     // calculate offset from the end of the block space
2468     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2469     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2470     // find offset difference from a multiple of allocation size
2471     offset %= objectSize;
2472     // and move the address down to where the real object starts.
2473     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2474 }
2475 
2476 /*
2477  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2478  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2479  */
2480 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2481 {
2482     BackRefIdx id;
2483 #if _MSC_VER
2484     __try {
2485 #endif
2486         id = dereference(ptr);
2487 #if _MSC_VER
2488     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2489                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2490         id = BackRefIdx();
2491     }
2492 #endif
2493     return id;
2494 }
2495 
2496 template<MemoryOrigin memOrigin>
2497 bool isLargeObject(void *object)
2498 {
2499     if (!isAligned(object, largeObjectAlignment))
2500         return false;
2501     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2502     BackRefIdx idx = (memOrigin == unknownMem) ?
2503         safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2504 
2505     return idx.isLargeObject()
2506         // in valid LargeObjectHdr memoryBlock is not nullptr
2507         && header->memoryBlock
2508         // in valid LargeObjectHdr memoryBlock points somewhere before header
2509         // TODO: more strict check
2510         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2511         && getBackRef(idx) == header;
2512 }
2513 
2514 static inline bool isSmallObject (void *ptr)
2515 {
2516     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2517     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2518 
2519     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2520     if (isSmall)
2521         expectedBlock->checkFreePrecond(ptr);
2522     return isSmall;
2523 }
2524 
2525 /**** Check if an object was allocated by scalable_malloc ****/
2526 static inline bool isRecognized (void* ptr)
2527 {
2528     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2529         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2530 }
2531 
2532 static inline void freeSmallObject(void *object)
2533 {
2534     /* mask low bits to get the block */
2535     Block *block = (Block *)alignDown(object, slabSize);
2536     block->checkFreePrecond(object);
2537 
2538 #if MALLOC_CHECK_RECURSION
2539     if (block->isStartupAllocObject()) {
2540         ((StartupBlock *)block)->free(object);
2541         return;
2542     }
2543 #endif
2544     if (block->isOwnedByCurrentThread()) {
2545         block->freeOwnObject(object);
2546     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2547         FreeObject *objectToFree = block->findObjectToFree(object);
2548         block->freePublicObject(objectToFree);
2549     }
2550 }
2551 
2552 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2553 {
2554     Bin* bin;
2555     Block * mallocBlock;
2556 
2557     if (!memPool) return nullptr;
2558 
2559     if (!size) size = sizeof(size_t);
2560 
2561     TLSData *tls = memPool->getTLS(/*create=*/true);
2562 
2563     /* Allocate a large object */
2564     if (size >= minLargeObjectSize)
2565         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2566 
2567     if (!tls) return nullptr;
2568 
2569     tls->markUsed();
2570     /*
2571      * Get an element in thread-local array corresponding to the given size;
2572      * It keeps ptr to the active block for allocations of this size
2573      */
2574     bin = tls->getAllocationBin(size);
2575     if ( !bin ) return nullptr;
2576 
2577     /* Get a block to try to allocate in. */
2578     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2579          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2580     {
2581         if( FreeObject *result = mallocBlock->allocate() )
2582             return result;
2583     }
2584 
2585     /*
2586      * else privatize publicly freed objects in some block and allocate from it
2587      */
2588     mallocBlock = bin->getPrivatizedFreeListBlock();
2589     if (mallocBlock) {
2590         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2591         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2592             return result;
2593         /* Else something strange happened, need to retry from the beginning; */
2594         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2595         return internalPoolMalloc(memPool, size);
2596     }
2597 
2598     /*
2599      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2600      */
2601     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2602     while (mallocBlock) {
2603         bin->pushTLSBin(mallocBlock);
2604         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2605         if( FreeObject *result = mallocBlock->allocate() )
2606             return result;
2607         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2608     }
2609 
2610     /*
2611      * else try to get a new empty block
2612      */
2613     mallocBlock = memPool->getEmptyBlock(size);
2614     if (mallocBlock) {
2615         bin->pushTLSBin(mallocBlock);
2616         bin->setActiveBlock(mallocBlock);
2617         if( FreeObject *result = mallocBlock->allocate() )
2618             return result;
2619         /* Else something strange happened, need to retry from the beginning; */
2620         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2621         return internalPoolMalloc(memPool, size);
2622     }
2623     /*
2624      * else nothing works so return nullptr
2625      */
2626     TRACEF(( "[ScalableMalloc trace] No memory found, returning nullptr.\n" ));
2627     return nullptr;
2628 }
2629 
2630 // When size==0 (i.e. unknown), detect here whether the object is large.
2631 // For size is known and < minLargeObjectSize, we still need to check
2632 // if the actual object is large, because large objects might be used
2633 // for aligned small allocations.
2634 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2635 {
2636     if (!memPool || !object) return false;
2637 
2638     // The library is initialized at allocation call, so releasing while
2639     // not initialized means foreign object is releasing.
2640     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2641     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2642                   "Invalid pointer during object releasing is detected.");
2643 
2644     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2645         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2646     else
2647         freeSmallObject(object);
2648     return true;
2649 }
2650 
2651 static void *internalMalloc(size_t size)
2652 {
2653     if (!size) size = sizeof(size_t);
2654 
2655 #if MALLOC_CHECK_RECURSION
2656     if (RecursiveMallocCallProtector::sameThreadActive())
2657         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2658             // nested allocation, so skip tls
2659             (FreeObject*)defaultMemPool->getFromLLOCache(nullptr, size, slabSize);
2660 #endif
2661 
2662     if (!isMallocInitialized())
2663         if (!doInitialization())
2664             return nullptr;
2665     return internalPoolMalloc(defaultMemPool, size);
2666 }
2667 
2668 static void internalFree(void *object)
2669 {
2670     internalPoolFree(defaultMemPool, object, 0);
2671 }
2672 
2673 static size_t internalMsize(void* ptr)
2674 {
2675     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2676     if (isLargeObject<ourMem>(ptr)) {
2677         // TODO: return the maximum memory size, that can be written to this object
2678         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2679         return lmb->objectSize;
2680     } else {
2681         Block *block = (Block*)alignDown(ptr, slabSize);
2682         return block->findObjectSize(ptr);
2683     }
2684 }
2685 
2686 } // namespace internal
2687 
2688 using namespace rml::internal;
2689 
2690 // legacy entry point saved for compatibility with binaries complied
2691 // with pre-6003 versions of TBB
2692 TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2693 {
2694     rml::MemoryPool *pool;
2695     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2696 
2697     pool_create_v1(pool_id, &pol, &pool);
2698     return pool;
2699 }
2700 
2701 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2702                                  rml::MemoryPool **pool)
2703 {
2704     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2705          // empty pFree allowed only for fixed pools
2706          || !(policy->fixedPool || policy->pFree)) {
2707         *pool = nullptr;
2708         return INVALID_POLICY;
2709     }
2710     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2711          // new flags can be added in place of reserved, but default
2712          // behaviour must be supported by this version
2713          || policy->reserved ) {
2714         *pool = nullptr;
2715         return UNSUPPORTED_POLICY;
2716     }
2717     if (!isMallocInitialized())
2718         if (!doInitialization()) {
2719             *pool = nullptr;
2720             return NO_MEMORY;
2721         }
2722     rml::internal::MemoryPool *memPool =
2723         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2724     if (!memPool) {
2725         *pool = nullptr;
2726         return NO_MEMORY;
2727     }
2728     memset(static_cast<void*>(memPool), 0, sizeof(rml::internal::MemoryPool));
2729     if (!memPool->init(pool_id, policy)) {
2730         internalFree(memPool);
2731         *pool = nullptr;
2732         return NO_MEMORY;
2733     }
2734 
2735     *pool = (rml::MemoryPool*)memPool;
2736     return POOL_OK;
2737 }
2738 
2739 bool pool_destroy(rml::MemoryPool* memPool)
2740 {
2741     if (!memPool) return false;
2742     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2743     internalFree(memPool);
2744 
2745     return ret;
2746 }
2747 
2748 bool pool_reset(rml::MemoryPool* memPool)
2749 {
2750     if (!memPool) return false;
2751 
2752     return ((rml::internal::MemoryPool*)memPool)->reset();
2753 }
2754 
2755 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2756 {
2757     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2758 }
2759 
2760 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2761 {
2762     if (!object)
2763         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2764     if (!size) {
2765         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2766         return nullptr;
2767     }
2768     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2769 }
2770 
2771 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2772 {
2773     if (!isPowerOfTwo(alignment) || 0==size)
2774         return nullptr;
2775 
2776     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2777 }
2778 
2779 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2780 {
2781     if (!isPowerOfTwo(alignment))
2782         return nullptr;
2783     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2784     void *tmp;
2785 
2786     if (!ptr)
2787         tmp = allocateAligned(mPool, size, alignment);
2788     else if (!size) {
2789         internalPoolFree(mPool, ptr, 0);
2790         return nullptr;
2791     } else
2792         tmp = reallocAligned(mPool, ptr, size, alignment);
2793 
2794     return tmp;
2795 }
2796 
2797 bool pool_free(rml::MemoryPool *mPool, void *object)
2798 {
2799     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2800 }
2801 
2802 rml::MemoryPool *pool_identify(void *object)
2803 {
2804     rml::internal::MemoryPool *pool;
2805     if (isLargeObject<ourMem>(object)) {
2806         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2807         pool = header->memoryBlock->pool;
2808     } else {
2809         Block *block = (Block*)alignDown(object, slabSize);
2810         pool = block->getMemPool();
2811     }
2812     // do not return defaultMemPool, as it can't be used in pool_free() etc
2813     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2814         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2815     return (rml::MemoryPool*)pool;
2816 }
2817 
2818 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2819 {
2820     if (object) {
2821         // No assert for object recognition, cause objects allocated from non-default
2822         // memory pool do not participate in range checking and do not have valid backreferences for
2823         // small objects. Instead, check that an object belong to the certain memory pool.
2824         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2825         return internalMsize(object);
2826     }
2827     errno = EINVAL;
2828     // Unlike _msize, return 0 in case of parameter error.
2829     // Returning size_t(-1) looks more like the way to troubles.
2830     return 0;
2831 }
2832 
2833 } // namespace rml
2834 
2835 using namespace rml::internal;
2836 
2837 #if MALLOC_TRACE
2838 static unsigned int threadGoingDownCount = 0;
2839 #endif
2840 
2841 /*
2842  * When a thread is shutting down this routine should be called to remove all the thread ids
2843  * from the malloc blocks and replace them with a nullptr thread id.
2844  *
2845  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2846  * It will be automatically called at thread exit with the key value as the argument,
2847  * unless that value is nullptr.
2848  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2849  *
2850  * However neither of the above is called for the main process thread, so the routine
2851  * also needs to be called during the process shutdown.
2852  *
2853 */
2854 // TODO: Consider making this function part of class MemoryPool.
2855 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2856 {
2857     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2858              getThreadId(),  threadGoingDownCount++ ));
2859 
2860 #if USE_PTHREAD
2861     if (tls) {
2862         if (!shutdownSync.threadDtorStart()) return;
2863         tls->getMemPool()->onThreadShutdown(tls);
2864         shutdownSync.threadDtorDone();
2865     } else
2866 #endif
2867     {
2868         suppress_unused_warning(tls); // not used on Windows
2869         // The default pool is safe to use at this point:
2870         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2871         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2872         // See also __TBB_mallocProcessShutdownNotification()
2873         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2874         // Take lock to walk through other pools; but waiting might be dangerous at this point
2875         // (e.g. on Windows the main thread might deadlock)
2876         bool locked = false;
2877         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2878         if (locked) { // the list is safe to process
2879             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2880                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2881         }
2882     }
2883 
2884     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2885 }
2886 
2887 #if USE_PTHREAD
2888 void mallocThreadShutdownNotification(void* arg)
2889 {
2890     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2891     if (!isMallocInitialized()) return;
2892     doThreadShutdownNotification((TLSData*)arg, false);
2893 }
2894 #else
2895 extern "C" void __TBB_mallocThreadShutdownNotification()
2896 {
2897     // The routine is called once per thread on Windows
2898     if (!isMallocInitialized()) return;
2899     doThreadShutdownNotification(nullptr, false);
2900 }
2901 #endif
2902 
2903 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2904 {
2905     if (!isMallocInitialized()) return;
2906 
2907     // Don't clean allocator internals if the entire process is exiting
2908     if (!windows_process_dying) {
2909         doThreadShutdownNotification(nullptr, /*main_thread=*/true);
2910     }
2911 #if  __TBB_MALLOC_LOCACHE_STAT
2912     printf("cache hit ratio %f, size hit %f\n",
2913            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2914     defaultMemPool->extMemPool.loc.reportStat(stdout);
2915 #endif
2916 
2917     shutdownSync.processExit();
2918 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2919 /* Pthread keys must be deleted as soon as possible to not call key dtor
2920    on thread termination when then the tbbmalloc code can be already unloaded.
2921 */
2922     defaultMemPool->destroy();
2923     destroyBackRefMain(&defaultMemPool->extMemPool.backend);
2924     ThreadId::destroy();      // Delete key for thread id
2925     hugePages.reset();
2926     // new total malloc initialization is possible after this point
2927     mallocInitialized.store(0, std::memory_order_release);
2928 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2929 
2930 #if COLLECT_STATISTICS
2931     unsigned nThreads = ThreadId::getMaxThreadId();
2932     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2933         STAT_print(i);
2934 #endif
2935     if (!usedBySrcIncluded) {
2936         MALLOC_ITT_FINI_ITTLIB();
2937         MALLOC_ITT_RELEASE_RESOURCES();
2938     }
2939 }
2940 
2941 extern "C" void * scalable_malloc(size_t size)
2942 {
2943     void *ptr = internalMalloc(size);
2944     if (!ptr) errno = ENOMEM;
2945     return ptr;
2946 }
2947 
2948 extern "C" void scalable_free(void *object)
2949 {
2950     internalFree(object);
2951 }
2952 
2953 #if MALLOC_ZONE_OVERLOAD_ENABLED
2954 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2955 {
2956     internalPoolFree(defaultMemPool, object, size);
2957 }
2958 #endif
2959 
2960 /*
2961  * A variant that provides additional memory safety, by checking whether the given address
2962  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2963  */
2964 extern "C" TBBMALLOC_EXPORT void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2965 {
2966     if (!object)
2967         return;
2968 
2969     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2970     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2971         if (isLargeObject<unknownMem>(object)) {
2972             // must check 1st for large object, because small object check touches 4 pages on left,
2973             // and it can be inaccessible
2974             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2975 
2976             defaultMemPool->putToLLOCache(tls, object);
2977             return;
2978         } else if (isSmallObject(object)) {
2979             freeSmallObject(object);
2980             return;
2981         }
2982     }
2983     if (original_free)
2984         original_free(object);
2985 }
2986 
2987 /********* End the free code        *************/
2988 
2989 /********* Code for scalable_realloc       ***********/
2990 
2991 /*
2992  * From K&R
2993  * "realloc changes the size of the object pointed to by p to size. The contents will
2994  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2995  * the new space is uninitialized. realloc returns a pointer to the new space, or
2996  * nullptr if the request cannot be satisfied, in which case *p is unchanged."
2997  *
2998  */
2999 extern "C" void* scalable_realloc(void* ptr, size_t size)
3000 {
3001     void *tmp;
3002 
3003     if (!ptr)
3004         tmp = internalMalloc(size);
3005     else if (!size) {
3006         internalFree(ptr);
3007         return nullptr;
3008     } else
3009         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3010 
3011     if (!tmp) errno = ENOMEM;
3012     return tmp;
3013 }
3014 
3015 /*
3016  * A variant that provides additional memory safety, by checking whether the given address
3017  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3018  */
3019 extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3020 {
3021     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3022 
3023     if (!ptr) {
3024         tmp = internalMalloc(sz);
3025     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3026         if (!sz) {
3027             internalFree(ptr);
3028             return nullptr;
3029         } else {
3030             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3031         }
3032     }
3033 #if USE_WINTHREAD
3034     else if (original_realloc && sz) {
3035         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3036         if ( original_ptrs->msize ){
3037             size_t oldSize = original_ptrs->msize(ptr);
3038             tmp = internalMalloc(sz);
3039             if (tmp) {
3040                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3041                 if ( original_ptrs->free ){
3042                     original_ptrs->free( ptr );
3043                 }
3044             }
3045         } else
3046             tmp = nullptr;
3047     }
3048 #else
3049     else if (original_realloc) {
3050         typedef void* (*realloc_ptr_t)(void*,size_t);
3051         realloc_ptr_t original_realloc_ptr;
3052         (void *&)original_realloc_ptr = original_realloc;
3053         tmp = original_realloc_ptr(ptr,sz);
3054     }
3055 #endif
3056     else tmp = nullptr;
3057 
3058     if (!tmp) errno = ENOMEM;
3059     return tmp;
3060 }
3061 
3062 /********* End code for scalable_realloc   ***********/
3063 
3064 /********* Code for scalable_calloc   ***********/
3065 
3066 /*
3067  * From K&R
3068  * calloc returns a pointer to space for an array of nobj objects,
3069  * each of size size, or nullptr if the request cannot be satisfied.
3070  * The space is initialized to zero bytes.
3071  *
3072  */
3073 
3074 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3075 {
3076     // it's square root of maximal size_t value
3077     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3078     const size_t arraySize = nobj * size;
3079 
3080     // check for overflow during multiplication:
3081     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3082         if (nobj && arraySize / nobj != size) {             // 2) exact check
3083             errno = ENOMEM;
3084             return nullptr;
3085         }
3086     void* result = internalMalloc(arraySize);
3087     if (result)
3088         memset(result, 0, arraySize);
3089     else
3090         errno = ENOMEM;
3091     return result;
3092 }
3093 
3094 /********* End code for scalable_calloc   ***********/
3095 
3096 /********* Code for aligned allocation API **********/
3097 
3098 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3099 {
3100     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3101         return EINVAL;
3102     void *result = allocateAligned(defaultMemPool, size, alignment);
3103     if (!result)
3104         return ENOMEM;
3105     *memptr = result;
3106     return 0;
3107 }
3108 
3109 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3110 {
3111     if (!isPowerOfTwo(alignment) || 0==size) {
3112         errno = EINVAL;
3113         return nullptr;
3114     }
3115     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3116     if (!tmp) errno = ENOMEM;
3117     return tmp;
3118 }
3119 
3120 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3121 {
3122     if (!isPowerOfTwo(alignment)) {
3123         errno = EINVAL;
3124         return nullptr;
3125     }
3126     void *tmp;
3127 
3128     if (!ptr)
3129         tmp = allocateAligned(defaultMemPool, size, alignment);
3130     else if (!size) {
3131         scalable_free(ptr);
3132         return nullptr;
3133     } else
3134         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3135 
3136     if (!tmp) errno = ENOMEM;
3137     return tmp;
3138 }
3139 
3140 extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3141 {
3142     /* corner cases left out of reallocAligned to not deal with errno there */
3143     if (!isPowerOfTwo(alignment)) {
3144         errno = EINVAL;
3145         return nullptr;
3146     }
3147     void *tmp = nullptr;
3148 
3149     if (!ptr) {
3150         tmp = allocateAligned(defaultMemPool, size, alignment);
3151     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3152         if (!size) {
3153             internalFree(ptr);
3154             return nullptr;
3155         } else {
3156             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3157         }
3158     }
3159 #if USE_WINTHREAD
3160     else {
3161         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3162         if (size) {
3163             // Without orig_msize, we can't do anything with this.
3164             // Just keeping old pointer.
3165             if ( original_ptrs->aligned_msize ){
3166                 // set alignment and offset to have possibly correct oldSize
3167                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3168                 tmp = allocateAligned(defaultMemPool, size, alignment);
3169                 if (tmp) {
3170                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3171                     if ( original_ptrs->aligned_free ){
3172                         original_ptrs->aligned_free( ptr );
3173                     }
3174                 }
3175             }
3176         } else {
3177             if ( original_ptrs->aligned_free ){
3178                 original_ptrs->aligned_free( ptr );
3179             }
3180             return nullptr;
3181         }
3182     }
3183 #else
3184     // As original_realloc can't align result, and there is no way to find
3185     // size of reallocating object, we are giving up.
3186     suppress_unused_warning(orig_function);
3187 #endif
3188     if (!tmp) errno = ENOMEM;
3189     return tmp;
3190 }
3191 
3192 extern "C" void scalable_aligned_free(void *ptr)
3193 {
3194     internalFree(ptr);
3195 }
3196 
3197 /********* end code for aligned allocation API **********/
3198 
3199 /********* Code for scalable_msize       ***********/
3200 
3201 /*
3202  * Returns the size of a memory block allocated in the heap.
3203  */
3204 extern "C" size_t scalable_msize(void* ptr)
3205 {
3206     if (ptr) {
3207         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3208         return internalMsize(ptr);
3209     }
3210     errno = EINVAL;
3211     // Unlike _msize, return 0 in case of parameter error.
3212     // Returning size_t(-1) looks more like the way to troubles.
3213     return 0;
3214 }
3215 
3216 /*
3217  * A variant that provides additional memory safety, by checking whether the given address
3218  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3219  */
3220 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3221 {
3222     if (object) {
3223         // Check if the memory was allocated by scalable_malloc
3224         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3225             return internalMsize(object);
3226         else if (original_msize)
3227             return original_msize(object);
3228     }
3229     // object is nullptr or unknown, or foreign and no original_msize
3230 #if USE_WINTHREAD
3231     errno = EINVAL; // errno expected to be set only on this platform
3232 #endif
3233     return 0;
3234 }
3235 
3236 /*
3237  * The same as above but for _aligned_msize case
3238  */
3239 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3240 {
3241     if (object) {
3242         // Check if the memory was allocated by scalable_malloc
3243         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3244             return internalMsize(object);
3245         else if (orig_aligned_msize)
3246             return orig_aligned_msize(object,alignment,offset);
3247     }
3248     // object is nullptr or unknown
3249     errno = EINVAL;
3250     return 0;
3251 }
3252 
3253 /********* End code for scalable_msize   ***********/
3254 
3255 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3256 {
3257     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3258         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3259         return TBBMALLOC_OK;
3260     } else if (param == USE_HUGE_PAGES) {
3261 #if __unix__
3262         switch (value) {
3263         case 0:
3264         case 1:
3265             hugePages.setMode(value);
3266             return TBBMALLOC_OK;
3267         default:
3268             return TBBMALLOC_INVALID_PARAM;
3269         }
3270 #else
3271         return TBBMALLOC_NO_EFFECT;
3272 #endif
3273 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3274     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3275         switch (value) {
3276         case 0: // used by dynamic library
3277         case 1: // used by static library or directly included sources
3278             usedBySrcIncluded = value;
3279             return TBBMALLOC_OK;
3280         default:
3281             return TBBMALLOC_INVALID_PARAM;
3282         }
3283 #endif
3284     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3285         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3286         return TBBMALLOC_OK;
3287     }
3288     return TBBMALLOC_INVALID_PARAM;
3289 }
3290 
3291 extern "C" int scalable_allocation_command(int cmd, void *param)
3292 {
3293     if (param)
3294         return TBBMALLOC_INVALID_PARAM;
3295 
3296     bool released = false;
3297     switch(cmd) {
3298     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3299         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3300             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3301         break;
3302     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3303         released = defaultMemPool->extMemPool.hardCachesCleanup(true);
3304         break;
3305     default:
3306         return TBBMALLOC_INVALID_PARAM;
3307     }
3308     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3309 }
3310