xref: /oneTBB/src/tbbmalloc/frontend.cpp (revision 6caecf96)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
31     inline void do_yield() {sched_yield();}
32     extern "C" { static void mallocThreadShutdownNotification(void*); }
33     #if __sun || __SUNPRO_CC
34     #define __asm__ asm
35     #endif
36     #include <unistd.h> // sysconf(_SC_PAGESIZE)
37 #elif USE_WINTHREAD
38     #define GetMyTID() GetCurrentThreadId()
39 #if __TBB_WIN8UI_SUPPORT
40     #include<thread>
41     #define TlsSetValue_func FlsSetValue
42     #define TlsGetValue_func FlsGetValue
43     #define TlsAlloc() FlsAlloc(NULL)
44     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
45     #define TlsFree FlsFree
46     inline void do_yield() {std::this_thread::yield();}
47 #else
48     #define TlsSetValue_func TlsSetValue
49     #define TlsGetValue_func TlsGetValue
50     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
51     inline void do_yield() {SwitchToThread();}
52 #endif
53 #else
54     #error Must define USE_PTHREAD or USE_WINTHREAD
55 #endif
56 
57 #define FREELIST_NONBLOCKING 1
58 
59 namespace rml {
60 class MemoryPool;
61 namespace internal {
62 
63 class Block;
64 class MemoryPool;
65 
66 #if MALLOC_CHECK_RECURSION
67 
68 inline bool isMallocInitialized();
69 
70 #endif // MALLOC_CHECK_RECURSION
71 
72 /** Support for handling the special UNUSABLE pointer state **/
73 const intptr_t UNUSABLE = 0x1;
74 inline bool isSolidPtr( void* ptr ) {
75     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
76 }
77 inline bool isNotForUse( void* ptr ) {
78     return (intptr_t)ptr==UNUSABLE;
79 }
80 
81 /*
82  * Block::objectSize value used to mark blocks allocated by startupAlloc
83  */
84 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
85 
86 /*
87  * The following constant is used to define the size of struct Block, the block header.
88  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
89  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
90  */
91 const int blockHeaderAlignment = estimatedCacheLineSize;
92 
93 /********* The data structures and global objects        **************/
94 
95 /*
96  * The malloc routines themselves need to be able to occasionally malloc some space,
97  * in order to set up the structures used by the thread local structures. This
98  * routine performs that functions.
99  */
100 class BootStrapBlocks {
101     MallocMutex bootStrapLock;
102     Block      *bootStrapBlock;
103     Block      *bootStrapBlockUsed;
104     FreeObject *bootStrapObjectList;
105 public:
106     void *allocate(MemoryPool *memPool, size_t size);
107     void free(void* ptr);
108     void reset();
109 };
110 
111 #if USE_INTERNAL_TID
112 class ThreadId {
113     static tls_key_t Tid_key;
114     std::atomic<intptr_t> ThreadCount;
115 
116     unsigned int id;
117 
118     static unsigned int tlsNumber() {
119         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
120         if( !result ) {
121             RecursiveMallocCallProtector scoped;
122             // Thread-local value is zero -> first call from this thread,
123             // need to initialize with next ID value (IDs start from 1)
124             result = ++ThreadCount; // returned new value!
125             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
126         }
127         return result;
128     }
129 public:
130     static bool init() {
131 #if USE_WINTHREAD
132         Tid_key = TlsAlloc();
133         if (Tid_key == TLS_ALLOC_FAILURE)
134             return false;
135 #else
136         int status = pthread_key_create( &Tid_key, NULL );
137         if ( status ) {
138             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
139             return false;
140         }
141 #endif /* USE_WINTHREAD */
142         return true;
143     }
144 #if __TBB_SOURCE_DIRECTLY_INCLUDED
145     static void destroy() {
146         if( Tid_key ) {
147 #if USE_WINTHREAD
148             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
149 #else
150             int status = pthread_key_delete( Tid_key );
151 #endif /* USE_WINTHREAD */
152             if ( status )
153                 fprintf (stderr, "The memory manager cannot delete tls key\n");
154             Tid_key = 0;
155         }
156     }
157 #endif
158 
159     ThreadId() : id(ThreadId::tlsNumber()) {}
160     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
161 
162 #if COLLECT_STATISTICS || MALLOC_TRACE
163     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
164 #endif
165 #if COLLECT_STATISTICS
166     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
167 
168     friend int STAT_increment(ThreadId tid, int bin, int ctr);
169 #endif
170 };
171 
172 tls_key_t ThreadId::Tid_key;
173 intptr_t ThreadId::ThreadCount;
174 
175 #if COLLECT_STATISTICS
176 int STAT_increment(ThreadId tid, int bin, int ctr)
177 {
178     return ::STAT_increment(tid.id, bin, ctr);
179 }
180 #endif
181 
182 #else // USE_INTERNAL_TID
183 
184 class ThreadId {
185 #if USE_PTHREAD
186     std::atomic<pthread_t> tid;
187 #else
188     std::atomic<DWORD>     tid;
189 #endif
190 public:
191     ThreadId() : tid(GetMyTID()) {}
192 #if USE_PTHREAD
193     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
194 #else
195     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
196 #endif
197     ThreadId& operator=(const ThreadId& other) {
198         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
199         return *this;
200     }
201     static bool init() { return true; }
202 #if __TBB_SOURCE_DIRECTLY_INCLUDED
203     static void destroy() {}
204 #endif
205 };
206 
207 #endif // USE_INTERNAL_TID
208 
209 /*********** Code to provide thread ID and a thread-local void pointer **********/
210 
211 bool TLSKey::init()
212 {
213 #if USE_WINTHREAD
214     TLS_pointer_key = TlsAlloc();
215     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
216         return false;
217 #else
218     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
219     if ( status )
220         return false;
221 #endif /* USE_WINTHREAD */
222     return true;
223 }
224 
225 bool TLSKey::destroy()
226 {
227 #if USE_WINTHREAD
228     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
229 #else
230     int status1 = pthread_key_delete(TLS_pointer_key);
231 #endif /* USE_WINTHREAD */
232     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
233     return status1==0;
234 }
235 
236 inline TLSData* TLSKey::getThreadMallocTLS() const
237 {
238     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
239 }
240 
241 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
242     RecursiveMallocCallProtector scoped;
243     TlsSetValue_func( TLS_pointer_key, newvalue );
244 }
245 
246 /* The 'next' field in the block header has to maintain some invariants:
247  *   it needs to be on a 16K boundary and the first field in the block.
248  *   Any value stored there needs to have the lower 14 bits set to 0
249  *   so that various assert work. This means that if you want to smash this memory
250  *   for debugging purposes you will need to obey this invariant.
251  * The total size of the header needs to be a power of 2 to simplify
252  * the alignment requirements. For now it is a 128 byte structure.
253  * To avoid false sharing, the fields changed only locally are separated
254  * from the fields changed by foreign threads.
255  * Changing the size of the block header would require to change
256  * some bin allocation sizes, in particular "fitting" sizes (see above).
257  */
258 class Bin;
259 class StartupBlock;
260 
261 class MemoryPool {
262     // if no explicit grainsize, expect to see malloc in user's pAlloc
263     // and set reasonable low granularity
264     static const size_t defaultGranularity = estimatedCacheLineSize;
265 
266     MemoryPool();                  // deny
267 public:
268     static MallocMutex  memPoolListLock;
269 
270     // list of all active pools is used to release
271     // all TLS data on thread termination or library unload
272     MemoryPool    *next,
273                   *prev;
274     ExtMemoryPool  extMemPool;
275     BootStrapBlocks bootStrapBlocks;
276 
277     static void initDefaultPool();
278 
279     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
280     bool reset();
281     bool destroy();
282     void onThreadShutdown(TLSData *tlsData);
283 
284     inline TLSData *getTLS(bool create);
285     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(NULL); }
286 
287     Block *getEmptyBlock(size_t size);
288     void returnEmptyBlock(Block *block, bool poolTheBlock);
289 
290     // get/put large object to/from local large object cache
291     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
292     void putToLLOCache(TLSData *tls, void *object);
293 };
294 
295 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
296                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
297 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
298 const size_t MemoryPool::defaultGranularity;
299 // zero-initialized
300 MallocMutex  MemoryPool::memPoolListLock;
301 // TODO: move huge page status to default pool, because that's its states
302 HugePagesStatus hugePages;
303 static bool usedBySrcIncluded = false;
304 
305 // Padding helpers
306 template<size_t padd>
307 struct PaddingImpl {
308     size_t       __padding[padd];
309 };
310 
311 template<>
312 struct PaddingImpl<0> {};
313 
314 template<int N>
315 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
316 
317 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
318 // fields and fields commonly accessed by not owner threads.
319 class GlobalBlockFields : public BlockI {
320 protected:
321     std::atomic<FreeObject*> publicFreeList;
322     std::atomic<Block*> nextPrivatizable;
323     MemoryPool  *poolPtr;
324 };
325 
326 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
327 protected:
328     Block       *next;
329     Block       *previous;        /* Use double linked list to speed up removal */
330     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
331     FreeObject  *freeList;
332     /* Pointer to local data for the owner thread. Used for fast finding tls
333        when releasing object from a block that current thread owned.
334        NULL for orphaned blocks. */
335     std::atomic<TLSData*> tlsPtr;
336     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
337     BackRefIdx   backRefIdx;
338     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
339     uint16_t     objectSize;
340     bool         isFull;
341 
342     friend class FreeBlockPool;
343     friend class StartupBlock;
344     friend class LifoList;
345     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
346     friend bool OrphanedBlocks::cleanup(Backend*);
347     friend Block *MemoryPool::getEmptyBlock(size_t);
348 };
349 
350 // Use inheritance to guarantee that a user data start on next cache line.
351 // Can't use member for it, because when LocalBlockFields already on cache line,
352 // we must have no additional memory consumption for all compilers.
353 class Block : public LocalBlockFields,
354               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
355 public:
356     bool empty() const {
357         if (allocatedCount > 0) return false;
358         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
359         return true;
360     }
361     inline FreeObject* allocate();
362     inline FreeObject *allocateFromFreeList();
363 
364     inline bool adjustFullness();
365     void adjustPositionInBin(Bin* bin = NULL);
366 #if MALLOC_DEBUG
367     bool freeListNonNull() { return freeList; }
368 #endif
369     void freePublicObject(FreeObject *objectToFree);
370     inline void freeOwnObject(void *object);
371     void reset();
372     void privatizePublicFreeList( bool reset = true );
373     void restoreBumpPtr();
374     void privatizeOrphaned(TLSData *tls, unsigned index);
375     bool readyToShare();
376     void shareOrphaned(intptr_t binTag, unsigned index);
377     unsigned int getSize() const {
378         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
379                       "Invalid object size");
380         return isStartupAllocObject()? 0 : objectSize;
381     }
382     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
383     inline bool isOwnedByCurrentThread() const;
384     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
385     inline FreeObject *findObjectToFree(const void *object) const;
386     void checkFreePrecond(const void *object) const {
387 #if MALLOC_DEBUG
388         const char *msg = "Possible double free or heap corruption.";
389         // small objects are always at least sizeof(size_t) Byte aligned,
390         // try to check this before this dereference as for invalid objects
391         // this may be unreadable
392         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
393 #if !__TBB_USE_THREAD_SANITIZER
394         // releasing to free slab
395         MALLOC_ASSERT(allocatedCount>0, msg);
396 #endif
397         // must not point to slab's header
398         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
399         if (startupAllocObjSizeMark == objectSize) // startup block
400             MALLOC_ASSERT(object<=bumpPtr, msg);
401         else {
402             // non-startup objects are 8 Byte aligned
403             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
404             FreeObject *toFree = findObjectToFree(object);
405 #if !__TBB_USE_THREAD_SANITIZER
406             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
407                           && (!bumpPtr || object>bumpPtr), msg);
408             // check against head of freeList, as this is mostly
409             // expected after double free
410             MALLOC_ASSERT(toFree != freeList, msg);
411 #endif
412             // check against head of publicFreeList, to detect double free
413             // involving foreign thread
414             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
415         }
416 #else
417         suppress_unused_warning(object);
418 #endif
419     }
420     void initEmptyBlock(TLSData *tls, size_t size);
421     size_t findObjectSize(void *object) const;
422     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
423 
424 protected:
425     void cleanBlockHeader();
426 
427 private:
428     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
429 
430     inline FreeObject *allocateFromBumpPtr();
431     inline FreeObject *findAllocatedObject(const void *address) const;
432 #if MALLOC_DEBUG
433     inline bool isProperlyPlaced(const void *object) const;
434 #endif
435     inline void markOwned(TLSData *tls) {
436         MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
437         ownerTid = ThreadId(); /* save the ID of the current thread */
438         tlsPtr.store(tls, std::memory_order_relaxed);
439     }
440     inline void markOrphaned() {
441         MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
442         tlsPtr.store(nullptr, std::memory_order_relaxed);
443     }
444 
445     friend class Bin;
446     friend class TLSData;
447     friend bool MemoryPool::destroy();
448 };
449 
450 const float Block::emptyEnoughRatio = 1.0 / 4.0;
451 
452 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
453     "The class Block does not fit into 2 cache lines on this platform. "
454     "Defining USE_INTERNAL_TID may help to fix it.");
455 
456 class Bin {
457 private:
458 public:
459     Block *activeBlk;
460     std::atomic<Block*> mailbox;
461     MallocMutex mailLock;
462 
463 public:
464     inline Block* getActiveBlock() const { return activeBlk; }
465     void resetActiveBlock() { activeBlk = NULL; }
466     inline void setActiveBlock(Block *block);
467     inline Block* setPreviousBlockActive();
468     Block* getPrivatizedFreeListBlock();
469     void moveBlockToFront(Block *block);
470     bool cleanPublicFreeLists();
471     void processEmptyBlock(Block *block, bool poolTheBlock);
472     void addPublicFreeListBlock(Block* block);
473 
474     void outofTLSBin(Block* block);
475     void verifyTLSBin(size_t size) const;
476     void pushTLSBin(Block* block);
477 
478 #if MALLOC_DEBUG
479     void verifyInitState() const {
480         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
481         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
482     }
483 #endif
484 
485     friend void Block::freePublicObject (FreeObject *objectToFree);
486 };
487 
488 /********* End of the data structures                    **************/
489 
490 /*
491  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
492  */
493 const uint32_t minSmallObjectIndex = 0;
494 const uint32_t numSmallObjectBins = 8;
495 const uint32_t maxSmallObjectSize = 64;
496 
497 /*
498  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
499  * from maxSmallObjectSize till this size; 16 bins in total
500  */
501 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
502 const uint32_t numSegregatedObjectBins = 16;
503 const uint32_t maxSegregatedObjectSize = 1024;
504 
505 /*
506  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
507  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
508  */
509 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
510 const uint32_t numFittingBins = 5;
511 
512 const uint32_t fittingAlignment = estimatedCacheLineSize;
513 
514 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
515 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
516 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
517 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
518 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
519 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
520 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
521 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
522 #undef SET_FITTING_SIZE
523 
524 /*
525  * The total number of thread-specific Block-based bins
526  */
527 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
528 
529 /*
530  * Objects of this size and larger are considered large objects.
531  */
532 const uint32_t minLargeObjectSize = fittingSize5 + 1;
533 
534 /*
535  * Per-thread pool of slab blocks. Idea behind it is to not share with other
536  * threads memory that are likely in local cache(s) of our CPU.
537  */
538 class FreeBlockPool {
539 private:
540     std::atomic<Block*> head;
541     int         size;
542     Backend    *backend;
543     bool        lastAccessMiss;
544 public:
545     static const int POOL_HIGH_MARK = 32;
546     static const int POOL_LOW_MARK  = 8;
547 
548     class ResOfGet {
549         ResOfGet();
550     public:
551         Block* block;
552         bool   lastAccMiss;
553         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
554     };
555 
556     // allocated in zero-initialized memory
557     FreeBlockPool(Backend *bknd) : backend(bknd) {}
558     ResOfGet getBlock();
559     void returnBlock(Block *block);
560     bool externalCleanup(); // can be called by another thread
561 };
562 
563 template<int LOW_MARK, int HIGH_MARK>
564 class LocalLOCImpl {
565 private:
566     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
567     // TODO: can single-linked list be faster here?
568     LargeMemoryBlock *tail; // need it when do releasing on overflow
569     std::atomic<LargeMemoryBlock*> head;
570     size_t            totalSize;
571     int               numOfBlocks;
572 public:
573     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
574     LargeMemoryBlock *get(size_t size);
575     bool externalCleanup(ExtMemoryPool *extMemPool);
576 #if __TBB_MALLOC_WHITEBOX_TEST
577     LocalLOCImpl() : head(NULL), tail(NULL), totalSize(0), numOfBlocks(0) {}
578     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
579     static const int LOC_HIGH_MARK = HIGH_MARK;
580 #else
581     // no ctor, object must be created in zero-initialized memory
582 #endif
583 };
584 
585 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
586 
587 class TLSData : public TLSRemote {
588     MemoryPool   *memPool;
589 public:
590     Bin           bin[numBlockBinLimit];
591     FreeBlockPool freeSlabBlocks;
592     LocalLOC      lloc;
593     unsigned      currCacheIdx;
594 private:
595     std::atomic<bool> unused;
596 public:
597     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {}
598     MemoryPool *getMemPool() const { return memPool; }
599     Bin* getAllocationBin(size_t size);
600     void release();
601     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
602         if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
603         // Heavy operation in terms of synchronization complexity,
604         // should be called only for the current thread
605         bool released = cleanBins ? cleanupBlockBins() : false;
606         // both cleanups to be called, and the order is not important
607         return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup();
608     }
609     bool cleanupBlockBins();
610     void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
611     void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
612 };
613 
614 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
615 {
616     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
617     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
618     if ( !tls )
619         return NULL;
620     new(tls) TLSData(memPool, backend);
621     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
622 #if MALLOC_DEBUG
623     for (uint32_t i = 0; i < numBlockBinLimit; i++)
624         tls->bin[i].verifyInitState();
625 #endif
626     setThreadMallocTLS(tls);
627     memPool->extMemPool.allLocalCaches.registerThread(tls);
628     return tls;
629 }
630 
631 bool TLSData::cleanupBlockBins()
632 {
633     bool released = false;
634     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
635         released |= bin[i].cleanPublicFreeLists();
636         // After cleaning public free lists, only the active block might be empty.
637         // Do not use processEmptyBlock because it will just restore bumpPtr.
638         Block *block = bin[i].getActiveBlock();
639         if (block && block->empty()) {
640             bin[i].outofTLSBin(block);
641             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
642             released = true;
643         }
644     }
645     return released;
646 }
647 
648 bool ExtMemoryPool::releaseAllLocalCaches()
649 {
650     // Iterate all registered TLS data and clean LLOC and Slab pools
651     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
652 
653     // Bins privatization is done only for the current thread
654     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
655         released |= tlsData->cleanupBlockBins();
656 
657     return released;
658 }
659 
660 void AllLocalCaches::registerThread(TLSRemote *tls)
661 {
662     tls->prev = NULL;
663     MallocMutex::scoped_lock lock(listLock);
664     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
665     tls->next = head;
666     if (head)
667         head->prev = tls;
668     head = tls;
669     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
670 }
671 
672 void AllLocalCaches::unregisterThread(TLSRemote *tls)
673 {
674     MallocMutex::scoped_lock lock(listLock);
675     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
676     if (head == tls)
677         head = tls->next;
678     if (tls->next)
679         tls->next->prev = tls->prev;
680     if (tls->prev)
681         tls->prev->next = tls->next;
682     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
683 }
684 
685 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
686 {
687     bool released = false;
688     {
689         MallocMutex::scoped_lock lock(listLock);
690         for (TLSRemote *curr=head; curr; curr=curr->next)
691             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
692     }
693     return released;
694 }
695 
696 void AllLocalCaches::markUnused()
697 {
698     bool locked;
699     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
700     if (!locked) // not wait for marking if someone doing something with it
701         return;
702 
703     for (TLSRemote *curr=head; curr; curr=curr->next)
704         static_cast<TLSData*>(curr)->markUnused();
705 }
706 
707 #if MALLOC_CHECK_RECURSION
708 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
709 pthread_t   RecursiveMallocCallProtector::owner_thread;
710 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
711 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
712 #if __FreeBSD__
713 bool        RecursiveMallocCallProtector::canUsePthread;
714 #endif
715 
716 #endif
717 
718 /*********** End code to provide thread ID and a TLS pointer **********/
719 
720 // Parameter for isLargeObject, keeps our expectations on memory origin.
721 // Assertions must use unknownMem to reliably report object invalidity.
722 enum MemoryOrigin {
723     ourMem,    // allocated by TBB allocator
724     unknownMem // can be allocated by system allocator or TBB allocator
725 };
726 
727 template<MemoryOrigin>
728 #if __TBB_USE_THREAD_SANITIZER
729 // We have a real race when accessing the large object header for
730 // non large objects (e.g. small or foreign objects).
731 // Therefore, we need to hide this access from the thread sanitizer
732 __attribute__((no_sanitize("thread")))
733 #endif
734 bool isLargeObject(void *object);
735 static void *internalMalloc(size_t size);
736 static void internalFree(void *object);
737 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
738 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
739 
740 #if !MALLOC_DEBUG
741 #if __INTEL_COMPILER || _MSC_VER
742 #define NOINLINE(decl) __declspec(noinline) decl
743 #define ALWAYSINLINE(decl) __forceinline decl
744 #elif __GNUC__
745 #define NOINLINE(decl) decl __attribute__ ((noinline))
746 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
747 #else
748 #define NOINLINE(decl) decl
749 #define ALWAYSINLINE(decl) decl
750 #endif
751 
752 static NOINLINE( bool doInitialization() );
753 ALWAYSINLINE( bool isMallocInitialized() );
754 
755 #undef ALWAYSINLINE
756 #undef NOINLINE
757 #endif /* !MALLOC_DEBUG */
758 
759 
760 /********* Now some rough utility code to deal with indexing the size bins. **************/
761 
762 /*
763  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
764  * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
765  * This is enough for current algorithm of distribution of sizes among bins.
766  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
767  */
768 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
769 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
770 #pragma intrinsic(_BitScanReverse)
771 #endif
772 static inline unsigned int highestBitPos(unsigned int n)
773 {
774     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
775     unsigned int pos;
776 #if __ARCH_x86_32||__ARCH_x86_64
777 
778 # if __linux__||__APPLE__||__FreeBSD__||__NetBSD__||__OpenBSD__||__sun||__MINGW32__
779     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
780 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
781     __asm
782     {
783         bsr eax, n
784         mov pos, eax
785     }
786 # elif _WIN64 && _MSC_VER>=1400
787     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
788 # else
789 #   error highestBitPos() not implemented for this platform
790 # endif
791 #elif __arm__
792     __asm__ __volatile__
793     (
794        "clz %0, %1\n"
795        "rsb %0, %0, %2\n"
796        :"=r" (pos) :"r" (n), "I" (31)
797     );
798 #else
799     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
800     pos = bsr[ n>>6 ];
801 #endif /* __ARCH_* */
802     return pos;
803 }
804 
805 
806 #if __TBB_x86_32 || __aarch32__
807 unsigned int getSmallObjectIndex(unsigned int size)
808 {
809     return (size-1)>>3;
810 }
811 #elif __TBB_x86_64 || __aarch64__
812 unsigned int getSmallObjectIndex(unsigned int size)
813 {
814     // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
815     unsigned int result = (size-1)>>3;
816     if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
817     return result;
818 }
819 #endif // __TBB_x86_32 ||  __aarch32__
820 
821 /*
822  * Depending on indexRequest, for a given size return either the index into the bin
823  * for objects of this size, or the actual size of objects in this bin.
824  */
825 template<bool indexRequest>
826 static unsigned int getIndexOrObjectSize (unsigned int size)
827 {
828     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
829         unsigned int index = getSmallObjectIndex( size );
830          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
831         return indexRequest ? index : (index+1)<<3;
832     }
833     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
834         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
835         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
836         if (indexRequest)
837             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
838         else {
839             unsigned int alignment = 128 >> (9-order); // alignment in the group
840             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
841             return alignUp(size,alignment);
842         }
843     }
844     else {
845         if( size <= fittingSize3 ) {
846             if( size <= fittingSize2 ) {
847                 if( size <= fittingSize1 )
848                     return indexRequest ? minFittingIndex : fittingSize1;
849                 else
850                     return indexRequest ? minFittingIndex+1 : fittingSize2;
851             } else
852                 return indexRequest ? minFittingIndex+2 : fittingSize3;
853         } else {
854             if( size <= fittingSize5 ) {
855                 if( size <= fittingSize4 )
856                     return indexRequest ? minFittingIndex+3 : fittingSize4;
857                 else
858                     return indexRequest ? minFittingIndex+4 : fittingSize5;
859             } else {
860                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
861                 return ~0U;
862             }
863         }
864     }
865 }
866 
867 static unsigned int getIndex (unsigned int size)
868 {
869     return getIndexOrObjectSize</*indexRequest=*/true>(size);
870 }
871 
872 static unsigned int getObjectSize (unsigned int size)
873 {
874     return getIndexOrObjectSize</*indexRequest=*/false>(size);
875 }
876 
877 
878 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
879 {
880     FreeObject *result;
881 
882     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
883 
884     { // Lock with acquire
885         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
886 
887         if( bootStrapObjectList) {
888             result = bootStrapObjectList;
889             bootStrapObjectList = bootStrapObjectList->next;
890         } else {
891             if (!bootStrapBlock) {
892                 bootStrapBlock = memPool->getEmptyBlock(size);
893                 if (!bootStrapBlock) return NULL;
894             }
895             result = bootStrapBlock->bumpPtr;
896             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
897             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
898                 bootStrapBlock->bumpPtr = NULL;
899                 bootStrapBlock->next = bootStrapBlockUsed;
900                 bootStrapBlockUsed = bootStrapBlock;
901                 bootStrapBlock = NULL;
902             }
903         }
904     } // Unlock with release
905     memset (result, 0, size);
906     return (void*)result;
907 }
908 
909 void BootStrapBlocks::free(void* ptr)
910 {
911     MALLOC_ASSERT( ptr, ASSERT_TEXT );
912     { // Lock with acquire
913         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
914         ((FreeObject*)ptr)->next = bootStrapObjectList;
915         bootStrapObjectList = (FreeObject*)ptr;
916     } // Unlock with release
917 }
918 
919 void BootStrapBlocks::reset()
920 {
921     bootStrapBlock = bootStrapBlockUsed = NULL;
922     bootStrapObjectList = NULL;
923 }
924 
925 #if !(FREELIST_NONBLOCKING)
926 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
927 #endif
928 
929 /********* End rough utility code  **************/
930 
931 /* LifoList assumes zero initialization so a vector of it can be created
932  * by just allocating some space with no call to constructor.
933  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
934  *
935  * By usage convention there is no race on the initialization. */
936 LifoList::LifoList( ) : top(nullptr)
937 {
938     // MallocMutex assumes zero initialization
939     memset(&lock, 0, sizeof(MallocMutex));
940 }
941 
942 void LifoList::push(Block *block)
943 {
944     MallocMutex::scoped_lock scoped_cs(lock);
945     block->next = top.load(std::memory_order_relaxed);
946     top.store(block, std::memory_order_relaxed);
947 }
948 
949 Block *LifoList::pop()
950 {
951     Block* block = nullptr;
952     if (top.load(std::memory_order_relaxed)) {
953         MallocMutex::scoped_lock scoped_cs(lock);
954         block = top.load(std::memory_order_relaxed);
955         if (block) {
956             top.store(block->next, std::memory_order_relaxed);
957         }
958     }
959     return block;
960 }
961 
962 Block *LifoList::grab()
963 {
964     Block *block = nullptr;
965     if (top.load(std::memory_order_relaxed)) {
966         MallocMutex::scoped_lock scoped_cs(lock);
967         block = top.load(std::memory_order_relaxed);
968         top.store(nullptr, std::memory_order_relaxed);
969     }
970     return block;
971 }
972 
973 /********* Thread and block related code      *************/
974 
975 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
976      LargeMemoryBlock *next, *lmb = loHead;
977      loHead = NULL;
978 
979      for (; lmb; lmb = next) {
980          next = lmb->gNext;
981          if (poolDestroy) {
982              // as it's pool destruction, no need to return object to backend,
983              // only remove backrefs, as they are global
984              removeBackRef(lmb->backRefIdx);
985          } else {
986              // clean g(Next|Prev) to prevent removing lmb
987              // from AllLargeBlocksList inside returnLargeObject
988              lmb->gNext = lmb->gPrev = NULL;
989              backend->returnLargeObject(lmb);
990          }
991      }
992 }
993 
994 TLSData* MemoryPool::getTLS(bool create)
995 {
996     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
997     if (create && !tls)
998         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
999     return tls;
1000 }
1001 
1002 /*
1003  * Return the bin for the given size.
1004  */
1005 inline Bin* TLSData::getAllocationBin(size_t size)
1006 {
1007     return bin + getIndex(size);
1008 }
1009 
1010 /* Return an empty uninitialized block in a non-blocking fashion. */
1011 Block *MemoryPool::getEmptyBlock(size_t size)
1012 {
1013     TLSData* tls = getTLS(/*create=*/false);
1014     // try to use per-thread cache, if TLS available
1015     FreeBlockPool::ResOfGet resOfGet = tls?
1016         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(NULL, false);
1017     Block *result = resOfGet.block;
1018 
1019     if (!result) { // not found in local cache, asks backend for slabs
1020         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1021         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1022 
1023         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1024         if (!result) return NULL;
1025 
1026         if (!extMemPool.userPool())
1027             for (int i=0; i<num; i++) {
1028                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1029                 if (backRefIdx[i].isInvalid()) {
1030                     // roll back resource allocation
1031                     for (int j=0; j<i; j++)
1032                         removeBackRef(backRefIdx[j]);
1033                     Block *b = result;
1034                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1035                         extMemPool.backend.putSlabBlock(b);
1036                     return NULL;
1037                 }
1038             }
1039         // resources were allocated, register blocks
1040         Block *b = result;
1041         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1042             // slab block in user's pool must have invalid backRefIdx
1043             if (extMemPool.userPool()) {
1044                 new (&b->backRefIdx) BackRefIdx();
1045             } else {
1046                 setBackRef(backRefIdx[i], b);
1047                 b->backRefIdx = backRefIdx[i];
1048             }
1049             b->tlsPtr.store(tls, std::memory_order_relaxed);
1050             b->poolPtr = this;
1051             // all but first one go to per-thread pool
1052             if (i > 0) {
1053                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1054                 tls->freeSlabBlocks.returnBlock(b);
1055             }
1056         }
1057     }
1058     MALLOC_ASSERT(result, ASSERT_TEXT);
1059     result->initEmptyBlock(tls, size);
1060     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1061     return result;
1062 }
1063 
1064 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1065 {
1066     block->reset();
1067     if (poolTheBlock) {
1068         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1069     } else {
1070         // slab blocks in user's pools do not have valid backRefIdx
1071         if (!extMemPool.userPool())
1072             removeBackRef(*(block->getBackRefIdx()));
1073         extMemPool.backend.putSlabBlock(block);
1074     }
1075 }
1076 
1077 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1078                          rawFreeType rawFree, size_t granularity,
1079                          bool keepAllMemory, bool fixedPool)
1080 {
1081     this->poolId = poolId;
1082     this->rawAlloc = rawAlloc;
1083     this->rawFree = rawFree;
1084     this->granularity = granularity;
1085     this->keepAllMemory = keepAllMemory;
1086     this->fixedPool = fixedPool;
1087     this->delayRegsReleasing = false;
1088     if (!initTLS())
1089         return false;
1090     loc.init(this);
1091     backend.init(this);
1092     MALLOC_ASSERT(isPoolValid(), NULL);
1093     return true;
1094 }
1095 
1096 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1097 
1098 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1099 {
1100     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1101                policy->granularity? policy->granularity : defaultGranularity,
1102                policy->keepAllMemory, policy->fixedPool))
1103         return false;
1104     {
1105         MallocMutex::scoped_lock lock(memPoolListLock);
1106         next = defaultMemPool->next;
1107         defaultMemPool->next = this;
1108         prev = defaultMemPool;
1109         if (next)
1110             next->prev = this;
1111     }
1112     return true;
1113 }
1114 
1115 bool MemoryPool::reset()
1116 {
1117     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1118     // memory is not releasing during pool reset
1119     // TODO: mark regions to release unused on next reset()
1120     extMemPool.delayRegionsReleasing(true);
1121 
1122     bootStrapBlocks.reset();
1123     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1124     if (!extMemPool.reset())
1125         return false;
1126 
1127     if (!extMemPool.initTLS())
1128         return false;
1129     extMemPool.delayRegionsReleasing(false);
1130     return true;
1131 }
1132 
1133 bool MemoryPool::destroy()
1134 {
1135 #if __TBB_MALLOC_LOCACHE_STAT
1136     extMemPool.loc.reportStat(stdout);
1137 #endif
1138 #if __TBB_MALLOC_BACKEND_STAT
1139     extMemPool.backend.reportStat(stdout);
1140 #endif
1141     {
1142         MallocMutex::scoped_lock lock(memPoolListLock);
1143         // remove itself from global pool list
1144         if (prev)
1145             prev->next = next;
1146         if (next)
1147             next->prev = prev;
1148     }
1149     // slab blocks in non-default pool do not have backreferences,
1150     // only large objects do
1151     if (extMemPool.userPool())
1152         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1153     else {
1154         // only one non-userPool() is supported now
1155         MALLOC_ASSERT(this==defaultMemPool, NULL);
1156         // There and below in extMemPool.destroy(), do not restore initial state
1157         // for user pool, because it's just about to be released. But for system
1158         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1159         bootStrapBlocks.reset();
1160         extMemPool.orphanedBlocks.reset();
1161     }
1162     return extMemPool.destroy();
1163 }
1164 
1165 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1166 {
1167     if (tlsData) { // might be called for "empty" TLS
1168         tlsData->release();
1169         bootStrapBlocks.free(tlsData);
1170         clearTLS();
1171     }
1172 }
1173 
1174 #if MALLOC_DEBUG
1175 void Bin::verifyTLSBin (size_t size) const
1176 {
1177 /* The debug version verifies the TLSBin as needed */
1178     uint32_t objSize = getObjectSize(size);
1179 
1180     if (activeBlk) {
1181         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1182         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1183 #if MALLOC_DEBUG>1
1184         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1185             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1186             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1187             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1188             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1189             if (temp->next) {
1190                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1191             }
1192         }
1193         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1194             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1195             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1196             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1197             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1198             if (temp->previous) {
1199                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1200             }
1201         }
1202 #endif /* MALLOC_DEBUG>1 */
1203     }
1204 }
1205 #else /* MALLOC_DEBUG */
1206 inline void Bin::verifyTLSBin (size_t) const { }
1207 #endif /* MALLOC_DEBUG */
1208 
1209 /*
1210  * Add a block to the start of this tls bin list.
1211  */
1212 void Bin::pushTLSBin(Block* block)
1213 {
1214     /* The objectSize should be defined and not a parameter
1215        because the function is applied to partially filled blocks as well */
1216     unsigned int size = block->objectSize;
1217 
1218     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1219     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1220     MALLOC_ASSERT( block->next == NULL, ASSERT_TEXT );
1221     MALLOC_ASSERT( block->previous == NULL, ASSERT_TEXT );
1222 
1223     MALLOC_ASSERT( this, ASSERT_TEXT );
1224     verifyTLSBin(size);
1225 
1226     block->next = activeBlk;
1227     if( activeBlk ) {
1228         block->previous = activeBlk->previous;
1229         activeBlk->previous = block;
1230         if( block->previous )
1231             block->previous->next = block;
1232     } else {
1233         activeBlk = block;
1234     }
1235 
1236     verifyTLSBin(size);
1237 }
1238 
1239 /*
1240  * Take a block out of its tls bin (e.g. before removal).
1241  */
1242 void Bin::outofTLSBin(Block* block)
1243 {
1244     unsigned int size = block->objectSize;
1245 
1246     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1247     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1248 
1249     MALLOC_ASSERT( this, ASSERT_TEXT );
1250     verifyTLSBin(size);
1251 
1252     if (block == activeBlk) {
1253         activeBlk = block->previous? block->previous : block->next;
1254     }
1255     /* Unlink the block */
1256     if (block->previous) {
1257         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1258         block->previous->next = block->next;
1259     }
1260     if (block->next) {
1261         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1262         block->next->previous = block->previous;
1263     }
1264     block->next = NULL;
1265     block->previous = NULL;
1266 
1267     verifyTLSBin(size);
1268 }
1269 
1270 Block* Bin::getPrivatizedFreeListBlock()
1271 {
1272     Block* block;
1273     MALLOC_ASSERT( this, ASSERT_TEXT );
1274     // if this method is called, active block usage must be unsuccessful
1275     MALLOC_ASSERT( !activeBlk && !mailbox.load(std::memory_order_relaxed) || activeBlk && activeBlk->isFull, ASSERT_TEXT );
1276 
1277 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1278     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1279         return NULL;
1280     else { // mailbox is not empty, take lock and inspect it
1281         MallocMutex::scoped_lock scoped_cs(mailLock);
1282         block = mailbox.load(std::memory_order_relaxed);
1283         if( block ) {
1284             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1285             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1286             mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1287             block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1288         }
1289     }
1290     if( block ) {
1291         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1292         block->privatizePublicFreeList();
1293         block->adjustPositionInBin(this);
1294     }
1295     return block;
1296 }
1297 
1298 void Bin::addPublicFreeListBlock(Block* block)
1299 {
1300     MallocMutex::scoped_lock scoped_cs(mailLock);
1301     block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1302     mailbox.store(block, std::memory_order_relaxed);
1303 }
1304 
1305 // Process publicly freed objects in all blocks and return empty blocks
1306 // to the backend in order to reduce overall footprint.
1307 bool Bin::cleanPublicFreeLists()
1308 {
1309     Block* block;
1310     if (!mailbox.load(std::memory_order_acquire))
1311         return false;
1312     else {
1313         // Grab all the blocks in the mailbox
1314         MallocMutex::scoped_lock scoped_cs(mailLock);
1315         block = mailbox.load(std::memory_order_relaxed);
1316         mailbox.store(NULL, std::memory_order_relaxed);
1317     }
1318     bool released = false;
1319     while (block) {
1320         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1321         Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1322         block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1323         block->privatizePublicFreeList();
1324         if (block->empty()) {
1325             processEmptyBlock(block, /*poolTheBlock=*/false);
1326             released = true;
1327         } else
1328             block->adjustPositionInBin(this);
1329         block = tmp;
1330     }
1331     return released;
1332 }
1333 
1334 bool Block::adjustFullness()
1335 {
1336     if (bumpPtr) {
1337         /* If we are still using a bump ptr for this block it is empty enough to use. */
1338         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1339         isFull = false;
1340     } else {
1341         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1342         /* allocatedCount shows how many objects in the block are in use; however it still counts
1343          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1344         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1345 #if COLLECT_STATISTICS
1346         if (isFull)
1347             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1348         else
1349             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1350 #endif
1351     }
1352     return isFull;
1353 }
1354 
1355 // This method resides in class Block, and not in class Bin, in order to avoid
1356 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
1357 void Block::adjustPositionInBin(Bin* bin/*=NULL*/)
1358 {
1359     // If the block were full, but became empty enough to use,
1360     // move it to the front of the list
1361     if (isFull && !adjustFullness()) {
1362         if (!bin)
1363             bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1364         bin->moveBlockToFront(this);
1365     }
1366 }
1367 
1368 /* Restore the bump pointer for an empty block that is planned to use */
1369 void Block::restoreBumpPtr()
1370 {
1371     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1372     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1373     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1374     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1375     freeList = NULL;
1376     isFull = false;
1377 }
1378 
1379 void Block::freeOwnObject(void *object)
1380 {
1381     tlsPtr.load(std::memory_order_relaxed)->markUsed();
1382     allocatedCount--;
1383     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1384 #if COLLECT_STATISTICS
1385     // Note that getAllocationBin is not called on the hottest path with statistics off.
1386     if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1387         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1388     else
1389         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1390 #endif
1391     if (empty()) {
1392         // If the last object of a slab is freed, the slab cannot be marked full
1393         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1394         tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1395     } else { // hot path
1396         FreeObject *objectToFree = findObjectToFree(object);
1397         objectToFree->next = freeList;
1398         freeList = objectToFree;
1399         adjustPositionInBin();
1400     }
1401 }
1402 
1403 void Block::freePublicObject (FreeObject *objectToFree)
1404 {
1405     FreeObject* localPublicFreeList{};
1406 
1407     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1408 #if FREELIST_NONBLOCKING
1409     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1410     localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1411     do {
1412         objectToFree->next = localPublicFreeList;
1413         // no backoff necessary because trying to make change, not waiting for a change
1414     } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1415 #else
1416     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1417     {
1418         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1419         localPublicFreeList = objectToFree->next = publicFreeList;
1420         publicFreeList = objectToFree;
1421     }
1422 #endif
1423 
1424     if( localPublicFreeList==NULL ) {
1425         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1426         // otherwise, it should point to the bin the block belongs to.
1427         // reading nextPrivatizable is thread-safe below, because:
1428         // 1) the executing thread atomically got publicFreeList==NULL and changed it to non-NULL;
1429         // 2) only owning thread can change it back to NULL,
1430         // 3) but it can not be done until the block is put to the mailbox
1431         // So the executing thread is now the only one that can change nextPrivatizable
1432         Block* next = nextPrivatizable.load(std::memory_order_acquire);
1433         if( !isNotForUse(next) ) {
1434             MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1435             Bin* theBin = (Bin*) next;
1436 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1437             // FIXME: The thread that returns the block is not the block's owner.
1438             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1439             // Need to find a way to get the correct remote bin for comparison.
1440             { // check that nextPrivatizable points to the bin the block belongs to
1441                 uint32_t index = getIndex( objectSize );
1442                 TLSData* tls = getThreadMallocTLS();
1443                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1444             }
1445 #endif // MALLOC_DEBUG
1446             theBin->addPublicFreeListBlock(this);
1447         }
1448     }
1449     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1450     STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1451 }
1452 
1453 // Make objects freed by other threads available for use again
1454 void Block::privatizePublicFreeList( bool reset )
1455 {
1456     FreeObject *localPublicFreeList;
1457     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1458     // to properly synchronize with other threads freeing objects to this slab.
1459     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1460 
1461     // Only the owner thread may reset the pointer to NULL
1462     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1463 #if FREELIST_NONBLOCKING
1464     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1465 #else
1466     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1467     {
1468         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1469         localPublicFreeList = publicFreeList;
1470         publicFreeList = endMarker;
1471     }
1472 #endif
1473     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1474     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1475 
1476     // publicFreeList must have been UNUSABLE or valid, but not NULL
1477     MALLOC_ASSERT( localPublicFreeList!=NULL, ASSERT_TEXT );
1478     if( isSolidPtr(localPublicFreeList) ) {
1479         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1480         /* other threads did not change the counter freeing our blocks */
1481         allocatedCount--;
1482         FreeObject *temp = localPublicFreeList;
1483         while( isSolidPtr(temp->next) ){ // the list will end with either NULL or UNUSABLE
1484             temp = temp->next;
1485             allocatedCount--;
1486             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1487         }
1488         /* merge with local freeList */
1489         temp->next = freeList;
1490         freeList = localPublicFreeList;
1491         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1492     }
1493 }
1494 
1495 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1496 {
1497     Bin* bin = tls->bin + index;
1498     STAT_increment(getThreadId(), index, allocBlockPublic);
1499     next = NULL;
1500     previous = NULL;
1501     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != NULL, ASSERT_TEXT );
1502     /* There is not a race here since no other thread owns this block */
1503     markOwned(tls);
1504     // It is safe to change nextPrivatizable, as publicFreeList is not null
1505     MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1506     nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1507     // the next call is required to change publicFreeList to 0
1508     privatizePublicFreeList();
1509     if( empty() ) {
1510         restoreBumpPtr();
1511     } else {
1512         adjustFullness(); // check the block fullness and set isFull
1513     }
1514     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1515 }
1516 
1517 
1518 bool Block::readyToShare()
1519 {
1520     FreeObject* oldVal = NULL;
1521 #if FREELIST_NONBLOCKING
1522     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1523 #else
1524     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1525     {
1526         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1527         if ( (oldVal=publicFreeList)==NULL )
1528             (intptr_t&)(publicFreeList) = UNUSABLE;
1529     }
1530 #endif
1531     return oldVal==NULL;
1532 }
1533 
1534 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1535 {
1536     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1537     // unreferenced formal parameter warning
1538     tbb::detail::suppress_unused_warning(index);
1539     STAT_increment(getThreadId(), index, freeBlockPublic);
1540     markOrphaned();
1541     bool syncOnMailbox = false;
1542     if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1543         // First check passed: the block is not in mailbox yet.
1544         // Need to set publicFreeList to non-zero, so other threads
1545         // will not change nextPrivatizable and it can be zeroed.
1546         if ( !readyToShare() ) {
1547             // another thread freed an object; we need to wait until it finishes.
1548             // There is no need for exponential backoff, as the wait here is not for a lock;
1549             // but need to yield, so the thread we wait has a chance to run.
1550             // TODO: add a pause to also be friendly to hyperthreads
1551             int count = 256;
1552             while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1553                 if (--count==0) {
1554                     do_yield();
1555                     count = 256;
1556                 }
1557             }
1558         }
1559     }
1560     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=NULL, ASSERT_TEXT );
1561     // now it is safe to change our data
1562     previous = NULL;
1563     // it is caller responsibility to ensure that the list of blocks
1564     // formed by nextPrivatizable pointers is kept consistent if required.
1565     // if only called from thread shutdown code, it does not matter.
1566     nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1567 }
1568 
1569 void Block::cleanBlockHeader()
1570 {
1571     next = nullptr;
1572     previous = nullptr;
1573     freeList = nullptr;
1574     allocatedCount = 0;
1575     isFull = false;
1576     tlsPtr.store(nullptr, std::memory_order_relaxed);
1577 
1578     publicFreeList.store(nullptr, std::memory_order_relaxed);
1579 }
1580 
1581 void Block::initEmptyBlock(TLSData *tls, size_t size)
1582 {
1583     // Having getIndex and getObjectSize called next to each other
1584     // allows better compiler optimization as they basically share the code.
1585     unsigned int index = getIndex(size);
1586     unsigned int objSz = getObjectSize(size);
1587 
1588     cleanBlockHeader();
1589     objectSize = objSz;
1590     markOwned(tls);
1591     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1592     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1593 
1594     // each block should have the address where the head of the list of "privatizable" blocks is kept
1595     // the only exception is a block for boot strap which is initialized when TLS is yet NULL
1596     nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1597     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1598              this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1599 }
1600 
1601 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1602 {
1603     // TODO: try to use index from getAllocationBin
1604     unsigned int index = getIndex(size);
1605     Block *block = bins[index].pop();
1606     if (block) {
1607         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1608         block->privatizeOrphaned(tls, index);
1609     }
1610     return block;
1611 }
1612 
1613 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1614 {
1615     unsigned int index = getIndex(block->getSize());
1616     block->shareOrphaned(binTag, index);
1617     MALLOC_ITT_SYNC_RELEASING(bins+index);
1618     bins[index].push(block);
1619 }
1620 
1621 void OrphanedBlocks::reset()
1622 {
1623     for (uint32_t i=0; i<numBlockBinLimit; i++)
1624         new (bins+i) LifoList();
1625 }
1626 
1627 bool OrphanedBlocks::cleanup(Backend* backend)
1628 {
1629     bool released = false;
1630     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1631         Block* block = bins[i].grab();
1632         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1633         while (block) {
1634             Block* next = block->next;
1635             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to NULL
1636             if (block->empty()) {
1637                 block->reset();
1638                 // slab blocks in user's pools do not have valid backRefIdx
1639                 if (!backend->inUserPool())
1640                     removeBackRef(*(block->getBackRefIdx()));
1641                 backend->putSlabBlock(block);
1642                 released = true;
1643             } else {
1644                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1645                 bins[i].push(block);
1646             }
1647             block = next;
1648         }
1649     }
1650     return released;
1651 }
1652 
1653 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1654 {
1655     Block *b = head.exchange(NULL);
1656 
1657     if (b) {
1658         size--;
1659         Block *newHead = b->next;
1660         lastAccessMiss = false;
1661         head.store(newHead, std::memory_order_release);
1662     } else {
1663         lastAccessMiss = true;
1664     }
1665     return ResOfGet(b, lastAccessMiss);
1666 }
1667 
1668 void FreeBlockPool::returnBlock(Block *block)
1669 {
1670     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1671     Block *localHead = head.exchange(NULL);
1672 
1673     if (!localHead) {
1674         size = 0; // head was stolen by externalClean, correct size accordingly
1675     } else if (size == POOL_HIGH_MARK) {
1676         // release cold blocks and add hot one,
1677         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1678         Block *headToFree = localHead, *helper;
1679         for (int i=0; i<POOL_LOW_MARK-2; i++)
1680             headToFree = headToFree->next;
1681         Block *last = headToFree;
1682         headToFree = headToFree->next;
1683         last->next = NULL;
1684         size = POOL_LOW_MARK-1;
1685         for (Block *currBl = headToFree; currBl; currBl = helper) {
1686             helper = currBl->next;
1687             // slab blocks in user's pools do not have valid backRefIdx
1688             if (!backend->inUserPool())
1689                 removeBackRef(currBl->backRefIdx);
1690             backend->putSlabBlock(currBl);
1691         }
1692     }
1693     size++;
1694     block->next = localHead;
1695     head.store(block, std::memory_order_release);
1696 }
1697 
1698 bool FreeBlockPool::externalCleanup()
1699 {
1700     Block *helper;
1701     bool released = false;
1702 
1703     for (Block *currBl=head.exchange(NULL); currBl; currBl=helper) {
1704         helper = currBl->next;
1705         // slab blocks in user's pools do not have valid backRefIdx
1706         if (!backend->inUserPool())
1707             removeBackRef(currBl->backRefIdx);
1708         backend->putSlabBlock(currBl);
1709         released = true;
1710     }
1711     return released;
1712 }
1713 
1714 /* Prepare the block for returning to FreeBlockPool */
1715 void Block::reset()
1716 {
1717     // it is caller's responsibility to ensure no data is lost before calling this
1718     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1719     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1720     if (!isStartupAllocObject())
1721         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1722 
1723     cleanBlockHeader();
1724 
1725     nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1726 
1727     objectSize = 0;
1728     // for an empty block, bump pointer should point right after the end of the block
1729     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1730 }
1731 
1732 inline void Bin::setActiveBlock (Block *block)
1733 {
1734 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1735     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1736     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1737     activeBlk = block;
1738 }
1739 
1740 inline Block* Bin::setPreviousBlockActive()
1741 {
1742     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1743     Block* temp = activeBlk->previous;
1744     if( temp ) {
1745         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1746         activeBlk = temp;
1747     }
1748     return temp;
1749 }
1750 
1751 inline bool Block::isOwnedByCurrentThread() const {
1752     return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1753 }
1754 
1755 FreeObject *Block::findObjectToFree(const void *object) const
1756 {
1757     FreeObject *objectToFree;
1758     // Due to aligned allocations, a pointer passed to scalable_free
1759     // might differ from the address of internally allocated object.
1760     // Small objects however should always be fine.
1761     if (objectSize <= maxSegregatedObjectSize)
1762         objectToFree = (FreeObject*)object;
1763     // "Fitting size" allocations are suspicious if aligned higher than naturally
1764     else {
1765         if ( ! isAligned(object,2*fittingAlignment) )
1766             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1767             //       so might even be slower in average than unconditional use of findAllocatedObject.
1768             // here it should be a "real" object
1769             objectToFree = (FreeObject*)object;
1770         else
1771             // here object can be an aligned address, so applying additional checks
1772             objectToFree = findAllocatedObject(object);
1773         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1774     }
1775     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1776 
1777     return objectToFree;
1778 }
1779 
1780 void TLSData::release()
1781 {
1782     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1783     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1784 
1785     for (unsigned index = 0; index < numBlockBins; index++) {
1786         Block *activeBlk = bin[index].getActiveBlock();
1787         if (!activeBlk)
1788             continue;
1789         Block *threadlessBlock = activeBlk->previous;
1790         bool syncOnMailbox = false;
1791         while (threadlessBlock) {
1792             Block *threadBlock = threadlessBlock->previous;
1793             if (threadlessBlock->empty()) {
1794                 /* we destroy the thread, so not use its block pool */
1795                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1796             } else {
1797                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1798                 syncOnMailbox = true;
1799             }
1800             threadlessBlock = threadBlock;
1801         }
1802         threadlessBlock = activeBlk;
1803         while (threadlessBlock) {
1804             Block *threadBlock = threadlessBlock->next;
1805             if (threadlessBlock->empty()) {
1806                 /* we destroy the thread, so not use its block pool */
1807                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1808             } else {
1809                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1810                 syncOnMailbox = true;
1811             }
1812             threadlessBlock = threadBlock;
1813         }
1814         bin[index].resetActiveBlock();
1815 
1816         if (syncOnMailbox) {
1817             // Although, we synchronized on nextPrivatizable inside a block, we still need to
1818             // synchronize on the bin lifetime because the thread releasing an object into the public
1819             // free list is touching the bin (mailbox and mailLock)
1820             MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1821         }
1822     }
1823 }
1824 
1825 
1826 #if MALLOC_CHECK_RECURSION
1827 // TODO: Use dedicated heap for this
1828 
1829 /*
1830  * It's a special kind of allocation that can be used when malloc is
1831  * not available (either during startup or when malloc was already called and
1832  * we are, say, inside pthread_setspecific's call).
1833  * Block can contain objects of different sizes,
1834  * allocations are performed by moving bump pointer and increasing of object counter,
1835  * releasing is done via counter of objects allocated in the block
1836  * or moving bump pointer if releasing object is on a bound.
1837  * TODO: make bump pointer to grow to the same backward direction as all the others.
1838  */
1839 
1840 class StartupBlock : public Block {
1841     size_t availableSize() const {
1842         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1843     }
1844     static StartupBlock *getBlock();
1845 public:
1846     static FreeObject *allocate(size_t size);
1847     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1848     void free(void *ptr);
1849 };
1850 
1851 static MallocMutex startupMallocLock;
1852 static StartupBlock *firstStartupBlock;
1853 
1854 StartupBlock *StartupBlock::getBlock()
1855 {
1856     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1857     if (backRefIdx.isInvalid()) return NULL;
1858 
1859     StartupBlock *block = static_cast<StartupBlock*>(
1860         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1861     if (!block) return NULL;
1862 
1863     block->cleanBlockHeader();
1864     setBackRef(backRefIdx, block);
1865     block->backRefIdx = backRefIdx;
1866     // use startupAllocObjSizeMark to mark objects from startup block marker
1867     block->objectSize = startupAllocObjSizeMark;
1868     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1869     return block;
1870 }
1871 
1872 FreeObject *StartupBlock::allocate(size_t size)
1873 {
1874     FreeObject *result;
1875     StartupBlock *newBlock = NULL;
1876     bool newBlockUnused = false;
1877 
1878     /* Objects must be aligned on their natural bounds,
1879        and objects bigger than word on word's bound. */
1880     size = alignUp(size, sizeof(size_t));
1881     // We need size of an object to implement msize.
1882     size_t reqSize = size + sizeof(size_t);
1883     {
1884         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1885         // Re-check whether we need a new block (conditions might have changed)
1886         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1887             if (!newBlock) {
1888                 newBlock = StartupBlock::getBlock();
1889                 if (!newBlock) return NULL;
1890             }
1891             newBlock->next = (Block*)firstStartupBlock;
1892             if (firstStartupBlock)
1893                 firstStartupBlock->previous = (Block*)newBlock;
1894             firstStartupBlock = newBlock;
1895         }
1896         result = firstStartupBlock->bumpPtr;
1897         firstStartupBlock->allocatedCount++;
1898         firstStartupBlock->bumpPtr =
1899             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1900     }
1901 
1902     // keep object size at the negative offset
1903     *((size_t*)result) = size;
1904     return (FreeObject*)((size_t*)result+1);
1905 }
1906 
1907 void StartupBlock::free(void *ptr)
1908 {
1909     Block* blockToRelease = NULL;
1910     {
1911         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1912 
1913         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1914         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1915                       && allocatedCount>0, ASSERT_TEXT);
1916         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1917                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1918                       ASSERT_TEXT);
1919         if (0 == --allocatedCount) {
1920             if (this == firstStartupBlock)
1921                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1922             if (previous)
1923                 previous->next = next;
1924             if (next)
1925                 next->previous = previous;
1926             blockToRelease = this;
1927         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1928             // last object in the block released
1929             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1930             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1931                           ASSERT_TEXT);
1932             bumpPtr = newBump;
1933         }
1934     }
1935     if (blockToRelease) {
1936         blockToRelease->previous = blockToRelease->next = NULL;
1937         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1938     }
1939 }
1940 
1941 #endif /* MALLOC_CHECK_RECURSION */
1942 
1943 /********* End thread related code  *************/
1944 
1945 /********* Library initialization *************/
1946 
1947 //! Value indicating the state of initialization.
1948 /* 0 = initialization not started.
1949  * 1 = initialization started but not finished.
1950  * 2 = initialization finished.
1951  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1952  * useful for detecting errors in the double-check pattern.
1953  */
1954 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1955 static MallocMutex initMutex;
1956 
1957 /** The leading "\0" is here so that applying "strings" to the binary
1958     delivers a clean result. */
1959 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1960 
1961 #if USE_PTHREAD && (__TBB_SOURCE_DIRECTLY_INCLUDED || __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND)
1962 
1963 /* Decrease race interval between dynamic library unloading and pthread key
1964    destructor. Protect only Pthreads with supported unloading. */
1965 class ShutdownSync {
1966 /* flag is the number of threads in pthread key dtor body
1967    (i.e., between threadDtorStart() and threadDtorDone())
1968    or the signal to skip dtor, if flag < 0 */
1969     std::atomic<intptr_t> flag;
1970     static const intptr_t skipDtor = INTPTR_MIN/2;
1971 public:
1972     void init() { flag.store(0, std::memory_order_release); }
1973 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1974    simultaneously, so flag never becomes negative because of that. */
1975     bool threadDtorStart() {
1976         if (flag.load(std::memory_order_acquire) < 0)
1977             return false;
1978         if (++flag <= 0) { // note that new value returned
1979             flag.fetch_sub(1); // flag is spoiled by us, restore it
1980             return false;
1981         }
1982         return true;
1983     }
1984     void threadDtorDone() {
1985         flag.fetch_sub(1);
1986     }
1987     void processExit() {
1988         if (flag.fetch_add(skipDtor) != 0) {
1989             SpinWaitUntilEq(flag, skipDtor);
1990         }
1991     }
1992 };
1993 
1994 #else
1995 
1996 class ShutdownSync {
1997 public:
1998     void init() { }
1999     bool threadDtorStart() { return true; }
2000     void threadDtorDone() { }
2001     void processExit() { }
2002 };
2003 
2004 #endif // USE_PTHREAD && (__TBB_SOURCE_DIRECTLY_INCLUDED || __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND)
2005 
2006 static ShutdownSync shutdownSync;
2007 
2008 inline bool isMallocInitialized() {
2009     // Load must have acquire fence; otherwise thread taking "initialized" path
2010     // might perform textually later loads *before* mallocInitialized becomes 2.
2011     return 2 == mallocInitialized.load(std::memory_order_acquire);
2012 }
2013 
2014 /* Caller is responsible for ensuring this routine is called exactly once. */
2015 extern "C" void MallocInitializeITT() {
2016 #if __TBB_USE_ITT_NOTIFY
2017     if (!usedBySrcIncluded)
2018         tbb::detail::r1::__TBB_load_ittnotify();
2019 #endif
2020 }
2021 
2022 void MemoryPool::initDefaultPool() {
2023     hugePages.init();
2024 }
2025 
2026 /*
2027  * Allocator initialization routine;
2028  * it is called lazily on the very first scalable_malloc call.
2029  */
2030 static bool initMemoryManager()
2031 {
2032     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2033              sizeof(Block), sizeof(uintptr_t) ));
2034     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2035     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2036     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2037                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2038 
2039 #if USE_WINTHREAD
2040     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2041 #else
2042     // POSIX.1-2001-compliant way to get page size
2043     const size_t granularity = sysconf(_SC_PAGESIZE);
2044 #endif
2045     if (!defaultMemPool) {
2046         // Do not rely on static constructors and do the assignment in case
2047         // of library static section not initialized at this call yet.
2048         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2049     }
2050     bool initOk = defaultMemPool->
2051         extMemPool.init(0, NULL, NULL, granularity,
2052                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2053 // TODO: extMemPool.init() to not allocate memory
2054     if (!initOk || !initBackRefMaster(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2055         return false;
2056     MemoryPool::initDefaultPool();
2057     // init() is required iff initMemoryManager() is called
2058     // after mallocProcessShutdownNotification()
2059     shutdownSync.init();
2060 #if COLLECT_STATISTICS
2061     initStatisticsCollection();
2062 #endif
2063     return true;
2064 }
2065 
2066 static bool GetBoolEnvironmentVariable(const char* name) {
2067     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2068 }
2069 
2070 //! Ensures that initMemoryManager() is called once and only once.
2071 /** Does not return until initMemoryManager() has been completed by a thread.
2072     There is no need to call this routine if mallocInitialized==2 . */
2073 static bool doInitialization()
2074 {
2075     MallocMutex::scoped_lock lock( initMutex );
2076     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2077         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2078         mallocInitialized.store(1, std::memory_order_relaxed);
2079         RecursiveMallocCallProtector scoped;
2080         if (!initMemoryManager()) {
2081             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2082             return false;
2083         }
2084 #ifdef  MALLOC_EXTRA_INITIALIZATION
2085         MALLOC_EXTRA_INITIALIZATION;
2086 #endif
2087 #if MALLOC_CHECK_RECURSION
2088         RecursiveMallocCallProtector::detectNaiveOverload();
2089 #endif
2090         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2091         // Store must have release fence, otherwise mallocInitialized==2
2092         // might become remotely visible before side effects of
2093         // initMemoryManager() become remotely visible.
2094         mallocInitialized.store(2, std::memory_order_release);
2095         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2096             fputs(VersionString+1,stderr);
2097             hugePages.printStatus();
2098         }
2099     }
2100     /* It can't be 0 or I would have initialized it */
2101     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2102     return true;
2103 }
2104 
2105 /********* End library initialization *************/
2106 
2107 /********* The malloc show begins     *************/
2108 
2109 
2110 FreeObject *Block::allocateFromFreeList()
2111 {
2112     FreeObject *result;
2113 
2114     if (!freeList) return NULL;
2115 
2116     result = freeList;
2117     MALLOC_ASSERT( result, ASSERT_TEXT );
2118 
2119     freeList = result->next;
2120     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2121     allocatedCount++;
2122     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2123 
2124     return result;
2125 }
2126 
2127 FreeObject *Block::allocateFromBumpPtr()
2128 {
2129     FreeObject *result = bumpPtr;
2130     if (result) {
2131         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2132         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2133             bumpPtr = NULL;
2134         }
2135         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2136         allocatedCount++;
2137         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2138     }
2139     return result;
2140 }
2141 
2142 inline FreeObject* Block::allocate()
2143 {
2144     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2145 
2146     /* for better cache locality, first looking in the free list. */
2147     if ( FreeObject *result = allocateFromFreeList() ) {
2148         return result;
2149     }
2150     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2151 
2152     /* if free list is empty, try thread local bump pointer allocation. */
2153     if ( FreeObject *result = allocateFromBumpPtr() ) {
2154         return result;
2155     }
2156     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2157 
2158     /* the block is considered full. */
2159     isFull = true;
2160     return NULL;
2161 }
2162 
2163 size_t Block::findObjectSize(void *object) const
2164 {
2165     size_t blSize = getSize();
2166 #if MALLOC_CHECK_RECURSION
2167     // Currently, there is no aligned allocations from startup blocks,
2168     // so we can return just StartupBlock::msize().
2169     // TODO: This must be extended if we add aligned allocation from startup blocks.
2170     if (!blSize)
2171         return StartupBlock::msize(object);
2172 #endif
2173     // object can be aligned, so real size can be less than block's
2174     size_t size =
2175         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2176     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2177     return size;
2178 }
2179 
2180 void Bin::moveBlockToFront(Block *block)
2181 {
2182     /* move the block to the front of the bin */
2183     if (block == activeBlk) return;
2184     outofTLSBin(block);
2185     pushTLSBin(block);
2186 }
2187 
2188 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2189 {
2190     if (block != activeBlk) {
2191         /* We are not using this block; return it to the pool */
2192         outofTLSBin(block);
2193         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2194     } else {
2195         /* all objects are free - let's restore the bump pointer */
2196         block->restoreBumpPtr();
2197     }
2198 }
2199 
2200 template<int LOW_MARK, int HIGH_MARK>
2201 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2202 {
2203     const size_t size = object->unalignedSize;
2204     // not spoil cache with too large object, that can cause its total cleanup
2205     if (size > MAX_TOTAL_SIZE)
2206         return false;
2207     LargeMemoryBlock *localHead = head.exchange(NULL);
2208 
2209     object->prev = NULL;
2210     object->next = localHead;
2211     if (localHead)
2212         localHead->prev = object;
2213     else {
2214         // those might not be cleaned during local cache stealing, correct them
2215         totalSize = 0;
2216         numOfBlocks = 0;
2217         tail = object;
2218     }
2219     localHead = object;
2220     totalSize += size;
2221     numOfBlocks++;
2222     // must meet both size and number of cached objects constrains
2223     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2224         // scanning from tail until meet conditions
2225         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2226             totalSize -= tail->unalignedSize;
2227             numOfBlocks--;
2228             tail = tail->prev;
2229         }
2230         LargeMemoryBlock *headToRelease = tail->next;
2231         tail->next = NULL;
2232 
2233         extMemPool->freeLargeObjectList(headToRelease);
2234     }
2235 
2236     head.store(localHead, std::memory_order_release);
2237     return true;
2238 }
2239 
2240 template<int LOW_MARK, int HIGH_MARK>
2241 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2242 {
2243     LargeMemoryBlock *localHead, *res = NULL;
2244 
2245     if (size > MAX_TOTAL_SIZE)
2246         return NULL;
2247 
2248     // TBB_REVAMP_TODO: review this line
2249     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(NULL)) == NULL) {
2250         // do not restore totalSize, numOfBlocks and tail at this point,
2251         // as they are used only in put(), where they must be restored
2252         return NULL;
2253     }
2254 
2255     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2256         if (curr->unalignedSize == size) {
2257             res = curr;
2258             if (curr->next)
2259                 curr->next->prev = curr->prev;
2260             else
2261                 tail = curr->prev;
2262             if (curr != localHead)
2263                 curr->prev->next = curr->next;
2264             else
2265                 localHead = curr->next;
2266             totalSize -= size;
2267             numOfBlocks--;
2268             break;
2269         }
2270     }
2271 
2272     head.store(localHead, std::memory_order_release);
2273     return res;
2274 }
2275 
2276 template<int LOW_MARK, int HIGH_MARK>
2277 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2278 {
2279     if (LargeMemoryBlock *localHead = head.exchange(NULL)) {
2280         extMemPool->freeLargeObjectList(localHead);
2281         return true;
2282     }
2283     return false;
2284 }
2285 
2286 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2287 {
2288     LargeMemoryBlock *lmb = NULL;
2289 
2290     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2291     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2292     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2293         return NULL;
2294     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2295 
2296     if (tls) {
2297         tls->markUsed();
2298         lmb = tls->lloc.get(allocationSize);
2299     }
2300     if (!lmb)
2301         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2302 
2303     if (lmb) {
2304         // doing shuffle we suppose that alignment offset guarantees
2305         // that different cache lines are in use
2306         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2307 
2308         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2309         uintptr_t alignedRight =
2310             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2311         // Has some room to shuffle object between cache lines?
2312         // Note that alignedRight and alignedArea are aligned at alignment.
2313         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2314         if (ptrDelta && tls) { // !tls is cold path
2315             // for the hot path of alignment==estimatedCacheLineSize,
2316             // allow compilers to use shift for division
2317             // (since estimatedCacheLineSize is a power-of-2 constant)
2318             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2319                   ptrDelta / estimatedCacheLineSize :
2320                   ptrDelta / alignment;
2321             unsigned myCacheIdx = ++tls->currCacheIdx;
2322             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2323 
2324             // Move object to a cache line with an offset that is different from
2325             // previous allocation. This supposedly allows us to use cache
2326             // associativity more efficiently.
2327             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2328         }
2329         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2330                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2331         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2332         header->memoryBlock = lmb;
2333         header->backRefIdx = lmb->backRefIdx;
2334         setBackRef(header->backRefIdx, header);
2335 
2336         lmb->objectSize = size;
2337 
2338         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2339         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2340 
2341         return alignedArea;
2342     }
2343     return NULL;
2344 }
2345 
2346 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2347 {
2348     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2349     // overwrite backRefIdx to simplify double free detection
2350     header->backRefIdx = BackRefIdx();
2351 
2352     if (tls) {
2353         tls->markUsed();
2354         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2355             return;
2356     }
2357     extMemPool.freeLargeObject(header->memoryBlock);
2358 }
2359 
2360 /*
2361  * All aligned allocations fall into one of the following categories:
2362  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2363  *       we just align the size up, and request this amount, because for every size
2364  *       aligned to some power of 2, the allocated object is at least that aligned.
2365  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2366  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2367  *       its address up; given such pointer, scalable_free could find the real object.
2368  *       Wrapping of size+alignment is impossible because maximal allowed
2369  *       alignment plus minLargeObjectSize can't lead to wrapping.
2370  * 4. otherwise, aligned large object is allocated.
2371  */
2372 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2373 {
2374     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2375 
2376     if (!isMallocInitialized())
2377         if (!doInitialization())
2378             return NULL;
2379 
2380     void *result;
2381     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2382         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2383     else if (size<minLargeObjectSize) {
2384         if (alignment<=fittingAlignment)
2385             result = internalPoolMalloc(memPool, size);
2386         else if (size+alignment < minLargeObjectSize) {
2387             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2388             if (!unaligned) return NULL;
2389             result = alignUp(unaligned, alignment);
2390         } else
2391             goto LargeObjAlloc;
2392     } else {
2393     LargeObjAlloc:
2394         TLSData *tls = memPool->getTLS(/*create=*/true);
2395         // take into account only alignment that are higher then natural
2396         result =
2397             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2398                                                largeObjectAlignment: alignment);
2399     }
2400 
2401     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2402     return result;
2403 }
2404 
2405 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2406                             size_t newSize, size_t alignment = 0)
2407 {
2408     void *result;
2409     size_t copySize;
2410 
2411     if (isLargeObject<ourMem>(ptr)) {
2412         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2413         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2414 
2415         // Apply different strategies if size decreases
2416         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2417 
2418             // For huge objects (that do not fit in backend cache), keep the same space unless
2419             // the new size is at least twice smaller
2420             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2421             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2422             if (newSize > threshold) {
2423                 lmb->objectSize = newSize;
2424                 return ptr;
2425             }
2426             // TODO: For large objects suitable for the backend cache,
2427             // split out the excessive part and put it to the backend.
2428         }
2429         // Reallocate for real
2430         copySize = lmb->objectSize;
2431 #if BACKEND_HAS_MREMAP
2432         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2433                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2434             return r;
2435 #endif
2436         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2437             internalPoolMalloc(memPool, newSize);
2438 
2439     } else {
2440         Block* block = (Block *)alignDown(ptr, slabSize);
2441         copySize = block->findObjectSize(ptr);
2442 
2443         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2444         // Currently, in case of size decreasing, old pointer is returned
2445         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2446             return ptr;
2447         } else {
2448             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2449                 internalPoolMalloc(memPool, newSize);
2450         }
2451     }
2452     if (result) {
2453         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2454         internalPoolFree(memPool, ptr, 0);
2455     }
2456     return result;
2457 }
2458 
2459 #if MALLOC_DEBUG
2460 /* A predicate checks if an object is properly placed inside its block */
2461 inline bool Block::isProperlyPlaced(const void *object) const
2462 {
2463     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2464 }
2465 #endif
2466 
2467 /* Finds the real object inside the block */
2468 FreeObject *Block::findAllocatedObject(const void *address) const
2469 {
2470     // calculate offset from the end of the block space
2471     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2472     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2473     // find offset difference from a multiple of allocation size
2474     offset %= objectSize;
2475     // and move the address down to where the real object starts.
2476     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2477 }
2478 
2479 /*
2480  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2481  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2482  */
2483 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2484 {
2485     BackRefIdx id;
2486 #if _MSC_VER
2487     __try {
2488 #endif
2489         id = dereference(ptr);
2490 #if _MSC_VER
2491     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2492                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2493         id = BackRefIdx();
2494     }
2495 #endif
2496     return id;
2497 }
2498 
2499 template<MemoryOrigin memOrigin>
2500 bool isLargeObject(void *object)
2501 {
2502     if (!isAligned(object, largeObjectAlignment))
2503         return false;
2504     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2505     BackRefIdx idx = (memOrigin == unknownMem) ?
2506         safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2507 
2508     return idx.isLargeObject()
2509         // in valid LargeObjectHdr memoryBlock is not NULL
2510         && header->memoryBlock
2511         // in valid LargeObjectHdr memoryBlock points somewhere before header
2512         // TODO: more strict check
2513         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2514         && getBackRef(idx) == header;
2515 }
2516 
2517 static inline bool isSmallObject (void *ptr)
2518 {
2519     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2520     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2521 
2522     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2523     if (isSmall)
2524         expectedBlock->checkFreePrecond(ptr);
2525     return isSmall;
2526 }
2527 
2528 /**** Check if an object was allocated by scalable_malloc ****/
2529 static inline bool isRecognized (void* ptr)
2530 {
2531     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2532         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2533 }
2534 
2535 static inline void freeSmallObject(void *object)
2536 {
2537     /* mask low bits to get the block */
2538     Block *block = (Block *)alignDown(object, slabSize);
2539     block->checkFreePrecond(object);
2540 
2541 #if MALLOC_CHECK_RECURSION
2542     if (block->isStartupAllocObject()) {
2543         ((StartupBlock *)block)->free(object);
2544         return;
2545     }
2546 #endif
2547     if (block->isOwnedByCurrentThread()) {
2548         block->freeOwnObject(object);
2549     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2550         FreeObject *objectToFree = block->findObjectToFree(object);
2551         block->freePublicObject(objectToFree);
2552     }
2553 }
2554 
2555 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2556 {
2557     Bin* bin;
2558     Block * mallocBlock;
2559 
2560     if (!memPool) return NULL;
2561 
2562     if (!size) size = sizeof(size_t);
2563 
2564     TLSData *tls = memPool->getTLS(/*create=*/true);
2565 
2566     /* Allocate a large object */
2567     if (size >= minLargeObjectSize)
2568         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2569 
2570     if (!tls) return NULL;
2571 
2572     tls->markUsed();
2573     /*
2574      * Get an element in thread-local array corresponding to the given size;
2575      * It keeps ptr to the active block for allocations of this size
2576      */
2577     bin = tls->getAllocationBin(size);
2578     if ( !bin ) return NULL;
2579 
2580     /* Get a block to try to allocate in. */
2581     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2582          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2583     {
2584         if( FreeObject *result = mallocBlock->allocate() )
2585             return result;
2586     }
2587 
2588     /*
2589      * else privatize publicly freed objects in some block and allocate from it
2590      */
2591     mallocBlock = bin->getPrivatizedFreeListBlock();
2592     if (mallocBlock) {
2593         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2594         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2595             return result;
2596         /* Else something strange happened, need to retry from the beginning; */
2597         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2598         return internalPoolMalloc(memPool, size);
2599     }
2600 
2601     /*
2602      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2603      */
2604     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2605     while (mallocBlock) {
2606         bin->pushTLSBin(mallocBlock);
2607         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2608         if( FreeObject *result = mallocBlock->allocate() )
2609             return result;
2610         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2611     }
2612 
2613     /*
2614      * else try to get a new empty block
2615      */
2616     mallocBlock = memPool->getEmptyBlock(size);
2617     if (mallocBlock) {
2618         bin->pushTLSBin(mallocBlock);
2619         bin->setActiveBlock(mallocBlock);
2620         if( FreeObject *result = mallocBlock->allocate() )
2621             return result;
2622         /* Else something strange happened, need to retry from the beginning; */
2623         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2624         return internalPoolMalloc(memPool, size);
2625     }
2626     /*
2627      * else nothing works so return NULL
2628      */
2629     TRACEF(( "[ScalableMalloc trace] No memory found, returning NULL.\n" ));
2630     return NULL;
2631 }
2632 
2633 // When size==0 (i.e. unknown), detect here whether the object is large.
2634 // For size is known and < minLargeObjectSize, we still need to check
2635 // if the actual object is large, because large objects might be used
2636 // for aligned small allocations.
2637 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2638 {
2639     if (!memPool || !object) return false;
2640 
2641     // The library is initialized at allocation call, so releasing while
2642     // not initialized means foreign object is releasing.
2643     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2644     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2645                   "Invalid pointer during object releasing is detected.");
2646 
2647     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2648         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2649     else
2650         freeSmallObject(object);
2651     return true;
2652 }
2653 
2654 static void *internalMalloc(size_t size)
2655 {
2656     if (!size) size = sizeof(size_t);
2657 
2658 #if MALLOC_CHECK_RECURSION
2659     if (RecursiveMallocCallProtector::sameThreadActive())
2660         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2661             // nested allocation, so skip tls
2662             (FreeObject*)defaultMemPool->getFromLLOCache(NULL, size, slabSize);
2663 #endif
2664 
2665     if (!isMallocInitialized())
2666         if (!doInitialization())
2667             return NULL;
2668     return internalPoolMalloc(defaultMemPool, size);
2669 }
2670 
2671 static void internalFree(void *object)
2672 {
2673     internalPoolFree(defaultMemPool, object, 0);
2674 }
2675 
2676 static size_t internalMsize(void* ptr)
2677 {
2678     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2679     if (isLargeObject<ourMem>(ptr)) {
2680         // TODO: return the maximum memory size, that can be written to this object
2681         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2682         return lmb->objectSize;
2683     } else {
2684         Block *block = (Block*)alignDown(ptr, slabSize);
2685         return block->findObjectSize(ptr);
2686     }
2687 }
2688 
2689 } // namespace internal
2690 
2691 using namespace rml::internal;
2692 
2693 // legacy entry point saved for compatibility with binaries complied
2694 // with pre-6003 versions of TBB
2695 rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2696 {
2697     rml::MemoryPool *pool;
2698     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2699 
2700     pool_create_v1(pool_id, &pol, &pool);
2701     return pool;
2702 }
2703 
2704 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2705                                  rml::MemoryPool **pool)
2706 {
2707     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2708          // empty pFree allowed only for fixed pools
2709          || !(policy->fixedPool || policy->pFree)) {
2710         *pool = NULL;
2711         return INVALID_POLICY;
2712     }
2713     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2714          // new flags can be added in place of reserved, but default
2715          // behaviour must be supported by this version
2716          || policy->reserved ) {
2717         *pool = NULL;
2718         return UNSUPPORTED_POLICY;
2719     }
2720     if (!isMallocInitialized())
2721         if (!doInitialization()) {
2722             *pool = NULL;
2723             return NO_MEMORY;
2724         }
2725     rml::internal::MemoryPool *memPool =
2726         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2727     if (!memPool) {
2728         *pool = NULL;
2729         return NO_MEMORY;
2730     }
2731     memset(memPool, 0, sizeof(rml::internal::MemoryPool));
2732     if (!memPool->init(pool_id, policy)) {
2733         internalFree(memPool);
2734         *pool = NULL;
2735         return NO_MEMORY;
2736     }
2737 
2738     *pool = (rml::MemoryPool*)memPool;
2739     return POOL_OK;
2740 }
2741 
2742 bool pool_destroy(rml::MemoryPool* memPool)
2743 {
2744     if (!memPool) return false;
2745     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2746     internalFree(memPool);
2747 
2748     return ret;
2749 }
2750 
2751 bool pool_reset(rml::MemoryPool* memPool)
2752 {
2753     if (!memPool) return false;
2754 
2755     return ((rml::internal::MemoryPool*)memPool)->reset();
2756 }
2757 
2758 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2759 {
2760     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2761 }
2762 
2763 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2764 {
2765     if (!object)
2766         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2767     if (!size) {
2768         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2769         return NULL;
2770     }
2771     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2772 }
2773 
2774 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2775 {
2776     if (!isPowerOfTwo(alignment) || 0==size)
2777         return NULL;
2778 
2779     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2780 }
2781 
2782 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2783 {
2784     if (!isPowerOfTwo(alignment))
2785         return NULL;
2786     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2787     void *tmp;
2788 
2789     if (!ptr)
2790         tmp = allocateAligned(mPool, size, alignment);
2791     else if (!size) {
2792         internalPoolFree(mPool, ptr, 0);
2793         return NULL;
2794     } else
2795         tmp = reallocAligned(mPool, ptr, size, alignment);
2796 
2797     return tmp;
2798 }
2799 
2800 bool pool_free(rml::MemoryPool *mPool, void *object)
2801 {
2802     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2803 }
2804 
2805 rml::MemoryPool *pool_identify(void *object)
2806 {
2807     rml::internal::MemoryPool *pool;
2808     if (isLargeObject<ourMem>(object)) {
2809         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2810         pool = header->memoryBlock->pool;
2811     } else {
2812         Block *block = (Block*)alignDown(object, slabSize);
2813         pool = block->getMemPool();
2814     }
2815     // do not return defaultMemPool, as it can't be used in pool_free() etc
2816     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2817         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2818     return (rml::MemoryPool*)pool;
2819 }
2820 
2821 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2822 {
2823     if (object) {
2824         // No assert for object recognition, cause objects allocated from non-default
2825         // memory pool do not participate in range checking and do not have valid backreferences for
2826         // small objects. Instead, check that an object belong to the certain memory pool.
2827         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2828         return internalMsize(object);
2829     }
2830     errno = EINVAL;
2831     // Unlike _msize, return 0 in case of parameter error.
2832     // Returning size_t(-1) looks more like the way to troubles.
2833     return 0;
2834 }
2835 
2836 } // namespace rml
2837 
2838 using namespace rml::internal;
2839 
2840 #if MALLOC_TRACE
2841 static unsigned int threadGoingDownCount = 0;
2842 #endif
2843 
2844 /*
2845  * When a thread is shutting down this routine should be called to remove all the thread ids
2846  * from the malloc blocks and replace them with a NULL thread id.
2847  *
2848  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2849  * It will be automatically called at thread exit with the key value as the argument,
2850  * unless that value is NULL.
2851  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2852  *
2853  * However neither of the above is called for the main process thread, so the routine
2854  * also needs to be called during the process shutdown.
2855  *
2856 */
2857 // TODO: Consider making this function part of class MemoryPool.
2858 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2859 {
2860     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2861              getThreadId(),  threadGoingDownCount++ ));
2862 
2863 #if USE_PTHREAD
2864     if (tls) {
2865         if (!shutdownSync.threadDtorStart()) return;
2866         tls->getMemPool()->onThreadShutdown(tls);
2867         shutdownSync.threadDtorDone();
2868     } else
2869 #endif
2870     {
2871         suppress_unused_warning(tls); // not used on Windows
2872         // The default pool is safe to use at this point:
2873         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2874         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2875         // See also __TBB_mallocProcessShutdownNotification()
2876         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2877         // Take lock to walk through other pools; but waiting might be dangerous at this point
2878         // (e.g. on Windows the main thread might deadlock)
2879         bool locked;
2880         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2881         if (locked) { // the list is safe to process
2882             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2883                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2884         }
2885     }
2886 
2887     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2888 }
2889 
2890 #if USE_PTHREAD
2891 void mallocThreadShutdownNotification(void* arg)
2892 {
2893     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2894     if (!isMallocInitialized()) return;
2895     doThreadShutdownNotification((TLSData*)arg, false);
2896 }
2897 #else
2898 extern "C" void __TBB_mallocThreadShutdownNotification()
2899 {
2900     // The routine is called once per thread on Windows
2901     if (!isMallocInitialized()) return;
2902     doThreadShutdownNotification(NULL, false);
2903 }
2904 #endif
2905 
2906 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2907 {
2908     if (!isMallocInitialized()) return;
2909 
2910     // Don't clean allocator internals if the entire process is exiting
2911     if (!windows_process_dying) {
2912         doThreadShutdownNotification(NULL, /*main_thread=*/true);
2913     }
2914 #if  __TBB_MALLOC_LOCACHE_STAT
2915     printf("cache hit ratio %f, size hit %f\n",
2916            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2917     defaultMemPool->extMemPool.loc.reportStat(stdout);
2918 #endif
2919 
2920     shutdownSync.processExit();
2921 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2922 /* Pthread keys must be deleted as soon as possible to not call key dtor
2923    on thread termination when then the tbbmalloc code can be already unloaded.
2924 */
2925     defaultMemPool->destroy();
2926     destroyBackRefMaster(&defaultMemPool->extMemPool.backend);
2927     ThreadId::destroy();      // Delete key for thread id
2928     hugePages.reset();
2929     // new total malloc initialization is possible after this point
2930     mallocInitialized.store(0, std::memory_order_release);
2931 #elif __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND
2932 /* In most cases we prevent unloading tbbmalloc, and don't clean up memory
2933    on process shutdown. When impossible to prevent, library unload results
2934    in shutdown notification, and it makes sense to release unused memory
2935    at that point (we can't release all memory because it's possible that
2936    it will be accessed after this point).
2937    TODO: better support systems where we can't prevent unloading by removing
2938    pthread destructors and releasing caches.
2939  */
2940     defaultMemPool->extMemPool.hardCachesCleanup();
2941 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2942 
2943 #if COLLECT_STATISTICS
2944     unsigned nThreads = ThreadId::getMaxThreadId();
2945     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2946         STAT_print(i);
2947 #endif
2948     if (!usedBySrcIncluded)
2949         MALLOC_ITT_FINI_ITTLIB();
2950 }
2951 
2952 extern "C" void * scalable_malloc(size_t size)
2953 {
2954     void *ptr = internalMalloc(size);
2955     if (!ptr) errno = ENOMEM;
2956     return ptr;
2957 }
2958 
2959 extern "C" void scalable_free(void *object)
2960 {
2961     internalFree(object);
2962 }
2963 
2964 #if MALLOC_ZONE_OVERLOAD_ENABLED
2965 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2966 {
2967     internalPoolFree(defaultMemPool, object, size);
2968 }
2969 #endif
2970 
2971 /*
2972  * A variant that provides additional memory safety, by checking whether the given address
2973  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2974  */
2975 extern "C" void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2976 {
2977     if (!object)
2978         return;
2979 
2980     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2981     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2982         if (isLargeObject<unknownMem>(object)) {
2983             // must check 1st for large object, because small object check touches 4 pages on left,
2984             // and it can be inaccessible
2985             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2986 
2987             defaultMemPool->putToLLOCache(tls, object);
2988             return;
2989         } else if (isSmallObject(object)) {
2990             freeSmallObject(object);
2991             return;
2992         }
2993     }
2994     if (original_free)
2995         original_free(object);
2996 }
2997 
2998 /********* End the free code        *************/
2999 
3000 /********* Code for scalable_realloc       ***********/
3001 
3002 /*
3003  * From K&R
3004  * "realloc changes the size of the object pointed to by p to size. The contents will
3005  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
3006  * the new space is uninitialized. realloc returns a pointer to the new space, or
3007  * NULL if the request cannot be satisfied, in which case *p is unchanged."
3008  *
3009  */
3010 extern "C" void* scalable_realloc(void* ptr, size_t size)
3011 {
3012     void *tmp;
3013 
3014     if (!ptr)
3015         tmp = internalMalloc(size);
3016     else if (!size) {
3017         internalFree(ptr);
3018         return NULL;
3019     } else
3020         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3021 
3022     if (!tmp) errno = ENOMEM;
3023     return tmp;
3024 }
3025 
3026 /*
3027  * A variant that provides additional memory safety, by checking whether the given address
3028  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3029  */
3030 extern "C" void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3031 {
3032     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3033 
3034     if (!ptr) {
3035         tmp = internalMalloc(sz);
3036     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3037         if (!sz) {
3038             internalFree(ptr);
3039             return NULL;
3040         } else {
3041             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3042         }
3043     }
3044 #if USE_WINTHREAD
3045     else if (original_realloc && sz) {
3046         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3047         if ( original_ptrs->msize ){
3048             size_t oldSize = original_ptrs->msize(ptr);
3049             tmp = internalMalloc(sz);
3050             if (tmp) {
3051                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3052                 if ( original_ptrs->free ){
3053                     original_ptrs->free( ptr );
3054                 }
3055             }
3056         } else
3057             tmp = NULL;
3058     }
3059 #else
3060     else if (original_realloc) {
3061         typedef void* (*realloc_ptr_t)(void*,size_t);
3062         realloc_ptr_t original_realloc_ptr;
3063         (void *&)original_realloc_ptr = original_realloc;
3064         tmp = original_realloc_ptr(ptr,sz);
3065     }
3066 #endif
3067     else tmp = NULL;
3068 
3069     if (!tmp) errno = ENOMEM;
3070     return tmp;
3071 }
3072 
3073 /********* End code for scalable_realloc   ***********/
3074 
3075 /********* Code for scalable_calloc   ***********/
3076 
3077 /*
3078  * From K&R
3079  * calloc returns a pointer to space for an array of nobj objects,
3080  * each of size size, or NULL if the request cannot be satisfied.
3081  * The space is initialized to zero bytes.
3082  *
3083  */
3084 
3085 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3086 {
3087     // it's square root of maximal size_t value
3088     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3089     const size_t arraySize = nobj * size;
3090 
3091     // check for overflow during multiplication:
3092     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3093         if (nobj && arraySize / nobj != size) {             // 2) exact check
3094             errno = ENOMEM;
3095             return NULL;
3096         }
3097     void* result = internalMalloc(arraySize);
3098     if (result)
3099         memset(result, 0, arraySize);
3100     else
3101         errno = ENOMEM;
3102     return result;
3103 }
3104 
3105 /********* End code for scalable_calloc   ***********/
3106 
3107 /********* Code for aligned allocation API **********/
3108 
3109 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3110 {
3111     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3112         return EINVAL;
3113     void *result = allocateAligned(defaultMemPool, size, alignment);
3114     if (!result)
3115         return ENOMEM;
3116     *memptr = result;
3117     return 0;
3118 }
3119 
3120 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3121 {
3122     if (!isPowerOfTwo(alignment) || 0==size) {
3123         errno = EINVAL;
3124         return NULL;
3125     }
3126     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3127     if (!tmp) errno = ENOMEM;
3128     return tmp;
3129 }
3130 
3131 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3132 {
3133     if (!isPowerOfTwo(alignment)) {
3134         errno = EINVAL;
3135         return NULL;
3136     }
3137     void *tmp;
3138 
3139     if (!ptr)
3140         tmp = allocateAligned(defaultMemPool, size, alignment);
3141     else if (!size) {
3142         scalable_free(ptr);
3143         return NULL;
3144     } else
3145         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3146 
3147     if (!tmp) errno = ENOMEM;
3148     return tmp;
3149 }
3150 
3151 extern "C" void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3152 {
3153     /* corner cases left out of reallocAligned to not deal with errno there */
3154     if (!isPowerOfTwo(alignment)) {
3155         errno = EINVAL;
3156         return NULL;
3157     }
3158     void *tmp = NULL;
3159 
3160     if (!ptr) {
3161         tmp = allocateAligned(defaultMemPool, size, alignment);
3162     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3163         if (!size) {
3164             internalFree(ptr);
3165             return NULL;
3166         } else {
3167             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3168         }
3169     }
3170 #if USE_WINTHREAD
3171     else {
3172         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3173         if (size) {
3174             // Without orig_msize, we can't do anything with this.
3175             // Just keeping old pointer.
3176             if ( original_ptrs->aligned_msize ){
3177                 // set alignment and offset to have possibly correct oldSize
3178                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3179                 tmp = allocateAligned(defaultMemPool, size, alignment);
3180                 if (tmp) {
3181                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3182                     if ( original_ptrs->aligned_free ){
3183                         original_ptrs->aligned_free( ptr );
3184                     }
3185                 }
3186             }
3187         } else {
3188             if ( original_ptrs->aligned_free ){
3189                 original_ptrs->aligned_free( ptr );
3190             }
3191             return NULL;
3192         }
3193     }
3194 #else
3195     // As original_realloc can't align result, and there is no way to find
3196     // size of reallocating object, we are giving up.
3197     suppress_unused_warning(orig_function);
3198 #endif
3199     if (!tmp) errno = ENOMEM;
3200     return tmp;
3201 }
3202 
3203 extern "C" void scalable_aligned_free(void *ptr)
3204 {
3205     internalFree(ptr);
3206 }
3207 
3208 /********* end code for aligned allocation API **********/
3209 
3210 /********* Code for scalable_msize       ***********/
3211 
3212 /*
3213  * Returns the size of a memory block allocated in the heap.
3214  */
3215 extern "C" size_t scalable_msize(void* ptr)
3216 {
3217     if (ptr) {
3218         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3219         return internalMsize(ptr);
3220     }
3221     errno = EINVAL;
3222     // Unlike _msize, return 0 in case of parameter error.
3223     // Returning size_t(-1) looks more like the way to troubles.
3224     return 0;
3225 }
3226 
3227 /*
3228  * A variant that provides additional memory safety, by checking whether the given address
3229  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3230  */
3231 extern "C" size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3232 {
3233     if (object) {
3234         // Check if the memory was allocated by scalable_malloc
3235         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3236             return internalMsize(object);
3237         else if (original_msize)
3238             return original_msize(object);
3239     }
3240     // object is NULL or unknown, or foreign and no original_msize
3241 #if USE_WINTHREAD
3242     errno = EINVAL; // errno expected to be set only on this platform
3243 #endif
3244     return 0;
3245 }
3246 
3247 /*
3248  * The same as above but for _aligned_msize case
3249  */
3250 extern "C" size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3251 {
3252     if (object) {
3253         // Check if the memory was allocated by scalable_malloc
3254         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3255             return internalMsize(object);
3256         else if (orig_aligned_msize)
3257             return orig_aligned_msize(object,alignment,offset);
3258     }
3259     // object is NULL or unknown
3260     errno = EINVAL;
3261     return 0;
3262 }
3263 
3264 /********* End code for scalable_msize   ***********/
3265 
3266 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3267 {
3268     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3269         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3270         return TBBMALLOC_OK;
3271     } else if (param == USE_HUGE_PAGES) {
3272 #if __linux__
3273         switch (value) {
3274         case 0:
3275         case 1:
3276             hugePages.setMode(value);
3277             return TBBMALLOC_OK;
3278         default:
3279             return TBBMALLOC_INVALID_PARAM;
3280         }
3281 #else
3282         return TBBMALLOC_NO_EFFECT;
3283 #endif
3284 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3285     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3286         switch (value) {
3287         case 0: // used by dynamic library
3288         case 1: // used by static library or directly included sources
3289             usedBySrcIncluded = value;
3290             return TBBMALLOC_OK;
3291         default:
3292             return TBBMALLOC_INVALID_PARAM;
3293         }
3294 #endif
3295     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3296         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3297         return TBBMALLOC_OK;
3298     }
3299     return TBBMALLOC_INVALID_PARAM;
3300 }
3301 
3302 extern "C" int scalable_allocation_command(int cmd, void *param)
3303 {
3304     if (param)
3305         return TBBMALLOC_INVALID_PARAM;
3306 
3307     bool released = false;
3308     switch(cmd) {
3309     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3310         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3311             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3312         break;
3313     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3314         released = defaultMemPool->extMemPool.hardCachesCleanup();
3315         break;
3316     default:
3317         return TBBMALLOC_INVALID_PARAM;
3318     }
3319     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3320 }
3321