xref: /oneTBB/src/tbbmalloc/frontend.cpp (revision 49e08aac)
1 /*
2     Copyright (c) 2005-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
31     inline void do_yield() {sched_yield();}
32     extern "C" { static void mallocThreadShutdownNotification(void*); }
33     #if __sun || __SUNPRO_CC
34     #define __asm__ asm
35     #endif
36     #include <unistd.h> // sysconf(_SC_PAGESIZE)
37 #elif USE_WINTHREAD
38     #define GetMyTID() GetCurrentThreadId()
39 #if __TBB_WIN8UI_SUPPORT
40     #include<thread>
41     #define TlsSetValue_func FlsSetValue
42     #define TlsGetValue_func FlsGetValue
43     #define TlsAlloc() FlsAlloc(NULL)
44     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
45     #define TlsFree FlsFree
46     inline void do_yield() {std::this_thread::yield();}
47 #else
48     #define TlsSetValue_func TlsSetValue
49     #define TlsGetValue_func TlsGetValue
50     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
51     inline void do_yield() {SwitchToThread();}
52 #endif
53 #else
54     #error Must define USE_PTHREAD or USE_WINTHREAD
55 #endif
56 
57 #define FREELIST_NONBLOCKING 1
58 
59 namespace rml {
60 class MemoryPool;
61 namespace internal {
62 
63 class Block;
64 class MemoryPool;
65 
66 #if MALLOC_CHECK_RECURSION
67 
68 inline bool isMallocInitialized();
69 
70 bool RecursiveMallocCallProtector::noRecursion() {
71     MALLOC_ASSERT(isMallocInitialized(),
72                   "Recursion status can be checked only when initialization was done.");
73     return !mallocRecursionDetected;
74 }
75 
76 #endif // MALLOC_CHECK_RECURSION
77 
78 /** Support for handling the special UNUSABLE pointer state **/
79 const intptr_t UNUSABLE = 0x1;
80 inline bool isSolidPtr( void* ptr ) {
81     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
82 }
83 inline bool isNotForUse( void* ptr ) {
84     return (intptr_t)ptr==UNUSABLE;
85 }
86 
87 /*
88  * Block::objectSize value used to mark blocks allocated by startupAlloc
89  */
90 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
91 
92 /*
93  * The following constant is used to define the size of struct Block, the block header.
94  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
95  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
96  */
97 const int blockHeaderAlignment = estimatedCacheLineSize;
98 
99 /********* The data structures and global objects        **************/
100 
101 /*
102  * The malloc routines themselves need to be able to occasionally malloc some space,
103  * in order to set up the structures used by the thread local structures. This
104  * routine performs that functions.
105  */
106 class BootStrapBlocks {
107     MallocMutex bootStrapLock;
108     Block      *bootStrapBlock;
109     Block      *bootStrapBlockUsed;
110     FreeObject *bootStrapObjectList;
111 public:
112     void *allocate(MemoryPool *memPool, size_t size);
113     void free(void* ptr);
114     void reset();
115 };
116 
117 #if USE_INTERNAL_TID
118 class ThreadId {
119     static tls_key_t Tid_key;
120     std::atomic<intptr_t> ThreadCount;
121 
122     unsigned int id;
123 
124     static unsigned int tlsNumber() {
125         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
126         if( !result ) {
127             RecursiveMallocCallProtector scoped;
128             // Thread-local value is zero -> first call from this thread,
129             // need to initialize with next ID value (IDs start from 1)
130             result = ++ThreadCount; // returned new value!
131             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
132         }
133         return result;
134     }
135 public:
136     static bool init() {
137 #if USE_WINTHREAD
138         Tid_key = TlsAlloc();
139         if (Tid_key == TLS_ALLOC_FAILURE)
140             return false;
141 #else
142         int status = pthread_key_create( &Tid_key, NULL );
143         if ( status ) {
144             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
145             return false;
146         }
147 #endif /* USE_WINTHREAD */
148         return true;
149     }
150     static void destroy() {
151         if( Tid_key ) {
152 #if USE_WINTHREAD
153             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
154 #else
155             int status = pthread_key_delete( Tid_key );
156 #endif /* USE_WINTHREAD */
157             if ( status )
158                 fprintf (stderr, "The memory manager cannot delete tls key\n");
159             Tid_key = 0;
160         }
161     }
162 
163     ThreadId() : id(ThreadId::tlsNumber()) {}
164     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
165 
166 #if COLLECT_STATISTICS || MALLOC_TRACE
167     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
168 #endif
169 #if COLLECT_STATISTICS
170     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
171 
172     friend int STAT_increment(ThreadId tid, int bin, int ctr);
173 #endif
174 };
175 
176 tls_key_t ThreadId::Tid_key;
177 intptr_t ThreadId::ThreadCount;
178 
179 #if COLLECT_STATISTICS
180 int STAT_increment(ThreadId tid, int bin, int ctr)
181 {
182     return ::STAT_increment(tid.id, bin, ctr);
183 }
184 #endif
185 
186 #else // USE_INTERNAL_TID
187 
188 class ThreadId {
189 #if USE_PTHREAD
190     pthread_t tid;
191 #else
192     DWORD     tid;
193 #endif
194 public:
195     ThreadId() : tid(GetMyTID()) {}
196 #if USE_PTHREAD
197     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid); }
198 #else
199     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid; }
200 #endif
201     static bool init() { return true; }
202     static void destroy() {}
203 };
204 
205 #endif // USE_INTERNAL_TID
206 
207 /*********** Code to provide thread ID and a thread-local void pointer **********/
208 
209 bool TLSKey::init()
210 {
211 #if USE_WINTHREAD
212     TLS_pointer_key = TlsAlloc();
213     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
214         return false;
215 #else
216     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
217     if ( status )
218         return false;
219 #endif /* USE_WINTHREAD */
220     return true;
221 }
222 
223 bool TLSKey::destroy()
224 {
225 #if USE_WINTHREAD
226     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
227 #else
228     int status1 = pthread_key_delete(TLS_pointer_key);
229 #endif /* USE_WINTHREAD */
230     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
231     return status1==0;
232 }
233 
234 inline TLSData* TLSKey::getThreadMallocTLS() const
235 {
236     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
237 }
238 
239 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
240     RecursiveMallocCallProtector scoped;
241     TlsSetValue_func( TLS_pointer_key, newvalue );
242 }
243 
244 /* The 'next' field in the block header has to maintain some invariants:
245  *   it needs to be on a 16K boundary and the first field in the block.
246  *   Any value stored there needs to have the lower 14 bits set to 0
247  *   so that various assert work. This means that if you want to smash this memory
248  *   for debugging purposes you will need to obey this invariant.
249  * The total size of the header needs to be a power of 2 to simplify
250  * the alignment requirements. For now it is a 128 byte structure.
251  * To avoid false sharing, the fields changed only locally are separated
252  * from the fields changed by foreign threads.
253  * Changing the size of the block header would require to change
254  * some bin allocation sizes, in particular "fitting" sizes (see above).
255  */
256 class Bin;
257 class StartupBlock;
258 
259 class MemoryPool {
260     // if no explicit grainsize, expect to see malloc in user's pAlloc
261     // and set reasonable low granularity
262     static const size_t defaultGranularity = estimatedCacheLineSize;
263 
264     MemoryPool();                  // deny
265 public:
266     static MallocMutex  memPoolListLock;
267 
268     // list of all active pools is used to release
269     // all TLS data on thread termination or library unload
270     MemoryPool    *next,
271                   *prev;
272     ExtMemoryPool  extMemPool;
273     BootStrapBlocks bootStrapBlocks;
274 
275     static void initDefaultPool();
276 
277     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
278     bool reset();
279     bool destroy();
280     void onThreadShutdown(TLSData *tlsData);
281 
282     inline TLSData *getTLS(bool create);
283     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(NULL); }
284 
285     Block *getEmptyBlock(size_t size);
286     void returnEmptyBlock(Block *block, bool poolTheBlock);
287 
288     // get/put large object to/from local large object cache
289     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
290     void putToLLOCache(TLSData *tls, void *object);
291 };
292 
293 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
294                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
295 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
296 const size_t MemoryPool::defaultGranularity;
297 // zero-initialized
298 MallocMutex  MemoryPool::memPoolListLock;
299 // TODO: move huge page status to default pool, because that's its states
300 HugePagesStatus hugePages;
301 static bool usedBySrcIncluded = false;
302 
303 // Padding helpers
304 template<size_t padd>
305 struct PaddingImpl {
306     size_t       __padding[padd];
307 };
308 
309 template<>
310 struct PaddingImpl<0> {};
311 
312 template<int N>
313 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
314 
315 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
316 // fields and fields commonly accessed by not owner threads.
317 class GlobalBlockFields : public BlockI {
318 protected:
319     std::atomic<FreeObject*> publicFreeList;
320     Block       *nextPrivatizable;
321     MemoryPool  *poolPtr;
322 };
323 
324 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
325 protected:
326     Block       *next;
327     Block       *previous;        /* Use double linked list to speed up removal */
328     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
329     FreeObject  *freeList;
330     /* Pointer to local data for the owner thread. Used for fast finding tls
331        when releasing object from a block that current thread owned.
332        NULL for orphaned blocks. */
333     TLSData     *tlsPtr;
334     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
335     BackRefIdx   backRefIdx;
336     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
337     uint16_t     objectSize;
338     bool         isFull;
339 
340     friend class FreeBlockPool;
341     friend class StartupBlock;
342     friend class LifoList;
343     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
344     friend bool OrphanedBlocks::cleanup(Backend*);
345     friend Block *MemoryPool::getEmptyBlock(size_t);
346 };
347 
348 // Use inheritance to guarantee that a user data start on next cache line.
349 // Can't use member for it, because when LocalBlockFields already on cache line,
350 // we must have no additional memory consumption for all compilers.
351 class Block : public LocalBlockFields,
352               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
353 public:
354     bool empty() const {
355         if (allocatedCount > 0) return false;
356         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
357         return true;
358     }
359     inline FreeObject* allocate();
360     inline FreeObject *allocateFromFreeList();
361 
362     inline bool adjustFullness();
363     void adjustPositionInBin(Bin* bin = NULL);
364 
365     bool freeListNonNull() { return freeList; }
366     void freePublicObject(FreeObject *objectToFree);
367     inline void freeOwnObject(void *object);
368     void reset();
369     void privatizePublicFreeList( bool reset = true );
370     void restoreBumpPtr();
371     void privatizeOrphaned(TLSData *tls, unsigned index);
372     bool readyToShare();
373     void shareOrphaned(intptr_t binTag, unsigned index);
374     unsigned int getSize() const {
375         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
376                       "Invalid object size");
377         return isStartupAllocObject()? 0 : objectSize;
378     }
379     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
380     inline bool isOwnedByCurrentThread() const;
381     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
382     inline FreeObject *findObjectToFree(const void *object) const;
383     void checkFreePrecond(const void *object) const {
384 #if MALLOC_DEBUG
385         const char *msg = "Possible double free or heap corruption.";
386         // small objects are always at least sizeof(size_t) Byte aligned,
387         // try to check this before this dereference as for invalid objects
388         // this may be unreadable
389         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
390         // releasing to free slab
391         MALLOC_ASSERT(allocatedCount>0, msg);
392         // must not point to slab's header
393         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
394         if (startupAllocObjSizeMark == objectSize) // startup block
395             MALLOC_ASSERT(object<=bumpPtr, msg);
396         else {
397             // non-startup objects are 8 Byte aligned
398             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
399             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
400                           && (!bumpPtr || object>bumpPtr), msg);
401             FreeObject *toFree = findObjectToFree(object);
402             // check against head of freeList, as this is mostly
403             // expected after double free
404             MALLOC_ASSERT(toFree != freeList, msg);
405             // check against head of publicFreeList, to detect double free
406             // involving foreign thread
407             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
408         }
409 #else
410         suppress_unused_warning(object);
411 #endif
412     }
413     void initEmptyBlock(TLSData *tls, size_t size);
414     size_t findObjectSize(void *object) const;
415     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
416 
417 protected:
418     void cleanBlockHeader();
419 
420 private:
421     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
422 
423     inline FreeObject *allocateFromBumpPtr();
424     inline FreeObject *findAllocatedObject(const void *address) const;
425     inline bool isProperlyPlaced(const void *object) const;
426     inline void markOwned(TLSData *tls) {
427         MALLOC_ASSERT(!tlsPtr, ASSERT_TEXT);
428         ownerTid = ThreadId(); /* save the ID of the current thread */
429         tlsPtr = tls;
430     }
431     inline void markOrphaned() {
432         MALLOC_ASSERT(tlsPtr, ASSERT_TEXT);
433         tlsPtr = NULL;
434     }
435 
436     friend class Bin;
437     friend class TLSData;
438     friend bool MemoryPool::destroy();
439 };
440 
441 const float Block::emptyEnoughRatio = 1.0 / 4.0;
442 
443 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
444     "The class Block does not fit into 2 cache lines on this platform. "
445     "Defining USE_INTERNAL_TID may help to fix it.");
446 
447 class Bin {
448 private:
449     Block *activeBlk;
450     std::atomic<Block*> mailbox;
451     MallocMutex mailLock;
452 
453 public:
454     inline Block* getActiveBlock() const { return activeBlk; }
455     void resetActiveBlock() { activeBlk = NULL; }
456     bool activeBlockUnused() const { return activeBlk && !activeBlk->allocatedCount; }
457     inline void setActiveBlock(Block *block);
458     inline Block* setPreviousBlockActive();
459     Block* getPrivatizedFreeListBlock();
460     void moveBlockToFront(Block *block);
461     bool cleanPublicFreeLists();
462     void processEmptyBlock(Block *block, bool poolTheBlock);
463     void addPublicFreeListBlock(Block* block);
464 
465     void outofTLSBin(Block* block);
466     void verifyTLSBin(size_t size) const;
467     void pushTLSBin(Block* block);
468 
469     void verifyInitState() const {
470         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
471         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
472     }
473 
474     friend void Block::freePublicObject (FreeObject *objectToFree);
475 };
476 
477 /********* End of the data structures                    **************/
478 
479 /*
480  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
481  */
482 const uint32_t minSmallObjectIndex = 0;
483 const uint32_t numSmallObjectBins = 8;
484 const uint32_t maxSmallObjectSize = 64;
485 
486 /*
487  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
488  * from maxSmallObjectSize till this size; 16 bins in total
489  */
490 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
491 const uint32_t numSegregatedObjectBins = 16;
492 const uint32_t maxSegregatedObjectSize = 1024;
493 
494 /*
495  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
496  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
497  */
498 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
499 const uint32_t numFittingBins = 5;
500 
501 const uint32_t fittingAlignment = estimatedCacheLineSize;
502 
503 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
504 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
505 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
506 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
507 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
508 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
509 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
510 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
511 #undef SET_FITTING_SIZE
512 
513 /*
514  * The total number of thread-specific Block-based bins
515  */
516 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
517 
518 /*
519  * Objects of this size and larger are considered large objects.
520  */
521 const uint32_t minLargeObjectSize = fittingSize5 + 1;
522 
523 /*
524  * Per-thread pool of slab blocks. Idea behind it is to not share with other
525  * threads memory that are likely in local cache(s) of our CPU.
526  */
527 class FreeBlockPool {
528 private:
529     std::atomic<Block*> head;
530     int         size;
531     Backend    *backend;
532     bool        lastAccessMiss;
533 public:
534     static const int POOL_HIGH_MARK = 32;
535     static const int POOL_LOW_MARK  = 8;
536 
537     class ResOfGet {
538         ResOfGet();
539     public:
540         Block* block;
541         bool   lastAccMiss;
542         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
543     };
544 
545     // allocated in zero-initialized memory
546     FreeBlockPool(Backend *bknd) : backend(bknd) {}
547     ResOfGet getBlock();
548     void returnBlock(Block *block);
549     bool externalCleanup(); // can be called by another thread
550 };
551 
552 template<int LOW_MARK, int HIGH_MARK>
553 class LocalLOCImpl {
554 private:
555     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
556     // TODO: can single-linked list be faster here?
557     LargeMemoryBlock *tail; // need it when do releasing on overflow
558     std::atomic<LargeMemoryBlock*> head;
559     size_t            totalSize;
560     int               numOfBlocks;
561 public:
562     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
563     LargeMemoryBlock *get(size_t size);
564     bool externalCleanup(ExtMemoryPool *extMemPool);
565 #if __TBB_MALLOC_WHITEBOX_TEST
566     LocalLOCImpl() : head(NULL), tail(NULL), totalSize(0), numOfBlocks(0) {}
567     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
568     static const int LOC_HIGH_MARK = HIGH_MARK;
569 #else
570     // no ctor, object must be created in zero-initialized memory
571 #endif
572 };
573 
574 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
575 
576 class TLSData : public TLSRemote {
577     MemoryPool   *memPool;
578 public:
579     Bin           bin[numBlockBinLimit];
580     FreeBlockPool freeSlabBlocks;
581     LocalLOC      lloc;
582     unsigned      currCacheIdx;
583 private:
584     bool unused;
585 public:
586     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {}
587     MemoryPool *getMemPool() const { return memPool; }
588     Bin* getAllocationBin(size_t size);
589     void release();
590     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
591         if (!unused && cleanOnlyUnused) return false;
592         // Heavy operation in terms of synchronization complexity,
593         // should be called only for the current thread
594         bool released = cleanBins ? cleanupBlockBins() : false;
595         // both cleanups to be called, and the order is not important
596         return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup();
597     }
598     bool cleanupBlockBins();
599     void markUsed() { unused = false; } // called by owner when TLS touched
600     void markUnused() { unused =  true; } // can be called by not owner thread
601 };
602 
603 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
604 {
605     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
606     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
607     if ( !tls )
608         return NULL;
609     new(tls) TLSData(memPool, backend);
610     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
611 #if MALLOC_DEBUG
612     for (uint32_t i = 0; i < numBlockBinLimit; i++)
613         tls->bin[i].verifyInitState();
614 #endif
615     setThreadMallocTLS(tls);
616     memPool->extMemPool.allLocalCaches.registerThread(tls);
617     return tls;
618 }
619 
620 bool TLSData::cleanupBlockBins()
621 {
622     bool released = false;
623     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
624         released |= bin[i].cleanPublicFreeLists();
625         // After cleaning public free lists, only the active block might be empty.
626         // Do not use processEmptyBlock because it will just restore bumpPtr.
627         Block *block = bin[i].getActiveBlock();
628         if (block && block->empty()) {
629             bin[i].outofTLSBin(block);
630             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
631             released = true;
632         }
633     }
634     return released;
635 }
636 
637 bool ExtMemoryPool::releaseAllLocalCaches()
638 {
639     // Iterate all registered TLS data and clean LLOC and Slab pools
640     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
641 
642     // Bins privatization is done only for the current thread
643     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
644         released |= tlsData->cleanupBlockBins();
645 
646     return released;
647 }
648 
649 void AllLocalCaches::registerThread(TLSRemote *tls)
650 {
651     tls->prev = NULL;
652     MallocMutex::scoped_lock lock(listLock);
653     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
654     tls->next = head;
655     if (head)
656         head->prev = tls;
657     head = tls;
658     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
659 }
660 
661 void AllLocalCaches::unregisterThread(TLSRemote *tls)
662 {
663     MallocMutex::scoped_lock lock(listLock);
664     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
665     if (head == tls)
666         head = tls->next;
667     if (tls->next)
668         tls->next->prev = tls->prev;
669     if (tls->prev)
670         tls->prev->next = tls->next;
671     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
672 }
673 
674 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
675 {
676     bool released = false;
677     {
678         MallocMutex::scoped_lock lock(listLock);
679         for (TLSRemote *curr=head; curr; curr=curr->next)
680             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
681     }
682     return released;
683 }
684 
685 void AllLocalCaches::markUnused()
686 {
687     bool locked;
688     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
689     if (!locked) // not wait for marking if someone doing something with it
690         return;
691 
692     for (TLSRemote *curr=head; curr; curr=curr->next)
693         static_cast<TLSData*>(curr)->markUnused();
694 }
695 
696 #if MALLOC_CHECK_RECURSION
697 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
698 pthread_t   RecursiveMallocCallProtector::owner_thread;
699 void       *RecursiveMallocCallProtector::autoObjPtr;
700 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
701 #if __FreeBSD__
702 bool        RecursiveMallocCallProtector::canUsePthread;
703 #endif
704 
705 #endif
706 
707 /*********** End code to provide thread ID and a TLS pointer **********/
708 
709 // Parameter for isLargeObject, keeps our expectations on memory origin.
710 // Assertions must use unknownMem to reliably report object invalidity.
711 enum MemoryOrigin {
712     ourMem,    // allocated by TBB allocator
713     unknownMem // can be allocated by system allocator or TBB allocator
714 };
715 
716 template<MemoryOrigin> bool isLargeObject(void *object);
717 static void *internalMalloc(size_t size);
718 static void internalFree(void *object);
719 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
720 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
721 
722 #if !MALLOC_DEBUG
723 #if __INTEL_COMPILER || _MSC_VER
724 #define NOINLINE(decl) __declspec(noinline) decl
725 #define ALWAYSINLINE(decl) __forceinline decl
726 #elif __GNUC__
727 #define NOINLINE(decl) decl __attribute__ ((noinline))
728 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
729 #else
730 #define NOINLINE(decl) decl
731 #define ALWAYSINLINE(decl) decl
732 #endif
733 
734 static NOINLINE( bool doInitialization() );
735 ALWAYSINLINE( bool isMallocInitialized() );
736 
737 #undef ALWAYSINLINE
738 #undef NOINLINE
739 #endif /* !MALLOC_DEBUG */
740 
741 
742 /********* Now some rough utility code to deal with indexing the size bins. **************/
743 
744 /*
745  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
746  * Moreover, on IPF, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
747  * This is enough for current algorithm of distribution of sizes among bins.
748  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
749  */
750 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
751 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
752 #pragma intrinsic(_BitScanReverse)
753 #endif
754 static inline unsigned int highestBitPos(unsigned int n)
755 {
756     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
757     unsigned int pos;
758 #if __ARCH_x86_32||__ARCH_x86_64
759 
760 # if __linux__||__APPLE__||__FreeBSD__||__NetBSD__||__OpenBSD__||__sun||__MINGW32__
761     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
762 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
763     __asm
764     {
765         bsr eax, n
766         mov pos, eax
767     }
768 # elif _WIN64 && _MSC_VER>=1400
769     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
770 # else
771 #   error highestBitPos() not implemented for this platform
772 # endif
773 #elif __arm__
774     __asm__ __volatile__
775     (
776        "clz %0, %1\n"
777        "rsb %0, %0, %2\n"
778        :"=r" (pos) :"r" (n), "I" (31)
779     );
780 #else
781     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
782     pos = bsr[ n>>6 ];
783 #endif /* __ARCH_* */
784     return pos;
785 }
786 
787 template<bool Is32Bit>
788 unsigned int getSmallObjectIndex(unsigned int size)
789 {
790     return (size-1)>>3;
791 }
792 template<>
793 unsigned int getSmallObjectIndex</*Is32Bit=*/false>(unsigned int size)
794 {
795     // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
796     unsigned int result = (size-1)>>3;
797     if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
798     return result;
799 }
800 /*
801  * Depending on indexRequest, for a given size return either the index into the bin
802  * for objects of this size, or the actual size of objects in this bin.
803  */
804 template<bool indexRequest>
805 static unsigned int getIndexOrObjectSize (unsigned int size)
806 {
807     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
808         unsigned int index = getSmallObjectIndex</*Is32Bit=*/(sizeof(size_t)<=4)>( size );
809          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
810         return indexRequest ? index : (index+1)<<3;
811     }
812     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
813         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
814         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
815         if (indexRequest)
816             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
817         else {
818             unsigned int alignment = 128 >> (9-order); // alignment in the group
819             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
820             return alignUp(size,alignment);
821         }
822     }
823     else {
824         if( size <= fittingSize3 ) {
825             if( size <= fittingSize2 ) {
826                 if( size <= fittingSize1 )
827                     return indexRequest ? minFittingIndex : fittingSize1;
828                 else
829                     return indexRequest ? minFittingIndex+1 : fittingSize2;
830             } else
831                 return indexRequest ? minFittingIndex+2 : fittingSize3;
832         } else {
833             if( size <= fittingSize5 ) {
834                 if( size <= fittingSize4 )
835                     return indexRequest ? minFittingIndex+3 : fittingSize4;
836                 else
837                     return indexRequest ? minFittingIndex+4 : fittingSize5;
838             } else {
839                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
840                 return ~0U;
841             }
842         }
843     }
844 }
845 
846 static unsigned int getIndex (unsigned int size)
847 {
848     return getIndexOrObjectSize</*indexRequest=*/true>(size);
849 }
850 
851 static unsigned int getObjectSize (unsigned int size)
852 {
853     return getIndexOrObjectSize</*indexRequest=*/false>(size);
854 }
855 
856 
857 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
858 {
859     FreeObject *result;
860 
861     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
862 
863     { // Lock with acquire
864         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
865 
866         if( bootStrapObjectList) {
867             result = bootStrapObjectList;
868             bootStrapObjectList = bootStrapObjectList->next;
869         } else {
870             if (!bootStrapBlock) {
871                 bootStrapBlock = memPool->getEmptyBlock(size);
872                 if (!bootStrapBlock) return NULL;
873             }
874             result = bootStrapBlock->bumpPtr;
875             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
876             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
877                 bootStrapBlock->bumpPtr = NULL;
878                 bootStrapBlock->next = bootStrapBlockUsed;
879                 bootStrapBlockUsed = bootStrapBlock;
880                 bootStrapBlock = NULL;
881             }
882         }
883     } // Unlock with release
884 
885     memset (result, 0, size);
886     return (void*)result;
887 }
888 
889 void BootStrapBlocks::free(void* ptr)
890 {
891     MALLOC_ASSERT( ptr, ASSERT_TEXT );
892     { // Lock with acquire
893         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
894         ((FreeObject*)ptr)->next = bootStrapObjectList;
895         bootStrapObjectList = (FreeObject*)ptr;
896     } // Unlock with release
897 }
898 
899 void BootStrapBlocks::reset()
900 {
901     bootStrapBlock = bootStrapBlockUsed = NULL;
902     bootStrapObjectList = NULL;
903 }
904 
905 #if !(FREELIST_NONBLOCKING)
906 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
907 #endif
908 
909 /********* End rough utility code  **************/
910 
911 /* LifoList assumes zero initialization so a vector of it can be created
912  * by just allocating some space with no call to constructor.
913  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
914  *
915  * By usage convention there is no race on the initialization. */
916 LifoList::LifoList( ) : top(NULL)
917 {
918     // MallocMutex assumes zero initialization
919     memset(&lock, 0, sizeof(MallocMutex));
920 }
921 
922 void LifoList::push(Block *block)
923 {
924     MallocMutex::scoped_lock scoped_cs(lock);
925     block->next = top;
926     top = block;
927 }
928 
929 Block *LifoList::pop()
930 {
931     Block *block=NULL;
932     if (top) {
933         MallocMutex::scoped_lock scoped_cs(lock);
934         if (top) {
935             block = top;
936             top = block->next;
937         }
938     }
939     return block;
940 }
941 
942 Block *LifoList::grab()
943 {
944     Block *block = NULL;
945     if (top) {
946         MallocMutex::scoped_lock scoped_cs(lock);
947         block = top;
948         top = NULL;
949     }
950     return block;
951 }
952 
953 /********* Thread and block related code      *************/
954 
955 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
956      LargeMemoryBlock *next, *lmb = loHead;
957      loHead = NULL;
958 
959      for (; lmb; lmb = next) {
960          next = lmb->gNext;
961          if (poolDestroy) {
962              // as it's pool destruction, no need to return object to backend,
963              // only remove backrefs, as they are global
964              removeBackRef(lmb->backRefIdx);
965          } else {
966              // clean g(Next|Prev) to prevent removing lmb
967              // from AllLargeBlocksList inside returnLargeObject
968              lmb->gNext = lmb->gPrev = NULL;
969              backend->returnLargeObject(lmb);
970          }
971      }
972 }
973 
974 TLSData* MemoryPool::getTLS(bool create)
975 {
976     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
977     if (create && !tls)
978         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
979     return tls;
980 }
981 
982 /*
983  * Return the bin for the given size.
984  */
985 inline Bin* TLSData::getAllocationBin(size_t size)
986 {
987     return bin + getIndex(size);
988 }
989 
990 /* Return an empty uninitialized block in a non-blocking fashion. */
991 Block *MemoryPool::getEmptyBlock(size_t size)
992 {
993     TLSData* tls = getTLS(/*create=*/false);
994     // try to use per-thread cache, if TLS available
995     FreeBlockPool::ResOfGet resOfGet = tls?
996         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(NULL, false);
997     Block *result = resOfGet.block;
998 
999     if (!result) { // not found in local cache, asks backend for slabs
1000         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1001         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1002 
1003         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1004         if (!result) return NULL;
1005 
1006         if (!extMemPool.userPool())
1007             for (int i=0; i<num; i++) {
1008                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1009                 if (backRefIdx[i].isInvalid()) {
1010                     // roll back resource allocation
1011                     for (int j=0; j<i; j++)
1012                         removeBackRef(backRefIdx[j]);
1013                     Block *b = result;
1014                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1015                         extMemPool.backend.putSlabBlock(b);
1016                     return NULL;
1017                 }
1018             }
1019         // resources were allocated, register blocks
1020         Block *b = result;
1021         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1022             // slab block in user's pool must have invalid backRefIdx
1023             if (extMemPool.userPool()) {
1024                 new (&b->backRefIdx) BackRefIdx();
1025             } else {
1026                 setBackRef(backRefIdx[i], b);
1027                 b->backRefIdx = backRefIdx[i];
1028             }
1029             b->tlsPtr = tls;
1030             b->poolPtr = this;
1031             // all but first one go to per-thread pool
1032             if (i > 0) {
1033                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1034                 tls->freeSlabBlocks.returnBlock(b);
1035             }
1036         }
1037     }
1038     MALLOC_ASSERT(result, ASSERT_TEXT);
1039     result->initEmptyBlock(tls, size);
1040     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1041     return result;
1042 }
1043 
1044 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1045 {
1046     block->reset();
1047     if (poolTheBlock) {
1048         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1049     } else {
1050         // slab blocks in user's pools do not have valid backRefIdx
1051         if (!extMemPool.userPool())
1052             removeBackRef(*(block->getBackRefIdx()));
1053         extMemPool.backend.putSlabBlock(block);
1054     }
1055 }
1056 
1057 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1058                          rawFreeType rawFree, size_t granularity,
1059                          bool keepAllMemory, bool fixedPool)
1060 {
1061     this->poolId = poolId;
1062     this->rawAlloc = rawAlloc;
1063     this->rawFree = rawFree;
1064     this->granularity = granularity;
1065     this->keepAllMemory = keepAllMemory;
1066     this->fixedPool = fixedPool;
1067     this->delayRegsReleasing = false;
1068     if (!initTLS())
1069         return false;
1070     loc.init(this);
1071     backend.init(this);
1072     MALLOC_ASSERT(isPoolValid(), NULL);
1073     return true;
1074 }
1075 
1076 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1077 
1078 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1079 {
1080     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1081                policy->granularity? policy->granularity : defaultGranularity,
1082                policy->keepAllMemory, policy->fixedPool))
1083         return false;
1084     {
1085         MallocMutex::scoped_lock lock(memPoolListLock);
1086         next = defaultMemPool->next;
1087         defaultMemPool->next = this;
1088         prev = defaultMemPool;
1089         if (next)
1090             next->prev = this;
1091     }
1092     return true;
1093 }
1094 
1095 bool MemoryPool::reset()
1096 {
1097     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1098     // memory is not releasing during pool reset
1099     // TODO: mark regions to release unused on next reset()
1100     extMemPool.delayRegionsReleasing(true);
1101 
1102     bootStrapBlocks.reset();
1103     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1104     if (!extMemPool.reset())
1105         return false;
1106 
1107     if (!extMemPool.initTLS())
1108         return false;
1109     extMemPool.delayRegionsReleasing(false);
1110     return true;
1111 }
1112 
1113 bool MemoryPool::destroy()
1114 {
1115 #if __TBB_MALLOC_LOCACHE_STAT
1116     extMemPool.loc.reportStat(stdout);
1117 #endif
1118 #if __TBB_MALLOC_BACKEND_STAT
1119     extMemPool.backend.reportStat(stdout);
1120 #endif
1121     {
1122         MallocMutex::scoped_lock lock(memPoolListLock);
1123         // remove itself from global pool list
1124         if (prev)
1125             prev->next = next;
1126         if (next)
1127             next->prev = prev;
1128     }
1129     // slab blocks in non-default pool do not have backreferences,
1130     // only large objects do
1131     if (extMemPool.userPool())
1132         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1133     else {
1134         // only one non-userPool() is supported now
1135         MALLOC_ASSERT(this==defaultMemPool, NULL);
1136         // There and below in extMemPool.destroy(), do not restore initial state
1137         // for user pool, because it's just about to be released. But for system
1138         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1139         bootStrapBlocks.reset();
1140         extMemPool.orphanedBlocks.reset();
1141     }
1142     return extMemPool.destroy();
1143 }
1144 
1145 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1146 {
1147     if (tlsData) { // might be called for "empty" TLS
1148         tlsData->release();
1149         bootStrapBlocks.free(tlsData);
1150         clearTLS();
1151     }
1152 }
1153 
1154 #if MALLOC_DEBUG
1155 void Bin::verifyTLSBin (size_t size) const
1156 {
1157 /* The debug version verifies the TLSBin as needed */
1158     uint32_t objSize = getObjectSize(size);
1159 
1160     if (activeBlk) {
1161         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1162         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1163 #if MALLOC_DEBUG>1
1164         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1165             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1166             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1167             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1168             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1169             if (temp->next) {
1170                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1171             }
1172         }
1173         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1174             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1175             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1176             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1177             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1178             if (temp->previous) {
1179                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1180             }
1181         }
1182 #endif /* MALLOC_DEBUG>1 */
1183     }
1184 }
1185 #else /* MALLOC_DEBUG */
1186 inline void Bin::verifyTLSBin (size_t) const { }
1187 #endif /* MALLOC_DEBUG */
1188 
1189 /*
1190  * Add a block to the start of this tls bin list.
1191  */
1192 void Bin::pushTLSBin(Block* block)
1193 {
1194     /* The objectSize should be defined and not a parameter
1195        because the function is applied to partially filled blocks as well */
1196     unsigned int size = block->objectSize;
1197 
1198     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1199     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1200     MALLOC_ASSERT( block->next == NULL, ASSERT_TEXT );
1201     MALLOC_ASSERT( block->previous == NULL, ASSERT_TEXT );
1202 
1203     MALLOC_ASSERT( this, ASSERT_TEXT );
1204     verifyTLSBin(size);
1205 
1206     block->next = activeBlk;
1207     if( activeBlk ) {
1208         block->previous = activeBlk->previous;
1209         activeBlk->previous = block;
1210         if( block->previous )
1211             block->previous->next = block;
1212     } else {
1213         activeBlk = block;
1214     }
1215 
1216     verifyTLSBin(size);
1217 }
1218 
1219 /*
1220  * Take a block out of its tls bin (e.g. before removal).
1221  */
1222 void Bin::outofTLSBin(Block* block)
1223 {
1224     unsigned int size = block->objectSize;
1225 
1226     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1227     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1228 
1229     MALLOC_ASSERT( this, ASSERT_TEXT );
1230     verifyTLSBin(size);
1231 
1232     if (block == activeBlk) {
1233         activeBlk = block->previous? block->previous : block->next;
1234     }
1235     /* Unlink the block */
1236     if (block->previous) {
1237         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1238         block->previous->next = block->next;
1239     }
1240     if (block->next) {
1241         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1242         block->next->previous = block->previous;
1243     }
1244     block->next = NULL;
1245     block->previous = NULL;
1246 
1247     verifyTLSBin(size);
1248 }
1249 
1250 Block* Bin::getPrivatizedFreeListBlock()
1251 {
1252     Block* block;
1253     MALLOC_ASSERT( this, ASSERT_TEXT );
1254     // if this method is called, active block usage must be unsuccessful
1255     MALLOC_ASSERT( !activeBlk && !mailbox.load(std::memory_order_relaxed) || activeBlk && activeBlk->isFull, ASSERT_TEXT );
1256 
1257 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1258     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1259         return NULL;
1260     else { // mailbox is not empty, take lock and inspect it
1261         MallocMutex::scoped_lock scoped_cs(mailLock);
1262         block = mailbox.load(std::memory_order_relaxed);
1263         if( block ) {
1264             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1265             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable), ASSERT_TEXT );
1266             mailbox.store(block->nextPrivatizable, std::memory_order_relaxed);
1267             block->nextPrivatizable = (Block*) this;
1268         }
1269     }
1270     if( block ) {
1271         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1272         block->privatizePublicFreeList();
1273         block->adjustPositionInBin(this);
1274     }
1275     return block;
1276 }
1277 
1278 void Bin::addPublicFreeListBlock(Block* block)
1279 {
1280     MallocMutex::scoped_lock scoped_cs(mailLock);
1281     block->nextPrivatizable = mailbox.load(std::memory_order_relaxed);
1282     mailbox.store(block, std::memory_order_relaxed);
1283 }
1284 
1285 // Process publicly freed objects in all blocks and return empty blocks
1286 // to the backend in order to reduce overall footprint.
1287 bool Bin::cleanPublicFreeLists()
1288 {
1289     Block* block;
1290     if (!mailbox.load(std::memory_order_acquire))
1291         return false;
1292     else {
1293         // Grab all the blocks in the mailbox
1294         MallocMutex::scoped_lock scoped_cs(mailLock);
1295         block = mailbox.load(std::memory_order_relaxed);
1296         mailbox.store(NULL, std::memory_order_relaxed);
1297     }
1298     bool released = false;
1299     while (block) {
1300         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1301         Block* tmp = block->nextPrivatizable;
1302         block->nextPrivatizable = (Block*) this;
1303         block->privatizePublicFreeList();
1304         if (block->empty()) {
1305             processEmptyBlock(block, /*poolTheBlock=*/false);
1306             released = true;
1307         } else
1308             block->adjustPositionInBin(this);
1309         block = tmp;
1310     }
1311     return released;
1312 }
1313 
1314 bool Block::adjustFullness()
1315 {
1316     if (bumpPtr) {
1317         /* If we are still using a bump ptr for this block it is empty enough to use. */
1318         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1319         isFull = false;
1320     } else {
1321         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1322         /* allocatedCount shows how many objects in the block are in use; however it still counts
1323          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1324         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1325 #if COLLECT_STATISTICS
1326         if (isFull)
1327             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1328         else
1329             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1330 #endif
1331     }
1332     return isFull;
1333 }
1334 
1335 // This method resides in class Block, and not in class Bin, in order to avoid
1336 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
1337 void Block::adjustPositionInBin(Bin* bin/*=NULL*/)
1338 {
1339     // If the block were full, but became empty enough to use,
1340     // move it to the front of the list
1341     if (isFull && !adjustFullness()) {
1342         if (!bin)
1343             bin = tlsPtr->getAllocationBin(objectSize);
1344         bin->moveBlockToFront(this);
1345     }
1346 }
1347 
1348 /* Restore the bump pointer for an empty block that is planned to use */
1349 void Block::restoreBumpPtr()
1350 {
1351     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1352     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1353     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1354     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1355     freeList = NULL;
1356     isFull = false;
1357 }
1358 
1359 void Block::freeOwnObject(void *object)
1360 {
1361     tlsPtr->markUsed();
1362     allocatedCount--;
1363     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1364 #if COLLECT_STATISTICS
1365     // Note that getAllocationBin is not called on the hottest path with statistics off.
1366     if (tlsPtr->getAllocationBin(objectSize)->getActiveBlock() != this)
1367         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1368     else
1369         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1370 #endif
1371     if (empty()) {
1372         // If the last object of a slab is freed, the slab cannot be marked full
1373         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1374         tlsPtr->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1375     } else { // hot path
1376         FreeObject *objectToFree = findObjectToFree(object);
1377         objectToFree->next = freeList;
1378         freeList = objectToFree;
1379         adjustPositionInBin();
1380     }
1381 }
1382 
1383 void Block::freePublicObject (FreeObject *objectToFree)
1384 {
1385     FreeObject *localPublicFreeList;
1386 
1387     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1388 #if FREELIST_NONBLOCKING
1389     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1390     FreeObject *temp = publicFreeList.load(std::memory_order_acquire);
1391     do {
1392         localPublicFreeList = objectToFree->next = temp;
1393         // no backoff necessary because trying to make change, not waiting for a change
1394     } while( !publicFreeList.compare_exchange_strong(temp, objectToFree) );
1395 #else
1396     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1397     {
1398         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1399         localPublicFreeList = objectToFree->next = publicFreeList;
1400         publicFreeList = objectToFree;
1401     }
1402 #endif
1403 
1404     if( localPublicFreeList==NULL ) {
1405         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1406         // otherwise, it should point to the bin the block belongs to.
1407         // reading nextPrivatizable is thread-safe below, because:
1408         // 1) the executing thread atomically got publicFreeList==NULL and changed it to non-NULL;
1409         // 2) only owning thread can change it back to NULL,
1410         // 3) but it can not be done until the block is put to the mailbox
1411         // So the executing thread is now the only one that can change nextPrivatizable
1412         if( !isNotForUse(nextPrivatizable) ) {
1413             MALLOC_ASSERT( nextPrivatizable!=NULL, ASSERT_TEXT );
1414             Bin* theBin = (Bin*) nextPrivatizable;
1415 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1416             // FIXME: The thread that returns the block is not the block's owner.
1417             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1418             // Need to find a way to get the correct remote bin for comparison.
1419             { // check that nextPrivatizable points to the bin the block belongs to
1420                 uint32_t index = getIndex( objectSize );
1421                 TLSData* tls = getThreadMallocTLS();
1422                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1423             }
1424 #endif // MALLOC_DEBUG
1425             theBin->addPublicFreeListBlock(this);
1426         }
1427     }
1428     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1429     STAT_increment(ownerTid, getIndex(objectSize), freeByOtherThread);
1430 }
1431 
1432 // Make objects freed by other threads available for use again
1433 void Block::privatizePublicFreeList( bool reset )
1434 {
1435     FreeObject *localPublicFreeList;
1436     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1437     // to properly synchronize with other threads freeing objects to this slab.
1438     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1439 
1440     // Only the owner thread may reset the pointer to NULL
1441     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1442 #if FREELIST_NONBLOCKING
1443     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1444 #else
1445     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1446     {
1447         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1448         localPublicFreeList = publicFreeList;
1449         publicFreeList = endMarker;
1450     }
1451 #endif
1452     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1453     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1454 
1455     // publicFreeList must have been UNUSABLE or valid, but not NULL
1456     MALLOC_ASSERT( localPublicFreeList!=NULL, ASSERT_TEXT );
1457     if( isSolidPtr(localPublicFreeList) ) {
1458         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1459         /* other threads did not change the counter freeing our blocks */
1460         allocatedCount--;
1461         FreeObject *temp = localPublicFreeList;
1462         while( isSolidPtr(temp->next) ){ // the list will end with either NULL or UNUSABLE
1463             temp = temp->next;
1464             allocatedCount--;
1465             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1466         }
1467         /* merge with local freeList */
1468         temp->next = freeList;
1469         freeList = localPublicFreeList;
1470         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1471     }
1472 }
1473 
1474 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1475 {
1476     Bin* bin = tls->bin + index;
1477     STAT_increment(getThreadId(), index, allocBlockPublic);
1478     next = NULL;
1479     previous = NULL;
1480     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != NULL, ASSERT_TEXT );
1481     /* There is not a race here since no other thread owns this block */
1482     markOwned(tls);
1483     // It is safe to change nextPrivatizable, as publicFreeList is not null
1484     MALLOC_ASSERT( isNotForUse(nextPrivatizable), ASSERT_TEXT );
1485     nextPrivatizable = (Block*)bin;
1486     // the next call is required to change publicFreeList to 0
1487     privatizePublicFreeList();
1488     if( empty() ) {
1489         restoreBumpPtr();
1490     } else {
1491         adjustFullness(); // check the block fullness and set isFull
1492     }
1493     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1494 }
1495 
1496 
1497 bool Block::readyToShare()
1498 {
1499     FreeObject* oldVal = NULL;
1500 #if FREELIST_NONBLOCKING
1501     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1502 #else
1503     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1504     {
1505         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1506         if ( (oldVal=publicFreeList)==NULL )
1507             (intptr_t&)(publicFreeList) = UNUSABLE;
1508     }
1509 #endif
1510     return oldVal==NULL;
1511 }
1512 
1513 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1514 {
1515     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1516     // unreferenced formal parameter warning
1517     tbb::detail::suppress_unused_warning(index);
1518     STAT_increment(getThreadId(), index, freeBlockPublic);
1519     markOrphaned();
1520     if ((intptr_t)nextPrivatizable==binTag) {
1521         // First check passed: the block is not in mailbox yet.
1522         // Need to set publicFreeList to non-zero, so other threads
1523         // will not change nextPrivatizable and it can be zeroed.
1524         if ( !readyToShare() ) {
1525             // another thread freed an object; we need to wait until it finishes.
1526             // There is no need for exponential backoff, as the wait here is not for a lock;
1527             // but need to yield, so the thread we wait has a chance to run.
1528             // TODO: add a pause to also be friendly to hyperthreads
1529             int count = 256;
1530             while( (intptr_t)const_cast<Block* volatile &>(nextPrivatizable)==binTag ) {
1531                 if (--count==0) {
1532                     do_yield();
1533                     count = 256;
1534                 }
1535             }
1536         }
1537     }
1538     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=NULL, ASSERT_TEXT );
1539     // now it is safe to change our data
1540     previous = NULL;
1541     // it is caller responsibility to ensure that the list of blocks
1542     // formed by nextPrivatizable pointers is kept consistent if required.
1543     // if only called from thread shutdown code, it does not matter.
1544     (intptr_t&)(nextPrivatizable) = UNUSABLE;
1545 }
1546 
1547 void Block::cleanBlockHeader()
1548 {
1549     next = NULL;
1550     previous = NULL;
1551     freeList = NULL;
1552     allocatedCount = 0;
1553     isFull = false;
1554     tlsPtr = NULL;
1555 
1556     publicFreeList.store(NULL, std::memory_order_relaxed);
1557 }
1558 
1559 void Block::initEmptyBlock(TLSData *tls, size_t size)
1560 {
1561     // Having getIndex and getObjectSize called next to each other
1562     // allows better compiler optimization as they basically share the code.
1563     unsigned int index = getIndex(size);
1564     unsigned int objSz = getObjectSize(size);
1565 
1566     cleanBlockHeader();
1567     objectSize = objSz;
1568     markOwned(tls);
1569     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1570     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1571 
1572     // each block should have the address where the head of the list of "privatizable" blocks is kept
1573     // the only exception is a block for boot strap which is initialized when TLS is yet NULL
1574     nextPrivatizable = tls? (Block*)(tls->bin + index) : NULL;
1575     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1576              this, tlsPtr ? getThreadId() : -1, objectSize, bumpPtr ));
1577 }
1578 
1579 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1580 {
1581     // TODO: try to use index from getAllocationBin
1582     unsigned int index = getIndex(size);
1583     Block *block = bins[index].pop();
1584     if (block) {
1585         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1586         block->privatizeOrphaned(tls, index);
1587     }
1588     return block;
1589 }
1590 
1591 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1592 {
1593     unsigned int index = getIndex(block->getSize());
1594     block->shareOrphaned(binTag, index);
1595     MALLOC_ITT_SYNC_RELEASING(bins+index);
1596     bins[index].push(block);
1597 }
1598 
1599 void OrphanedBlocks::reset()
1600 {
1601     for (uint32_t i=0; i<numBlockBinLimit; i++)
1602         new (bins+i) LifoList();
1603 }
1604 
1605 bool OrphanedBlocks::cleanup(Backend* backend)
1606 {
1607     bool released = false;
1608     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1609         Block* block = bins[i].grab();
1610         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1611         while (block) {
1612             Block* next = block->next;
1613             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to NULL
1614             if (block->empty()) {
1615                 block->reset();
1616                 // slab blocks in user's pools do not have valid backRefIdx
1617                 if (!backend->inUserPool())
1618                     removeBackRef(*(block->getBackRefIdx()));
1619                 backend->putSlabBlock(block);
1620                 released = true;
1621             } else {
1622                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1623                 bins[i].push(block);
1624             }
1625             block = next;
1626         }
1627     }
1628     return released;
1629 }
1630 
1631 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1632 {
1633     Block *b = head.exchange(NULL);
1634 
1635     if (b) {
1636         size--;
1637         Block *newHead = b->next;
1638         lastAccessMiss = false;
1639         head.store(newHead, std::memory_order_release);
1640     } else {
1641         lastAccessMiss = true;
1642     }
1643     return ResOfGet(b, lastAccessMiss);
1644 }
1645 
1646 void FreeBlockPool::returnBlock(Block *block)
1647 {
1648     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1649     Block *localHead = head.exchange(NULL);
1650 
1651     if (!localHead) {
1652         size = 0; // head was stolen by externalClean, correct size accordingly
1653     } else if (size == POOL_HIGH_MARK) {
1654         // release cold blocks and add hot one,
1655         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1656         Block *headToFree = localHead, *helper;
1657         for (int i=0; i<POOL_LOW_MARK-2; i++)
1658             headToFree = headToFree->next;
1659         Block *last = headToFree;
1660         headToFree = headToFree->next;
1661         last->next = NULL;
1662         size = POOL_LOW_MARK-1;
1663         for (Block *currBl = headToFree; currBl; currBl = helper) {
1664             helper = currBl->next;
1665             // slab blocks in user's pools do not have valid backRefIdx
1666             if (!backend->inUserPool())
1667                 removeBackRef(currBl->backRefIdx);
1668             backend->putSlabBlock(currBl);
1669         }
1670     }
1671     size++;
1672     block->next = localHead;
1673     head.store(block, std::memory_order_release);
1674 }
1675 
1676 bool FreeBlockPool::externalCleanup()
1677 {
1678     Block *helper;
1679     bool released = false;
1680 
1681     for (Block *currBl=head.exchange(NULL); currBl; currBl=helper) {
1682         helper = currBl->next;
1683         // slab blocks in user's pools do not have valid backRefIdx
1684         if (!backend->inUserPool())
1685             removeBackRef(currBl->backRefIdx);
1686         backend->putSlabBlock(currBl);
1687         released = true;
1688     }
1689     return released;
1690 }
1691 
1692 /* Prepare the block for returning to FreeBlockPool */
1693 void Block::reset()
1694 {
1695     // it is caller's responsibility to ensure no data is lost before calling this
1696     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1697     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1698     if (!isStartupAllocObject())
1699         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1700 
1701     cleanBlockHeader();
1702 
1703     nextPrivatizable = NULL;
1704 
1705     objectSize = 0;
1706     // for an empty block, bump pointer should point right after the end of the block
1707     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1708 }
1709 
1710 inline void Bin::setActiveBlock (Block *block)
1711 {
1712 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1713     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1714     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1715     activeBlk = block;
1716 }
1717 
1718 inline Block* Bin::setPreviousBlockActive()
1719 {
1720     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1721     Block* temp = activeBlk->previous;
1722     if( temp ) {
1723         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1724         activeBlk = temp;
1725     }
1726     return temp;
1727 }
1728 
1729 inline bool Block::isOwnedByCurrentThread() const {
1730     return tlsPtr && ownerTid.isCurrentThreadId();
1731 }
1732 
1733 FreeObject *Block::findObjectToFree(const void *object) const
1734 {
1735     FreeObject *objectToFree;
1736     // Due to aligned allocations, a pointer passed to scalable_free
1737     // might differ from the address of internally allocated object.
1738     // Small objects however should always be fine.
1739     if (objectSize <= maxSegregatedObjectSize)
1740         objectToFree = (FreeObject*)object;
1741     // "Fitting size" allocations are suspicious if aligned higher than naturally
1742     else {
1743         if ( ! isAligned(object,2*fittingAlignment) )
1744             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1745             //       so might even be slower in average than unconditional use of findAllocatedObject.
1746             // here it should be a "real" object
1747             objectToFree = (FreeObject*)object;
1748         else
1749             // here object can be an aligned address, so applying additional checks
1750             objectToFree = findAllocatedObject(object);
1751         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1752     }
1753     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1754 
1755     return objectToFree;
1756 }
1757 
1758 void TLSData::release()
1759 {
1760     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1761     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1762 
1763     for (unsigned index = 0; index < numBlockBins; index++) {
1764         Block *activeBlk = bin[index].getActiveBlock();
1765         if (!activeBlk)
1766             continue;
1767         Block *threadlessBlock = activeBlk->previous;
1768         while (threadlessBlock) {
1769             Block *threadBlock = threadlessBlock->previous;
1770             if (threadlessBlock->empty()) {
1771                 /* we destroy the thread, so not use its block pool */
1772                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1773             } else {
1774                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1775             }
1776             threadlessBlock = threadBlock;
1777         }
1778         threadlessBlock = activeBlk;
1779         while (threadlessBlock) {
1780             Block *threadBlock = threadlessBlock->next;
1781             if (threadlessBlock->empty()) {
1782                 /* we destroy the thread, so not use its block pool */
1783                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1784             } else {
1785                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1786             }
1787             threadlessBlock = threadBlock;
1788         }
1789         bin[index].resetActiveBlock();
1790     }
1791 }
1792 
1793 
1794 #if MALLOC_CHECK_RECURSION
1795 // TODO: Use dedicated heap for this
1796 
1797 /*
1798  * It's a special kind of allocation that can be used when malloc is
1799  * not available (either during startup or when malloc was already called and
1800  * we are, say, inside pthread_setspecific's call).
1801  * Block can contain objects of different sizes,
1802  * allocations are performed by moving bump pointer and increasing of object counter,
1803  * releasing is done via counter of objects allocated in the block
1804  * or moving bump pointer if releasing object is on a bound.
1805  * TODO: make bump pointer to grow to the same backward direction as all the others.
1806  */
1807 
1808 class StartupBlock : public Block {
1809     size_t availableSize() const {
1810         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1811     }
1812     static StartupBlock *getBlock();
1813 public:
1814     static FreeObject *allocate(size_t size);
1815     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1816     void free(void *ptr);
1817 };
1818 
1819 static MallocMutex startupMallocLock;
1820 static StartupBlock *firstStartupBlock;
1821 
1822 StartupBlock *StartupBlock::getBlock()
1823 {
1824     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1825     if (backRefIdx.isInvalid()) return NULL;
1826 
1827     StartupBlock *block = static_cast<StartupBlock*>(
1828         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1829     if (!block) return NULL;
1830 
1831     block->cleanBlockHeader();
1832     setBackRef(backRefIdx, block);
1833     block->backRefIdx = backRefIdx;
1834     // use startupAllocObjSizeMark to mark objects from startup block marker
1835     block->objectSize = startupAllocObjSizeMark;
1836     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1837     return block;
1838 }
1839 
1840 FreeObject *StartupBlock::allocate(size_t size)
1841 {
1842     FreeObject *result;
1843     StartupBlock *newBlock = NULL;
1844     bool newBlockUnused = false;
1845 
1846     /* Objects must be aligned on their natural bounds,
1847        and objects bigger than word on word's bound. */
1848     size = alignUp(size, sizeof(size_t));
1849     // We need size of an object to implement msize.
1850     size_t reqSize = size + sizeof(size_t);
1851     // speculatively allocates newBlock to try avoid allocation while holding lock
1852     /* TODO: The function is called when malloc nested call is detected,
1853              so simultaneous usage from different threads seems unlikely.
1854              If pre-allocation is found useless, the code might be simplified. */
1855     if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1856         newBlock = StartupBlock::getBlock();
1857         if (!newBlock) return NULL;
1858     }
1859     {
1860         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1861         // Re-check whether we need a new block (conditions might have changed)
1862         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1863             if (!newBlock) {
1864                 newBlock = StartupBlock::getBlock();
1865                 if (!newBlock) return NULL;
1866             }
1867             newBlock->next = (Block*)firstStartupBlock;
1868             if (firstStartupBlock)
1869                 firstStartupBlock->previous = (Block*)newBlock;
1870             firstStartupBlock = newBlock;
1871         } else
1872             newBlockUnused = true;
1873         result = firstStartupBlock->bumpPtr;
1874         firstStartupBlock->allocatedCount++;
1875         firstStartupBlock->bumpPtr =
1876             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1877     }
1878     if (newBlock && newBlockUnused)
1879         defaultMemPool->returnEmptyBlock(newBlock, /*poolTheBlock=*/false);
1880 
1881     // keep object size at the negative offset
1882     *((size_t*)result) = size;
1883     return (FreeObject*)((size_t*)result+1);
1884 }
1885 
1886 void StartupBlock::free(void *ptr)
1887 {
1888     Block* blockToRelease = NULL;
1889     {
1890         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1891 
1892         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1893         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1894                       && allocatedCount>0, ASSERT_TEXT);
1895         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1896                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1897                       ASSERT_TEXT);
1898         if (0 == --allocatedCount) {
1899             if (this == firstStartupBlock)
1900                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1901             if (previous)
1902                 previous->next = next;
1903             if (next)
1904                 next->previous = previous;
1905             blockToRelease = this;
1906         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1907             // last object in the block released
1908             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1909             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1910                           ASSERT_TEXT);
1911             bumpPtr = newBump;
1912         }
1913     }
1914     if (blockToRelease) {
1915         blockToRelease->previous = blockToRelease->next = NULL;
1916         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1917     }
1918 }
1919 
1920 #endif /* MALLOC_CHECK_RECURSION */
1921 
1922 /********* End thread related code  *************/
1923 
1924 /********* Library initialization *************/
1925 
1926 //! Value indicating the state of initialization.
1927 /* 0 = initialization not started.
1928  * 1 = initialization started but not finished.
1929  * 2 = initialization finished.
1930  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1931  * useful for detecting errors in the double-check pattern.
1932  */
1933 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1934 static MallocMutex initMutex;
1935 
1936 /** The leading "\0" is here so that applying "strings" to the binary
1937     delivers a clean result. */
1938 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1939 
1940 #if USE_PTHREAD && (__TBB_SOURCE_DIRECTLY_INCLUDED || __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND)
1941 
1942 /* Decrease race interval between dynamic library unloading and pthread key
1943    destructor. Protect only Pthreads with supported unloading. */
1944 class ShutdownSync {
1945 /* flag is the number of threads in pthread key dtor body
1946    (i.e., between threadDtorStart() and threadDtorDone())
1947    or the signal to skip dtor, if flag < 0 */
1948     std::atomic<intptr_t> flag;
1949     static const intptr_t skipDtor = INTPTR_MIN/2;
1950 public:
1951     void init() { flag.store(0, std::memory_order_release); }
1952 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1953    simultaneously, so flag never becomes negative because of that. */
1954     bool threadDtorStart() {
1955         if (flag.load(std::memory_order_acquire) < 0)
1956             return false;
1957         if (++flag <= 0) { // note that new value returned
1958             flag.fetch_sub(1); // flag is spoiled by us, restore it
1959             return false;
1960         }
1961         return true;
1962     }
1963     void threadDtorDone() {
1964         flag.fetch_sub(1);
1965     }
1966     void processExit() {
1967         if (flag.fetch_add(skipDtor) != 0) {
1968             SpinWaitUntilEq(flag, skipDtor);
1969         }
1970     }
1971 };
1972 
1973 #else
1974 
1975 class ShutdownSync {
1976 public:
1977     void init() { }
1978     bool threadDtorStart() { return true; }
1979     void threadDtorDone() { }
1980     void processExit() { }
1981 };
1982 
1983 #endif // USE_PTHREAD && (__TBB_SOURCE_DIRECTLY_INCLUDED || __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND)
1984 
1985 static ShutdownSync shutdownSync;
1986 
1987 inline bool isMallocInitialized() {
1988     // Load must have acquire fence; otherwise thread taking "initialized" path
1989     // might perform textually later loads *before* mallocInitialized becomes 2.
1990     return 2 == mallocInitialized.load(std::memory_order_acquire);
1991 }
1992 
1993 bool isMallocInitializedExt() {
1994     return isMallocInitialized();
1995 }
1996 
1997 /* Caller is responsible for ensuring this routine is called exactly once. */
1998 extern "C" void MallocInitializeITT() {
1999 #if __TBB_USE_ITT_NOTIFY
2000     if (!usedBySrcIncluded)
2001         tbb::detail::r1::__TBB_load_ittnotify();
2002 #endif
2003 }
2004 
2005 void MemoryPool::initDefaultPool() {
2006     hugePages.init();
2007 }
2008 
2009 /*
2010  * Allocator initialization routine;
2011  * it is called lazily on the very first scalable_malloc call.
2012  */
2013 static bool initMemoryManager()
2014 {
2015     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2016              sizeof(Block), sizeof(uintptr_t) ));
2017     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2018     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2019     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2020                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2021 
2022 #if USE_WINTHREAD
2023     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2024 #else
2025     // POSIX.1-2001-compliant way to get page size
2026     const size_t granularity = sysconf(_SC_PAGESIZE);
2027 #endif
2028     if (!defaultMemPool) {
2029         // Do not rely on static constructors and do the assignment in case
2030         // of library static section not initialized at this call yet.
2031         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2032     }
2033     bool initOk = defaultMemPool->
2034         extMemPool.init(0, NULL, NULL, granularity,
2035                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2036 // TODO: extMemPool.init() to not allocate memory
2037     if (!initOk || !initBackRefMaster(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2038         return false;
2039     MemoryPool::initDefaultPool();
2040     // init() is required iff initMemoryManager() is called
2041     // after mallocProcessShutdownNotification()
2042     shutdownSync.init();
2043 #if COLLECT_STATISTICS
2044     initStatisticsCollection();
2045 #endif
2046     return true;
2047 }
2048 
2049 static bool GetBoolEnvironmentVariable(const char* name) {
2050     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2051 }
2052 
2053 //! Ensures that initMemoryManager() is called once and only once.
2054 /** Does not return until initMemoryManager() has been completed by a thread.
2055     There is no need to call this routine if mallocInitialized==2 . */
2056 static bool doInitialization()
2057 {
2058     MallocMutex::scoped_lock lock( initMutex );
2059     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2060         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2061         mallocInitialized.store(1, std::memory_order_relaxed);
2062         RecursiveMallocCallProtector scoped;
2063         if (!initMemoryManager()) {
2064             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2065             return false;
2066         }
2067 #ifdef  MALLOC_EXTRA_INITIALIZATION
2068         MALLOC_EXTRA_INITIALIZATION;
2069 #endif
2070 #if MALLOC_CHECK_RECURSION
2071         RecursiveMallocCallProtector::detectNaiveOverload();
2072 #endif
2073         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2074         // Store must have release fence, otherwise mallocInitialized==2
2075         // might become remotely visible before side effects of
2076         // initMemoryManager() become remotely visible.
2077         mallocInitialized.store(2, std::memory_order_release);
2078         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2079             fputs(VersionString+1,stderr);
2080             hugePages.printStatus();
2081         }
2082     }
2083     /* It can't be 0 or I would have initialized it */
2084     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2085     return true;
2086 }
2087 
2088 /********* End library initialization *************/
2089 
2090 /********* The malloc show begins     *************/
2091 
2092 
2093 FreeObject *Block::allocateFromFreeList()
2094 {
2095     FreeObject *result;
2096 
2097     if (!freeList) return NULL;
2098 
2099     result = freeList;
2100     MALLOC_ASSERT( result, ASSERT_TEXT );
2101 
2102     freeList = result->next;
2103     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2104     allocatedCount++;
2105     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2106 
2107     return result;
2108 }
2109 
2110 FreeObject *Block::allocateFromBumpPtr()
2111 {
2112     FreeObject *result = bumpPtr;
2113     if (result) {
2114         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2115         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2116             bumpPtr = NULL;
2117         }
2118         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2119         allocatedCount++;
2120         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2121     }
2122     return result;
2123 }
2124 
2125 inline FreeObject* Block::allocate()
2126 {
2127     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2128 
2129     /* for better cache locality, first looking in the free list. */
2130     if ( FreeObject *result = allocateFromFreeList() ) {
2131         return result;
2132     }
2133     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2134 
2135     /* if free list is empty, try thread local bump pointer allocation. */
2136     if ( FreeObject *result = allocateFromBumpPtr() ) {
2137         return result;
2138     }
2139     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2140 
2141     /* the block is considered full. */
2142     isFull = true;
2143     return NULL;
2144 }
2145 
2146 size_t Block::findObjectSize(void *object) const
2147 {
2148     size_t blSize = getSize();
2149 #if MALLOC_CHECK_RECURSION
2150     // Currently, there is no aligned allocations from startup blocks,
2151     // so we can return just StartupBlock::msize().
2152     // TODO: This must be extended if we add aligned allocation from startup blocks.
2153     if (!blSize)
2154         return StartupBlock::msize(object);
2155 #endif
2156     // object can be aligned, so real size can be less than block's
2157     size_t size =
2158         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2159     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2160     return size;
2161 }
2162 
2163 void Bin::moveBlockToFront(Block *block)
2164 {
2165     /* move the block to the front of the bin */
2166     if (block == activeBlk) return;
2167     outofTLSBin(block);
2168     pushTLSBin(block);
2169 }
2170 
2171 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2172 {
2173     if (block != activeBlk) {
2174         /* We are not using this block; return it to the pool */
2175         outofTLSBin(block);
2176         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2177     } else {
2178         /* all objects are free - let's restore the bump pointer */
2179         block->restoreBumpPtr();
2180     }
2181 }
2182 
2183 template<int LOW_MARK, int HIGH_MARK>
2184 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2185 {
2186     const size_t size = object->unalignedSize;
2187     // not spoil cache with too large object, that can cause its total cleanup
2188     if (size > MAX_TOTAL_SIZE)
2189         return false;
2190     LargeMemoryBlock *localHead = head.exchange(NULL);
2191 
2192     object->prev = NULL;
2193     object->next = localHead;
2194     if (localHead)
2195         localHead->prev = object;
2196     else {
2197         // those might not be cleaned during local cache stealing, correct them
2198         totalSize = 0;
2199         numOfBlocks = 0;
2200         tail = object;
2201     }
2202     localHead = object;
2203     totalSize += size;
2204     numOfBlocks++;
2205     // must meet both size and number of cached objects constrains
2206     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2207         // scanning from tail until meet conditions
2208         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2209             totalSize -= tail->unalignedSize;
2210             numOfBlocks--;
2211             tail = tail->prev;
2212         }
2213         LargeMemoryBlock *headToRelease = tail->next;
2214         tail->next = NULL;
2215 
2216         extMemPool->freeLargeObjectList(headToRelease);
2217     }
2218 
2219     head.store(localHead, std::memory_order_release);
2220     return true;
2221 }
2222 
2223 template<int LOW_MARK, int HIGH_MARK>
2224 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2225 {
2226     LargeMemoryBlock *localHead, *res = NULL;
2227 
2228     if (size > MAX_TOTAL_SIZE)
2229         return NULL;
2230 
2231     // TBB_REVAMP_TODO: review this line
2232     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(NULL)) == NULL) {
2233         // do not restore totalSize, numOfBlocks and tail at this point,
2234         // as they are used only in put(), where they must be restored
2235         return NULL;
2236     }
2237 
2238     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2239         if (curr->unalignedSize == size) {
2240             res = curr;
2241             if (curr->next)
2242                 curr->next->prev = curr->prev;
2243             else
2244                 tail = curr->prev;
2245             if (curr != localHead)
2246                 curr->prev->next = curr->next;
2247             else
2248                 localHead = curr->next;
2249             totalSize -= size;
2250             numOfBlocks--;
2251             break;
2252         }
2253     }
2254 
2255     head.store(localHead, std::memory_order_release);
2256     return res;
2257 }
2258 
2259 template<int LOW_MARK, int HIGH_MARK>
2260 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2261 {
2262     if (LargeMemoryBlock *localHead = head.exchange(NULL)) {
2263         extMemPool->freeLargeObjectList(localHead);
2264         return true;
2265     }
2266     return false;
2267 }
2268 
2269 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2270 {
2271     LargeMemoryBlock *lmb = NULL;
2272 
2273     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2274     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2275     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2276         return NULL;
2277     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2278 
2279     if (tls) {
2280         tls->markUsed();
2281         lmb = tls->lloc.get(allocationSize);
2282     }
2283     if (!lmb)
2284         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2285 
2286     if (lmb) {
2287         // doing shuffle we suppose that alignment offset guarantees
2288         // that different cache lines are in use
2289         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2290 
2291         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2292         uintptr_t alignedRight =
2293             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2294         // Has some room to shuffle object between cache lines?
2295         // Note that alignedRight and alignedArea are aligned at alignment.
2296         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2297         if (ptrDelta && tls) { // !tls is cold path
2298             // for the hot path of alignment==estimatedCacheLineSize,
2299             // allow compilers to use shift for division
2300             // (since estimatedCacheLineSize is a power-of-2 constant)
2301             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2302                   ptrDelta / estimatedCacheLineSize :
2303                   ptrDelta / alignment;
2304             unsigned myCacheIdx = ++tls->currCacheIdx;
2305             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2306 
2307             // Move object to a cache line with an offset that is different from
2308             // previous allocation. This supposedly allows us to use cache
2309             // associativity more efficiently.
2310             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2311         }
2312         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2313                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2314         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2315         header->memoryBlock = lmb;
2316         header->backRefIdx = lmb->backRefIdx;
2317         setBackRef(header->backRefIdx, header);
2318 
2319         lmb->objectSize = size;
2320 
2321         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2322         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2323 
2324         return alignedArea;
2325     }
2326     return NULL;
2327 }
2328 
2329 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2330 {
2331     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2332     // overwrite backRefIdx to simplify double free detection
2333     header->backRefIdx = BackRefIdx();
2334 
2335     if (tls) {
2336         tls->markUsed();
2337         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2338             return;
2339     }
2340     extMemPool.freeLargeObject(header->memoryBlock);
2341 }
2342 
2343 /*
2344  * All aligned allocations fall into one of the following categories:
2345  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2346  *       we just align the size up, and request this amount, because for every size
2347  *       aligned to some power of 2, the allocated object is at least that aligned.
2348  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2349  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2350  *       its address up; given such pointer, scalable_free could find the real object.
2351  *       Wrapping of size+alignment is impossible because maximal allowed
2352  *       alignment plus minLargeObjectSize can't lead to wrapping.
2353  * 4. otherwise, aligned large object is allocated.
2354  */
2355 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2356 {
2357     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2358 
2359     if (!isMallocInitialized())
2360         if (!doInitialization())
2361             return NULL;
2362 
2363     void *result;
2364     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2365         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2366     else if (size<minLargeObjectSize) {
2367         if (alignment<=fittingAlignment)
2368             result = internalPoolMalloc(memPool, size);
2369         else if (size+alignment < minLargeObjectSize) {
2370             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2371             if (!unaligned) return NULL;
2372             result = alignUp(unaligned, alignment);
2373         } else
2374             goto LargeObjAlloc;
2375     } else {
2376     LargeObjAlloc:
2377         TLSData *tls = memPool->getTLS(/*create=*/true);
2378         // take into account only alignment that are higher then natural
2379         result =
2380             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2381                                                largeObjectAlignment: alignment);
2382     }
2383 
2384     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2385     return result;
2386 }
2387 
2388 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2389                             size_t newSize, size_t alignment = 0)
2390 {
2391     void *result;
2392     size_t copySize;
2393 
2394     if (isLargeObject<ourMem>(ptr)) {
2395         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2396         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2397 
2398         // Apply different strategies if size decreases
2399         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2400 
2401             // For huge objects (that do not fit in backend cache), keep the same space unless
2402             // the new size is at least twice smaller
2403             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2404             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2405             if (newSize > threshold) {
2406                 lmb->objectSize = newSize;
2407                 return ptr;
2408             }
2409             // TODO: For large objects suitable for the backend cache,
2410             // split out the excessive part and put it to the backend.
2411         }
2412         // Reallocate for real
2413         copySize = lmb->objectSize;
2414 #if BACKEND_HAS_MREMAP
2415         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2416                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2417             return r;
2418 #endif
2419         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2420             internalPoolMalloc(memPool, newSize);
2421 
2422     } else {
2423         Block* block = (Block *)alignDown(ptr, slabSize);
2424         copySize = block->findObjectSize(ptr);
2425 
2426         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2427         // Currently, in case of size decreasing, old pointer is returned
2428         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2429             return ptr;
2430         } else {
2431             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2432                 internalPoolMalloc(memPool, newSize);
2433         }
2434     }
2435     if (result) {
2436         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2437         internalPoolFree(memPool, ptr, 0);
2438     }
2439     return result;
2440 }
2441 
2442 /* A predicate checks if an object is properly placed inside its block */
2443 inline bool Block::isProperlyPlaced(const void *object) const
2444 {
2445     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2446 }
2447 
2448 /* Finds the real object inside the block */
2449 FreeObject *Block::findAllocatedObject(const void *address) const
2450 {
2451     // calculate offset from the end of the block space
2452     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2453     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2454     // find offset difference from a multiple of allocation size
2455     offset %= objectSize;
2456     // and move the address down to where the real object starts.
2457     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2458 }
2459 
2460 /*
2461  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2462  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2463  */
2464 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2465 {
2466     BackRefIdx id;
2467 #if _MSC_VER
2468     __try {
2469 #endif
2470         id = *ptr;
2471 #if _MSC_VER
2472     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2473                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2474         id = BackRefIdx();
2475     }
2476 #endif
2477     return id;
2478 }
2479 
2480 template<MemoryOrigin memOrigin>
2481 bool isLargeObject(void *object)
2482 {
2483     if (!isAligned(object, largeObjectAlignment))
2484         return false;
2485     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2486     BackRefIdx idx = (memOrigin == unknownMem) ?
2487         safer_dereference(&header->backRefIdx) : header->backRefIdx;
2488 
2489     return idx.isLargeObject()
2490         // in valid LargeObjectHdr memoryBlock is not NULL
2491         && header->memoryBlock
2492         // in valid LargeObjectHdr memoryBlock points somewhere before header
2493         // TODO: more strict check
2494         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2495         && getBackRef(idx) == header;
2496 }
2497 
2498 static inline bool isSmallObject (void *ptr)
2499 {
2500     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2501     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2502 
2503     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2504     if (isSmall)
2505         expectedBlock->checkFreePrecond(ptr);
2506     return isSmall;
2507 }
2508 
2509 /**** Check if an object was allocated by scalable_malloc ****/
2510 static inline bool isRecognized (void* ptr)
2511 {
2512     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2513         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2514 }
2515 
2516 static inline void freeSmallObject(void *object)
2517 {
2518     /* mask low bits to get the block */
2519     Block *block = (Block *)alignDown(object, slabSize);
2520     block->checkFreePrecond(object);
2521 
2522 #if MALLOC_CHECK_RECURSION
2523     if (block->isStartupAllocObject()) {
2524         ((StartupBlock *)block)->free(object);
2525         return;
2526     }
2527 #endif
2528     if (block->isOwnedByCurrentThread()) {
2529         block->freeOwnObject(object);
2530     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2531         FreeObject *objectToFree = block->findObjectToFree(object);
2532         block->freePublicObject(objectToFree);
2533     }
2534 }
2535 
2536 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2537 {
2538     Bin* bin;
2539     Block * mallocBlock;
2540 
2541     if (!memPool) return NULL;
2542 
2543     if (!size) size = sizeof(size_t);
2544 
2545     TLSData *tls = memPool->getTLS(/*create=*/true);
2546 
2547     /* Allocate a large object */
2548     if (size >= minLargeObjectSize)
2549         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2550 
2551     if (!tls) return NULL;
2552 
2553     tls->markUsed();
2554     /*
2555      * Get an element in thread-local array corresponding to the given size;
2556      * It keeps ptr to the active block for allocations of this size
2557      */
2558     bin = tls->getAllocationBin(size);
2559     if ( !bin ) return NULL;
2560 
2561     /* Get a block to try to allocate in. */
2562     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2563          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2564     {
2565         if( FreeObject *result = mallocBlock->allocate() )
2566             return result;
2567     }
2568 
2569     /*
2570      * else privatize publicly freed objects in some block and allocate from it
2571      */
2572     mallocBlock = bin->getPrivatizedFreeListBlock();
2573     if (mallocBlock) {
2574         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2575         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2576             return result;
2577         /* Else something strange happened, need to retry from the beginning; */
2578         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2579         return internalPoolMalloc(memPool, size);
2580     }
2581 
2582     /*
2583      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2584      */
2585     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2586     while (mallocBlock) {
2587         bin->pushTLSBin(mallocBlock);
2588         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2589         if( FreeObject *result = mallocBlock->allocate() )
2590             return result;
2591         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2592     }
2593 
2594     /*
2595      * else try to get a new empty block
2596      */
2597     mallocBlock = memPool->getEmptyBlock(size);
2598     if (mallocBlock) {
2599         bin->pushTLSBin(mallocBlock);
2600         bin->setActiveBlock(mallocBlock);
2601         if( FreeObject *result = mallocBlock->allocate() )
2602             return result;
2603         /* Else something strange happened, need to retry from the beginning; */
2604         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2605         return internalPoolMalloc(memPool, size);
2606     }
2607     /*
2608      * else nothing works so return NULL
2609      */
2610     TRACEF(( "[ScalableMalloc trace] No memory found, returning NULL.\n" ));
2611     return NULL;
2612 }
2613 
2614 // When size==0 (i.e. unknown), detect here whether the object is large.
2615 // For size is known and < minLargeObjectSize, we still need to check
2616 // if the actual object is large, because large objects might be used
2617 // for aligned small allocations.
2618 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2619 {
2620     if (!memPool || !object) return false;
2621 
2622     // The library is initialized at allocation call, so releasing while
2623     // not initialized means foreign object is releasing.
2624     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2625     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2626                   "Invalid pointer during object releasing is detected.");
2627 
2628     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2629         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2630     else
2631         freeSmallObject(object);
2632     return true;
2633 }
2634 
2635 static void *internalMalloc(size_t size)
2636 {
2637     if (!size) size = sizeof(size_t);
2638 
2639 #if MALLOC_CHECK_RECURSION
2640     if (RecursiveMallocCallProtector::sameThreadActive())
2641         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2642             // nested allocation, so skip tls
2643             (FreeObject*)defaultMemPool->getFromLLOCache(NULL, size, slabSize);
2644 #endif
2645 
2646     if (!isMallocInitialized())
2647         if (!doInitialization())
2648             return NULL;
2649     return internalPoolMalloc(defaultMemPool, size);
2650 }
2651 
2652 static void internalFree(void *object)
2653 {
2654     internalPoolFree(defaultMemPool, object, 0);
2655 }
2656 
2657 static size_t internalMsize(void* ptr)
2658 {
2659     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2660     if (isLargeObject<ourMem>(ptr)) {
2661         // TODO: return the maximum memory size, that can be written to this object
2662         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2663         return lmb->objectSize;
2664     } else {
2665         Block *block = (Block*)alignDown(ptr, slabSize);
2666         return block->findObjectSize(ptr);
2667     }
2668 }
2669 
2670 } // namespace internal
2671 
2672 using namespace rml::internal;
2673 
2674 // legacy entry point saved for compatibility with binaries complied
2675 // with pre-6003 versions of TBB
2676 rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2677 {
2678     rml::MemoryPool *pool;
2679     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2680 
2681     pool_create_v1(pool_id, &pol, &pool);
2682     return pool;
2683 }
2684 
2685 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2686                                  rml::MemoryPool **pool)
2687 {
2688     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2689          // empty pFree allowed only for fixed pools
2690          || !(policy->fixedPool || policy->pFree)) {
2691         *pool = NULL;
2692         return INVALID_POLICY;
2693     }
2694     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2695          // new flags can be added in place of reserved, but default
2696          // behaviour must be supported by this version
2697          || policy->reserved ) {
2698         *pool = NULL;
2699         return UNSUPPORTED_POLICY;
2700     }
2701     if (!isMallocInitialized())
2702         if (!doInitialization()) {
2703             *pool = NULL;
2704             return NO_MEMORY;
2705         }
2706     rml::internal::MemoryPool *memPool =
2707         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2708     if (!memPool) {
2709         *pool = NULL;
2710         return NO_MEMORY;
2711     }
2712     memset(memPool, 0, sizeof(rml::internal::MemoryPool));
2713     if (!memPool->init(pool_id, policy)) {
2714         internalFree(memPool);
2715         *pool = NULL;
2716         return NO_MEMORY;
2717     }
2718 
2719     *pool = (rml::MemoryPool*)memPool;
2720     return POOL_OK;
2721 }
2722 
2723 bool pool_destroy(rml::MemoryPool* memPool)
2724 {
2725     if (!memPool) return false;
2726     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2727     internalFree(memPool);
2728 
2729     return ret;
2730 }
2731 
2732 bool pool_reset(rml::MemoryPool* memPool)
2733 {
2734     if (!memPool) return false;
2735 
2736     return ((rml::internal::MemoryPool*)memPool)->reset();
2737 }
2738 
2739 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2740 {
2741     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2742 }
2743 
2744 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2745 {
2746     if (!object)
2747         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2748     if (!size) {
2749         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2750         return NULL;
2751     }
2752     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2753 }
2754 
2755 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2756 {
2757     if (!isPowerOfTwo(alignment) || 0==size)
2758         return NULL;
2759 
2760     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2761 }
2762 
2763 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2764 {
2765     if (!isPowerOfTwo(alignment))
2766         return NULL;
2767     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2768     void *tmp;
2769 
2770     if (!ptr)
2771         tmp = allocateAligned(mPool, size, alignment);
2772     else if (!size) {
2773         internalPoolFree(mPool, ptr, 0);
2774         return NULL;
2775     } else
2776         tmp = reallocAligned(mPool, ptr, size, alignment);
2777 
2778     return tmp;
2779 }
2780 
2781 bool pool_free(rml::MemoryPool *mPool, void *object)
2782 {
2783     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2784 }
2785 
2786 rml::MemoryPool *pool_identify(void *object)
2787 {
2788     rml::internal::MemoryPool *pool;
2789     if (isLargeObject<ourMem>(object)) {
2790         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2791         pool = header->memoryBlock->pool;
2792     } else {
2793         Block *block = (Block*)alignDown(object, slabSize);
2794         pool = block->getMemPool();
2795     }
2796     // do not return defaultMemPool, as it can't be used in pool_free() etc
2797     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2798         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2799     return (rml::MemoryPool*)pool;
2800 }
2801 
2802 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2803 {
2804     if (object) {
2805         // No assert for object recognition, cause objects allocated from non-default
2806         // memory pool do not participate in range checking and do not have valid backreferences for
2807         // small objects. Instead, check that an object belong to the certain memory pool.
2808         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2809         return internalMsize(object);
2810     }
2811     errno = EINVAL;
2812     // Unlike _msize, return 0 in case of parameter error.
2813     // Returning size_t(-1) looks more like the way to troubles.
2814     return 0;
2815 }
2816 
2817 } // namespace rml
2818 
2819 using namespace rml::internal;
2820 
2821 #if MALLOC_TRACE
2822 static unsigned int threadGoingDownCount = 0;
2823 #endif
2824 
2825 /*
2826  * When a thread is shutting down this routine should be called to remove all the thread ids
2827  * from the malloc blocks and replace them with a NULL thread id.
2828  *
2829  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2830  * It will be automatically called at thread exit with the key value as the argument,
2831  * unless that value is NULL.
2832  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2833  *
2834  * However neither of the above is called for the main process thread, so the routine
2835  * also needs to be called during the process shutdown.
2836  *
2837 */
2838 // TODO: Consider making this function part of class MemoryPool.
2839 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2840 {
2841     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2842              getThreadId(),  threadGoingDownCount++ ));
2843 
2844 #if USE_PTHREAD
2845     if (tls) {
2846         if (!shutdownSync.threadDtorStart()) return;
2847         tls->getMemPool()->onThreadShutdown(tls);
2848         shutdownSync.threadDtorDone();
2849     } else
2850 #endif
2851     {
2852         suppress_unused_warning(tls); // not used on Windows
2853         // The default pool is safe to use at this point:
2854         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2855         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2856         // See also __TBB_mallocProcessShutdownNotification()
2857         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2858         // Take lock to walk through other pools; but waiting might be dangerous at this point
2859         // (e.g. on Windows the main thread might deadlock)
2860         bool locked;
2861         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2862         if (locked) { // the list is safe to process
2863             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2864                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2865         }
2866     }
2867 
2868     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2869 }
2870 
2871 #if USE_PTHREAD
2872 void mallocThreadShutdownNotification(void* arg)
2873 {
2874     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2875     if (!isMallocInitialized()) return;
2876     doThreadShutdownNotification((TLSData*)arg, false);
2877 }
2878 #else
2879 extern "C" void __TBB_mallocThreadShutdownNotification()
2880 {
2881     // The routine is called once per thread on Windows
2882     if (!isMallocInitialized()) return;
2883     doThreadShutdownNotification(NULL, false);
2884 }
2885 #endif
2886 
2887 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2888 {
2889     if (!isMallocInitialized()) return;
2890 
2891     // Don't clean allocator internals if the entire process is exiting
2892     if (!windows_process_dying) {
2893         doThreadShutdownNotification(NULL, /*main_thread=*/true);
2894     }
2895 #if  __TBB_MALLOC_LOCACHE_STAT
2896     printf("cache hit ratio %f, size hit %f\n",
2897            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2898     defaultMemPool->extMemPool.loc.reportStat(stdout);
2899 #endif
2900 
2901     shutdownSync.processExit();
2902 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2903 /* Pthread keys must be deleted as soon as possible to not call key dtor
2904    on thread termination when then the tbbmalloc code can be already unloaded.
2905 */
2906     defaultMemPool->destroy();
2907     destroyBackRefMaster(&defaultMemPool->extMemPool.backend);
2908     ThreadId::destroy();      // Delete key for thread id
2909     hugePages.reset();
2910     // new total malloc initialization is possible after this point
2911     mallocInitialized.store(0, std::memory_order_release);
2912 #elif __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND
2913 /* In most cases we prevent unloading tbbmalloc, and don't clean up memory
2914    on process shutdown. When impossible to prevent, library unload results
2915    in shutdown notification, and it makes sense to release unused memory
2916    at that point (we can't release all memory because it's possible that
2917    it will be accessed after this point).
2918    TODO: better support systems where we can't prevent unloading by removing
2919    pthread destructors and releasing caches.
2920  */
2921     defaultMemPool->extMemPool.hardCachesCleanup();
2922 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2923 
2924 #if COLLECT_STATISTICS
2925     unsigned nThreads = ThreadId::getMaxThreadId();
2926     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2927         STAT_print(i);
2928 #endif
2929     if (!usedBySrcIncluded)
2930         MALLOC_ITT_FINI_ITTLIB();
2931 }
2932 
2933 extern "C" void * scalable_malloc(size_t size)
2934 {
2935     void *ptr = internalMalloc(size);
2936     if (!ptr) errno = ENOMEM;
2937     return ptr;
2938 }
2939 
2940 extern "C" void scalable_free(void *object)
2941 {
2942     internalFree(object);
2943 }
2944 
2945 #if MALLOC_ZONE_OVERLOAD_ENABLED
2946 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2947 {
2948     internalPoolFree(defaultMemPool, object, size);
2949 }
2950 #endif
2951 
2952 /*
2953  * A variant that provides additional memory safety, by checking whether the given address
2954  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2955  */
2956 extern "C" void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2957 {
2958     if (!object)
2959         return;
2960 
2961     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2962     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2963         if (isLargeObject<unknownMem>(object)) {
2964             // must check 1st for large object, because small object check touches 4 pages on left,
2965             // and it can be inaccessible
2966             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2967 
2968             defaultMemPool->putToLLOCache(tls, object);
2969             return;
2970         } else if (isSmallObject(object)) {
2971             freeSmallObject(object);
2972             return;
2973         }
2974     }
2975     if (original_free)
2976         original_free(object);
2977 }
2978 
2979 /********* End the free code        *************/
2980 
2981 /********* Code for scalable_realloc       ***********/
2982 
2983 /*
2984  * From K&R
2985  * "realloc changes the size of the object pointed to by p to size. The contents will
2986  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2987  * the new space is uninitialized. realloc returns a pointer to the new space, or
2988  * NULL if the request cannot be satisfied, in which case *p is unchanged."
2989  *
2990  */
2991 extern "C" void* scalable_realloc(void* ptr, size_t size)
2992 {
2993     void *tmp;
2994 
2995     if (!ptr)
2996         tmp = internalMalloc(size);
2997     else if (!size) {
2998         internalFree(ptr);
2999         return NULL;
3000     } else
3001         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3002 
3003     if (!tmp) errno = ENOMEM;
3004     return tmp;
3005 }
3006 
3007 /*
3008  * A variant that provides additional memory safety, by checking whether the given address
3009  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3010  */
3011 extern "C" void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3012 {
3013     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3014 
3015     if (!ptr) {
3016         tmp = internalMalloc(sz);
3017     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3018         if (!sz) {
3019             internalFree(ptr);
3020             return NULL;
3021         } else {
3022             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3023         }
3024     }
3025 #if USE_WINTHREAD
3026     else if (original_realloc && sz) {
3027         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3028         if ( original_ptrs->msize ){
3029             size_t oldSize = original_ptrs->msize(ptr);
3030             tmp = internalMalloc(sz);
3031             if (tmp) {
3032                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3033                 if ( original_ptrs->free ){
3034                     original_ptrs->free( ptr );
3035                 }
3036             }
3037         } else
3038             tmp = NULL;
3039     }
3040 #else
3041     else if (original_realloc) {
3042         typedef void* (*realloc_ptr_t)(void*,size_t);
3043         realloc_ptr_t original_realloc_ptr;
3044         (void *&)original_realloc_ptr = original_realloc;
3045         tmp = original_realloc_ptr(ptr,sz);
3046     }
3047 #endif
3048     else tmp = NULL;
3049 
3050     if (!tmp) errno = ENOMEM;
3051     return tmp;
3052 }
3053 
3054 /********* End code for scalable_realloc   ***********/
3055 
3056 /********* Code for scalable_calloc   ***********/
3057 
3058 /*
3059  * From K&R
3060  * calloc returns a pointer to space for an array of nobj objects,
3061  * each of size size, or NULL if the request cannot be satisfied.
3062  * The space is initialized to zero bytes.
3063  *
3064  */
3065 
3066 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3067 {
3068     // it's square root of maximal size_t value
3069     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3070     const size_t arraySize = nobj * size;
3071 
3072     // check for overflow during multiplication:
3073     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3074         if (nobj && arraySize / nobj != size) {             // 2) exact check
3075             errno = ENOMEM;
3076             return NULL;
3077         }
3078     void* result = internalMalloc(arraySize);
3079     if (result)
3080         memset(result, 0, arraySize);
3081     else
3082         errno = ENOMEM;
3083     return result;
3084 }
3085 
3086 /********* End code for scalable_calloc   ***********/
3087 
3088 /********* Code for aligned allocation API **********/
3089 
3090 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3091 {
3092     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3093         return EINVAL;
3094     void *result = allocateAligned(defaultMemPool, size, alignment);
3095     if (!result)
3096         return ENOMEM;
3097     *memptr = result;
3098     return 0;
3099 }
3100 
3101 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3102 {
3103     if (!isPowerOfTwo(alignment) || 0==size) {
3104         errno = EINVAL;
3105         return NULL;
3106     }
3107     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3108     if (!tmp) errno = ENOMEM;
3109     return tmp;
3110 }
3111 
3112 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3113 {
3114     if (!isPowerOfTwo(alignment)) {
3115         errno = EINVAL;
3116         return NULL;
3117     }
3118     void *tmp;
3119 
3120     if (!ptr)
3121         tmp = allocateAligned(defaultMemPool, size, alignment);
3122     else if (!size) {
3123         scalable_free(ptr);
3124         return NULL;
3125     } else
3126         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3127 
3128     if (!tmp) errno = ENOMEM;
3129     return tmp;
3130 }
3131 
3132 extern "C" void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3133 {
3134     /* corner cases left out of reallocAligned to not deal with errno there */
3135     if (!isPowerOfTwo(alignment)) {
3136         errno = EINVAL;
3137         return NULL;
3138     }
3139     void *tmp = NULL;
3140 
3141     if (!ptr) {
3142         tmp = allocateAligned(defaultMemPool, size, alignment);
3143     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3144         if (!size) {
3145             internalFree(ptr);
3146             return NULL;
3147         } else {
3148             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3149         }
3150     }
3151 #if USE_WINTHREAD
3152     else {
3153         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3154         if (size) {
3155             // Without orig_msize, we can't do anything with this.
3156             // Just keeping old pointer.
3157             if ( original_ptrs->aligned_msize ){
3158                 // set alignment and offset to have possibly correct oldSize
3159                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3160                 tmp = allocateAligned(defaultMemPool, size, alignment);
3161                 if (tmp) {
3162                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3163                     if ( original_ptrs->aligned_free ){
3164                         original_ptrs->aligned_free( ptr );
3165                     }
3166                 }
3167             }
3168         } else {
3169             if ( original_ptrs->aligned_free ){
3170                 original_ptrs->aligned_free( ptr );
3171             }
3172             return NULL;
3173         }
3174     }
3175 #else
3176     // As original_realloc can't align result, and there is no way to find
3177     // size of reallocating object, we are giving up.
3178     suppress_unused_warning(orig_function);
3179 #endif
3180     if (!tmp) errno = ENOMEM;
3181     return tmp;
3182 }
3183 
3184 extern "C" void scalable_aligned_free(void *ptr)
3185 {
3186     internalFree(ptr);
3187 }
3188 
3189 /********* end code for aligned allocation API **********/
3190 
3191 /********* Code for scalable_msize       ***********/
3192 
3193 /*
3194  * Returns the size of a memory block allocated in the heap.
3195  */
3196 extern "C" size_t scalable_msize(void* ptr)
3197 {
3198     if (ptr) {
3199         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3200         return internalMsize(ptr);
3201     }
3202     errno = EINVAL;
3203     // Unlike _msize, return 0 in case of parameter error.
3204     // Returning size_t(-1) looks more like the way to troubles.
3205     return 0;
3206 }
3207 
3208 /*
3209  * A variant that provides additional memory safety, by checking whether the given address
3210  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3211  */
3212 extern "C" size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3213 {
3214     if (object) {
3215         // Check if the memory was allocated by scalable_malloc
3216         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3217             return internalMsize(object);
3218         else if (original_msize)
3219             return original_msize(object);
3220     }
3221     // object is NULL or unknown, or foreign and no original_msize
3222 #if USE_WINTHREAD
3223     errno = EINVAL; // errno expected to be set only on this platform
3224 #endif
3225     return 0;
3226 }
3227 
3228 /*
3229  * The same as above but for _aligned_msize case
3230  */
3231 extern "C" size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3232 {
3233     if (object) {
3234         // Check if the memory was allocated by scalable_malloc
3235         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3236             return internalMsize(object);
3237         else if (orig_aligned_msize)
3238             return orig_aligned_msize(object,alignment,offset);
3239     }
3240     // object is NULL or unknown
3241     errno = EINVAL;
3242     return 0;
3243 }
3244 
3245 /********* End code for scalable_msize   ***********/
3246 
3247 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3248 {
3249     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3250         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3251         return TBBMALLOC_OK;
3252     } else if (param == USE_HUGE_PAGES) {
3253 #if __linux__
3254         switch (value) {
3255         case 0:
3256         case 1:
3257             hugePages.setMode(value);
3258             return TBBMALLOC_OK;
3259         default:
3260             return TBBMALLOC_INVALID_PARAM;
3261         }
3262 #else
3263         return TBBMALLOC_NO_EFFECT;
3264 #endif
3265 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3266     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3267         switch (value) {
3268         case 0: // used by dynamic library
3269         case 1: // used by static library or directly included sources
3270             usedBySrcIncluded = value;
3271             return TBBMALLOC_OK;
3272         default:
3273             return TBBMALLOC_INVALID_PARAM;
3274         }
3275 #endif
3276     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3277         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3278         return TBBMALLOC_OK;
3279     }
3280     return TBBMALLOC_INVALID_PARAM;
3281 }
3282 
3283 extern "C" int scalable_allocation_command(int cmd, void *param)
3284 {
3285     if (param)
3286         return TBBMALLOC_INVALID_PARAM;
3287 
3288     bool released = false;
3289     switch(cmd) {
3290     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3291         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3292             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3293         break;
3294     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3295         released = defaultMemPool->extMemPool.hardCachesCleanup();
3296         break;
3297     default:
3298         return TBBMALLOC_INVALID_PARAM;
3299     }
3300     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3301 }
3302