1 /*
2  * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_io.h"
18 #include "kmp_wrapper_malloc.h"
19 
20 // Disable bget when it is not used
21 #if KMP_USE_BGET
22 
23 /* Thread private buffer management code */
24 
25 typedef int (*bget_compact_t)(size_t, int);
26 typedef void *(*bget_acquire_t)(size_t);
27 typedef void (*bget_release_t)(void *);
28 
29 /* NOTE: bufsize must be a signed datatype */
30 
31 #if KMP_OS_WINDOWS
32 #if KMP_ARCH_X86 || KMP_ARCH_ARM
33 typedef kmp_int32 bufsize;
34 #else
35 typedef kmp_int64 bufsize;
36 #endif
37 #else
38 typedef ssize_t bufsize;
39 #endif
40 
41 /* The three modes of operation are, fifo search, lifo search, and best-fit */
42 
43 typedef enum bget_mode {
44   bget_mode_fifo = 0,
45   bget_mode_lifo = 1,
46   bget_mode_best = 2
47 } bget_mode_t;
48 
49 static void bpool(kmp_info_t *th, void *buffer, bufsize len);
50 static void *bget(kmp_info_t *th, bufsize size);
51 static void *bgetz(kmp_info_t *th, bufsize size);
52 static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
53 static void brel(kmp_info_t *th, void *buf);
54 static void bectl(kmp_info_t *th, bget_compact_t compact,
55                   bget_acquire_t acquire, bget_release_t release,
56                   bufsize pool_incr);
57 
58 #ifdef KMP_DEBUG
59 static void bstats(kmp_info_t *th, bufsize *curalloc, bufsize *totfree,
60                    bufsize *maxfree, long *nget, long *nrel);
61 static void bstatse(kmp_info_t *th, bufsize *pool_incr, long *npool,
62                     long *npget, long *nprel, long *ndget, long *ndrel);
63 static void bufdump(kmp_info_t *th, void *buf);
64 static void bpoold(kmp_info_t *th, void *pool, int dumpalloc, int dumpfree);
65 static int bpoolv(kmp_info_t *th, void *pool);
66 #endif
67 
68 /* BGET CONFIGURATION */
69 /* Buffer allocation size quantum: all buffers allocated are a
70    multiple of this size.  This MUST be a power of two. */
71 
72 /* On IA-32 architecture with  Linux* OS, malloc() does not
73    ensure 16 byte alignmnent */
74 
75 #if KMP_ARCH_X86 || !KMP_HAVE_QUAD
76 
77 #define SizeQuant 8
78 #define AlignType double
79 
80 #else
81 
82 #define SizeQuant 16
83 #define AlignType _Quad
84 
85 #endif
86 
87 // Define this symbol to enable the bstats() function which calculates the
88 // total free space in the buffer pool, the largest available buffer, and the
89 // total space currently allocated.
90 #define BufStats 1
91 
92 #ifdef KMP_DEBUG
93 
94 // Define this symbol to enable the bpoold() function which dumps the buffers
95 // in a buffer pool.
96 #define BufDump 1
97 
98 // Define this symbol to enable the bpoolv() function for validating a buffer
99 // pool.
100 #define BufValid 1
101 
102 // Define this symbol to enable the bufdump() function which allows dumping the
103 // contents of an allocated or free buffer.
104 #define DumpData 1
105 
106 #ifdef NOT_USED_NOW
107 
108 // Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
109 // who attempt to use pointers into released buffers.
110 #define FreeWipe 1
111 
112 // Use a best fit algorithm when searching for space for an allocation request.
113 // This uses memory more efficiently, but allocation will be much slower.
114 #define BestFit 1
115 
116 #endif /* NOT_USED_NOW */
117 #endif /* KMP_DEBUG */
118 
119 static bufsize bget_bin_size[] = {
120     0,
121     //    1 << 6,    /* .5 Cache line */
122     1 << 7, /* 1 Cache line, new */
123     1 << 8, /* 2 Cache lines */
124     1 << 9, /* 4 Cache lines, new */
125     1 << 10, /* 8 Cache lines */
126     1 << 11, /* 16 Cache lines, new */
127     1 << 12, 1 << 13, /* new */
128     1 << 14, 1 << 15, /* new */
129     1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
130     1 << 21, /*  2MB */
131     1 << 22, /*  4MB */
132     1 << 23, /*  8MB */
133     1 << 24, /* 16MB */
134     1 << 25, /* 32MB */
135 };
136 
137 #define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
138 
139 struct bfhead;
140 
141 //  Declare the interface, including the requested buffer size type, bufsize.
142 
143 /* Queue links */
144 typedef struct qlinks {
145   struct bfhead *flink; /* Forward link */
146   struct bfhead *blink; /* Backward link */
147 } qlinks_t;
148 
149 /* Header in allocated and free buffers */
150 typedef struct bhead2 {
151   kmp_info_t *bthr; /* The thread which owns the buffer pool */
152   bufsize prevfree; /* Relative link back to previous free buffer in memory or
153                        0 if previous buffer is allocated.  */
154   bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
155 } bhead2_t;
156 
157 /* Make sure the bhead structure is a multiple of SizeQuant in size. */
158 typedef union bhead {
159   KMP_ALIGN(SizeQuant)
160   AlignType b_align;
161   char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
162   bhead2_t bb;
163 } bhead_t;
164 #define BH(p) ((bhead_t *)(p))
165 
166 /*  Header in directly allocated buffers (by acqfcn) */
167 typedef struct bdhead {
168   bufsize tsize; /* Total size, including overhead */
169   bhead_t bh; /* Common header */
170 } bdhead_t;
171 #define BDH(p) ((bdhead_t *)(p))
172 
173 /* Header in free buffers */
174 typedef struct bfhead {
175   bhead_t bh; /* Common allocated/free header */
176   qlinks_t ql; /* Links on free list */
177 } bfhead_t;
178 #define BFH(p) ((bfhead_t *)(p))
179 
180 typedef struct thr_data {
181   bfhead_t freelist[MAX_BGET_BINS];
182 #if BufStats
183   size_t totalloc; /* Total space currently allocated */
184   long numget, numrel; /* Number of bget() and brel() calls */
185   long numpblk; /* Number of pool blocks */
186   long numpget, numprel; /* Number of block gets and rels */
187   long numdget, numdrel; /* Number of direct gets and rels */
188 #endif /* BufStats */
189 
190   /* Automatic expansion block management functions */
191   bget_compact_t compfcn;
192   bget_acquire_t acqfcn;
193   bget_release_t relfcn;
194 
195   bget_mode_t mode; /* what allocation mode to use? */
196 
197   bufsize exp_incr; /* Expansion block size */
198   bufsize pool_len; /* 0: no bpool calls have been made
199                        -1: not all pool blocks are the same size
200                        >0: (common) block size for all bpool calls made so far
201                     */
202   bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
203 } thr_data_t;
204 
205 /*  Minimum allocation quantum: */
206 #define QLSize (sizeof(qlinks_t))
207 #define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
208 #define MaxSize                                                                \
209   (bufsize)(                                                                   \
210       ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
211 // Maximun for the requested size.
212 
213 /* End sentinel: value placed in bsize field of dummy block delimiting
214    end of pool block.  The most negative number which will  fit  in  a
215    bufsize, defined in a way that the compiler will accept. */
216 
217 #define ESent                                                                  \
218   ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
219 
220 /* Thread Data management routines */
221 static int bget_get_bin(bufsize size) {
222   // binary chop bins
223   int lo = 0, hi = MAX_BGET_BINS - 1;
224 
225   KMP_DEBUG_ASSERT(size > 0);
226 
227   while ((hi - lo) > 1) {
228     int mid = (lo + hi) >> 1;
229     if (size < bget_bin_size[mid])
230       hi = mid - 1;
231     else
232       lo = mid;
233   }
234 
235   KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
236 
237   return lo;
238 }
239 
240 static void set_thr_data(kmp_info_t *th) {
241   int i;
242   thr_data_t *data;
243 
244   data = (thr_data_t *)((!th->th.th_local.bget_data)
245                             ? __kmp_allocate(sizeof(*data))
246                             : th->th.th_local.bget_data);
247 
248   memset(data, '\0', sizeof(*data));
249 
250   for (i = 0; i < MAX_BGET_BINS; ++i) {
251     data->freelist[i].ql.flink = &data->freelist[i];
252     data->freelist[i].ql.blink = &data->freelist[i];
253   }
254 
255   th->th.th_local.bget_data = data;
256   th->th.th_local.bget_list = 0;
257 #if !USE_CMP_XCHG_FOR_BGET
258 #ifdef USE_QUEUING_LOCK_FOR_BGET
259   __kmp_init_lock(&th->th.th_local.bget_lock);
260 #else
261   __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
262 #endif /* USE_LOCK_FOR_BGET */
263 #endif /* ! USE_CMP_XCHG_FOR_BGET */
264 }
265 
266 static thr_data_t *get_thr_data(kmp_info_t *th) {
267   thr_data_t *data;
268 
269   data = (thr_data_t *)th->th.th_local.bget_data;
270 
271   KMP_DEBUG_ASSERT(data != 0);
272 
273   return data;
274 }
275 
276 #ifdef KMP_DEBUG
277 
278 static void __kmp_bget_validate_queue(kmp_info_t *th) {
279   /* NOTE: assume that the global_lock is held */
280 
281   void *p = (void *)th->th.th_local.bget_list;
282 
283   while (p != 0) {
284     bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
285 
286     KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
287     p = (void *)b->ql.flink;
288   }
289 }
290 
291 #endif
292 
293 /* Walk the free list and release the enqueued buffers */
294 static void __kmp_bget_dequeue(kmp_info_t *th) {
295   void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
296 
297   if (p != 0) {
298 #if USE_CMP_XCHG_FOR_BGET
299     {
300       volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
301       while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
302                                         CCAST(void *, old_value), nullptr)) {
303         KMP_CPU_PAUSE();
304         old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
305       }
306       p = CCAST(void *, old_value);
307     }
308 #else /* ! USE_CMP_XCHG_FOR_BGET */
309 #ifdef USE_QUEUING_LOCK_FOR_BGET
310     __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
311 #else
312     __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
313 #endif /* USE_QUEUING_LOCK_FOR_BGET */
314 
315     p = (void *)th->th.th_local.bget_list;
316     th->th.th_local.bget_list = 0;
317 
318 #ifdef USE_QUEUING_LOCK_FOR_BGET
319     __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
320 #else
321     __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
322 #endif
323 #endif /* USE_CMP_XCHG_FOR_BGET */
324 
325     /* Check again to make sure the list is not empty */
326     while (p != 0) {
327       void *buf = p;
328       bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
329 
330       KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
331       KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
332                        (kmp_uintptr_t)th); // clear possible mark
333       KMP_DEBUG_ASSERT(b->ql.blink == 0);
334 
335       p = (void *)b->ql.flink;
336 
337       brel(th, buf);
338     }
339   }
340 }
341 
342 /* Chain together the free buffers by using the thread owner field */
343 static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
344 #ifdef USE_QUEUING_LOCK_FOR_BGET
345                                ,
346                                kmp_int32 rel_gtid
347 #endif
348                                ) {
349   bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
350 
351   KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
352   KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
353                    (kmp_uintptr_t)th); // clear possible mark
354 
355   b->ql.blink = 0;
356 
357   KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
358                 __kmp_gtid_from_thread(th)));
359 
360 #if USE_CMP_XCHG_FOR_BGET
361   {
362     volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
363     /* the next pointer must be set before setting bget_list to buf to avoid
364        exposing a broken list to other threads, even for an instant. */
365     b->ql.flink = BFH(CCAST(void *, old_value));
366 
367     while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
368                                       CCAST(void *, old_value), buf)) {
369       KMP_CPU_PAUSE();
370       old_value = TCR_PTR(th->th.th_local.bget_list);
371       /* the next pointer must be set before setting bget_list to buf to avoid
372          exposing a broken list to other threads, even for an instant. */
373       b->ql.flink = BFH(CCAST(void *, old_value));
374     }
375   }
376 #else /* ! USE_CMP_XCHG_FOR_BGET */
377 #ifdef USE_QUEUING_LOCK_FOR_BGET
378   __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
379 #else
380   __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
381 #endif
382 
383   b->ql.flink = BFH(th->th.th_local.bget_list);
384   th->th.th_local.bget_list = (void *)buf;
385 
386 #ifdef USE_QUEUING_LOCK_FOR_BGET
387   __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
388 #else
389   __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
390 #endif
391 #endif /* USE_CMP_XCHG_FOR_BGET */
392 }
393 
394 /* insert buffer back onto a new freelist */
395 static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
396   int bin;
397 
398   KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
399   KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
400 
401   bin = bget_get_bin(b->bh.bb.bsize);
402 
403   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
404                    &thr->freelist[bin]);
405   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
406                    &thr->freelist[bin]);
407 
408   b->ql.flink = &thr->freelist[bin];
409   b->ql.blink = thr->freelist[bin].ql.blink;
410 
411   thr->freelist[bin].ql.blink = b;
412   b->ql.blink->ql.flink = b;
413 }
414 
415 /* unlink the buffer from the old freelist */
416 static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
417   KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
418   KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
419 
420   b->ql.blink->ql.flink = b->ql.flink;
421   b->ql.flink->ql.blink = b->ql.blink;
422 }
423 
424 /*  GET STATS -- check info on free list */
425 static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
426   thr_data_t *thr = get_thr_data(th);
427   int bin;
428 
429   *total_free = *max_free = 0;
430 
431   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
432     bfhead_t *b, *best;
433 
434     best = &thr->freelist[bin];
435     b = best->ql.flink;
436 
437     while (b != &thr->freelist[bin]) {
438       *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
439       if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
440         best = b;
441 
442       /* Link to next buffer */
443       b = b->ql.flink;
444     }
445 
446     if (*max_free < best->bh.bb.bsize)
447       *max_free = best->bh.bb.bsize;
448   }
449 
450   if (*max_free > (bufsize)sizeof(bhead_t))
451     *max_free -= sizeof(bhead_t);
452 }
453 
454 /*  BGET  --  Allocate a buffer.  */
455 static void *bget(kmp_info_t *th, bufsize requested_size) {
456   thr_data_t *thr = get_thr_data(th);
457   bufsize size = requested_size;
458   bfhead_t *b;
459   void *buf;
460   int compactseq = 0;
461   int use_blink = 0;
462   /* For BestFit */
463   bfhead_t *best;
464 
465   if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
466     return NULL;
467   }; // if
468 
469   __kmp_bget_dequeue(th); /* Release any queued buffers */
470 
471   if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
472     size = SizeQ;
473   }
474 #if defined(SizeQuant) && (SizeQuant > 1)
475   size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
476 #endif
477 
478   size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
479   KMP_DEBUG_ASSERT(size >= 0);
480   KMP_DEBUG_ASSERT(size % SizeQuant == 0);
481 
482   use_blink = (thr->mode == bget_mode_lifo);
483 
484   /* If a compact function was provided in the call to bectl(), wrap
485      a loop around the allocation process  to  allow  compaction  to
486      intervene in case we don't find a suitable buffer in the chain. */
487 
488   for (;;) {
489     int bin;
490 
491     for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
492       /* Link to next buffer */
493       b = (use_blink ? thr->freelist[bin].ql.blink
494                      : thr->freelist[bin].ql.flink);
495 
496       if (thr->mode == bget_mode_best) {
497         best = &thr->freelist[bin];
498 
499         /* Scan the free list searching for the first buffer big enough
500            to hold the requested size buffer. */
501         while (b != &thr->freelist[bin]) {
502           if (b->bh.bb.bsize >= (bufsize)size) {
503             if ((best == &thr->freelist[bin]) ||
504                 (b->bh.bb.bsize < best->bh.bb.bsize)) {
505               best = b;
506             }
507           }
508 
509           /* Link to next buffer */
510           b = (use_blink ? b->ql.blink : b->ql.flink);
511         }
512         b = best;
513       }
514 
515       while (b != &thr->freelist[bin]) {
516         if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
517 
518           // Buffer is big enough to satisfy the request. Allocate it to the
519           // caller. We must decide whether the buffer is large enough to split
520           // into the part given to the caller and a free buffer that remains
521           // on the free list, or whether the entire buffer should be removed
522           // from the free list and given to the caller in its entirety. We
523           // only split the buffer if enough room remains for a header plus the
524           // minimum quantum of allocation.
525           if ((b->bh.bb.bsize - (bufsize)size) >
526               (bufsize)(SizeQ + (sizeof(bhead_t)))) {
527             bhead_t *ba, *bn;
528 
529             ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
530             bn = BH(((char *)ba) + size);
531 
532             KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
533 
534             /* Subtract size from length of free block. */
535             b->bh.bb.bsize -= (bufsize)size;
536 
537             /* Link allocated buffer to the previous free buffer. */
538             ba->bb.prevfree = b->bh.bb.bsize;
539 
540             /* Plug negative size into user buffer. */
541             ba->bb.bsize = -size;
542 
543             /* Mark this buffer as owned by this thread. */
544             TCW_PTR(ba->bb.bthr,
545                     th); // not an allocated address (do not mark it)
546             /* Mark buffer after this one not preceded by free block. */
547             bn->bb.prevfree = 0;
548 
549             // unlink buffer from old freelist, and reinsert into new freelist
550             __kmp_bget_remove_from_freelist(b);
551             __kmp_bget_insert_into_freelist(thr, b);
552 #if BufStats
553             thr->totalloc += (size_t)size;
554             thr->numget++; /* Increment number of bget() calls */
555 #endif
556             buf = (void *)((((char *)ba) + sizeof(bhead_t)));
557             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
558             return buf;
559           } else {
560             bhead_t *ba;
561 
562             ba = BH(((char *)b) + b->bh.bb.bsize);
563 
564             KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
565 
566             /* The buffer isn't big enough to split.  Give  the  whole
567                shebang to the caller and remove it from the free list. */
568 
569             __kmp_bget_remove_from_freelist(b);
570 #if BufStats
571             thr->totalloc += (size_t)b->bh.bb.bsize;
572             thr->numget++; /* Increment number of bget() calls */
573 #endif
574             /* Negate size to mark buffer allocated. */
575             b->bh.bb.bsize = -(b->bh.bb.bsize);
576 
577             /* Mark this buffer as owned by this thread. */
578             TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
579             /* Zero the back pointer in the next buffer in memory
580                to indicate that this buffer is allocated. */
581             ba->bb.prevfree = 0;
582 
583             /* Give user buffer starting at queue links. */
584             buf = (void *)&(b->ql);
585             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
586             return buf;
587           }
588         }
589 
590         /* Link to next buffer */
591         b = (use_blink ? b->ql.blink : b->ql.flink);
592       }
593     }
594 
595     /* We failed to find a buffer. If there's a compact function defined,
596        notify it of the size requested. If it returns TRUE, try the allocation
597        again. */
598 
599     if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
600       break;
601     }
602   }
603 
604   /* No buffer available with requested size free. */
605 
606   /* Don't give up yet -- look in the reserve supply. */
607   if (thr->acqfcn != 0) {
608     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
609       /* Request is too large to fit in a single expansion block.
610          Try to satisy it by a direct buffer acquisition. */
611       bdhead_t *bdh;
612 
613       size += sizeof(bdhead_t) - sizeof(bhead_t);
614 
615       KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
616 
617       /* richryan */
618       bdh = BDH((*thr->acqfcn)((bufsize)size));
619       if (bdh != NULL) {
620 
621         // Mark the buffer special by setting size field of its header to zero.
622         bdh->bh.bb.bsize = 0;
623 
624         /* Mark this buffer as owned by this thread. */
625         TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
626         // because direct buffer never goes to free list
627         bdh->bh.bb.prevfree = 0;
628         bdh->tsize = size;
629 #if BufStats
630         thr->totalloc += (size_t)size;
631         thr->numget++; /* Increment number of bget() calls */
632         thr->numdget++; /* Direct bget() call count */
633 #endif
634         buf = (void *)(bdh + 1);
635         KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
636         return buf;
637       }
638 
639     } else {
640 
641       /*  Try to obtain a new expansion block */
642       void *newpool;
643 
644       KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
645 
646       /* richryan */
647       newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
648       KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
649       if (newpool != NULL) {
650         bpool(th, newpool, thr->exp_incr);
651         buf = bget(
652             th, requested_size); /* This can't, I say, can't get into a loop. */
653         return buf;
654       }
655     }
656   }
657 
658   /*  Still no buffer available */
659 
660   return NULL;
661 }
662 
663 /*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
664                the  entire  contents  of  the buffer to zero, not just the
665                region requested by the caller. */
666 
667 static void *bgetz(kmp_info_t *th, bufsize size) {
668   char *buf = (char *)bget(th, size);
669 
670   if (buf != NULL) {
671     bhead_t *b;
672     bufsize rsize;
673 
674     b = BH(buf - sizeof(bhead_t));
675     rsize = -(b->bb.bsize);
676     if (rsize == 0) {
677       bdhead_t *bd;
678 
679       bd = BDH(buf - sizeof(bdhead_t));
680       rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
681     } else {
682       rsize -= sizeof(bhead_t);
683     }
684 
685     KMP_DEBUG_ASSERT(rsize >= size);
686 
687     (void)memset(buf, 0, (bufsize)rsize);
688   }
689   return ((void *)buf);
690 }
691 
692 /*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
693                simply in terms of brel()  and  bget().   It  could  be
694                enhanced to allow the buffer to grow into adjacent free
695                blocks and to avoid moving data unnecessarily.  */
696 
697 static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
698   void *nbuf;
699   bufsize osize; /* Old size of buffer */
700   bhead_t *b;
701 
702   nbuf = bget(th, size);
703   if (nbuf == NULL) { /* Acquire new buffer */
704     return NULL;
705   }
706   if (buf == NULL) {
707     return nbuf;
708   }
709   b = BH(((char *)buf) - sizeof(bhead_t));
710   osize = -b->bb.bsize;
711   if (osize == 0) {
712     /*  Buffer acquired directly through acqfcn. */
713     bdhead_t *bd;
714 
715     bd = BDH(((char *)buf) - sizeof(bdhead_t));
716     osize = bd->tsize - (bufsize)sizeof(bdhead_t);
717   } else {
718     osize -= sizeof(bhead_t);
719   };
720 
721   KMP_DEBUG_ASSERT(osize > 0);
722 
723   (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
724                    (size_t)((size < osize) ? size : osize));
725   brel(th, buf);
726 
727   return nbuf;
728 }
729 
730 /*  BREL  --  Release a buffer.  */
731 static void brel(kmp_info_t *th, void *buf) {
732   thr_data_t *thr = get_thr_data(th);
733   bfhead_t *b, *bn;
734   kmp_info_t *bth;
735 
736   KMP_DEBUG_ASSERT(buf != NULL);
737   KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
738 
739   b = BFH(((char *)buf) - sizeof(bhead_t));
740 
741   if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
742     bdhead_t *bdh;
743 
744     bdh = BDH(((char *)buf) - sizeof(bdhead_t));
745     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
746 #if BufStats
747     thr->totalloc -= (size_t)bdh->tsize;
748     thr->numdrel++; /* Number of direct releases */
749     thr->numrel++; /* Increment number of brel() calls */
750 #endif /* BufStats */
751 #ifdef FreeWipe
752     (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
753 #endif /* FreeWipe */
754 
755     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
756 
757     KMP_DEBUG_ASSERT(thr->relfcn != 0);
758     (*thr->relfcn)((void *)bdh); /* Release it directly. */
759     return;
760   }
761 
762   bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
763                        ~1); // clear possible mark before comparison
764   if (bth != th) {
765     /* Add this buffer to be released by the owning thread later */
766     __kmp_bget_enqueue(bth, buf
767 #ifdef USE_QUEUING_LOCK_FOR_BGET
768                        ,
769                        __kmp_gtid_from_thread(th)
770 #endif
771                            );
772     return;
773   }
774 
775   /* Buffer size must be negative, indicating that the buffer is allocated. */
776   if (b->bh.bb.bsize >= 0) {
777     bn = NULL;
778   }
779   KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
780 
781   /*  Back pointer in next buffer must be zero, indicating the same thing: */
782 
783   KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
784 
785 #if BufStats
786   thr->numrel++; /* Increment number of brel() calls */
787   thr->totalloc += (size_t)b->bh.bb.bsize;
788 #endif
789 
790   /* If the back link is nonzero, the previous buffer is free.  */
791 
792   if (b->bh.bb.prevfree != 0) {
793     /* The previous buffer is free. Consolidate this buffer with it by adding
794        the length of this buffer to the previous free buffer. Note that we
795        subtract the size in the buffer being released, since it's negative to
796        indicate that the buffer is allocated. */
797     bufsize size = b->bh.bb.bsize;
798 
799     /* Make the previous buffer the one we're working on. */
800     KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
801                      b->bh.bb.prevfree);
802     b = BFH(((char *)b) - b->bh.bb.prevfree);
803     b->bh.bb.bsize -= size;
804 
805     /* unlink the buffer from the old freelist */
806     __kmp_bget_remove_from_freelist(b);
807   } else {
808     /* The previous buffer isn't allocated. Mark this buffer size as positive
809        (i.e. free) and fall through to place the buffer on the free list as an
810        isolated free block. */
811     b->bh.bb.bsize = -b->bh.bb.bsize;
812   }
813 
814   /* insert buffer back onto a new freelist */
815   __kmp_bget_insert_into_freelist(thr, b);
816 
817   /* Now we look at the next buffer in memory, located by advancing from
818      the  start  of  this  buffer  by its size, to see if that buffer is
819      free.  If it is, we combine  this  buffer  with  the  next  one  in
820      memory, dechaining the second buffer from the free list. */
821   bn = BFH(((char *)b) + b->bh.bb.bsize);
822   if (bn->bh.bb.bsize > 0) {
823 
824     /* The buffer is free.  Remove it from the free list and add
825        its size to that of our buffer. */
826     KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
827                      bn->bh.bb.bsize);
828 
829     __kmp_bget_remove_from_freelist(bn);
830 
831     b->bh.bb.bsize += bn->bh.bb.bsize;
832 
833     /* unlink the buffer from the old freelist, and reinsert it into the new
834      * freelist */
835     __kmp_bget_remove_from_freelist(b);
836     __kmp_bget_insert_into_freelist(thr, b);
837 
838     /* Finally,  advance  to   the  buffer  that   follows  the  newly
839        consolidated free block.  We must set its  backpointer  to  the
840        head  of  the  consolidated free block.  We know the next block
841        must be an allocated block because the process of recombination
842        guarantees  that  two  free  blocks will never be contiguous in
843        memory.  */
844     bn = BFH(((char *)b) + b->bh.bb.bsize);
845   }
846 #ifdef FreeWipe
847   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
848                (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
849 #endif
850   KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
851 
852   /* The next buffer is allocated.  Set the backpointer in it  to  point
853      to this buffer; the previous free buffer in memory. */
854 
855   bn->bh.bb.prevfree = b->bh.bb.bsize;
856 
857   /*  If  a  block-release function is defined, and this free buffer
858       constitutes the entire block, release it.  Note that  pool_len
859       is  defined  in  such a way that the test will fail unless all
860       pool blocks are the same size.  */
861   if (thr->relfcn != 0 &&
862       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
863 #if BufStats
864     if (thr->numpblk !=
865         1) { /* Do not release the last buffer until finalization time */
866 #endif
867 
868       KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
869       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
870       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
871                        b->bh.bb.bsize);
872 
873       /*  Unlink the buffer from the free list  */
874       __kmp_bget_remove_from_freelist(b);
875 
876       KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
877 
878       (*thr->relfcn)(b);
879 #if BufStats
880       thr->numprel++; /* Nr of expansion block releases */
881       thr->numpblk--; /* Total number of blocks */
882       KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
883 
884       // avoid leaving stale last_pool pointer around if it is being dealloced
885       if (thr->last_pool == b)
886         thr->last_pool = 0;
887     } else {
888       thr->last_pool = b;
889     }
890 #endif /* BufStats */
891   }
892 }
893 
894 /*  BECTL  --  Establish automatic pool expansion control  */
895 static void bectl(kmp_info_t *th, bget_compact_t compact,
896                   bget_acquire_t acquire, bget_release_t release,
897                   bufsize pool_incr) {
898   thr_data_t *thr = get_thr_data(th);
899 
900   thr->compfcn = compact;
901   thr->acqfcn = acquire;
902   thr->relfcn = release;
903   thr->exp_incr = pool_incr;
904 }
905 
906 /*  BPOOL  --  Add a region of memory to the buffer pool.  */
907 static void bpool(kmp_info_t *th, void *buf, bufsize len) {
908   /*    int bin = 0; */
909   thr_data_t *thr = get_thr_data(th);
910   bfhead_t *b = BFH(buf);
911   bhead_t *bn;
912 
913   __kmp_bget_dequeue(th); /* Release any queued buffers */
914 
915 #ifdef SizeQuant
916   len &= ~(SizeQuant - 1);
917 #endif
918   if (thr->pool_len == 0) {
919     thr->pool_len = len;
920   } else if (len != thr->pool_len) {
921     thr->pool_len = -1;
922   }
923 #if BufStats
924   thr->numpget++; /* Number of block acquisitions */
925   thr->numpblk++; /* Number of blocks total */
926   KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
927 #endif /* BufStats */
928 
929   /* Since the block is initially occupied by a single free  buffer,
930      it  had  better  not  be  (much) larger than the largest buffer
931      whose size we can store in bhead.bb.bsize. */
932   KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
933 
934   /* Clear  the  backpointer at  the start of the block to indicate that
935      there  is  no  free  block  prior  to  this   one.    That   blocks
936      recombination when the first block in memory is released. */
937   b->bh.bb.prevfree = 0;
938 
939   /* Create a dummy allocated buffer at the end of the pool.  This dummy
940      buffer is seen when a buffer at the end of the pool is released and
941      blocks  recombination  of  the last buffer with the dummy buffer at
942      the end.  The length in the dummy buffer  is  set  to  the  largest
943      negative  number  to  denote  the  end  of  the pool for diagnostic
944      routines (this specific value is  not  counted  on  by  the  actual
945      allocation and release functions). */
946   len -= sizeof(bhead_t);
947   b->bh.bb.bsize = (bufsize)len;
948   /* Set the owner of this buffer */
949   TCW_PTR(b->bh.bb.bthr,
950           (kmp_info_t *)((kmp_uintptr_t)th |
951                          1)); // mark the buffer as allocated address
952 
953   /* Chain the new block to the free list. */
954   __kmp_bget_insert_into_freelist(thr, b);
955 
956 #ifdef FreeWipe
957   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
958                (size_t)(len - sizeof(bfhead_t)));
959 #endif
960   bn = BH(((char *)b) + len);
961   bn->bb.prevfree = (bufsize)len;
962   /* Definition of ESent assumes two's complement! */
963   KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
964 
965   bn->bb.bsize = ESent;
966 }
967 
968 /*  BFREED  --  Dump the free lists for this thread. */
969 static void bfreed(kmp_info_t *th) {
970   int bin = 0, count = 0;
971   int gtid = __kmp_gtid_from_thread(th);
972   thr_data_t *thr = get_thr_data(th);
973 
974 #if BufStats
975   __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
976                        " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
977                        " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
978                        " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
979                        " drel=%" KMP_INT64_SPEC "\n",
980                        gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
981                        (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
982                        (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
983                        (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
984 #endif
985 
986   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
987     bfhead_t *b;
988 
989     for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
990          b = b->ql.flink) {
991       bufsize bs = b->bh.bb.bsize;
992 
993       KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
994       KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
995       KMP_DEBUG_ASSERT(bs > 0);
996 
997       count += 1;
998 
999       __kmp_printf_no_lock(
1000           "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
1001           (long)bs);
1002 #ifdef FreeWipe
1003       {
1004         char *lerr = ((char *)b) + sizeof(bfhead_t);
1005         if ((bs > sizeof(bfhead_t)) &&
1006             ((*lerr != 0x55) ||
1007              (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
1008               0))) {
1009           __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
1010                                "free block have been overstored.)\n",
1011                                gtid);
1012         }
1013       }
1014 #endif
1015     }
1016   }
1017 
1018   if (count == 0)
1019     __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
1020 }
1021 
1022 #ifdef KMP_DEBUG
1023 
1024 #if BufStats
1025 
1026 /*  BSTATS  --  Return buffer allocation free space statistics.  */
1027 static void bstats(kmp_info_t *th, bufsize *curalloc, bufsize *totfree,
1028                    bufsize *maxfree, long *nget, long *nrel) {
1029   int bin = 0;
1030   thr_data_t *thr = get_thr_data(th);
1031 
1032   *nget = thr->numget;
1033   *nrel = thr->numrel;
1034   *curalloc = (bufsize)thr->totalloc;
1035   *totfree = 0;
1036   *maxfree = -1;
1037 
1038   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
1039     bfhead_t *b = thr->freelist[bin].ql.flink;
1040 
1041     while (b != &thr->freelist[bin]) {
1042       KMP_DEBUG_ASSERT(b->bh.bb.bsize > 0);
1043       *totfree += b->bh.bb.bsize;
1044       if (b->bh.bb.bsize > *maxfree) {
1045         *maxfree = b->bh.bb.bsize;
1046       }
1047       b = b->ql.flink; /* Link to next buffer */
1048     }
1049   }
1050 }
1051 
1052 /*  BSTATSE  --  Return extended statistics  */
1053 static void bstatse(kmp_info_t *th, bufsize *pool_incr, long *npool,
1054                     long *npget, long *nprel, long *ndget, long *ndrel) {
1055   thr_data_t *thr = get_thr_data(th);
1056 
1057   *pool_incr = (thr->pool_len < 0) ? -thr->exp_incr : thr->exp_incr;
1058   *npool = thr->numpblk;
1059   *npget = thr->numpget;
1060   *nprel = thr->numprel;
1061   *ndget = thr->numdget;
1062   *ndrel = thr->numdrel;
1063 }
1064 
1065 #endif /* BufStats */
1066 
1067 /*  BUFDUMP  --  Dump the data in a buffer.  This is called with the  user
1068                  data pointer, and backs up to the buffer header.  It will
1069                  dump either a free block or an allocated one.  */
1070 static void bufdump(kmp_info_t *th, void *buf) {
1071   bfhead_t *b;
1072   unsigned char *bdump;
1073   bufsize bdlen;
1074 
1075   b = BFH(((char *)buf) - sizeof(bhead_t));
1076   KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
1077   if (b->bh.bb.bsize < 0) {
1078     bdump = (unsigned char *)buf;
1079     bdlen = (-b->bh.bb.bsize) - (bufsize)sizeof(bhead_t);
1080   } else {
1081     bdump = (unsigned char *)(((char *)b) + sizeof(bfhead_t));
1082     bdlen = b->bh.bb.bsize - (bufsize)sizeof(bfhead_t);
1083   }
1084 
1085   while (bdlen > 0) {
1086     int i, dupes = 0;
1087     bufsize l = bdlen;
1088     char bhex[50], bascii[20];
1089 
1090     if (l > 16) {
1091       l = 16;
1092     }
1093 
1094     for (i = 0; i < l; i++) {
1095       (void)KMP_SNPRINTF(bhex + i * 3, sizeof(bhex) - i * 3, "%02X ", bdump[i]);
1096       if (bdump[i] > 0x20 && bdump[i] < 0x7F)
1097         bascii[i] = bdump[i];
1098       else
1099         bascii[i] = ' ';
1100     }
1101     bascii[i] = 0;
1102     (void)__kmp_printf_no_lock("%-48s   %s\n", bhex, bascii);
1103     bdump += l;
1104     bdlen -= l;
1105     while ((bdlen > 16) &&
1106            (memcmp((char *)(bdump - 16), (char *)bdump, 16) == 0)) {
1107       dupes++;
1108       bdump += 16;
1109       bdlen -= 16;
1110     }
1111     if (dupes > 1) {
1112       (void)__kmp_printf_no_lock(
1113           "     (%d lines [%d bytes] identical to above line skipped)\n", dupes,
1114           dupes * 16);
1115     } else if (dupes == 1) {
1116       bdump -= 16;
1117       bdlen += 16;
1118     }
1119   }
1120 }
1121 
1122 /*  BPOOLD  --  Dump a buffer pool.  The buffer headers are always listed.
1123                 If DUMPALLOC is nonzero, the contents of allocated buffers
1124                 are  dumped.   If  DUMPFREE  is  nonzero,  free blocks are
1125                 dumped as well.  If FreeWipe  checking  is  enabled,  free
1126                 blocks  which  have  been clobbered will always be dumped. */
1127 static void bpoold(kmp_info_t *th, void *buf, int dumpalloc, int dumpfree) {
1128   bfhead_t *b = BFH((char *)buf - sizeof(bhead_t));
1129 
1130   while (b->bh.bb.bsize != ESent) {
1131     bufsize bs = b->bh.bb.bsize;
1132 
1133     if (bs < 0) {
1134       bs = -bs;
1135       (void)__kmp_printf_no_lock("Allocated buffer: size %6ld bytes.\n",
1136                                  (long)bs);
1137       if (dumpalloc) {
1138         bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
1139       }
1140     } else {
1141       const char *lerr = "";
1142 
1143       KMP_DEBUG_ASSERT(bs > 0);
1144       if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
1145         lerr = "  (Bad free list links)";
1146       }
1147       (void)__kmp_printf_no_lock("Free block:       size %6ld bytes.%s\n",
1148                                  (long)bs, lerr);
1149 #ifdef FreeWipe
1150       lerr = ((char *)b) + sizeof(bfhead_t);
1151       if ((bs > sizeof(bfhead_t)) &&
1152           ((*lerr != 0x55) ||
1153            (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
1154             0))) {
1155         (void)__kmp_printf_no_lock(
1156             "(Contents of above free block have been overstored.)\n");
1157         bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
1158       } else
1159 #endif
1160           if (dumpfree) {
1161         bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
1162       }
1163     }
1164     b = BFH(((char *)b) + bs);
1165   }
1166 }
1167 
1168 /*  BPOOLV  --  Validate a buffer pool. */
1169 static int bpoolv(kmp_info_t *th, void *buf) {
1170   bfhead_t *b = BFH(buf);
1171 
1172   while (b->bh.bb.bsize != ESent) {
1173     bufsize bs = b->bh.bb.bsize;
1174 
1175     if (bs < 0) {
1176       bs = -bs;
1177     } else {
1178 #ifdef FreeWipe
1179       char *lerr = "";
1180 #endif
1181 
1182       KMP_DEBUG_ASSERT(bs > 0);
1183       if (bs <= 0) {
1184         return 0;
1185       }
1186       if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
1187         (void)__kmp_printf_no_lock(
1188             "Free block: size %6ld bytes.  (Bad free list links)\n", (long)bs);
1189         KMP_DEBUG_ASSERT(0);
1190         return 0;
1191       }
1192 #ifdef FreeWipe
1193       lerr = ((char *)b) + sizeof(bfhead_t);
1194       if ((bs > sizeof(bfhead_t)) &&
1195           ((*lerr != 0x55) ||
1196            (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
1197             0))) {
1198         (void)__kmp_printf_no_lock(
1199             "(Contents of above free block have been overstored.)\n");
1200         bufdump(th, (void *)(((char *)b) + sizeof(bhead_t)));
1201         KMP_DEBUG_ASSERT(0);
1202         return 0;
1203       }
1204 #endif /* FreeWipe */
1205     }
1206     b = BFH(((char *)b) + bs);
1207   }
1208   return 1;
1209 }
1210 
1211 #endif /* KMP_DEBUG */
1212 
1213 void __kmp_initialize_bget(kmp_info_t *th) {
1214   KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
1215 
1216   set_thr_data(th);
1217 
1218   bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
1219         (bufsize)__kmp_malloc_pool_incr);
1220 }
1221 
1222 void __kmp_finalize_bget(kmp_info_t *th) {
1223   thr_data_t *thr;
1224   bfhead_t *b;
1225 
1226   KMP_DEBUG_ASSERT(th != 0);
1227 
1228 #if BufStats
1229   thr = (thr_data_t *)th->th.th_local.bget_data;
1230   KMP_DEBUG_ASSERT(thr != NULL);
1231   b = thr->last_pool;
1232 
1233   /*  If a block-release function is defined, and this free buffer constitutes
1234       the entire block, release it. Note that pool_len is defined in such a way
1235       that the test will fail unless all pool blocks are the same size.  */
1236 
1237   // Deallocate the last pool if one exists because we no longer do it in brel()
1238   if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1239       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1240     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1241     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1242     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1243                      b->bh.bb.bsize);
1244 
1245     /*  Unlink the buffer from the free list  */
1246     __kmp_bget_remove_from_freelist(b);
1247 
1248     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1249 
1250     (*thr->relfcn)(b);
1251     thr->numprel++; /* Nr of expansion block releases */
1252     thr->numpblk--; /* Total number of blocks */
1253     KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1254   }
1255 #endif /* BufStats */
1256 
1257   /* Deallocate bget_data */
1258   if (th->th.th_local.bget_data != NULL) {
1259     __kmp_free(th->th.th_local.bget_data);
1260     th->th.th_local.bget_data = NULL;
1261   }; // if
1262 }
1263 
1264 void kmpc_set_poolsize(size_t size) {
1265   bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1266         (bget_release_t)free, (bufsize)size);
1267 }
1268 
1269 size_t kmpc_get_poolsize(void) {
1270   thr_data_t *p;
1271 
1272   p = get_thr_data(__kmp_get_thread());
1273 
1274   return p->exp_incr;
1275 }
1276 
1277 void kmpc_set_poolmode(int mode) {
1278   thr_data_t *p;
1279 
1280   if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1281       mode == bget_mode_best) {
1282     p = get_thr_data(__kmp_get_thread());
1283     p->mode = (bget_mode_t)mode;
1284   }
1285 }
1286 
1287 int kmpc_get_poolmode(void) {
1288   thr_data_t *p;
1289 
1290   p = get_thr_data(__kmp_get_thread());
1291 
1292   return p->mode;
1293 }
1294 
1295 void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1296   kmp_info_t *th = __kmp_get_thread();
1297   bufsize a, b;
1298 
1299   __kmp_bget_dequeue(th); /* Release any queued buffers */
1300 
1301   bcheck(th, &a, &b);
1302 
1303   *maxmem = a;
1304   *allmem = b;
1305 }
1306 
1307 void kmpc_poolprint(void) {
1308   kmp_info_t *th = __kmp_get_thread();
1309 
1310   __kmp_bget_dequeue(th); /* Release any queued buffers */
1311 
1312   bfreed(th);
1313 }
1314 
1315 #endif // #if KMP_USE_BGET
1316 
1317 void *kmpc_malloc(size_t size) {
1318   void *ptr;
1319   ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1320   if (ptr != NULL) {
1321     // save allocated pointer just before one returned to user
1322     *(void **)ptr = ptr;
1323     ptr = (void **)ptr + 1;
1324   }
1325   return ptr;
1326 }
1327 
1328 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1329 
1330 void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1331   void *ptr;
1332   void *ptr_allocated;
1333   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1334   if (!IS_POWER_OF_TWO(alignment)) {
1335     // AC: do we need to issue a warning here?
1336     errno = EINVAL;
1337     return NULL;
1338   }
1339   size = size + sizeof(void *) + alignment;
1340   ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1341   if (ptr_allocated != NULL) {
1342     // save allocated pointer just before one returned to user
1343     ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1344                    ~(alignment - 1));
1345     *((void **)ptr - 1) = ptr_allocated;
1346   } else {
1347     ptr = NULL;
1348   }
1349   return ptr;
1350 }
1351 
1352 void *kmpc_calloc(size_t nelem, size_t elsize) {
1353   void *ptr;
1354   ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1355   if (ptr != NULL) {
1356     // save allocated pointer just before one returned to user
1357     *(void **)ptr = ptr;
1358     ptr = (void **)ptr + 1;
1359   }
1360   return ptr;
1361 }
1362 
1363 void *kmpc_realloc(void *ptr, size_t size) {
1364   void *result = NULL;
1365   if (ptr == NULL) {
1366     // If pointer is NULL, realloc behaves like malloc.
1367     result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1368     // save allocated pointer just before one returned to user
1369     if (result != NULL) {
1370       *(void **)result = result;
1371       result = (void **)result + 1;
1372     }
1373   } else if (size == 0) {
1374     // If size is 0, realloc behaves like free.
1375     // The thread must be registered by the call to kmpc_malloc() or
1376     // kmpc_calloc() before.
1377     // So it should be safe to call __kmp_get_thread(), not
1378     // __kmp_entry_thread().
1379     KMP_ASSERT(*((void **)ptr - 1));
1380     brel(__kmp_get_thread(), *((void **)ptr - 1));
1381   } else {
1382     result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1383                    (bufsize)(size + sizeof(ptr)));
1384     if (result != NULL) {
1385       *(void **)result = result;
1386       result = (void **)result + 1;
1387     }
1388   }; // if
1389   return result;
1390 }
1391 
1392 // NOTE: the library must have already been initialized by a previous allocate
1393 void kmpc_free(void *ptr) {
1394   if (!__kmp_init_serial) {
1395     return;
1396   }; // if
1397   if (ptr != NULL) {
1398     kmp_info_t *th = __kmp_get_thread();
1399     __kmp_bget_dequeue(th); /* Release any queued buffers */
1400     // extract allocated pointer and free it
1401     KMP_ASSERT(*((void **)ptr - 1));
1402     brel(th, *((void **)ptr - 1));
1403   };
1404 }
1405 
1406 void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1407   void *ptr;
1408   KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1409                 (int)size KMP_SRC_LOC_PARM));
1410   ptr = bget(th, (bufsize)size);
1411   KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1412   return ptr;
1413 }
1414 
1415 void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1416                            size_t elsize KMP_SRC_LOC_DECL) {
1417   void *ptr;
1418   KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1419                 (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1420   ptr = bgetz(th, (bufsize)(nelem * elsize));
1421   KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1422   return ptr;
1423 }
1424 
1425 void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1426                             size_t size KMP_SRC_LOC_DECL) {
1427   KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1428                 ptr, (int)size KMP_SRC_LOC_PARM));
1429   ptr = bgetr(th, ptr, (bufsize)size);
1430   KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1431   return ptr;
1432 }
1433 
1434 void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1435   KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1436                 ptr KMP_SRC_LOC_PARM));
1437   if (ptr != NULL) {
1438     __kmp_bget_dequeue(th); /* Release any queued buffers */
1439     brel(th, ptr);
1440   }
1441   KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1442 }
1443 
1444 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
1445    memory leaks, but it may be useful for debugging memory corruptions, used
1446    freed pointers, etc. */
1447 /* #define LEAK_MEMORY */
1448 struct kmp_mem_descr { // Memory block descriptor.
1449   void *ptr_allocated; // Pointer returned by malloc(), subject for free().
1450   size_t size_allocated; // Size of allocated memory block.
1451   void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
1452   size_t size_aligned; // Size of aligned memory block.
1453 };
1454 typedef struct kmp_mem_descr kmp_mem_descr_t;
1455 
1456 /* Allocate memory on requested boundary, fill allocated memory with 0x00.
1457    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1458    error. Must use __kmp_free when freeing memory allocated by this routine! */
1459 static void *___kmp_allocate_align(size_t size,
1460                                    size_t alignment KMP_SRC_LOC_DECL) {
1461   /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
1462      requested to return properly aligned pointer. Original pointer returned
1463      by malloc() and size of allocated block is saved in descriptor just
1464      before the aligned pointer. This information used by __kmp_free() -- it
1465      has to pass to free() original pointer, not aligned one.
1466 
1467           +---------+------------+-----------------------------------+---------+
1468           | padding | descriptor |           aligned block           | padding |
1469           +---------+------------+-----------------------------------+---------+
1470           ^                      ^
1471           |                      |
1472           |                      +- Aligned pointer returned to caller
1473           +- Pointer returned by malloc()
1474 
1475       Aligned block is filled with zeros, paddings are filled with 0xEF. */
1476 
1477   kmp_mem_descr_t descr;
1478   kmp_uintptr_t addr_allocated; // Address returned by malloc().
1479   kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
1480   kmp_uintptr_t addr_descr; // Address of memory block descriptor.
1481 
1482   KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
1483                 (int)size, (int)alignment KMP_SRC_LOC_PARM));
1484 
1485   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
1486   KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
1487   // Make sure kmp_uintptr_t is enough to store addresses.
1488 
1489   descr.size_aligned = size;
1490   descr.size_allocated =
1491       descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
1492 
1493 #if KMP_DEBUG
1494   descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
1495 #else
1496   descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
1497 #endif
1498   KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
1499                 descr.ptr_allocated));
1500   if (descr.ptr_allocated == NULL) {
1501     KMP_FATAL(OutOfHeapMemory);
1502   };
1503 
1504   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1505   addr_aligned =
1506       (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
1507   addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
1508 
1509   descr.ptr_aligned = (void *)addr_aligned;
1510 
1511   KE_TRACE(26, ("   ___kmp_allocate_align: "
1512                 "ptr_allocated=%p, size_allocated=%d, "
1513                 "ptr_aligned=%p, size_aligned=%d\n",
1514                 descr.ptr_allocated, (int)descr.size_allocated,
1515                 descr.ptr_aligned, (int)descr.size_aligned));
1516 
1517   KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
1518   KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
1519   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1520                    addr_allocated + descr.size_allocated);
1521   KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
1522 #ifdef KMP_DEBUG
1523   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1524 // Fill allocated memory block with 0xEF.
1525 #endif
1526   memset(descr.ptr_aligned, 0x00, descr.size_aligned);
1527   // Fill the aligned memory block (which is intended for using by caller) with
1528   // 0x00. Do not
1529   // put this filling under KMP_DEBUG condition! Many callers expect zeroed
1530   // memory. (Padding
1531   // bytes remain filled with 0xEF in debugging library.)
1532   *((kmp_mem_descr_t *)addr_descr) = descr;
1533 
1534   KMP_MB();
1535 
1536   KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
1537   return descr.ptr_aligned;
1538 } // func ___kmp_allocate_align
1539 
1540 /* Allocate memory on cache line boundary, fill allocated memory with 0x00.
1541    Do not call this func directly! Use __kmp_allocate macro instead.
1542    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1543    error. Must use __kmp_free when freeing memory allocated by this routine! */
1544 void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
1545   void *ptr;
1546   KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
1547                 (int)size KMP_SRC_LOC_PARM));
1548   ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
1549   KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
1550   return ptr;
1551 } // func ___kmp_allocate
1552 
1553 #if (BUILD_MEMORY == FIRST_TOUCH)
1554 void *__kmp_ft_page_allocate(size_t size) {
1555   void *adr, *aadr;
1556 
1557   const int page_size = KMP_GET_PAGE_SIZE();
1558 
1559   adr = (void *)__kmp_thread_malloc(__kmp_get_thread(),
1560                                     size + page_size + KMP_PTR_SKIP);
1561   if (adr == 0)
1562     KMP_FATAL(OutOfHeapMemory);
1563 
1564   /* check to see if adr is on a page boundary. */
1565   if (((kmp_uintptr_t)adr & (page_size - 1)) == 0)
1566     /* nothing to do if adr is already on a page boundary. */
1567     aadr = adr;
1568   else
1569     /* else set aadr to the first page boundary in the allocated memory. */
1570     aadr = (void *)(((kmp_uintptr_t)adr + page_size) & ~(page_size - 1));
1571 
1572   /* the first touch by the owner thread. */
1573   *((void **)aadr) = adr;
1574 
1575   /* skip the memory space used for storing adr above. */
1576   return (void *)((char *)aadr + KMP_PTR_SKIP);
1577 }
1578 #endif
1579 
1580 /* Allocate memory on page boundary, fill allocated memory with 0x00.
1581    Does not call this func directly! Use __kmp_page_allocate macro instead.
1582    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1583    error. Must use __kmp_free when freeing memory allocated by this routine! */
1584 void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
1585   int page_size = 8 * 1024;
1586   void *ptr;
1587 
1588   KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
1589                 (int)size KMP_SRC_LOC_PARM));
1590   ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
1591   KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
1592   return ptr;
1593 } // ___kmp_page_allocate
1594 
1595 /* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
1596    In debug mode, fill the memory block with 0xEF before call to free(). */
1597 void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
1598   kmp_mem_descr_t descr;
1599   kmp_uintptr_t addr_allocated; // Address returned by malloc().
1600   kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
1601 
1602   KE_TRACE(25,
1603            ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
1604   KMP_ASSERT(ptr != NULL);
1605 
1606   descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
1607 
1608   KE_TRACE(26, ("   __kmp_free:     "
1609                 "ptr_allocated=%p, size_allocated=%d, "
1610                 "ptr_aligned=%p, size_aligned=%d\n",
1611                 descr.ptr_allocated, (int)descr.size_allocated,
1612                 descr.ptr_aligned, (int)descr.size_aligned));
1613 
1614   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1615   addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
1616 
1617   KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
1618   KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
1619   KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
1620   KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
1621   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1622                    addr_allocated + descr.size_allocated);
1623 
1624 #ifdef KMP_DEBUG
1625   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1626 // Fill memory block with 0xEF, it helps catch using freed memory.
1627 #endif
1628 
1629 #ifndef LEAK_MEMORY
1630   KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
1631 #ifdef KMP_DEBUG
1632   _free_src_loc(descr.ptr_allocated, _file_, _line_);
1633 #else
1634   free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
1635 #endif
1636 #endif
1637   KMP_MB();
1638   KE_TRACE(25, ("<- __kmp_free() returns\n"));
1639 } // func ___kmp_free
1640 
1641 #if USE_FAST_MEMORY == 3
1642 // Allocate fast memory by first scanning the thread's free lists
1643 // If a chunk the right size exists, grab it off the free list.
1644 // Otherwise allocate normally using kmp_thread_malloc.
1645 
1646 // AC: How to choose the limit? Just get 16 for now...
1647 #define KMP_FREE_LIST_LIMIT 16
1648 
1649 // Always use 128 bytes for determining buckets for caching memory blocks
1650 #define DCACHE_LINE 128
1651 
1652 void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
1653   void *ptr;
1654   int num_lines;
1655   int idx;
1656   int index;
1657   void *alloc_ptr;
1658   size_t alloc_size;
1659   kmp_mem_descr_t *descr;
1660 
1661   KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
1662                 __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
1663 
1664   num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
1665   idx = num_lines - 1;
1666   KMP_DEBUG_ASSERT(idx >= 0);
1667   if (idx < 2) {
1668     index = 0; // idx is [ 0, 1 ], use first free list
1669     num_lines = 2; // 1, 2 cache lines or less than cache line
1670   } else if ((idx >>= 2) == 0) {
1671     index = 1; // idx is [ 2, 3 ], use second free list
1672     num_lines = 4; // 3, 4 cache lines
1673   } else if ((idx >>= 2) == 0) {
1674     index = 2; // idx is [ 4, 15 ], use third free list
1675     num_lines = 16; // 5, 6, ..., 16 cache lines
1676   } else if ((idx >>= 2) == 0) {
1677     index = 3; // idx is [ 16, 63 ], use fourth free list
1678     num_lines = 64; // 17, 18, ..., 64 cache lines
1679   } else {
1680     goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
1681   }
1682 
1683   ptr = this_thr->th.th_free_lists[index].th_free_list_self;
1684   if (ptr != NULL) {
1685     // pop the head of no-sync free list
1686     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
1687     KMP_DEBUG_ASSERT(
1688         this_thr ==
1689         ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
1690             ->ptr_aligned);
1691     goto end;
1692   };
1693   ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
1694   if (ptr != NULL) {
1695     // no-sync free list is empty, use sync free list (filled in by other
1696     // threads only)
1697     // pop the head of the sync free list, push NULL instead
1698     while (!KMP_COMPARE_AND_STORE_PTR(
1699         &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
1700       KMP_CPU_PAUSE();
1701       ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
1702     }
1703     // push the rest of chain into no-sync free list (can be NULL if there was
1704     // the only block)
1705     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
1706     KMP_DEBUG_ASSERT(
1707         this_thr ==
1708         ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
1709             ->ptr_aligned);
1710     goto end;
1711   }
1712 
1713 alloc_call:
1714   // haven't found block in the free lists, thus allocate it
1715   size = num_lines * DCACHE_LINE;
1716 
1717   alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
1718   KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
1719                 "alloc_size %d\n",
1720                 __kmp_gtid_from_thread(this_thr), alloc_size));
1721   alloc_ptr = bget(this_thr, (bufsize)alloc_size);
1722 
1723   // align ptr to DCACHE_LINE
1724   ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
1725                   DCACHE_LINE) &
1726                  ~(DCACHE_LINE - 1));
1727   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
1728 
1729   descr->ptr_allocated = alloc_ptr; // remember allocated pointer
1730   // we don't need size_allocated
1731   descr->ptr_aligned = (void *)this_thr; // remember allocating thread
1732   // (it is already saved in bget buffer,
1733   // but we may want to use another allocator in future)
1734   descr->size_aligned = size;
1735 
1736 end:
1737   KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
1738                 __kmp_gtid_from_thread(this_thr), ptr));
1739   return ptr;
1740 } // func __kmp_fast_allocate
1741 
1742 // Free fast memory and place it on the thread's free list if it is of
1743 // the correct size.
1744 void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
1745   kmp_mem_descr_t *descr;
1746   kmp_info_t *alloc_thr;
1747   size_t size;
1748   size_t idx;
1749   int index;
1750 
1751   KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
1752                 __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
1753   KMP_ASSERT(ptr != NULL);
1754 
1755   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
1756 
1757   KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
1758                 (int)descr->size_aligned));
1759 
1760   size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
1761 
1762   idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
1763   if (idx == size) {
1764     index = 0; // 2 cache lines
1765   } else if ((idx <<= 1) == size) {
1766     index = 1; // 4 cache lines
1767   } else if ((idx <<= 2) == size) {
1768     index = 2; // 16 cache lines
1769   } else if ((idx <<= 2) == size) {
1770     index = 3; // 64 cache lines
1771   } else {
1772     KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
1773     goto free_call; // 65 or more cache lines ( > 8KB )
1774   }
1775 
1776   alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
1777   if (alloc_thr == this_thr) {
1778     // push block to self no-sync free list, linking previous head (LIFO)
1779     *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
1780     this_thr->th.th_free_lists[index].th_free_list_self = ptr;
1781   } else {
1782     void *head = this_thr->th.th_free_lists[index].th_free_list_other;
1783     if (head == NULL) {
1784       // Create new free list
1785       this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1786       *((void **)ptr) = NULL; // mark the tail of the list
1787       descr->size_allocated = (size_t)1; // head of the list keeps its length
1788     } else {
1789       // need to check existed "other" list's owner thread and size of queue
1790       kmp_mem_descr_t *dsc =
1791           (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
1792       // allocating thread, same for all queue nodes
1793       kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
1794       size_t q_sz =
1795           dsc->size_allocated + 1; // new size in case we add current task
1796       if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
1797         // we can add current task to "other" list, no sync needed
1798         *((void **)ptr) = head;
1799         descr->size_allocated = q_sz;
1800         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1801       } else {
1802         // either queue blocks owner is changing or size limit exceeded
1803         // return old queue to allocating thread (q_th) synchroneously,
1804         // and start new list for alloc_thr's tasks
1805         void *old_ptr;
1806         void *tail = head;
1807         void *next = *((void **)head);
1808         while (next != NULL) {
1809           KMP_DEBUG_ASSERT(
1810               // queue size should decrease by 1 each step through the list
1811               ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
1812                       ->size_allocated +
1813                   1 ==
1814               ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
1815                   ->size_allocated);
1816           tail = next; // remember tail node
1817           next = *((void **)next);
1818         }
1819         KMP_DEBUG_ASSERT(q_th != NULL);
1820         // push block to owner's sync free list
1821         old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
1822         /* the next pointer must be set before setting free_list to ptr to avoid
1823            exposing a broken list to other threads, even for an instant. */
1824         *((void **)tail) = old_ptr;
1825 
1826         while (!KMP_COMPARE_AND_STORE_PTR(
1827             &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
1828           KMP_CPU_PAUSE();
1829           old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
1830           *((void **)tail) = old_ptr;
1831         }
1832 
1833         // start new list of not-selt tasks
1834         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1835         *((void **)ptr) = NULL;
1836         descr->size_allocated = (size_t)1; // head of queue keeps its length
1837       }
1838     }
1839   }
1840   goto end;
1841 
1842 free_call:
1843   KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
1844                 __kmp_gtid_from_thread(this_thr), size));
1845   __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
1846   brel(this_thr, descr->ptr_allocated);
1847 
1848 end:
1849   KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
1850 
1851 } // func __kmp_fast_free
1852 
1853 // Initialize the thread free lists related to fast memory
1854 // Only do this when a thread is initially created.
1855 void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
1856   KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
1857 
1858   memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
1859 }
1860 
1861 // Free the memory in the thread free lists related to fast memory
1862 // Only do this when a thread is being reaped (destroyed).
1863 void __kmp_free_fast_memory(kmp_info_t *th) {
1864   // Suppose we use BGET underlying allocator, walk through its structures...
1865   int bin;
1866   thr_data_t *thr = get_thr_data(th);
1867   void **lst = NULL;
1868 
1869   KE_TRACE(
1870       5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
1871 
1872   __kmp_bget_dequeue(th); // Release any queued buffers
1873 
1874   // Dig through free lists and extract all allocated blocks
1875   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
1876     bfhead_t *b = thr->freelist[bin].ql.flink;
1877     while (b != &thr->freelist[bin]) {
1878       if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
1879         *((void **)b) =
1880             lst; // link the list (override bthr, but keep flink yet)
1881         lst = (void **)b; // push b into lst
1882       }
1883       b = b->ql.flink; // get next buffer
1884     }
1885   }
1886   while (lst != NULL) {
1887     void *next = *lst;
1888     KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
1889                   lst, next, th, __kmp_gtid_from_thread(th)));
1890     (*thr->relfcn)(lst);
1891 #if BufStats
1892     // count blocks to prevent problems in __kmp_finalize_bget()
1893     thr->numprel++; /* Nr of expansion block releases */
1894     thr->numpblk--; /* Total number of blocks */
1895 #endif
1896     lst = (void **)next;
1897   }
1898 
1899   KE_TRACE(
1900       5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
1901 }
1902 
1903 #endif // USE_FAST_MEMORY
1904