1 /*
2  * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_io.h"
16 #include "kmp_wrapper_malloc.h"
17 
18 // Disable bget when it is not used
19 #if KMP_USE_BGET
20 
21 /* Thread private buffer management code */
22 
23 typedef int (*bget_compact_t)(size_t, int);
24 typedef void *(*bget_acquire_t)(size_t);
25 typedef void (*bget_release_t)(void *);
26 
27 /* NOTE: bufsize must be a signed datatype */
28 
29 #if KMP_OS_WINDOWS
30 #if KMP_ARCH_X86 || KMP_ARCH_ARM
31 typedef kmp_int32 bufsize;
32 #else
33 typedef kmp_int64 bufsize;
34 #endif
35 #else
36 typedef ssize_t bufsize;
37 #endif
38 
39 /* The three modes of operation are, fifo search, lifo search, and best-fit */
40 
41 typedef enum bget_mode {
42   bget_mode_fifo = 0,
43   bget_mode_lifo = 1,
44   bget_mode_best = 2
45 } bget_mode_t;
46 
47 static void bpool(kmp_info_t *th, void *buffer, bufsize len);
48 static void *bget(kmp_info_t *th, bufsize size);
49 static void *bgetz(kmp_info_t *th, bufsize size);
50 static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
51 static void brel(kmp_info_t *th, void *buf);
52 static void bectl(kmp_info_t *th, bget_compact_t compact,
53                   bget_acquire_t acquire, bget_release_t release,
54                   bufsize pool_incr);
55 
56 /* BGET CONFIGURATION */
57 /* Buffer allocation size quantum: all buffers allocated are a
58    multiple of this size.  This MUST be a power of two. */
59 
60 /* On IA-32 architecture with  Linux* OS, malloc() does not
61    ensure 16 byte alignmnent */
62 
63 #if KMP_ARCH_X86 || !KMP_HAVE_QUAD
64 
65 #define SizeQuant 8
66 #define AlignType double
67 
68 #else
69 
70 #define SizeQuant 16
71 #define AlignType _Quad
72 
73 #endif
74 
75 // Define this symbol to enable the bstats() function which calculates the
76 // total free space in the buffer pool, the largest available buffer, and the
77 // total space currently allocated.
78 #define BufStats 1
79 
80 #ifdef KMP_DEBUG
81 
82 // Define this symbol to enable the bpoold() function which dumps the buffers
83 // in a buffer pool.
84 #define BufDump 1
85 
86 // Define this symbol to enable the bpoolv() function for validating a buffer
87 // pool.
88 #define BufValid 1
89 
90 // Define this symbol to enable the bufdump() function which allows dumping the
91 // contents of an allocated or free buffer.
92 #define DumpData 1
93 
94 #ifdef NOT_USED_NOW
95 
96 // Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
97 // who attempt to use pointers into released buffers.
98 #define FreeWipe 1
99 
100 // Use a best fit algorithm when searching for space for an allocation request.
101 // This uses memory more efficiently, but allocation will be much slower.
102 #define BestFit 1
103 
104 #endif /* NOT_USED_NOW */
105 #endif /* KMP_DEBUG */
106 
107 static bufsize bget_bin_size[] = {
108     0,
109     //    1 << 6,    /* .5 Cache line */
110     1 << 7, /* 1 Cache line, new */
111     1 << 8, /* 2 Cache lines */
112     1 << 9, /* 4 Cache lines, new */
113     1 << 10, /* 8 Cache lines */
114     1 << 11, /* 16 Cache lines, new */
115     1 << 12, 1 << 13, /* new */
116     1 << 14, 1 << 15, /* new */
117     1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
118     1 << 21, /*  2MB */
119     1 << 22, /*  4MB */
120     1 << 23, /*  8MB */
121     1 << 24, /* 16MB */
122     1 << 25, /* 32MB */
123 };
124 
125 #define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
126 
127 struct bfhead;
128 
129 //  Declare the interface, including the requested buffer size type, bufsize.
130 
131 /* Queue links */
132 typedef struct qlinks {
133   struct bfhead *flink; /* Forward link */
134   struct bfhead *blink; /* Backward link */
135 } qlinks_t;
136 
137 /* Header in allocated and free buffers */
138 typedef struct bhead2 {
139   kmp_info_t *bthr; /* The thread which owns the buffer pool */
140   bufsize prevfree; /* Relative link back to previous free buffer in memory or
141                        0 if previous buffer is allocated.  */
142   bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
143 } bhead2_t;
144 
145 /* Make sure the bhead structure is a multiple of SizeQuant in size. */
146 typedef union bhead {
147   KMP_ALIGN(SizeQuant)
148   AlignType b_align;
149   char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
150   bhead2_t bb;
151 } bhead_t;
152 #define BH(p) ((bhead_t *)(p))
153 
154 /*  Header in directly allocated buffers (by acqfcn) */
155 typedef struct bdhead {
156   bufsize tsize; /* Total size, including overhead */
157   bhead_t bh; /* Common header */
158 } bdhead_t;
159 #define BDH(p) ((bdhead_t *)(p))
160 
161 /* Header in free buffers */
162 typedef struct bfhead {
163   bhead_t bh; /* Common allocated/free header */
164   qlinks_t ql; /* Links on free list */
165 } bfhead_t;
166 #define BFH(p) ((bfhead_t *)(p))
167 
168 typedef struct thr_data {
169   bfhead_t freelist[MAX_BGET_BINS];
170 #if BufStats
171   size_t totalloc; /* Total space currently allocated */
172   long numget, numrel; /* Number of bget() and brel() calls */
173   long numpblk; /* Number of pool blocks */
174   long numpget, numprel; /* Number of block gets and rels */
175   long numdget, numdrel; /* Number of direct gets and rels */
176 #endif /* BufStats */
177 
178   /* Automatic expansion block management functions */
179   bget_compact_t compfcn;
180   bget_acquire_t acqfcn;
181   bget_release_t relfcn;
182 
183   bget_mode_t mode; /* what allocation mode to use? */
184 
185   bufsize exp_incr; /* Expansion block size */
186   bufsize pool_len; /* 0: no bpool calls have been made
187                        -1: not all pool blocks are the same size
188                        >0: (common) block size for all bpool calls made so far
189                     */
190   bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
191 } thr_data_t;
192 
193 /*  Minimum allocation quantum: */
194 #define QLSize (sizeof(qlinks_t))
195 #define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
196 #define MaxSize                                                                \
197   (bufsize)(                                                                   \
198       ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
199 // Maximun for the requested size.
200 
201 /* End sentinel: value placed in bsize field of dummy block delimiting
202    end of pool block.  The most negative number which will  fit  in  a
203    bufsize, defined in a way that the compiler will accept. */
204 
205 #define ESent                                                                  \
206   ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
207 
208 /* Thread Data management routines */
209 static int bget_get_bin(bufsize size) {
210   // binary chop bins
211   int lo = 0, hi = MAX_BGET_BINS - 1;
212 
213   KMP_DEBUG_ASSERT(size > 0);
214 
215   while ((hi - lo) > 1) {
216     int mid = (lo + hi) >> 1;
217     if (size < bget_bin_size[mid])
218       hi = mid - 1;
219     else
220       lo = mid;
221   }
222 
223   KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
224 
225   return lo;
226 }
227 
228 static void set_thr_data(kmp_info_t *th) {
229   int i;
230   thr_data_t *data;
231 
232   data = (thr_data_t *)((!th->th.th_local.bget_data)
233                             ? __kmp_allocate(sizeof(*data))
234                             : th->th.th_local.bget_data);
235 
236   memset(data, '\0', sizeof(*data));
237 
238   for (i = 0; i < MAX_BGET_BINS; ++i) {
239     data->freelist[i].ql.flink = &data->freelist[i];
240     data->freelist[i].ql.blink = &data->freelist[i];
241   }
242 
243   th->th.th_local.bget_data = data;
244   th->th.th_local.bget_list = 0;
245 #if !USE_CMP_XCHG_FOR_BGET
246 #ifdef USE_QUEUING_LOCK_FOR_BGET
247   __kmp_init_lock(&th->th.th_local.bget_lock);
248 #else
249   __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
250 #endif /* USE_LOCK_FOR_BGET */
251 #endif /* ! USE_CMP_XCHG_FOR_BGET */
252 }
253 
254 static thr_data_t *get_thr_data(kmp_info_t *th) {
255   thr_data_t *data;
256 
257   data = (thr_data_t *)th->th.th_local.bget_data;
258 
259   KMP_DEBUG_ASSERT(data != 0);
260 
261   return data;
262 }
263 
264 /* Walk the free list and release the enqueued buffers */
265 static void __kmp_bget_dequeue(kmp_info_t *th) {
266   void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
267 
268   if (p != 0) {
269 #if USE_CMP_XCHG_FOR_BGET
270     {
271       volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
272       while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
273                                         CCAST(void *, old_value), nullptr)) {
274         KMP_CPU_PAUSE();
275         old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
276       }
277       p = CCAST(void *, old_value);
278     }
279 #else /* ! USE_CMP_XCHG_FOR_BGET */
280 #ifdef USE_QUEUING_LOCK_FOR_BGET
281     __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
282 #else
283     __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
284 #endif /* USE_QUEUING_LOCK_FOR_BGET */
285 
286     p = (void *)th->th.th_local.bget_list;
287     th->th.th_local.bget_list = 0;
288 
289 #ifdef USE_QUEUING_LOCK_FOR_BGET
290     __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
291 #else
292     __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
293 #endif
294 #endif /* USE_CMP_XCHG_FOR_BGET */
295 
296     /* Check again to make sure the list is not empty */
297     while (p != 0) {
298       void *buf = p;
299       bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
300 
301       KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
302       KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
303                        (kmp_uintptr_t)th); // clear possible mark
304       KMP_DEBUG_ASSERT(b->ql.blink == 0);
305 
306       p = (void *)b->ql.flink;
307 
308       brel(th, buf);
309     }
310   }
311 }
312 
313 /* Chain together the free buffers by using the thread owner field */
314 static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
315 #ifdef USE_QUEUING_LOCK_FOR_BGET
316                                ,
317                                kmp_int32 rel_gtid
318 #endif
319                                ) {
320   bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
321 
322   KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
323   KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
324                    (kmp_uintptr_t)th); // clear possible mark
325 
326   b->ql.blink = 0;
327 
328   KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
329                 __kmp_gtid_from_thread(th)));
330 
331 #if USE_CMP_XCHG_FOR_BGET
332   {
333     volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
334     /* the next pointer must be set before setting bget_list to buf to avoid
335        exposing a broken list to other threads, even for an instant. */
336     b->ql.flink = BFH(CCAST(void *, old_value));
337 
338     while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
339                                       CCAST(void *, old_value), buf)) {
340       KMP_CPU_PAUSE();
341       old_value = TCR_PTR(th->th.th_local.bget_list);
342       /* the next pointer must be set before setting bget_list to buf to avoid
343          exposing a broken list to other threads, even for an instant. */
344       b->ql.flink = BFH(CCAST(void *, old_value));
345     }
346   }
347 #else /* ! USE_CMP_XCHG_FOR_BGET */
348 #ifdef USE_QUEUING_LOCK_FOR_BGET
349   __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
350 #else
351   __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
352 #endif
353 
354   b->ql.flink = BFH(th->th.th_local.bget_list);
355   th->th.th_local.bget_list = (void *)buf;
356 
357 #ifdef USE_QUEUING_LOCK_FOR_BGET
358   __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
359 #else
360   __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
361 #endif
362 #endif /* USE_CMP_XCHG_FOR_BGET */
363 }
364 
365 /* insert buffer back onto a new freelist */
366 static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
367   int bin;
368 
369   KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
370   KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
371 
372   bin = bget_get_bin(b->bh.bb.bsize);
373 
374   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
375                    &thr->freelist[bin]);
376   KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
377                    &thr->freelist[bin]);
378 
379   b->ql.flink = &thr->freelist[bin];
380   b->ql.blink = thr->freelist[bin].ql.blink;
381 
382   thr->freelist[bin].ql.blink = b;
383   b->ql.blink->ql.flink = b;
384 }
385 
386 /* unlink the buffer from the old freelist */
387 static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
388   KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
389   KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
390 
391   b->ql.blink->ql.flink = b->ql.flink;
392   b->ql.flink->ql.blink = b->ql.blink;
393 }
394 
395 /*  GET STATS -- check info on free list */
396 static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
397   thr_data_t *thr = get_thr_data(th);
398   int bin;
399 
400   *total_free = *max_free = 0;
401 
402   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
403     bfhead_t *b, *best;
404 
405     best = &thr->freelist[bin];
406     b = best->ql.flink;
407 
408     while (b != &thr->freelist[bin]) {
409       *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
410       if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
411         best = b;
412 
413       /* Link to next buffer */
414       b = b->ql.flink;
415     }
416 
417     if (*max_free < best->bh.bb.bsize)
418       *max_free = best->bh.bb.bsize;
419   }
420 
421   if (*max_free > (bufsize)sizeof(bhead_t))
422     *max_free -= sizeof(bhead_t);
423 }
424 
425 /*  BGET  --  Allocate a buffer.  */
426 static void *bget(kmp_info_t *th, bufsize requested_size) {
427   thr_data_t *thr = get_thr_data(th);
428   bufsize size = requested_size;
429   bfhead_t *b;
430   void *buf;
431   int compactseq = 0;
432   int use_blink = 0;
433   /* For BestFit */
434   bfhead_t *best;
435 
436   if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
437     return NULL;
438   }
439 
440   __kmp_bget_dequeue(th); /* Release any queued buffers */
441 
442   if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
443     size = SizeQ;
444   }
445 #if defined(SizeQuant) && (SizeQuant > 1)
446   size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
447 #endif
448 
449   size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
450   KMP_DEBUG_ASSERT(size >= 0);
451   KMP_DEBUG_ASSERT(size % SizeQuant == 0);
452 
453   use_blink = (thr->mode == bget_mode_lifo);
454 
455   /* If a compact function was provided in the call to bectl(), wrap
456      a loop around the allocation process  to  allow  compaction  to
457      intervene in case we don't find a suitable buffer in the chain. */
458 
459   for (;;) {
460     int bin;
461 
462     for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
463       /* Link to next buffer */
464       b = (use_blink ? thr->freelist[bin].ql.blink
465                      : thr->freelist[bin].ql.flink);
466 
467       if (thr->mode == bget_mode_best) {
468         best = &thr->freelist[bin];
469 
470         /* Scan the free list searching for the first buffer big enough
471            to hold the requested size buffer. */
472         while (b != &thr->freelist[bin]) {
473           if (b->bh.bb.bsize >= (bufsize)size) {
474             if ((best == &thr->freelist[bin]) ||
475                 (b->bh.bb.bsize < best->bh.bb.bsize)) {
476               best = b;
477             }
478           }
479 
480           /* Link to next buffer */
481           b = (use_blink ? b->ql.blink : b->ql.flink);
482         }
483         b = best;
484       }
485 
486       while (b != &thr->freelist[bin]) {
487         if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
488 
489           // Buffer is big enough to satisfy the request. Allocate it to the
490           // caller. We must decide whether the buffer is large enough to split
491           // into the part given to the caller and a free buffer that remains
492           // on the free list, or whether the entire buffer should be removed
493           // from the free list and given to the caller in its entirety. We
494           // only split the buffer if enough room remains for a header plus the
495           // minimum quantum of allocation.
496           if ((b->bh.bb.bsize - (bufsize)size) >
497               (bufsize)(SizeQ + (sizeof(bhead_t)))) {
498             bhead_t *ba, *bn;
499 
500             ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
501             bn = BH(((char *)ba) + size);
502 
503             KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
504 
505             /* Subtract size from length of free block. */
506             b->bh.bb.bsize -= (bufsize)size;
507 
508             /* Link allocated buffer to the previous free buffer. */
509             ba->bb.prevfree = b->bh.bb.bsize;
510 
511             /* Plug negative size into user buffer. */
512             ba->bb.bsize = -size;
513 
514             /* Mark this buffer as owned by this thread. */
515             TCW_PTR(ba->bb.bthr,
516                     th); // not an allocated address (do not mark it)
517             /* Mark buffer after this one not preceded by free block. */
518             bn->bb.prevfree = 0;
519 
520             // unlink buffer from old freelist, and reinsert into new freelist
521             __kmp_bget_remove_from_freelist(b);
522             __kmp_bget_insert_into_freelist(thr, b);
523 #if BufStats
524             thr->totalloc += (size_t)size;
525             thr->numget++; /* Increment number of bget() calls */
526 #endif
527             buf = (void *)((((char *)ba) + sizeof(bhead_t)));
528             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
529             return buf;
530           } else {
531             bhead_t *ba;
532 
533             ba = BH(((char *)b) + b->bh.bb.bsize);
534 
535             KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
536 
537             /* The buffer isn't big enough to split.  Give  the  whole
538                shebang to the caller and remove it from the free list. */
539 
540             __kmp_bget_remove_from_freelist(b);
541 #if BufStats
542             thr->totalloc += (size_t)b->bh.bb.bsize;
543             thr->numget++; /* Increment number of bget() calls */
544 #endif
545             /* Negate size to mark buffer allocated. */
546             b->bh.bb.bsize = -(b->bh.bb.bsize);
547 
548             /* Mark this buffer as owned by this thread. */
549             TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
550             /* Zero the back pointer in the next buffer in memory
551                to indicate that this buffer is allocated. */
552             ba->bb.prevfree = 0;
553 
554             /* Give user buffer starting at queue links. */
555             buf = (void *)&(b->ql);
556             KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
557             return buf;
558           }
559         }
560 
561         /* Link to next buffer */
562         b = (use_blink ? b->ql.blink : b->ql.flink);
563       }
564     }
565 
566     /* We failed to find a buffer. If there's a compact function defined,
567        notify it of the size requested. If it returns TRUE, try the allocation
568        again. */
569 
570     if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
571       break;
572     }
573   }
574 
575   /* No buffer available with requested size free. */
576 
577   /* Don't give up yet -- look in the reserve supply. */
578   if (thr->acqfcn != 0) {
579     if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
580       /* Request is too large to fit in a single expansion block.
581          Try to satisy it by a direct buffer acquisition. */
582       bdhead_t *bdh;
583 
584       size += sizeof(bdhead_t) - sizeof(bhead_t);
585 
586       KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
587 
588       /* richryan */
589       bdh = BDH((*thr->acqfcn)((bufsize)size));
590       if (bdh != NULL) {
591 
592         // Mark the buffer special by setting size field of its header to zero.
593         bdh->bh.bb.bsize = 0;
594 
595         /* Mark this buffer as owned by this thread. */
596         TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
597         // because direct buffer never goes to free list
598         bdh->bh.bb.prevfree = 0;
599         bdh->tsize = size;
600 #if BufStats
601         thr->totalloc += (size_t)size;
602         thr->numget++; /* Increment number of bget() calls */
603         thr->numdget++; /* Direct bget() call count */
604 #endif
605         buf = (void *)(bdh + 1);
606         KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
607         return buf;
608       }
609 
610     } else {
611 
612       /*  Try to obtain a new expansion block */
613       void *newpool;
614 
615       KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
616 
617       /* richryan */
618       newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
619       KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
620       if (newpool != NULL) {
621         bpool(th, newpool, thr->exp_incr);
622         buf = bget(
623             th, requested_size); /* This can't, I say, can't get into a loop. */
624         return buf;
625       }
626     }
627   }
628 
629   /*  Still no buffer available */
630 
631   return NULL;
632 }
633 
634 /*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
635                the  entire  contents  of  the buffer to zero, not just the
636                region requested by the caller. */
637 
638 static void *bgetz(kmp_info_t *th, bufsize size) {
639   char *buf = (char *)bget(th, size);
640 
641   if (buf != NULL) {
642     bhead_t *b;
643     bufsize rsize;
644 
645     b = BH(buf - sizeof(bhead_t));
646     rsize = -(b->bb.bsize);
647     if (rsize == 0) {
648       bdhead_t *bd;
649 
650       bd = BDH(buf - sizeof(bdhead_t));
651       rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
652     } else {
653       rsize -= sizeof(bhead_t);
654     }
655 
656     KMP_DEBUG_ASSERT(rsize >= size);
657 
658     (void)memset(buf, 0, (bufsize)rsize);
659   }
660   return ((void *)buf);
661 }
662 
663 /*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
664                simply in terms of brel()  and  bget().   It  could  be
665                enhanced to allow the buffer to grow into adjacent free
666                blocks and to avoid moving data unnecessarily.  */
667 
668 static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
669   void *nbuf;
670   bufsize osize; /* Old size of buffer */
671   bhead_t *b;
672 
673   nbuf = bget(th, size);
674   if (nbuf == NULL) { /* Acquire new buffer */
675     return NULL;
676   }
677   if (buf == NULL) {
678     return nbuf;
679   }
680   b = BH(((char *)buf) - sizeof(bhead_t));
681   osize = -b->bb.bsize;
682   if (osize == 0) {
683     /*  Buffer acquired directly through acqfcn. */
684     bdhead_t *bd;
685 
686     bd = BDH(((char *)buf) - sizeof(bdhead_t));
687     osize = bd->tsize - (bufsize)sizeof(bdhead_t);
688   } else {
689     osize -= sizeof(bhead_t);
690   }
691 
692   KMP_DEBUG_ASSERT(osize > 0);
693 
694   (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
695                    (size_t)((size < osize) ? size : osize));
696   brel(th, buf);
697 
698   return nbuf;
699 }
700 
701 /*  BREL  --  Release a buffer.  */
702 static void brel(kmp_info_t *th, void *buf) {
703   thr_data_t *thr = get_thr_data(th);
704   bfhead_t *b, *bn;
705   kmp_info_t *bth;
706 
707   KMP_DEBUG_ASSERT(buf != NULL);
708   KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
709 
710   b = BFH(((char *)buf) - sizeof(bhead_t));
711 
712   if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
713     bdhead_t *bdh;
714 
715     bdh = BDH(((char *)buf) - sizeof(bdhead_t));
716     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
717 #if BufStats
718     thr->totalloc -= (size_t)bdh->tsize;
719     thr->numdrel++; /* Number of direct releases */
720     thr->numrel++; /* Increment number of brel() calls */
721 #endif /* BufStats */
722 #ifdef FreeWipe
723     (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
724 #endif /* FreeWipe */
725 
726     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
727 
728     KMP_DEBUG_ASSERT(thr->relfcn != 0);
729     (*thr->relfcn)((void *)bdh); /* Release it directly. */
730     return;
731   }
732 
733   bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
734                        ~1); // clear possible mark before comparison
735   if (bth != th) {
736     /* Add this buffer to be released by the owning thread later */
737     __kmp_bget_enqueue(bth, buf
738 #ifdef USE_QUEUING_LOCK_FOR_BGET
739                        ,
740                        __kmp_gtid_from_thread(th)
741 #endif
742                            );
743     return;
744   }
745 
746   /* Buffer size must be negative, indicating that the buffer is allocated. */
747   if (b->bh.bb.bsize >= 0) {
748     bn = NULL;
749   }
750   KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
751 
752   /*  Back pointer in next buffer must be zero, indicating the same thing: */
753 
754   KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
755 
756 #if BufStats
757   thr->numrel++; /* Increment number of brel() calls */
758   thr->totalloc += (size_t)b->bh.bb.bsize;
759 #endif
760 
761   /* If the back link is nonzero, the previous buffer is free.  */
762 
763   if (b->bh.bb.prevfree != 0) {
764     /* The previous buffer is free. Consolidate this buffer with it by adding
765        the length of this buffer to the previous free buffer. Note that we
766        subtract the size in the buffer being released, since it's negative to
767        indicate that the buffer is allocated. */
768     bufsize size = b->bh.bb.bsize;
769 
770     /* Make the previous buffer the one we're working on. */
771     KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
772                      b->bh.bb.prevfree);
773     b = BFH(((char *)b) - b->bh.bb.prevfree);
774     b->bh.bb.bsize -= size;
775 
776     /* unlink the buffer from the old freelist */
777     __kmp_bget_remove_from_freelist(b);
778   } else {
779     /* The previous buffer isn't allocated. Mark this buffer size as positive
780        (i.e. free) and fall through to place the buffer on the free list as an
781        isolated free block. */
782     b->bh.bb.bsize = -b->bh.bb.bsize;
783   }
784 
785   /* insert buffer back onto a new freelist */
786   __kmp_bget_insert_into_freelist(thr, b);
787 
788   /* Now we look at the next buffer in memory, located by advancing from
789      the  start  of  this  buffer  by its size, to see if that buffer is
790      free.  If it is, we combine  this  buffer  with  the  next  one  in
791      memory, dechaining the second buffer from the free list. */
792   bn = BFH(((char *)b) + b->bh.bb.bsize);
793   if (bn->bh.bb.bsize > 0) {
794 
795     /* The buffer is free.  Remove it from the free list and add
796        its size to that of our buffer. */
797     KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
798                      bn->bh.bb.bsize);
799 
800     __kmp_bget_remove_from_freelist(bn);
801 
802     b->bh.bb.bsize += bn->bh.bb.bsize;
803 
804     /* unlink the buffer from the old freelist, and reinsert it into the new
805      * freelist */
806     __kmp_bget_remove_from_freelist(b);
807     __kmp_bget_insert_into_freelist(thr, b);
808 
809     /* Finally,  advance  to   the  buffer  that   follows  the  newly
810        consolidated free block.  We must set its  backpointer  to  the
811        head  of  the  consolidated free block.  We know the next block
812        must be an allocated block because the process of recombination
813        guarantees  that  two  free  blocks will never be contiguous in
814        memory.  */
815     bn = BFH(((char *)b) + b->bh.bb.bsize);
816   }
817 #ifdef FreeWipe
818   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
819                (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
820 #endif
821   KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
822 
823   /* The next buffer is allocated.  Set the backpointer in it  to  point
824      to this buffer; the previous free buffer in memory. */
825 
826   bn->bh.bb.prevfree = b->bh.bb.bsize;
827 
828   /*  If  a  block-release function is defined, and this free buffer
829       constitutes the entire block, release it.  Note that  pool_len
830       is  defined  in  such a way that the test will fail unless all
831       pool blocks are the same size.  */
832   if (thr->relfcn != 0 &&
833       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
834 #if BufStats
835     if (thr->numpblk !=
836         1) { /* Do not release the last buffer until finalization time */
837 #endif
838 
839       KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
840       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
841       KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
842                        b->bh.bb.bsize);
843 
844       /*  Unlink the buffer from the free list  */
845       __kmp_bget_remove_from_freelist(b);
846 
847       KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
848 
849       (*thr->relfcn)(b);
850 #if BufStats
851       thr->numprel++; /* Nr of expansion block releases */
852       thr->numpblk--; /* Total number of blocks */
853       KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
854 
855       // avoid leaving stale last_pool pointer around if it is being dealloced
856       if (thr->last_pool == b)
857         thr->last_pool = 0;
858     } else {
859       thr->last_pool = b;
860     }
861 #endif /* BufStats */
862   }
863 }
864 
865 /*  BECTL  --  Establish automatic pool expansion control  */
866 static void bectl(kmp_info_t *th, bget_compact_t compact,
867                   bget_acquire_t acquire, bget_release_t release,
868                   bufsize pool_incr) {
869   thr_data_t *thr = get_thr_data(th);
870 
871   thr->compfcn = compact;
872   thr->acqfcn = acquire;
873   thr->relfcn = release;
874   thr->exp_incr = pool_incr;
875 }
876 
877 /*  BPOOL  --  Add a region of memory to the buffer pool.  */
878 static void bpool(kmp_info_t *th, void *buf, bufsize len) {
879   /*    int bin = 0; */
880   thr_data_t *thr = get_thr_data(th);
881   bfhead_t *b = BFH(buf);
882   bhead_t *bn;
883 
884   __kmp_bget_dequeue(th); /* Release any queued buffers */
885 
886 #ifdef SizeQuant
887   len &= ~(SizeQuant - 1);
888 #endif
889   if (thr->pool_len == 0) {
890     thr->pool_len = len;
891   } else if (len != thr->pool_len) {
892     thr->pool_len = -1;
893   }
894 #if BufStats
895   thr->numpget++; /* Number of block acquisitions */
896   thr->numpblk++; /* Number of blocks total */
897   KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
898 #endif /* BufStats */
899 
900   /* Since the block is initially occupied by a single free  buffer,
901      it  had  better  not  be  (much) larger than the largest buffer
902      whose size we can store in bhead.bb.bsize. */
903   KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
904 
905   /* Clear  the  backpointer at  the start of the block to indicate that
906      there  is  no  free  block  prior  to  this   one.    That   blocks
907      recombination when the first block in memory is released. */
908   b->bh.bb.prevfree = 0;
909 
910   /* Create a dummy allocated buffer at the end of the pool.  This dummy
911      buffer is seen when a buffer at the end of the pool is released and
912      blocks  recombination  of  the last buffer with the dummy buffer at
913      the end.  The length in the dummy buffer  is  set  to  the  largest
914      negative  number  to  denote  the  end  of  the pool for diagnostic
915      routines (this specific value is  not  counted  on  by  the  actual
916      allocation and release functions). */
917   len -= sizeof(bhead_t);
918   b->bh.bb.bsize = (bufsize)len;
919   /* Set the owner of this buffer */
920   TCW_PTR(b->bh.bb.bthr,
921           (kmp_info_t *)((kmp_uintptr_t)th |
922                          1)); // mark the buffer as allocated address
923 
924   /* Chain the new block to the free list. */
925   __kmp_bget_insert_into_freelist(thr, b);
926 
927 #ifdef FreeWipe
928   (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
929                (size_t)(len - sizeof(bfhead_t)));
930 #endif
931   bn = BH(((char *)b) + len);
932   bn->bb.prevfree = (bufsize)len;
933   /* Definition of ESent assumes two's complement! */
934   KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
935 
936   bn->bb.bsize = ESent;
937 }
938 
939 /*  BFREED  --  Dump the free lists for this thread. */
940 static void bfreed(kmp_info_t *th) {
941   int bin = 0, count = 0;
942   int gtid = __kmp_gtid_from_thread(th);
943   thr_data_t *thr = get_thr_data(th);
944 
945 #if BufStats
946   __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
947                        " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
948                        " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
949                        " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
950                        " drel=%" KMP_INT64_SPEC "\n",
951                        gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
952                        (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
953                        (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
954                        (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
955 #endif
956 
957   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
958     bfhead_t *b;
959 
960     for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
961          b = b->ql.flink) {
962       bufsize bs = b->bh.bb.bsize;
963 
964       KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
965       KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
966       KMP_DEBUG_ASSERT(bs > 0);
967 
968       count += 1;
969 
970       __kmp_printf_no_lock(
971           "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
972           (long)bs);
973 #ifdef FreeWipe
974       {
975         char *lerr = ((char *)b) + sizeof(bfhead_t);
976         if ((bs > sizeof(bfhead_t)) &&
977             ((*lerr != 0x55) ||
978              (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
979               0))) {
980           __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
981                                "free block have been overstored.)\n",
982                                gtid);
983         }
984       }
985 #endif
986     }
987   }
988 
989   if (count == 0)
990     __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
991 }
992 
993 void __kmp_initialize_bget(kmp_info_t *th) {
994   KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
995 
996   set_thr_data(th);
997 
998   bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
999         (bufsize)__kmp_malloc_pool_incr);
1000 }
1001 
1002 void __kmp_finalize_bget(kmp_info_t *th) {
1003   thr_data_t *thr;
1004   bfhead_t *b;
1005 
1006   KMP_DEBUG_ASSERT(th != 0);
1007 
1008 #if BufStats
1009   thr = (thr_data_t *)th->th.th_local.bget_data;
1010   KMP_DEBUG_ASSERT(thr != NULL);
1011   b = thr->last_pool;
1012 
1013   /*  If a block-release function is defined, and this free buffer constitutes
1014       the entire block, release it. Note that pool_len is defined in such a way
1015       that the test will fail unless all pool blocks are the same size.  */
1016 
1017   // Deallocate the last pool if one exists because we no longer do it in brel()
1018   if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1019       b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1020     KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1021     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1022     KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1023                      b->bh.bb.bsize);
1024 
1025     /*  Unlink the buffer from the free list  */
1026     __kmp_bget_remove_from_freelist(b);
1027 
1028     KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1029 
1030     (*thr->relfcn)(b);
1031     thr->numprel++; /* Nr of expansion block releases */
1032     thr->numpblk--; /* Total number of blocks */
1033     KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1034   }
1035 #endif /* BufStats */
1036 
1037   /* Deallocate bget_data */
1038   if (th->th.th_local.bget_data != NULL) {
1039     __kmp_free(th->th.th_local.bget_data);
1040     th->th.th_local.bget_data = NULL;
1041   }
1042 }
1043 
1044 void kmpc_set_poolsize(size_t size) {
1045   bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1046         (bget_release_t)free, (bufsize)size);
1047 }
1048 
1049 size_t kmpc_get_poolsize(void) {
1050   thr_data_t *p;
1051 
1052   p = get_thr_data(__kmp_get_thread());
1053 
1054   return p->exp_incr;
1055 }
1056 
1057 void kmpc_set_poolmode(int mode) {
1058   thr_data_t *p;
1059 
1060   if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1061       mode == bget_mode_best) {
1062     p = get_thr_data(__kmp_get_thread());
1063     p->mode = (bget_mode_t)mode;
1064   }
1065 }
1066 
1067 int kmpc_get_poolmode(void) {
1068   thr_data_t *p;
1069 
1070   p = get_thr_data(__kmp_get_thread());
1071 
1072   return p->mode;
1073 }
1074 
1075 void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1076   kmp_info_t *th = __kmp_get_thread();
1077   bufsize a, b;
1078 
1079   __kmp_bget_dequeue(th); /* Release any queued buffers */
1080 
1081   bcheck(th, &a, &b);
1082 
1083   *maxmem = a;
1084   *allmem = b;
1085 }
1086 
1087 void kmpc_poolprint(void) {
1088   kmp_info_t *th = __kmp_get_thread();
1089 
1090   __kmp_bget_dequeue(th); /* Release any queued buffers */
1091 
1092   bfreed(th);
1093 }
1094 
1095 #endif // #if KMP_USE_BGET
1096 
1097 void *kmpc_malloc(size_t size) {
1098   void *ptr;
1099   ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1100   if (ptr != NULL) {
1101     // save allocated pointer just before one returned to user
1102     *(void **)ptr = ptr;
1103     ptr = (void **)ptr + 1;
1104   }
1105   return ptr;
1106 }
1107 
1108 #define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1109 
1110 void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1111   void *ptr;
1112   void *ptr_allocated;
1113   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1114   if (!IS_POWER_OF_TWO(alignment)) {
1115     // AC: do we need to issue a warning here?
1116     errno = EINVAL;
1117     return NULL;
1118   }
1119   size = size + sizeof(void *) + alignment;
1120   ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1121   if (ptr_allocated != NULL) {
1122     // save allocated pointer just before one returned to user
1123     ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1124                    ~(alignment - 1));
1125     *((void **)ptr - 1) = ptr_allocated;
1126   } else {
1127     ptr = NULL;
1128   }
1129   return ptr;
1130 }
1131 
1132 void *kmpc_calloc(size_t nelem, size_t elsize) {
1133   void *ptr;
1134   ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1135   if (ptr != NULL) {
1136     // save allocated pointer just before one returned to user
1137     *(void **)ptr = ptr;
1138     ptr = (void **)ptr + 1;
1139   }
1140   return ptr;
1141 }
1142 
1143 void *kmpc_realloc(void *ptr, size_t size) {
1144   void *result = NULL;
1145   if (ptr == NULL) {
1146     // If pointer is NULL, realloc behaves like malloc.
1147     result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1148     // save allocated pointer just before one returned to user
1149     if (result != NULL) {
1150       *(void **)result = result;
1151       result = (void **)result + 1;
1152     }
1153   } else if (size == 0) {
1154     // If size is 0, realloc behaves like free.
1155     // The thread must be registered by the call to kmpc_malloc() or
1156     // kmpc_calloc() before.
1157     // So it should be safe to call __kmp_get_thread(), not
1158     // __kmp_entry_thread().
1159     KMP_ASSERT(*((void **)ptr - 1));
1160     brel(__kmp_get_thread(), *((void **)ptr - 1));
1161   } else {
1162     result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1163                    (bufsize)(size + sizeof(ptr)));
1164     if (result != NULL) {
1165       *(void **)result = result;
1166       result = (void **)result + 1;
1167     }
1168   }
1169   return result;
1170 }
1171 
1172 // NOTE: the library must have already been initialized by a previous allocate
1173 void kmpc_free(void *ptr) {
1174   if (!__kmp_init_serial) {
1175     return;
1176   }
1177   if (ptr != NULL) {
1178     kmp_info_t *th = __kmp_get_thread();
1179     __kmp_bget_dequeue(th); /* Release any queued buffers */
1180     // extract allocated pointer and free it
1181     KMP_ASSERT(*((void **)ptr - 1));
1182     brel(th, *((void **)ptr - 1));
1183   }
1184 }
1185 
1186 void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1187   void *ptr;
1188   KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1189                 (int)size KMP_SRC_LOC_PARM));
1190   ptr = bget(th, (bufsize)size);
1191   KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1192   return ptr;
1193 }
1194 
1195 void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1196                            size_t elsize KMP_SRC_LOC_DECL) {
1197   void *ptr;
1198   KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1199                 (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1200   ptr = bgetz(th, (bufsize)(nelem * elsize));
1201   KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1202   return ptr;
1203 }
1204 
1205 void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1206                             size_t size KMP_SRC_LOC_DECL) {
1207   KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1208                 ptr, (int)size KMP_SRC_LOC_PARM));
1209   ptr = bgetr(th, ptr, (bufsize)size);
1210   KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1211   return ptr;
1212 }
1213 
1214 void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1215   KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1216                 ptr KMP_SRC_LOC_PARM));
1217   if (ptr != NULL) {
1218     __kmp_bget_dequeue(th); /* Release any queued buffers */
1219     brel(th, ptr);
1220   }
1221   KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1222 }
1223 
1224 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
1225    memory leaks, but it may be useful for debugging memory corruptions, used
1226    freed pointers, etc. */
1227 /* #define LEAK_MEMORY */
1228 struct kmp_mem_descr { // Memory block descriptor.
1229   void *ptr_allocated; // Pointer returned by malloc(), subject for free().
1230   size_t size_allocated; // Size of allocated memory block.
1231   void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
1232   size_t size_aligned; // Size of aligned memory block.
1233 };
1234 typedef struct kmp_mem_descr kmp_mem_descr_t;
1235 
1236 /* Allocate memory on requested boundary, fill allocated memory with 0x00.
1237    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1238    error. Must use __kmp_free when freeing memory allocated by this routine! */
1239 static void *___kmp_allocate_align(size_t size,
1240                                    size_t alignment KMP_SRC_LOC_DECL) {
1241   /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
1242      requested to return properly aligned pointer. Original pointer returned
1243      by malloc() and size of allocated block is saved in descriptor just
1244      before the aligned pointer. This information used by __kmp_free() -- it
1245      has to pass to free() original pointer, not aligned one.
1246 
1247           +---------+------------+-----------------------------------+---------+
1248           | padding | descriptor |           aligned block           | padding |
1249           +---------+------------+-----------------------------------+---------+
1250           ^                      ^
1251           |                      |
1252           |                      +- Aligned pointer returned to caller
1253           +- Pointer returned by malloc()
1254 
1255       Aligned block is filled with zeros, paddings are filled with 0xEF. */
1256 
1257   kmp_mem_descr_t descr;
1258   kmp_uintptr_t addr_allocated; // Address returned by malloc().
1259   kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
1260   kmp_uintptr_t addr_descr; // Address of memory block descriptor.
1261 
1262   KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
1263                 (int)size, (int)alignment KMP_SRC_LOC_PARM));
1264 
1265   KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
1266   KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
1267   // Make sure kmp_uintptr_t is enough to store addresses.
1268 
1269   descr.size_aligned = size;
1270   descr.size_allocated =
1271       descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
1272 
1273 #if KMP_DEBUG
1274   descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
1275 #else
1276   descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
1277 #endif
1278   KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
1279                 descr.ptr_allocated));
1280   if (descr.ptr_allocated == NULL) {
1281     KMP_FATAL(OutOfHeapMemory);
1282   }
1283 
1284   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1285   addr_aligned =
1286       (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
1287   addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
1288 
1289   descr.ptr_aligned = (void *)addr_aligned;
1290 
1291   KE_TRACE(26, ("   ___kmp_allocate_align: "
1292                 "ptr_allocated=%p, size_allocated=%d, "
1293                 "ptr_aligned=%p, size_aligned=%d\n",
1294                 descr.ptr_allocated, (int)descr.size_allocated,
1295                 descr.ptr_aligned, (int)descr.size_aligned));
1296 
1297   KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
1298   KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
1299   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1300                    addr_allocated + descr.size_allocated);
1301   KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
1302 #ifdef KMP_DEBUG
1303   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1304 // Fill allocated memory block with 0xEF.
1305 #endif
1306   memset(descr.ptr_aligned, 0x00, descr.size_aligned);
1307   // Fill the aligned memory block (which is intended for using by caller) with
1308   // 0x00. Do not
1309   // put this filling under KMP_DEBUG condition! Many callers expect zeroed
1310   // memory. (Padding
1311   // bytes remain filled with 0xEF in debugging library.)
1312   *((kmp_mem_descr_t *)addr_descr) = descr;
1313 
1314   KMP_MB();
1315 
1316   KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
1317   return descr.ptr_aligned;
1318 } // func ___kmp_allocate_align
1319 
1320 /* Allocate memory on cache line boundary, fill allocated memory with 0x00.
1321    Do not call this func directly! Use __kmp_allocate macro instead.
1322    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1323    error. Must use __kmp_free when freeing memory allocated by this routine! */
1324 void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
1325   void *ptr;
1326   KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
1327                 (int)size KMP_SRC_LOC_PARM));
1328   ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
1329   KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
1330   return ptr;
1331 } // func ___kmp_allocate
1332 
1333 /* Allocate memory on page boundary, fill allocated memory with 0x00.
1334    Does not call this func directly! Use __kmp_page_allocate macro instead.
1335    NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1336    error. Must use __kmp_free when freeing memory allocated by this routine! */
1337 void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
1338   int page_size = 8 * 1024;
1339   void *ptr;
1340 
1341   KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
1342                 (int)size KMP_SRC_LOC_PARM));
1343   ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
1344   KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
1345   return ptr;
1346 } // ___kmp_page_allocate
1347 
1348 /* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
1349    In debug mode, fill the memory block with 0xEF before call to free(). */
1350 void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
1351   kmp_mem_descr_t descr;
1352   kmp_uintptr_t addr_allocated; // Address returned by malloc().
1353   kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
1354 
1355   KE_TRACE(25,
1356            ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
1357   KMP_ASSERT(ptr != NULL);
1358 
1359   descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
1360 
1361   KE_TRACE(26, ("   __kmp_free:     "
1362                 "ptr_allocated=%p, size_allocated=%d, "
1363                 "ptr_aligned=%p, size_aligned=%d\n",
1364                 descr.ptr_allocated, (int)descr.size_allocated,
1365                 descr.ptr_aligned, (int)descr.size_aligned));
1366 
1367   addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1368   addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
1369 
1370   KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
1371   KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
1372   KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
1373   KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
1374   KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1375                    addr_allocated + descr.size_allocated);
1376 
1377 #ifdef KMP_DEBUG
1378   memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1379 // Fill memory block with 0xEF, it helps catch using freed memory.
1380 #endif
1381 
1382 #ifndef LEAK_MEMORY
1383   KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
1384 #ifdef KMP_DEBUG
1385   _free_src_loc(descr.ptr_allocated, _file_, _line_);
1386 #else
1387   free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
1388 #endif
1389 #endif
1390   KMP_MB();
1391   KE_TRACE(25, ("<- __kmp_free() returns\n"));
1392 } // func ___kmp_free
1393 
1394 #if USE_FAST_MEMORY == 3
1395 // Allocate fast memory by first scanning the thread's free lists
1396 // If a chunk the right size exists, grab it off the free list.
1397 // Otherwise allocate normally using kmp_thread_malloc.
1398 
1399 // AC: How to choose the limit? Just get 16 for now...
1400 #define KMP_FREE_LIST_LIMIT 16
1401 
1402 // Always use 128 bytes for determining buckets for caching memory blocks
1403 #define DCACHE_LINE 128
1404 
1405 void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
1406   void *ptr;
1407   int num_lines;
1408   int idx;
1409   int index;
1410   void *alloc_ptr;
1411   size_t alloc_size;
1412   kmp_mem_descr_t *descr;
1413 
1414   KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
1415                 __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
1416 
1417   num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
1418   idx = num_lines - 1;
1419   KMP_DEBUG_ASSERT(idx >= 0);
1420   if (idx < 2) {
1421     index = 0; // idx is [ 0, 1 ], use first free list
1422     num_lines = 2; // 1, 2 cache lines or less than cache line
1423   } else if ((idx >>= 2) == 0) {
1424     index = 1; // idx is [ 2, 3 ], use second free list
1425     num_lines = 4; // 3, 4 cache lines
1426   } else if ((idx >>= 2) == 0) {
1427     index = 2; // idx is [ 4, 15 ], use third free list
1428     num_lines = 16; // 5, 6, ..., 16 cache lines
1429   } else if ((idx >>= 2) == 0) {
1430     index = 3; // idx is [ 16, 63 ], use fourth free list
1431     num_lines = 64; // 17, 18, ..., 64 cache lines
1432   } else {
1433     goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
1434   }
1435 
1436   ptr = this_thr->th.th_free_lists[index].th_free_list_self;
1437   if (ptr != NULL) {
1438     // pop the head of no-sync free list
1439     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
1440     KMP_DEBUG_ASSERT(
1441         this_thr ==
1442         ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
1443             ->ptr_aligned);
1444     goto end;
1445   }
1446   ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
1447   if (ptr != NULL) {
1448     // no-sync free list is empty, use sync free list (filled in by other
1449     // threads only)
1450     // pop the head of the sync free list, push NULL instead
1451     while (!KMP_COMPARE_AND_STORE_PTR(
1452         &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
1453       KMP_CPU_PAUSE();
1454       ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
1455     }
1456     // push the rest of chain into no-sync free list (can be NULL if there was
1457     // the only block)
1458     this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
1459     KMP_DEBUG_ASSERT(
1460         this_thr ==
1461         ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
1462             ->ptr_aligned);
1463     goto end;
1464   }
1465 
1466 alloc_call:
1467   // haven't found block in the free lists, thus allocate it
1468   size = num_lines * DCACHE_LINE;
1469 
1470   alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
1471   KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
1472                 "alloc_size %d\n",
1473                 __kmp_gtid_from_thread(this_thr), alloc_size));
1474   alloc_ptr = bget(this_thr, (bufsize)alloc_size);
1475 
1476   // align ptr to DCACHE_LINE
1477   ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
1478                   DCACHE_LINE) &
1479                  ~(DCACHE_LINE - 1));
1480   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
1481 
1482   descr->ptr_allocated = alloc_ptr; // remember allocated pointer
1483   // we don't need size_allocated
1484   descr->ptr_aligned = (void *)this_thr; // remember allocating thread
1485   // (it is already saved in bget buffer,
1486   // but we may want to use another allocator in future)
1487   descr->size_aligned = size;
1488 
1489 end:
1490   KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
1491                 __kmp_gtid_from_thread(this_thr), ptr));
1492   return ptr;
1493 } // func __kmp_fast_allocate
1494 
1495 // Free fast memory and place it on the thread's free list if it is of
1496 // the correct size.
1497 void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
1498   kmp_mem_descr_t *descr;
1499   kmp_info_t *alloc_thr;
1500   size_t size;
1501   size_t idx;
1502   int index;
1503 
1504   KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
1505                 __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
1506   KMP_ASSERT(ptr != NULL);
1507 
1508   descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
1509 
1510   KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
1511                 (int)descr->size_aligned));
1512 
1513   size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
1514 
1515   idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
1516   if (idx == size) {
1517     index = 0; // 2 cache lines
1518   } else if ((idx <<= 1) == size) {
1519     index = 1; // 4 cache lines
1520   } else if ((idx <<= 2) == size) {
1521     index = 2; // 16 cache lines
1522   } else if ((idx <<= 2) == size) {
1523     index = 3; // 64 cache lines
1524   } else {
1525     KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
1526     goto free_call; // 65 or more cache lines ( > 8KB )
1527   }
1528 
1529   alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
1530   if (alloc_thr == this_thr) {
1531     // push block to self no-sync free list, linking previous head (LIFO)
1532     *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
1533     this_thr->th.th_free_lists[index].th_free_list_self = ptr;
1534   } else {
1535     void *head = this_thr->th.th_free_lists[index].th_free_list_other;
1536     if (head == NULL) {
1537       // Create new free list
1538       this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1539       *((void **)ptr) = NULL; // mark the tail of the list
1540       descr->size_allocated = (size_t)1; // head of the list keeps its length
1541     } else {
1542       // need to check existed "other" list's owner thread and size of queue
1543       kmp_mem_descr_t *dsc =
1544           (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
1545       // allocating thread, same for all queue nodes
1546       kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
1547       size_t q_sz =
1548           dsc->size_allocated + 1; // new size in case we add current task
1549       if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
1550         // we can add current task to "other" list, no sync needed
1551         *((void **)ptr) = head;
1552         descr->size_allocated = q_sz;
1553         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1554       } else {
1555         // either queue blocks owner is changing or size limit exceeded
1556         // return old queue to allocating thread (q_th) synchroneously,
1557         // and start new list for alloc_thr's tasks
1558         void *old_ptr;
1559         void *tail = head;
1560         void *next = *((void **)head);
1561         while (next != NULL) {
1562           KMP_DEBUG_ASSERT(
1563               // queue size should decrease by 1 each step through the list
1564               ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
1565                       ->size_allocated +
1566                   1 ==
1567               ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
1568                   ->size_allocated);
1569           tail = next; // remember tail node
1570           next = *((void **)next);
1571         }
1572         KMP_DEBUG_ASSERT(q_th != NULL);
1573         // push block to owner's sync free list
1574         old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
1575         /* the next pointer must be set before setting free_list to ptr to avoid
1576            exposing a broken list to other threads, even for an instant. */
1577         *((void **)tail) = old_ptr;
1578 
1579         while (!KMP_COMPARE_AND_STORE_PTR(
1580             &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
1581           KMP_CPU_PAUSE();
1582           old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
1583           *((void **)tail) = old_ptr;
1584         }
1585 
1586         // start new list of not-selt tasks
1587         this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1588         *((void **)ptr) = NULL;
1589         descr->size_allocated = (size_t)1; // head of queue keeps its length
1590       }
1591     }
1592   }
1593   goto end;
1594 
1595 free_call:
1596   KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
1597                 __kmp_gtid_from_thread(this_thr), size));
1598   __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
1599   brel(this_thr, descr->ptr_allocated);
1600 
1601 end:
1602   KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
1603 
1604 } // func __kmp_fast_free
1605 
1606 // Initialize the thread free lists related to fast memory
1607 // Only do this when a thread is initially created.
1608 void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
1609   KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
1610 
1611   memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
1612 }
1613 
1614 // Free the memory in the thread free lists related to fast memory
1615 // Only do this when a thread is being reaped (destroyed).
1616 void __kmp_free_fast_memory(kmp_info_t *th) {
1617   // Suppose we use BGET underlying allocator, walk through its structures...
1618   int bin;
1619   thr_data_t *thr = get_thr_data(th);
1620   void **lst = NULL;
1621 
1622   KE_TRACE(
1623       5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
1624 
1625   __kmp_bget_dequeue(th); // Release any queued buffers
1626 
1627   // Dig through free lists and extract all allocated blocks
1628   for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
1629     bfhead_t *b = thr->freelist[bin].ql.flink;
1630     while (b != &thr->freelist[bin]) {
1631       if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
1632         *((void **)b) =
1633             lst; // link the list (override bthr, but keep flink yet)
1634         lst = (void **)b; // push b into lst
1635       }
1636       b = b->ql.flink; // get next buffer
1637     }
1638   }
1639   while (lst != NULL) {
1640     void *next = *lst;
1641     KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
1642                   lst, next, th, __kmp_gtid_from_thread(th)));
1643     (*thr->relfcn)(lst);
1644 #if BufStats
1645     // count blocks to prevent problems in __kmp_finalize_bget()
1646     thr->numprel++; /* Nr of expansion block releases */
1647     thr->numpblk--; /* Total number of blocks */
1648 #endif
1649     lst = (void **)next;
1650   }
1651 
1652   KE_TRACE(
1653       5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
1654 }
1655 
1656 #endif // USE_FAST_MEMORY
1657