1 /*
2  * kmp_lock.cpp -- lock-related functions
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <stddef.h>
15 #include <atomic>
16 
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_lock.h"
22 
23 #include "tsan_annotations.h"
24 
25 #if KMP_USE_FUTEX
26 #include <sys/syscall.h>
27 #include <unistd.h>
28 // We should really include <futex.h>, but that causes compatibility problems on
29 // different Linux* OS distributions that either require that you include (or
30 // break when you try to include) <pci/types.h>. Since all we need is the two
31 // macros below (which are part of the kernel ABI, so can't change) we just
32 // define the constants here and don't include <futex.h>
33 #ifndef FUTEX_WAIT
34 #define FUTEX_WAIT 0
35 #endif
36 #ifndef FUTEX_WAKE
37 #define FUTEX_WAKE 1
38 #endif
39 #endif
40 
41 /* Implement spin locks for internal library use.             */
42 /* The algorithm implemented is Lamport's bakery lock [1974]. */
43 
44 void __kmp_validate_locks(void) {
45   int i;
46   kmp_uint32 x, y;
47 
48   /* Check to make sure unsigned arithmetic does wraps properly */
49   x = ~((kmp_uint32)0) - 2;
50   y = x - 2;
51 
52   for (i = 0; i < 8; ++i, ++x, ++y) {
53     kmp_uint32 z = (x - y);
54     KMP_ASSERT(z == 2);
55   }
56 
57   KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
58 }
59 
60 /* ------------------------------------------------------------------------ */
61 /* test and set locks */
62 
63 // For the non-nested locks, we can only assume that the first 4 bytes were
64 // allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
65 // compiler only allocates a 4 byte pointer on IA-32 architecture.  On
66 // Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
67 //
68 // gcc reserves >= 8 bytes for nested locks, so we can assume that the
69 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
70 
71 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
72   return KMP_LOCK_STRIP(TCR_4(lck->lk.poll)) - 1;
73 }
74 
75 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
76   return lck->lk.depth_locked != -1;
77 }
78 
79 __forceinline static int
80 __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
81   KMP_MB();
82 
83 #ifdef USE_LOCK_PROFILE
84   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
85   if ((curr != 0) && (curr != gtid + 1))
86     __kmp_printf("LOCK CONTENTION: %p\n", lck);
87 /* else __kmp_printf( "." );*/
88 #endif /* USE_LOCK_PROFILE */
89 
90   if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
91       KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
92                                   KMP_LOCK_BUSY(gtid + 1, tas))) {
93     KMP_FSYNC_ACQUIRED(lck);
94     return KMP_LOCK_ACQUIRED_FIRST;
95   }
96 
97   kmp_uint32 spins;
98   KMP_FSYNC_PREPARE(lck);
99   KMP_INIT_YIELD(spins);
100   if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
101     KMP_YIELD(TRUE);
102   } else {
103     KMP_YIELD_SPIN(spins);
104   }
105 
106   kmp_backoff_t backoff = __kmp_spin_backoff_params;
107   while ((lck->lk.poll != KMP_LOCK_FREE(tas)) ||
108          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
109                                        KMP_LOCK_BUSY(gtid + 1, tas)))) {
110 
111     __kmp_spin_backoff(&backoff);
112     if (TCR_4(__kmp_nth) >
113         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
114       KMP_YIELD(TRUE);
115     } else {
116       KMP_YIELD_SPIN(spins);
117     }
118   }
119   KMP_FSYNC_ACQUIRED(lck);
120   return KMP_LOCK_ACQUIRED_FIRST;
121 }
122 
123 int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
124   int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
125   ANNOTATE_TAS_ACQUIRED(lck);
126   return retval;
127 }
128 
129 static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
130                                               kmp_int32 gtid) {
131   char const *const func = "omp_set_lock";
132   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
133       __kmp_is_tas_lock_nestable(lck)) {
134     KMP_FATAL(LockNestableUsedAsSimple, func);
135   }
136   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
137     KMP_FATAL(LockIsAlreadyOwned, func);
138   }
139   return __kmp_acquire_tas_lock(lck, gtid);
140 }
141 
142 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
143   if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
144       KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
145                                   KMP_LOCK_BUSY(gtid + 1, tas))) {
146     KMP_FSYNC_ACQUIRED(lck);
147     return TRUE;
148   }
149   return FALSE;
150 }
151 
152 static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
153                                            kmp_int32 gtid) {
154   char const *const func = "omp_test_lock";
155   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
156       __kmp_is_tas_lock_nestable(lck)) {
157     KMP_FATAL(LockNestableUsedAsSimple, func);
158   }
159   return __kmp_test_tas_lock(lck, gtid);
160 }
161 
162 int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
163   KMP_MB(); /* Flush all pending memory write invalidates.  */
164 
165   KMP_FSYNC_RELEASING(lck);
166   ANNOTATE_TAS_RELEASED(lck);
167   KMP_ST_REL32(&(lck->lk.poll), KMP_LOCK_FREE(tas));
168   KMP_MB(); /* Flush all pending memory write invalidates.  */
169 
170   KMP_YIELD(TCR_4(__kmp_nth) >
171             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
172   return KMP_LOCK_RELEASED;
173 }
174 
175 static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
176                                               kmp_int32 gtid) {
177   char const *const func = "omp_unset_lock";
178   KMP_MB(); /* in case another processor initialized lock */
179   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
180       __kmp_is_tas_lock_nestable(lck)) {
181     KMP_FATAL(LockNestableUsedAsSimple, func);
182   }
183   if (__kmp_get_tas_lock_owner(lck) == -1) {
184     KMP_FATAL(LockUnsettingFree, func);
185   }
186   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
187       (__kmp_get_tas_lock_owner(lck) != gtid)) {
188     KMP_FATAL(LockUnsettingSetByAnother, func);
189   }
190   return __kmp_release_tas_lock(lck, gtid);
191 }
192 
193 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
194   TCW_4(lck->lk.poll, KMP_LOCK_FREE(tas));
195 }
196 
197 static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
198   __kmp_init_tas_lock(lck);
199 }
200 
201 void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
202 
203 static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
204   char const *const func = "omp_destroy_lock";
205   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
206       __kmp_is_tas_lock_nestable(lck)) {
207     KMP_FATAL(LockNestableUsedAsSimple, func);
208   }
209   if (__kmp_get_tas_lock_owner(lck) != -1) {
210     KMP_FATAL(LockStillOwned, func);
211   }
212   __kmp_destroy_tas_lock(lck);
213 }
214 
215 // nested test and set locks
216 
217 int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
218   KMP_DEBUG_ASSERT(gtid >= 0);
219 
220   if (__kmp_get_tas_lock_owner(lck) == gtid) {
221     lck->lk.depth_locked += 1;
222     return KMP_LOCK_ACQUIRED_NEXT;
223   } else {
224     __kmp_acquire_tas_lock_timed_template(lck, gtid);
225     ANNOTATE_TAS_ACQUIRED(lck);
226     lck->lk.depth_locked = 1;
227     return KMP_LOCK_ACQUIRED_FIRST;
228   }
229 }
230 
231 static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
232                                                      kmp_int32 gtid) {
233   char const *const func = "omp_set_nest_lock";
234   if (!__kmp_is_tas_lock_nestable(lck)) {
235     KMP_FATAL(LockSimpleUsedAsNestable, func);
236   }
237   return __kmp_acquire_nested_tas_lock(lck, gtid);
238 }
239 
240 int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
241   int retval;
242 
243   KMP_DEBUG_ASSERT(gtid >= 0);
244 
245   if (__kmp_get_tas_lock_owner(lck) == gtid) {
246     retval = ++lck->lk.depth_locked;
247   } else if (!__kmp_test_tas_lock(lck, gtid)) {
248     retval = 0;
249   } else {
250     KMP_MB();
251     retval = lck->lk.depth_locked = 1;
252   }
253   return retval;
254 }
255 
256 static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
257                                                   kmp_int32 gtid) {
258   char const *const func = "omp_test_nest_lock";
259   if (!__kmp_is_tas_lock_nestable(lck)) {
260     KMP_FATAL(LockSimpleUsedAsNestable, func);
261   }
262   return __kmp_test_nested_tas_lock(lck, gtid);
263 }
264 
265 int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
266   KMP_DEBUG_ASSERT(gtid >= 0);
267 
268   KMP_MB();
269   if (--(lck->lk.depth_locked) == 0) {
270     __kmp_release_tas_lock(lck, gtid);
271     return KMP_LOCK_RELEASED;
272   }
273   return KMP_LOCK_STILL_HELD;
274 }
275 
276 static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
277                                                      kmp_int32 gtid) {
278   char const *const func = "omp_unset_nest_lock";
279   KMP_MB(); /* in case another processor initialized lock */
280   if (!__kmp_is_tas_lock_nestable(lck)) {
281     KMP_FATAL(LockSimpleUsedAsNestable, func);
282   }
283   if (__kmp_get_tas_lock_owner(lck) == -1) {
284     KMP_FATAL(LockUnsettingFree, func);
285   }
286   if (__kmp_get_tas_lock_owner(lck) != gtid) {
287     KMP_FATAL(LockUnsettingSetByAnother, func);
288   }
289   return __kmp_release_nested_tas_lock(lck, gtid);
290 }
291 
292 void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
293   __kmp_init_tas_lock(lck);
294   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
295 }
296 
297 static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
298   __kmp_init_nested_tas_lock(lck);
299 }
300 
301 void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
302   __kmp_destroy_tas_lock(lck);
303   lck->lk.depth_locked = 0;
304 }
305 
306 static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
307   char const *const func = "omp_destroy_nest_lock";
308   if (!__kmp_is_tas_lock_nestable(lck)) {
309     KMP_FATAL(LockSimpleUsedAsNestable, func);
310   }
311   if (__kmp_get_tas_lock_owner(lck) != -1) {
312     KMP_FATAL(LockStillOwned, func);
313   }
314   __kmp_destroy_nested_tas_lock(lck);
315 }
316 
317 #if KMP_USE_FUTEX
318 
319 /* ------------------------------------------------------------------------ */
320 /* futex locks */
321 
322 // futex locks are really just test and set locks, with a different method
323 // of handling contention.  They take the same amount of space as test and
324 // set locks, and are allocated the same way (i.e. use the area allocated by
325 // the compiler for non-nested locks / allocate nested locks on the heap).
326 
327 static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
328   return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
329 }
330 
331 static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
332   return lck->lk.depth_locked != -1;
333 }
334 
335 __forceinline static int
336 __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
337   kmp_int32 gtid_code = (gtid + 1) << 1;
338 
339   KMP_MB();
340 
341 #ifdef USE_LOCK_PROFILE
342   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
343   if ((curr != 0) && (curr != gtid_code))
344     __kmp_printf("LOCK CONTENTION: %p\n", lck);
345 /* else __kmp_printf( "." );*/
346 #endif /* USE_LOCK_PROFILE */
347 
348   KMP_FSYNC_PREPARE(lck);
349   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
350                   lck, lck->lk.poll, gtid));
351 
352   kmp_int32 poll_val;
353 
354   while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
355               &(lck->lk.poll), KMP_LOCK_FREE(futex),
356               KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
357 
358     kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
359     KA_TRACE(
360         1000,
361         ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
362          lck, gtid, poll_val, cond));
363 
364     // NOTE: if you try to use the following condition for this branch
365     //
366     // if ( poll_val & 1 == 0 )
367     //
368     // Then the 12.0 compiler has a bug where the following block will
369     // always be skipped, regardless of the value of the LSB of poll_val.
370     if (!cond) {
371       // Try to set the lsb in the poll to indicate to the owner
372       // thread that they need to wake this thread up.
373       if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
374                                        poll_val | KMP_LOCK_BUSY(1, futex))) {
375         KA_TRACE(
376             1000,
377             ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
378              lck, lck->lk.poll, gtid));
379         continue;
380       }
381       poll_val |= KMP_LOCK_BUSY(1, futex);
382 
383       KA_TRACE(1000,
384                ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
385                 lck->lk.poll, gtid));
386     }
387 
388     KA_TRACE(
389         1000,
390         ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
391          lck, gtid, poll_val));
392 
393     kmp_int32 rc;
394     if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
395                       NULL, 0)) != 0) {
396       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
397                       "failed (rc=%d errno=%d)\n",
398                       lck, gtid, poll_val, rc, errno));
399       continue;
400     }
401 
402     KA_TRACE(1000,
403              ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
404               lck, gtid, poll_val));
405     // This thread has now done a successful futex wait call and was entered on
406     // the OS futex queue.  We must now perform a futex wake call when releasing
407     // the lock, as we have no idea how many other threads are in the queue.
408     gtid_code |= 1;
409   }
410 
411   KMP_FSYNC_ACQUIRED(lck);
412   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
413                   lck->lk.poll, gtid));
414   return KMP_LOCK_ACQUIRED_FIRST;
415 }
416 
417 int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
418   int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
419   ANNOTATE_FUTEX_ACQUIRED(lck);
420   return retval;
421 }
422 
423 static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
424                                                 kmp_int32 gtid) {
425   char const *const func = "omp_set_lock";
426   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
427       __kmp_is_futex_lock_nestable(lck)) {
428     KMP_FATAL(LockNestableUsedAsSimple, func);
429   }
430   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
431     KMP_FATAL(LockIsAlreadyOwned, func);
432   }
433   return __kmp_acquire_futex_lock(lck, gtid);
434 }
435 
436 int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
437   if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
438                                   KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
439     KMP_FSYNC_ACQUIRED(lck);
440     return TRUE;
441   }
442   return FALSE;
443 }
444 
445 static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
446                                              kmp_int32 gtid) {
447   char const *const func = "omp_test_lock";
448   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
449       __kmp_is_futex_lock_nestable(lck)) {
450     KMP_FATAL(LockNestableUsedAsSimple, func);
451   }
452   return __kmp_test_futex_lock(lck, gtid);
453 }
454 
455 int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
456   KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
459                   lck, lck->lk.poll, gtid));
460 
461   KMP_FSYNC_RELEASING(lck);
462   ANNOTATE_FUTEX_RELEASED(lck);
463 
464   kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
465 
466   KA_TRACE(1000,
467            ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
468             lck, gtid, poll_val));
469 
470   if (KMP_LOCK_STRIP(poll_val) & 1) {
471     KA_TRACE(1000,
472              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
473               lck, gtid));
474     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
475             NULL, NULL, 0);
476   }
477 
478   KMP_MB(); /* Flush all pending memory write invalidates.  */
479 
480   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
481                   lck->lk.poll, gtid));
482 
483   KMP_YIELD(TCR_4(__kmp_nth) >
484             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
485   return KMP_LOCK_RELEASED;
486 }
487 
488 static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
489                                                 kmp_int32 gtid) {
490   char const *const func = "omp_unset_lock";
491   KMP_MB(); /* in case another processor initialized lock */
492   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
493       __kmp_is_futex_lock_nestable(lck)) {
494     KMP_FATAL(LockNestableUsedAsSimple, func);
495   }
496   if (__kmp_get_futex_lock_owner(lck) == -1) {
497     KMP_FATAL(LockUnsettingFree, func);
498   }
499   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
500       (__kmp_get_futex_lock_owner(lck) != gtid)) {
501     KMP_FATAL(LockUnsettingSetByAnother, func);
502   }
503   return __kmp_release_futex_lock(lck, gtid);
504 }
505 
506 void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
507   TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
508 }
509 
510 static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
511   __kmp_init_futex_lock(lck);
512 }
513 
514 void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
515 
516 static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
517   char const *const func = "omp_destroy_lock";
518   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
519       __kmp_is_futex_lock_nestable(lck)) {
520     KMP_FATAL(LockNestableUsedAsSimple, func);
521   }
522   if (__kmp_get_futex_lock_owner(lck) != -1) {
523     KMP_FATAL(LockStillOwned, func);
524   }
525   __kmp_destroy_futex_lock(lck);
526 }
527 
528 // nested futex locks
529 
530 int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
531   KMP_DEBUG_ASSERT(gtid >= 0);
532 
533   if (__kmp_get_futex_lock_owner(lck) == gtid) {
534     lck->lk.depth_locked += 1;
535     return KMP_LOCK_ACQUIRED_NEXT;
536   } else {
537     __kmp_acquire_futex_lock_timed_template(lck, gtid);
538     ANNOTATE_FUTEX_ACQUIRED(lck);
539     lck->lk.depth_locked = 1;
540     return KMP_LOCK_ACQUIRED_FIRST;
541   }
542 }
543 
544 static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
545                                                        kmp_int32 gtid) {
546   char const *const func = "omp_set_nest_lock";
547   if (!__kmp_is_futex_lock_nestable(lck)) {
548     KMP_FATAL(LockSimpleUsedAsNestable, func);
549   }
550   return __kmp_acquire_nested_futex_lock(lck, gtid);
551 }
552 
553 int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
554   int retval;
555 
556   KMP_DEBUG_ASSERT(gtid >= 0);
557 
558   if (__kmp_get_futex_lock_owner(lck) == gtid) {
559     retval = ++lck->lk.depth_locked;
560   } else if (!__kmp_test_futex_lock(lck, gtid)) {
561     retval = 0;
562   } else {
563     KMP_MB();
564     retval = lck->lk.depth_locked = 1;
565   }
566   return retval;
567 }
568 
569 static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
570                                                     kmp_int32 gtid) {
571   char const *const func = "omp_test_nest_lock";
572   if (!__kmp_is_futex_lock_nestable(lck)) {
573     KMP_FATAL(LockSimpleUsedAsNestable, func);
574   }
575   return __kmp_test_nested_futex_lock(lck, gtid);
576 }
577 
578 int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
579   KMP_DEBUG_ASSERT(gtid >= 0);
580 
581   KMP_MB();
582   if (--(lck->lk.depth_locked) == 0) {
583     __kmp_release_futex_lock(lck, gtid);
584     return KMP_LOCK_RELEASED;
585   }
586   return KMP_LOCK_STILL_HELD;
587 }
588 
589 static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
590                                                        kmp_int32 gtid) {
591   char const *const func = "omp_unset_nest_lock";
592   KMP_MB(); /* in case another processor initialized lock */
593   if (!__kmp_is_futex_lock_nestable(lck)) {
594     KMP_FATAL(LockSimpleUsedAsNestable, func);
595   }
596   if (__kmp_get_futex_lock_owner(lck) == -1) {
597     KMP_FATAL(LockUnsettingFree, func);
598   }
599   if (__kmp_get_futex_lock_owner(lck) != gtid) {
600     KMP_FATAL(LockUnsettingSetByAnother, func);
601   }
602   return __kmp_release_nested_futex_lock(lck, gtid);
603 }
604 
605 void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
606   __kmp_init_futex_lock(lck);
607   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
608 }
609 
610 static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
611   __kmp_init_nested_futex_lock(lck);
612 }
613 
614 void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
615   __kmp_destroy_futex_lock(lck);
616   lck->lk.depth_locked = 0;
617 }
618 
619 static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
620   char const *const func = "omp_destroy_nest_lock";
621   if (!__kmp_is_futex_lock_nestable(lck)) {
622     KMP_FATAL(LockSimpleUsedAsNestable, func);
623   }
624   if (__kmp_get_futex_lock_owner(lck) != -1) {
625     KMP_FATAL(LockStillOwned, func);
626   }
627   __kmp_destroy_nested_futex_lock(lck);
628 }
629 
630 #endif // KMP_USE_FUTEX
631 
632 /* ------------------------------------------------------------------------ */
633 /* ticket (bakery) locks */
634 
635 static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
636   return std::atomic_load_explicit(&lck->lk.owner_id,
637                                    std::memory_order_relaxed) -
638          1;
639 }
640 
641 static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
642   return std::atomic_load_explicit(&lck->lk.depth_locked,
643                                    std::memory_order_relaxed) != -1;
644 }
645 
646 static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
647   return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
648                                    std::memory_order_acquire) == my_ticket;
649 }
650 
651 __forceinline static int
652 __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
653                                          kmp_int32 gtid) {
654   kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
655       &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
656 
657 #ifdef USE_LOCK_PROFILE
658   if (std::atomic_load_explicit(&lck->lk.now_serving,
659                                 std::memory_order_relaxed) != my_ticket)
660     __kmp_printf("LOCK CONTENTION: %p\n", lck);
661 /* else __kmp_printf( "." );*/
662 #endif /* USE_LOCK_PROFILE */
663 
664   if (std::atomic_load_explicit(&lck->lk.now_serving,
665                                 std::memory_order_acquire) == my_ticket) {
666     return KMP_LOCK_ACQUIRED_FIRST;
667   }
668   KMP_WAIT_YIELD_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
669   return KMP_LOCK_ACQUIRED_FIRST;
670 }
671 
672 int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
673   int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
674   ANNOTATE_TICKET_ACQUIRED(lck);
675   return retval;
676 }
677 
678 static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
679                                                  kmp_int32 gtid) {
680   char const *const func = "omp_set_lock";
681 
682   if (!std::atomic_load_explicit(&lck->lk.initialized,
683                                  std::memory_order_relaxed)) {
684     KMP_FATAL(LockIsUninitialized, func);
685   }
686   if (lck->lk.self != lck) {
687     KMP_FATAL(LockIsUninitialized, func);
688   }
689   if (__kmp_is_ticket_lock_nestable(lck)) {
690     KMP_FATAL(LockNestableUsedAsSimple, func);
691   }
692   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
693     KMP_FATAL(LockIsAlreadyOwned, func);
694   }
695 
696   __kmp_acquire_ticket_lock(lck, gtid);
697 
698   std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
699                              std::memory_order_relaxed);
700   return KMP_LOCK_ACQUIRED_FIRST;
701 }
702 
703 int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
704   kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
705                                                    std::memory_order_relaxed);
706 
707   if (std::atomic_load_explicit(&lck->lk.now_serving,
708                                 std::memory_order_relaxed) == my_ticket) {
709     kmp_uint32 next_ticket = my_ticket + 1;
710     if (std::atomic_compare_exchange_strong_explicit(
711             &lck->lk.next_ticket, &my_ticket, next_ticket,
712             std::memory_order_acquire, std::memory_order_acquire)) {
713       return TRUE;
714     }
715   }
716   return FALSE;
717 }
718 
719 static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
720                                               kmp_int32 gtid) {
721   char const *const func = "omp_test_lock";
722 
723   if (!std::atomic_load_explicit(&lck->lk.initialized,
724                                  std::memory_order_relaxed)) {
725     KMP_FATAL(LockIsUninitialized, func);
726   }
727   if (lck->lk.self != lck) {
728     KMP_FATAL(LockIsUninitialized, func);
729   }
730   if (__kmp_is_ticket_lock_nestable(lck)) {
731     KMP_FATAL(LockNestableUsedAsSimple, func);
732   }
733 
734   int retval = __kmp_test_ticket_lock(lck, gtid);
735 
736   if (retval) {
737     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
738                                std::memory_order_relaxed);
739   }
740   return retval;
741 }
742 
743 int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
744   kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
745                                                   std::memory_order_relaxed) -
746                         std::atomic_load_explicit(&lck->lk.now_serving,
747                                                   std::memory_order_relaxed);
748 
749   ANNOTATE_TICKET_RELEASED(lck);
750   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
751                                  std::memory_order_release);
752 
753   KMP_YIELD(distance >
754             (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
755   return KMP_LOCK_RELEASED;
756 }
757 
758 static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
759                                                  kmp_int32 gtid) {
760   char const *const func = "omp_unset_lock";
761 
762   if (!std::atomic_load_explicit(&lck->lk.initialized,
763                                  std::memory_order_relaxed)) {
764     KMP_FATAL(LockIsUninitialized, func);
765   }
766   if (lck->lk.self != lck) {
767     KMP_FATAL(LockIsUninitialized, func);
768   }
769   if (__kmp_is_ticket_lock_nestable(lck)) {
770     KMP_FATAL(LockNestableUsedAsSimple, func);
771   }
772   if (__kmp_get_ticket_lock_owner(lck) == -1) {
773     KMP_FATAL(LockUnsettingFree, func);
774   }
775   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
776       (__kmp_get_ticket_lock_owner(lck) != gtid)) {
777     KMP_FATAL(LockUnsettingSetByAnother, func);
778   }
779   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
780   return __kmp_release_ticket_lock(lck, gtid);
781 }
782 
783 void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
784   lck->lk.location = NULL;
785   lck->lk.self = lck;
786   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
787                              std::memory_order_relaxed);
788   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
789                              std::memory_order_relaxed);
790   std::atomic_store_explicit(
791       &lck->lk.owner_id, 0,
792       std::memory_order_relaxed); // no thread owns the lock.
793   std::atomic_store_explicit(
794       &lck->lk.depth_locked, -1,
795       std::memory_order_relaxed); // -1 => not a nested lock.
796   std::atomic_store_explicit(&lck->lk.initialized, true,
797                              std::memory_order_release);
798 }
799 
800 static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
801   __kmp_init_ticket_lock(lck);
802 }
803 
804 void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
805   std::atomic_store_explicit(&lck->lk.initialized, false,
806                              std::memory_order_release);
807   lck->lk.self = NULL;
808   lck->lk.location = NULL;
809   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
810                              std::memory_order_relaxed);
811   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
812                              std::memory_order_relaxed);
813   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
814   std::atomic_store_explicit(&lck->lk.depth_locked, -1,
815                              std::memory_order_relaxed);
816 }
817 
818 static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
819   char const *const func = "omp_destroy_lock";
820 
821   if (!std::atomic_load_explicit(&lck->lk.initialized,
822                                  std::memory_order_relaxed)) {
823     KMP_FATAL(LockIsUninitialized, func);
824   }
825   if (lck->lk.self != lck) {
826     KMP_FATAL(LockIsUninitialized, func);
827   }
828   if (__kmp_is_ticket_lock_nestable(lck)) {
829     KMP_FATAL(LockNestableUsedAsSimple, func);
830   }
831   if (__kmp_get_ticket_lock_owner(lck) != -1) {
832     KMP_FATAL(LockStillOwned, func);
833   }
834   __kmp_destroy_ticket_lock(lck);
835 }
836 
837 // nested ticket locks
838 
839 int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
840   KMP_DEBUG_ASSERT(gtid >= 0);
841 
842   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
843     std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
844                                    std::memory_order_relaxed);
845     return KMP_LOCK_ACQUIRED_NEXT;
846   } else {
847     __kmp_acquire_ticket_lock_timed_template(lck, gtid);
848     ANNOTATE_TICKET_ACQUIRED(lck);
849     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
850                                std::memory_order_relaxed);
851     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
852                                std::memory_order_relaxed);
853     return KMP_LOCK_ACQUIRED_FIRST;
854   }
855 }
856 
857 static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
858                                                         kmp_int32 gtid) {
859   char const *const func = "omp_set_nest_lock";
860 
861   if (!std::atomic_load_explicit(&lck->lk.initialized,
862                                  std::memory_order_relaxed)) {
863     KMP_FATAL(LockIsUninitialized, func);
864   }
865   if (lck->lk.self != lck) {
866     KMP_FATAL(LockIsUninitialized, func);
867   }
868   if (!__kmp_is_ticket_lock_nestable(lck)) {
869     KMP_FATAL(LockSimpleUsedAsNestable, func);
870   }
871   return __kmp_acquire_nested_ticket_lock(lck, gtid);
872 }
873 
874 int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
875   int retval;
876 
877   KMP_DEBUG_ASSERT(gtid >= 0);
878 
879   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
880     retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
881                                             std::memory_order_relaxed) +
882              1;
883   } else if (!__kmp_test_ticket_lock(lck, gtid)) {
884     retval = 0;
885   } else {
886     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
887                                std::memory_order_relaxed);
888     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
889                                std::memory_order_relaxed);
890     retval = 1;
891   }
892   return retval;
893 }
894 
895 static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
896                                                      kmp_int32 gtid) {
897   char const *const func = "omp_test_nest_lock";
898 
899   if (!std::atomic_load_explicit(&lck->lk.initialized,
900                                  std::memory_order_relaxed)) {
901     KMP_FATAL(LockIsUninitialized, func);
902   }
903   if (lck->lk.self != lck) {
904     KMP_FATAL(LockIsUninitialized, func);
905   }
906   if (!__kmp_is_ticket_lock_nestable(lck)) {
907     KMP_FATAL(LockSimpleUsedAsNestable, func);
908   }
909   return __kmp_test_nested_ticket_lock(lck, gtid);
910 }
911 
912 int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
913   KMP_DEBUG_ASSERT(gtid >= 0);
914 
915   if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
916                                       std::memory_order_relaxed) -
917        1) == 0) {
918     std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
919     __kmp_release_ticket_lock(lck, gtid);
920     return KMP_LOCK_RELEASED;
921   }
922   return KMP_LOCK_STILL_HELD;
923 }
924 
925 static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
926                                                         kmp_int32 gtid) {
927   char const *const func = "omp_unset_nest_lock";
928 
929   if (!std::atomic_load_explicit(&lck->lk.initialized,
930                                  std::memory_order_relaxed)) {
931     KMP_FATAL(LockIsUninitialized, func);
932   }
933   if (lck->lk.self != lck) {
934     KMP_FATAL(LockIsUninitialized, func);
935   }
936   if (!__kmp_is_ticket_lock_nestable(lck)) {
937     KMP_FATAL(LockSimpleUsedAsNestable, func);
938   }
939   if (__kmp_get_ticket_lock_owner(lck) == -1) {
940     KMP_FATAL(LockUnsettingFree, func);
941   }
942   if (__kmp_get_ticket_lock_owner(lck) != gtid) {
943     KMP_FATAL(LockUnsettingSetByAnother, func);
944   }
945   return __kmp_release_nested_ticket_lock(lck, gtid);
946 }
947 
948 void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
949   __kmp_init_ticket_lock(lck);
950   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
951                              std::memory_order_relaxed);
952   // >= 0 for nestable locks, -1 for simple locks
953 }
954 
955 static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
956   __kmp_init_nested_ticket_lock(lck);
957 }
958 
959 void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
960   __kmp_destroy_ticket_lock(lck);
961   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
962                              std::memory_order_relaxed);
963 }
964 
965 static void
966 __kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
967   char const *const func = "omp_destroy_nest_lock";
968 
969   if (!std::atomic_load_explicit(&lck->lk.initialized,
970                                  std::memory_order_relaxed)) {
971     KMP_FATAL(LockIsUninitialized, func);
972   }
973   if (lck->lk.self != lck) {
974     KMP_FATAL(LockIsUninitialized, func);
975   }
976   if (!__kmp_is_ticket_lock_nestable(lck)) {
977     KMP_FATAL(LockSimpleUsedAsNestable, func);
978   }
979   if (__kmp_get_ticket_lock_owner(lck) != -1) {
980     KMP_FATAL(LockStillOwned, func);
981   }
982   __kmp_destroy_nested_ticket_lock(lck);
983 }
984 
985 // access functions to fields which don't exist for all lock kinds.
986 
987 static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
988   return std::atomic_load_explicit(&lck->lk.initialized,
989                                    std::memory_order_relaxed) &&
990          (lck->lk.self == lck);
991 }
992 
993 static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
994   return lck->lk.location;
995 }
996 
997 static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
998                                            const ident_t *loc) {
999   lck->lk.location = loc;
1000 }
1001 
1002 static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
1003   return lck->lk.flags;
1004 }
1005 
1006 static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
1007                                         kmp_lock_flags_t flags) {
1008   lck->lk.flags = flags;
1009 }
1010 
1011 /* ------------------------------------------------------------------------ */
1012 /* queuing locks */
1013 
1014 /* First the states
1015    (head,tail) =              0, 0  means lock is unheld, nobody on queue
1016                  UINT_MAX or -1, 0  means lock is held, nobody on queue
1017                               h, h  means lock held or about to transition,
1018                                     1 element on queue
1019                               h, t  h <> t, means lock is held or about to
1020                                     transition, >1 elements on queue
1021 
1022    Now the transitions
1023       Acquire(0,0)  = -1 ,0
1024       Release(0,0)  = Error
1025       Acquire(-1,0) =  h ,h    h > 0
1026       Release(-1,0) =  0 ,0
1027       Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
1028       Release(h,h)  = -1 ,0    h > 0
1029       Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
1030       Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
1031 
1032    And pictorially
1033 
1034            +-----+
1035            | 0, 0|------- release -------> Error
1036            +-----+
1037              |  ^
1038       acquire|  |release
1039              |  |
1040              |  |
1041              v  |
1042            +-----+
1043            |-1, 0|
1044            +-----+
1045              |  ^
1046       acquire|  |release
1047              |  |
1048              |  |
1049              v  |
1050            +-----+
1051            | h, h|
1052            +-----+
1053              |  ^
1054       acquire|  |release
1055              |  |
1056              |  |
1057              v  |
1058            +-----+
1059            | h, t|----- acquire, release loopback ---+
1060            +-----+                                   |
1061                 ^                                    |
1062                 |                                    |
1063                 +------------------------------------+
1064  */
1065 
1066 #ifdef DEBUG_QUEUING_LOCKS
1067 
1068 /* Stuff for circular trace buffer */
1069 #define TRACE_BUF_ELE 1024
1070 static char traces[TRACE_BUF_ELE][128] = {0};
1071 static int tc = 0;
1072 #define TRACE_LOCK(X, Y)                                                       \
1073   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
1074 #define TRACE_LOCK_T(X, Y, Z)                                                  \
1075   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
1076 #define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
1077   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
1078                Z, Q);
1079 
1080 static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
1081                                     kmp_queuing_lock_t *lck, kmp_int32 head_id,
1082                                     kmp_int32 tail_id) {
1083   kmp_int32 t, i;
1084 
1085   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
1086 
1087   i = tc % TRACE_BUF_ELE;
1088   __kmp_printf_no_lock("%s\n", traces[i]);
1089   i = (i + 1) % TRACE_BUF_ELE;
1090   while (i != (tc % TRACE_BUF_ELE)) {
1091     __kmp_printf_no_lock("%s", traces[i]);
1092     i = (i + 1) % TRACE_BUF_ELE;
1093   }
1094   __kmp_printf_no_lock("\n");
1095 
1096   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
1097                        "next_wait:%d, head_id:%d, tail_id:%d\n",
1098                        gtid + 1, this_thr->th.th_spin_here,
1099                        this_thr->th.th_next_waiting, head_id, tail_id);
1100 
1101   __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
1102 
1103   if (lck->lk.head_id >= 1) {
1104     t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
1105     while (t > 0) {
1106       __kmp_printf_no_lock("-> %d ", t);
1107       t = __kmp_threads[t - 1]->th.th_next_waiting;
1108     }
1109   }
1110   __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
1111   __kmp_printf_no_lock("\n\n");
1112 }
1113 
1114 #endif /* DEBUG_QUEUING_LOCKS */
1115 
1116 static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
1117   return TCR_4(lck->lk.owner_id) - 1;
1118 }
1119 
1120 static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
1121   return lck->lk.depth_locked != -1;
1122 }
1123 
1124 /* Acquire a lock using a the queuing lock implementation */
1125 template <bool takeTime>
1126 /* [TLW] The unused template above is left behind because of what BEB believes
1127    is a potential compiler problem with __forceinline. */
1128 __forceinline static int
1129 __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
1130                                           kmp_int32 gtid) {
1131   kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
1132   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1133   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1134   volatile kmp_uint32 *spin_here_p;
1135   kmp_int32 need_mf = 1;
1136 
1137 #if OMPT_SUPPORT
1138   omp_state_t prev_state = omp_state_undefined;
1139 #endif
1140 
1141   KA_TRACE(1000,
1142            ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1143 
1144   KMP_FSYNC_PREPARE(lck);
1145   KMP_DEBUG_ASSERT(this_thr != NULL);
1146   spin_here_p = &this_thr->th.th_spin_here;
1147 
1148 #ifdef DEBUG_QUEUING_LOCKS
1149   TRACE_LOCK(gtid + 1, "acq ent");
1150   if (*spin_here_p)
1151     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1152   if (this_thr->th.th_next_waiting != 0)
1153     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1154 #endif
1155   KMP_DEBUG_ASSERT(!*spin_here_p);
1156   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1157 
1158   /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
1159      head_id_p that may follow, not just in execution order, but also in
1160      visibility order. This way, when a releasing thread observes the changes to
1161      the queue by this thread, it can rightly assume that spin_here_p has
1162      already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
1163      not premature.  If the releasing thread sets spin_here_p to FALSE before
1164      this thread sets it to TRUE, this thread will hang. */
1165   *spin_here_p = TRUE; /* before enqueuing to prevent race */
1166 
1167   while (1) {
1168     kmp_int32 enqueued;
1169     kmp_int32 head;
1170     kmp_int32 tail;
1171 
1172     head = *head_id_p;
1173 
1174     switch (head) {
1175 
1176     case -1: {
1177 #ifdef DEBUG_QUEUING_LOCKS
1178       tail = *tail_id_p;
1179       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1180 #endif
1181       tail = 0; /* to make sure next link asynchronously read is not set
1182                 accidentally; this assignment prevents us from entering the
1183                 if ( t > 0 ) condition in the enqueued case below, which is not
1184                 necessary for this state transition */
1185 
1186       need_mf = 0;
1187       /* try (-1,0)->(tid,tid) */
1188       enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
1189                                              KMP_PACK_64(-1, 0),
1190                                              KMP_PACK_64(gtid + 1, gtid + 1));
1191 #ifdef DEBUG_QUEUING_LOCKS
1192       if (enqueued)
1193         TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
1194 #endif
1195     } break;
1196 
1197     default: {
1198       tail = *tail_id_p;
1199       KMP_DEBUG_ASSERT(tail != gtid + 1);
1200 
1201 #ifdef DEBUG_QUEUING_LOCKS
1202       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1203 #endif
1204 
1205       if (tail == 0) {
1206         enqueued = FALSE;
1207       } else {
1208         need_mf = 0;
1209         /* try (h,t) or (h,h)->(h,tid) */
1210         enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
1211 
1212 #ifdef DEBUG_QUEUING_LOCKS
1213         if (enqueued)
1214           TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
1215 #endif
1216       }
1217     } break;
1218 
1219     case 0: /* empty queue */
1220     {
1221       kmp_int32 grabbed_lock;
1222 
1223 #ifdef DEBUG_QUEUING_LOCKS
1224       tail = *tail_id_p;
1225       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1226 #endif
1227       /* try (0,0)->(-1,0) */
1228 
1229       /* only legal transition out of head = 0 is head = -1 with no change to
1230        * tail */
1231       grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
1232 
1233       if (grabbed_lock) {
1234 
1235         *spin_here_p = FALSE;
1236 
1237         KA_TRACE(
1238             1000,
1239             ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
1240              lck, gtid));
1241 #ifdef DEBUG_QUEUING_LOCKS
1242         TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
1243 #endif
1244 
1245 #if OMPT_SUPPORT
1246         if (ompt_enabled.enabled && prev_state != omp_state_undefined) {
1247           /* change the state before clearing wait_id */
1248           this_thr->th.ompt_thread_info.state = prev_state;
1249           this_thr->th.ompt_thread_info.wait_id = 0;
1250         }
1251 #endif
1252 
1253         KMP_FSYNC_ACQUIRED(lck);
1254         return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
1255       }
1256       enqueued = FALSE;
1257     } break;
1258     }
1259 
1260 #if OMPT_SUPPORT
1261     if (ompt_enabled.enabled && prev_state == omp_state_undefined) {
1262       /* this thread will spin; set wait_id before entering wait state */
1263       prev_state = this_thr->th.ompt_thread_info.state;
1264       this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
1265       this_thr->th.ompt_thread_info.state = omp_state_wait_lock;
1266     }
1267 #endif
1268 
1269     if (enqueued) {
1270       if (tail > 0) {
1271         kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
1272         KMP_ASSERT(tail_thr != NULL);
1273         tail_thr->th.th_next_waiting = gtid + 1;
1274         /* corresponding wait for this write in release code */
1275       }
1276       KA_TRACE(1000,
1277                ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
1278                 lck, gtid));
1279 
1280       /* ToDo: May want to consider using __kmp_wait_sleep  or something that
1281          sleeps for throughput only here. */
1282       KMP_MB();
1283       KMP_WAIT_YIELD(spin_here_p, FALSE, KMP_EQ, lck);
1284 
1285 #ifdef DEBUG_QUEUING_LOCKS
1286       TRACE_LOCK(gtid + 1, "acq spin");
1287 
1288       if (this_thr->th.th_next_waiting != 0)
1289         __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1290 #endif
1291       KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1292       KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
1293                       "waiting on queue\n",
1294                       lck, gtid));
1295 
1296 #ifdef DEBUG_QUEUING_LOCKS
1297       TRACE_LOCK(gtid + 1, "acq exit 2");
1298 #endif
1299 
1300 #if OMPT_SUPPORT
1301       /* change the state before clearing wait_id */
1302       this_thr->th.ompt_thread_info.state = prev_state;
1303       this_thr->th.ompt_thread_info.wait_id = 0;
1304 #endif
1305 
1306       /* got lock, we were dequeued by the thread that released lock */
1307       return KMP_LOCK_ACQUIRED_FIRST;
1308     }
1309 
1310     /* Yield if number of threads > number of logical processors */
1311     /* ToDo: Not sure why this should only be in oversubscription case,
1312        maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
1313     KMP_YIELD(TCR_4(__kmp_nth) >
1314               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
1315 #ifdef DEBUG_QUEUING_LOCKS
1316     TRACE_LOCK(gtid + 1, "acq retry");
1317 #endif
1318   }
1319   KMP_ASSERT2(0, "should not get here");
1320   return KMP_LOCK_ACQUIRED_FIRST;
1321 }
1322 
1323 int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1324   KMP_DEBUG_ASSERT(gtid >= 0);
1325 
1326   int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1327   ANNOTATE_QUEUING_ACQUIRED(lck);
1328   return retval;
1329 }
1330 
1331 static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1332                                                   kmp_int32 gtid) {
1333   char const *const func = "omp_set_lock";
1334   if (lck->lk.initialized != lck) {
1335     KMP_FATAL(LockIsUninitialized, func);
1336   }
1337   if (__kmp_is_queuing_lock_nestable(lck)) {
1338     KMP_FATAL(LockNestableUsedAsSimple, func);
1339   }
1340   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1341     KMP_FATAL(LockIsAlreadyOwned, func);
1342   }
1343 
1344   __kmp_acquire_queuing_lock(lck, gtid);
1345 
1346   lck->lk.owner_id = gtid + 1;
1347   return KMP_LOCK_ACQUIRED_FIRST;
1348 }
1349 
1350 int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1351   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1352   kmp_int32 head;
1353 #ifdef KMP_DEBUG
1354   kmp_info_t *this_thr;
1355 #endif
1356 
1357   KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
1358   KMP_DEBUG_ASSERT(gtid >= 0);
1359 #ifdef KMP_DEBUG
1360   this_thr = __kmp_thread_from_gtid(gtid);
1361   KMP_DEBUG_ASSERT(this_thr != NULL);
1362   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1363 #endif
1364 
1365   head = *head_id_p;
1366 
1367   if (head == 0) { /* nobody on queue, nobody holding */
1368     /* try (0,0)->(-1,0) */
1369     if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
1370       KA_TRACE(1000,
1371                ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
1372       KMP_FSYNC_ACQUIRED(lck);
1373       ANNOTATE_QUEUING_ACQUIRED(lck);
1374       return TRUE;
1375     }
1376   }
1377 
1378   KA_TRACE(1000,
1379            ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
1380   return FALSE;
1381 }
1382 
1383 static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1384                                                kmp_int32 gtid) {
1385   char const *const func = "omp_test_lock";
1386   if (lck->lk.initialized != lck) {
1387     KMP_FATAL(LockIsUninitialized, func);
1388   }
1389   if (__kmp_is_queuing_lock_nestable(lck)) {
1390     KMP_FATAL(LockNestableUsedAsSimple, func);
1391   }
1392 
1393   int retval = __kmp_test_queuing_lock(lck, gtid);
1394 
1395   if (retval) {
1396     lck->lk.owner_id = gtid + 1;
1397   }
1398   return retval;
1399 }
1400 
1401 int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1402   kmp_info_t *this_thr;
1403   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1404   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1405 
1406   KA_TRACE(1000,
1407            ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1408   KMP_DEBUG_ASSERT(gtid >= 0);
1409   this_thr = __kmp_thread_from_gtid(gtid);
1410   KMP_DEBUG_ASSERT(this_thr != NULL);
1411 #ifdef DEBUG_QUEUING_LOCKS
1412   TRACE_LOCK(gtid + 1, "rel ent");
1413 
1414   if (this_thr->th.th_spin_here)
1415     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1416   if (this_thr->th.th_next_waiting != 0)
1417     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1418 #endif
1419   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1420   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1421 
1422   KMP_FSYNC_RELEASING(lck);
1423   ANNOTATE_QUEUING_RELEASED(lck);
1424 
1425   while (1) {
1426     kmp_int32 dequeued;
1427     kmp_int32 head;
1428     kmp_int32 tail;
1429 
1430     head = *head_id_p;
1431 
1432 #ifdef DEBUG_QUEUING_LOCKS
1433     tail = *tail_id_p;
1434     TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
1435     if (head == 0)
1436       __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1437 #endif
1438     KMP_DEBUG_ASSERT(head !=
1439                      0); /* holding the lock, head must be -1 or queue head */
1440 
1441     if (head == -1) { /* nobody on queue */
1442       /* try (-1,0)->(0,0) */
1443       if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
1444         KA_TRACE(
1445             1000,
1446             ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
1447              lck, gtid));
1448 #ifdef DEBUG_QUEUING_LOCKS
1449         TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
1450 #endif
1451 
1452 #if OMPT_SUPPORT
1453 /* nothing to do - no other thread is trying to shift blame */
1454 #endif
1455         return KMP_LOCK_RELEASED;
1456       }
1457       dequeued = FALSE;
1458     } else {
1459       KMP_MB();
1460       tail = *tail_id_p;
1461       if (head == tail) { /* only one thread on the queue */
1462 #ifdef DEBUG_QUEUING_LOCKS
1463         if (head <= 0)
1464           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1465 #endif
1466         KMP_DEBUG_ASSERT(head > 0);
1467 
1468         /* try (h,h)->(-1,0) */
1469         dequeued = KMP_COMPARE_AND_STORE_REL64(
1470             RCAST(volatile kmp_int64 *, tail_id_p), KMP_PACK_64(head, head),
1471             KMP_PACK_64(-1, 0));
1472 #ifdef DEBUG_QUEUING_LOCKS
1473         TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
1474 #endif
1475 
1476       } else {
1477         volatile kmp_int32 *waiting_id_p;
1478         kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1479         KMP_DEBUG_ASSERT(head_thr != NULL);
1480         waiting_id_p = &head_thr->th.th_next_waiting;
1481 
1482 /* Does this require synchronous reads? */
1483 #ifdef DEBUG_QUEUING_LOCKS
1484         if (head <= 0 || tail <= 0)
1485           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1486 #endif
1487         KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1488 
1489         /* try (h,t)->(h',t) or (t,t) */
1490         KMP_MB();
1491         /* make sure enqueuing thread has time to update next waiting thread
1492          * field */
1493         *head_id_p = KMP_WAIT_YIELD((volatile kmp_uint32 *)waiting_id_p, 0,
1494                                     KMP_NEQ, NULL);
1495 #ifdef DEBUG_QUEUING_LOCKS
1496         TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
1497 #endif
1498         dequeued = TRUE;
1499       }
1500     }
1501 
1502     if (dequeued) {
1503       kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1504       KMP_DEBUG_ASSERT(head_thr != NULL);
1505 
1506 /* Does this require synchronous reads? */
1507 #ifdef DEBUG_QUEUING_LOCKS
1508       if (head <= 0 || tail <= 0)
1509         __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1510 #endif
1511       KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1512 
1513       /* For clean code only. Thread not released until next statement prevents
1514          race with acquire code. */
1515       head_thr->th.th_next_waiting = 0;
1516 #ifdef DEBUG_QUEUING_LOCKS
1517       TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
1518 #endif
1519 
1520       KMP_MB();
1521       /* reset spin value */
1522       head_thr->th.th_spin_here = FALSE;
1523 
1524       KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
1525                       "dequeuing\n",
1526                       lck, gtid));
1527 #ifdef DEBUG_QUEUING_LOCKS
1528       TRACE_LOCK(gtid + 1, "rel exit 2");
1529 #endif
1530       return KMP_LOCK_RELEASED;
1531     }
1532 /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
1533    threads */
1534 
1535 #ifdef DEBUG_QUEUING_LOCKS
1536     TRACE_LOCK(gtid + 1, "rel retry");
1537 #endif
1538 
1539   } /* while */
1540   KMP_ASSERT2(0, "should not get here");
1541   return KMP_LOCK_RELEASED;
1542 }
1543 
1544 static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1545                                                   kmp_int32 gtid) {
1546   char const *const func = "omp_unset_lock";
1547   KMP_MB(); /* in case another processor initialized lock */
1548   if (lck->lk.initialized != lck) {
1549     KMP_FATAL(LockIsUninitialized, func);
1550   }
1551   if (__kmp_is_queuing_lock_nestable(lck)) {
1552     KMP_FATAL(LockNestableUsedAsSimple, func);
1553   }
1554   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1555     KMP_FATAL(LockUnsettingFree, func);
1556   }
1557   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1558     KMP_FATAL(LockUnsettingSetByAnother, func);
1559   }
1560   lck->lk.owner_id = 0;
1561   return __kmp_release_queuing_lock(lck, gtid);
1562 }
1563 
1564 void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
1565   lck->lk.location = NULL;
1566   lck->lk.head_id = 0;
1567   lck->lk.tail_id = 0;
1568   lck->lk.next_ticket = 0;
1569   lck->lk.now_serving = 0;
1570   lck->lk.owner_id = 0; // no thread owns the lock.
1571   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
1572   lck->lk.initialized = lck;
1573 
1574   KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
1575 }
1576 
1577 static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1578   __kmp_init_queuing_lock(lck);
1579 }
1580 
1581 void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
1582   lck->lk.initialized = NULL;
1583   lck->lk.location = NULL;
1584   lck->lk.head_id = 0;
1585   lck->lk.tail_id = 0;
1586   lck->lk.next_ticket = 0;
1587   lck->lk.now_serving = 0;
1588   lck->lk.owner_id = 0;
1589   lck->lk.depth_locked = -1;
1590 }
1591 
1592 static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1593   char const *const func = "omp_destroy_lock";
1594   if (lck->lk.initialized != lck) {
1595     KMP_FATAL(LockIsUninitialized, func);
1596   }
1597   if (__kmp_is_queuing_lock_nestable(lck)) {
1598     KMP_FATAL(LockNestableUsedAsSimple, func);
1599   }
1600   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1601     KMP_FATAL(LockStillOwned, func);
1602   }
1603   __kmp_destroy_queuing_lock(lck);
1604 }
1605 
1606 // nested queuing locks
1607 
1608 int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1609   KMP_DEBUG_ASSERT(gtid >= 0);
1610 
1611   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1612     lck->lk.depth_locked += 1;
1613     return KMP_LOCK_ACQUIRED_NEXT;
1614   } else {
1615     __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1616     ANNOTATE_QUEUING_ACQUIRED(lck);
1617     KMP_MB();
1618     lck->lk.depth_locked = 1;
1619     KMP_MB();
1620     lck->lk.owner_id = gtid + 1;
1621     return KMP_LOCK_ACQUIRED_FIRST;
1622   }
1623 }
1624 
1625 static int
1626 __kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1627                                               kmp_int32 gtid) {
1628   char const *const func = "omp_set_nest_lock";
1629   if (lck->lk.initialized != lck) {
1630     KMP_FATAL(LockIsUninitialized, func);
1631   }
1632   if (!__kmp_is_queuing_lock_nestable(lck)) {
1633     KMP_FATAL(LockSimpleUsedAsNestable, func);
1634   }
1635   return __kmp_acquire_nested_queuing_lock(lck, gtid);
1636 }
1637 
1638 int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1639   int retval;
1640 
1641   KMP_DEBUG_ASSERT(gtid >= 0);
1642 
1643   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1644     retval = ++lck->lk.depth_locked;
1645   } else if (!__kmp_test_queuing_lock(lck, gtid)) {
1646     retval = 0;
1647   } else {
1648     KMP_MB();
1649     retval = lck->lk.depth_locked = 1;
1650     KMP_MB();
1651     lck->lk.owner_id = gtid + 1;
1652   }
1653   return retval;
1654 }
1655 
1656 static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1657                                                       kmp_int32 gtid) {
1658   char const *const func = "omp_test_nest_lock";
1659   if (lck->lk.initialized != lck) {
1660     KMP_FATAL(LockIsUninitialized, func);
1661   }
1662   if (!__kmp_is_queuing_lock_nestable(lck)) {
1663     KMP_FATAL(LockSimpleUsedAsNestable, func);
1664   }
1665   return __kmp_test_nested_queuing_lock(lck, gtid);
1666 }
1667 
1668 int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1669   KMP_DEBUG_ASSERT(gtid >= 0);
1670 
1671   KMP_MB();
1672   if (--(lck->lk.depth_locked) == 0) {
1673     KMP_MB();
1674     lck->lk.owner_id = 0;
1675     __kmp_release_queuing_lock(lck, gtid);
1676     return KMP_LOCK_RELEASED;
1677   }
1678   return KMP_LOCK_STILL_HELD;
1679 }
1680 
1681 static int
1682 __kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1683                                               kmp_int32 gtid) {
1684   char const *const func = "omp_unset_nest_lock";
1685   KMP_MB(); /* in case another processor initialized lock */
1686   if (lck->lk.initialized != lck) {
1687     KMP_FATAL(LockIsUninitialized, func);
1688   }
1689   if (!__kmp_is_queuing_lock_nestable(lck)) {
1690     KMP_FATAL(LockSimpleUsedAsNestable, func);
1691   }
1692   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1693     KMP_FATAL(LockUnsettingFree, func);
1694   }
1695   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1696     KMP_FATAL(LockUnsettingSetByAnother, func);
1697   }
1698   return __kmp_release_nested_queuing_lock(lck, gtid);
1699 }
1700 
1701 void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1702   __kmp_init_queuing_lock(lck);
1703   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
1704 }
1705 
1706 static void
1707 __kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1708   __kmp_init_nested_queuing_lock(lck);
1709 }
1710 
1711 void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1712   __kmp_destroy_queuing_lock(lck);
1713   lck->lk.depth_locked = 0;
1714 }
1715 
1716 static void
1717 __kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1718   char const *const func = "omp_destroy_nest_lock";
1719   if (lck->lk.initialized != lck) {
1720     KMP_FATAL(LockIsUninitialized, func);
1721   }
1722   if (!__kmp_is_queuing_lock_nestable(lck)) {
1723     KMP_FATAL(LockSimpleUsedAsNestable, func);
1724   }
1725   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1726     KMP_FATAL(LockStillOwned, func);
1727   }
1728   __kmp_destroy_nested_queuing_lock(lck);
1729 }
1730 
1731 // access functions to fields which don't exist for all lock kinds.
1732 
1733 static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
1734   return lck == lck->lk.initialized;
1735 }
1736 
1737 static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
1738   return lck->lk.location;
1739 }
1740 
1741 static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
1742                                             const ident_t *loc) {
1743   lck->lk.location = loc;
1744 }
1745 
1746 static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
1747   return lck->lk.flags;
1748 }
1749 
1750 static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
1751                                          kmp_lock_flags_t flags) {
1752   lck->lk.flags = flags;
1753 }
1754 
1755 #if KMP_USE_ADAPTIVE_LOCKS
1756 
1757 /* RTM Adaptive locks */
1758 
1759 #if KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1760 
1761 #include <immintrin.h>
1762 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1763 
1764 #else
1765 
1766 // Values from the status register after failed speculation.
1767 #define _XBEGIN_STARTED (~0u)
1768 #define _XABORT_EXPLICIT (1 << 0)
1769 #define _XABORT_RETRY (1 << 1)
1770 #define _XABORT_CONFLICT (1 << 2)
1771 #define _XABORT_CAPACITY (1 << 3)
1772 #define _XABORT_DEBUG (1 << 4)
1773 #define _XABORT_NESTED (1 << 5)
1774 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
1775 
1776 // Aborts for which it's worth trying again immediately
1777 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1778 
1779 #define STRINGIZE_INTERNAL(arg) #arg
1780 #define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
1781 
1782 // Access to RTM instructions
1783 /*A version of XBegin which returns -1 on speculation, and the value of EAX on
1784   an abort. This is the same definition as the compiler intrinsic that will be
1785   supported at some point. */
1786 static __inline int _xbegin() {
1787   int res = -1;
1788 
1789 #if KMP_OS_WINDOWS
1790 #if KMP_ARCH_X86_64
1791   _asm {
1792         _emit 0xC7
1793         _emit 0xF8
1794         _emit 2
1795         _emit 0
1796         _emit 0
1797         _emit 0
1798         jmp   L2
1799         mov   res, eax
1800     L2:
1801   }
1802 #else /* IA32 */
1803   _asm {
1804         _emit 0xC7
1805         _emit 0xF8
1806         _emit 2
1807         _emit 0
1808         _emit 0
1809         _emit 0
1810         jmp   L2
1811         mov   res, eax
1812     L2:
1813   }
1814 #endif // KMP_ARCH_X86_64
1815 #else
1816   /* Note that %eax must be noted as killed (clobbered), because the XSR is
1817      returned in %eax(%rax) on abort.  Other register values are restored, so
1818      don't need to be killed.
1819 
1820      We must also mark 'res' as an input and an output, since otherwise
1821      'res=-1' may be dropped as being dead, whereas we do need the assignment on
1822      the successful (i.e., non-abort) path. */
1823   __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
1824                    "   .long  1f-1b-6\n"
1825                    "    jmp   2f\n"
1826                    "1:  movl  %%eax,%0\n"
1827                    "2:"
1828                    : "+r"(res)::"memory", "%eax");
1829 #endif // KMP_OS_WINDOWS
1830   return res;
1831 }
1832 
1833 /* Transaction end */
1834 static __inline void _xend() {
1835 #if KMP_OS_WINDOWS
1836   __asm {
1837         _emit 0x0f
1838         _emit 0x01
1839         _emit 0xd5
1840   }
1841 #else
1842   __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
1843 #endif
1844 }
1845 
1846 /* This is a macro, the argument must be a single byte constant which can be
1847    evaluated by the inline assembler, since it is emitted as a byte into the
1848    assembly code. */
1849 // clang-format off
1850 #if KMP_OS_WINDOWS
1851 #define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
1852 #else
1853 #define _xabort(ARG)                                                           \
1854   __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
1855 #endif
1856 // clang-format on
1857 #endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1858 
1859 // Statistics is collected for testing purpose
1860 #if KMP_DEBUG_ADAPTIVE_LOCKS
1861 
1862 // We accumulate speculative lock statistics when the lock is destroyed. We
1863 // keep locks that haven't been destroyed in the liveLocks list so that we can
1864 // grab their statistics too.
1865 static kmp_adaptive_lock_statistics_t destroyedStats;
1866 
1867 // To hold the list of live locks.
1868 static kmp_adaptive_lock_info_t liveLocks;
1869 
1870 // A lock so we can safely update the list of locks.
1871 static kmp_bootstrap_lock_t chain_lock;
1872 
1873 // Initialize the list of stats.
1874 void __kmp_init_speculative_stats() {
1875   kmp_adaptive_lock_info_t *lck = &liveLocks;
1876 
1877   memset((void *)&(lck->stats), 0, sizeof(lck->stats));
1878   lck->stats.next = lck;
1879   lck->stats.prev = lck;
1880 
1881   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1882   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1883 
1884   __kmp_init_bootstrap_lock(&chain_lock);
1885 }
1886 
1887 // Insert the lock into the circular list
1888 static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
1889   __kmp_acquire_bootstrap_lock(&chain_lock);
1890 
1891   lck->stats.next = liveLocks.stats.next;
1892   lck->stats.prev = &liveLocks;
1893 
1894   liveLocks.stats.next = lck;
1895   lck->stats.next->stats.prev = lck;
1896 
1897   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1898   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1899 
1900   __kmp_release_bootstrap_lock(&chain_lock);
1901 }
1902 
1903 static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
1904   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1905   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1906 
1907   kmp_adaptive_lock_info_t *n = lck->stats.next;
1908   kmp_adaptive_lock_info_t *p = lck->stats.prev;
1909 
1910   n->stats.prev = p;
1911   p->stats.next = n;
1912 }
1913 
1914 static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1915   memset((void *)&lck->stats, 0, sizeof(lck->stats));
1916   __kmp_remember_lock(lck);
1917 }
1918 
1919 static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
1920                             kmp_adaptive_lock_info_t *lck) {
1921   kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
1922 
1923   t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
1924   t->successfulSpeculations += s->successfulSpeculations;
1925   t->hardFailedSpeculations += s->hardFailedSpeculations;
1926   t->softFailedSpeculations += s->softFailedSpeculations;
1927   t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
1928   t->lemmingYields += s->lemmingYields;
1929 }
1930 
1931 static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1932   kmp_adaptive_lock_statistics_t *t = &destroyedStats;
1933 
1934   __kmp_acquire_bootstrap_lock(&chain_lock);
1935 
1936   __kmp_add_stats(&destroyedStats, lck);
1937   __kmp_forget_lock(lck);
1938 
1939   __kmp_release_bootstrap_lock(&chain_lock);
1940 }
1941 
1942 static float percent(kmp_uint32 count, kmp_uint32 total) {
1943   return (total == 0) ? 0.0 : (100.0 * count) / total;
1944 }
1945 
1946 static FILE *__kmp_open_stats_file() {
1947   if (strcmp(__kmp_speculative_statsfile, "-") == 0)
1948     return stdout;
1949 
1950   size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
1951   char buffer[buffLen];
1952   KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
1953                (kmp_int32)getpid());
1954   FILE *result = fopen(&buffer[0], "w");
1955 
1956   // Maybe we should issue a warning here...
1957   return result ? result : stdout;
1958 }
1959 
1960 void __kmp_print_speculative_stats() {
1961   if (__kmp_user_lock_kind != lk_adaptive)
1962     return;
1963 
1964   FILE *statsFile = __kmp_open_stats_file();
1965 
1966   kmp_adaptive_lock_statistics_t total = destroyedStats;
1967   kmp_adaptive_lock_info_t *lck;
1968 
1969   for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
1970     __kmp_add_stats(&total, lck);
1971   }
1972   kmp_adaptive_lock_statistics_t *t = &total;
1973   kmp_uint32 totalSections =
1974       t->nonSpeculativeAcquires + t->successfulSpeculations;
1975   kmp_uint32 totalSpeculations = t->successfulSpeculations +
1976                                  t->hardFailedSpeculations +
1977                                  t->softFailedSpeculations;
1978 
1979   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
1980   fprintf(statsFile, " Lock parameters: \n"
1981                      "   max_soft_retries               : %10d\n"
1982                      "   max_badness                    : %10d\n",
1983           __kmp_adaptive_backoff_params.max_soft_retries,
1984           __kmp_adaptive_backoff_params.max_badness);
1985   fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
1986           t->nonSpeculativeAcquireAttempts);
1987   fprintf(statsFile, " Total critical sections          : %10d\n",
1988           totalSections);
1989   fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
1990           t->successfulSpeculations,
1991           percent(t->successfulSpeculations, totalSections));
1992   fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
1993           t->nonSpeculativeAcquires,
1994           percent(t->nonSpeculativeAcquires, totalSections));
1995   fprintf(statsFile, " Lemming yields                   : %10d\n\n",
1996           t->lemmingYields);
1997 
1998   fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
1999           totalSpeculations);
2000   fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
2001           t->successfulSpeculations,
2002           percent(t->successfulSpeculations, totalSpeculations));
2003   fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
2004           t->softFailedSpeculations,
2005           percent(t->softFailedSpeculations, totalSpeculations));
2006   fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
2007           t->hardFailedSpeculations,
2008           percent(t->hardFailedSpeculations, totalSpeculations));
2009 
2010   if (statsFile != stdout)
2011     fclose(statsFile);
2012 }
2013 
2014 #define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
2015 #else
2016 #define KMP_INC_STAT(lck, stat)
2017 
2018 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
2019 
2020 static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
2021   // It is enough to check that the head_id is zero.
2022   // We don't also need to check the tail.
2023   bool res = lck->lk.head_id == 0;
2024 
2025 // We need a fence here, since we must ensure that no memory operations
2026 // from later in this thread float above that read.
2027 #if KMP_COMPILER_ICC
2028   _mm_mfence();
2029 #else
2030   __sync_synchronize();
2031 #endif
2032 
2033   return res;
2034 }
2035 
2036 // Functions for manipulating the badness
2037 static __inline void
2038 __kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
2039   // Reset the badness to zero so we eagerly try to speculate again
2040   lck->lk.adaptive.badness = 0;
2041   KMP_INC_STAT(lck, successfulSpeculations);
2042 }
2043 
2044 // Create a bit mask with one more set bit.
2045 static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
2046   kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
2047   if (newBadness > lck->lk.adaptive.max_badness) {
2048     return;
2049   } else {
2050     lck->lk.adaptive.badness = newBadness;
2051   }
2052 }
2053 
2054 // Check whether speculation should be attempted.
2055 static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
2056                                            kmp_int32 gtid) {
2057   kmp_uint32 badness = lck->lk.adaptive.badness;
2058   kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
2059   int res = (attempts & badness) == 0;
2060   return res;
2061 }
2062 
2063 // Attempt to acquire only the speculative lock.
2064 // Does not back off to the non-speculative lock.
2065 static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
2066                                          kmp_int32 gtid) {
2067   int retries = lck->lk.adaptive.max_soft_retries;
2068 
2069   // We don't explicitly count the start of speculation, rather we record the
2070   // results (success, hard fail, soft fail). The sum of all of those is the
2071   // total number of times we started speculation since all speculations must
2072   // end one of those ways.
2073   do {
2074     kmp_uint32 status = _xbegin();
2075     // Switch this in to disable actual speculation but exercise at least some
2076     // of the rest of the code. Useful for debugging...
2077     // kmp_uint32 status = _XABORT_NESTED;
2078 
2079     if (status == _XBEGIN_STARTED) {
2080       /* We have successfully started speculation. Check that no-one acquired
2081          the lock for real between when we last looked and now. This also gets
2082          the lock cache line into our read-set, which we need so that we'll
2083          abort if anyone later claims it for real. */
2084       if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2085         // Lock is now visibly acquired, so someone beat us to it. Abort the
2086         // transaction so we'll restart from _xbegin with the failure status.
2087         _xabort(0x01);
2088         KMP_ASSERT2(0, "should not get here");
2089       }
2090       return 1; // Lock has been acquired (speculatively)
2091     } else {
2092       // We have aborted, update the statistics
2093       if (status & SOFT_ABORT_MASK) {
2094         KMP_INC_STAT(lck, softFailedSpeculations);
2095         // and loop round to retry.
2096       } else {
2097         KMP_INC_STAT(lck, hardFailedSpeculations);
2098         // Give up if we had a hard failure.
2099         break;
2100       }
2101     }
2102   } while (retries--); // Loop while we have retries, and didn't fail hard.
2103 
2104   // Either we had a hard failure or we didn't succeed softly after
2105   // the full set of attempts, so back off the badness.
2106   __kmp_step_badness(lck);
2107   return 0;
2108 }
2109 
2110 // Attempt to acquire the speculative lock, or back off to the non-speculative
2111 // one if the speculative lock cannot be acquired.
2112 // We can succeed speculatively, non-speculatively, or fail.
2113 static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
2114   // First try to acquire the lock speculatively
2115   if (__kmp_should_speculate(lck, gtid) &&
2116       __kmp_test_adaptive_lock_only(lck, gtid))
2117     return 1;
2118 
2119   // Speculative acquisition failed, so try to acquire it non-speculatively.
2120   // Count the non-speculative acquire attempt
2121   lck->lk.adaptive.acquire_attempts++;
2122 
2123   // Use base, non-speculative lock.
2124   if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
2125     KMP_INC_STAT(lck, nonSpeculativeAcquires);
2126     return 1; // Lock is acquired (non-speculatively)
2127   } else {
2128     return 0; // Failed to acquire the lock, it's already visibly locked.
2129   }
2130 }
2131 
2132 static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2133                                                 kmp_int32 gtid) {
2134   char const *const func = "omp_test_lock";
2135   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2136     KMP_FATAL(LockIsUninitialized, func);
2137   }
2138 
2139   int retval = __kmp_test_adaptive_lock(lck, gtid);
2140 
2141   if (retval) {
2142     lck->lk.qlk.owner_id = gtid + 1;
2143   }
2144   return retval;
2145 }
2146 
2147 // Block until we can acquire a speculative, adaptive lock. We check whether we
2148 // should be trying to speculate. If we should be, we check the real lock to see
2149 // if it is free, and, if not, pause without attempting to acquire it until it
2150 // is. Then we try the speculative acquire. This means that although we suffer
2151 // from lemmings a little (because all we can't acquire the lock speculatively
2152 // until the queue of threads waiting has cleared), we don't get into a state
2153 // where we can never acquire the lock speculatively (because we force the queue
2154 // to clear by preventing new arrivals from entering the queue). This does mean
2155 // that when we're trying to break lemmings, the lock is no longer fair. However
2156 // OpenMP makes no guarantee that its locks are fair, so this isn't a real
2157 // problem.
2158 static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
2159                                         kmp_int32 gtid) {
2160   if (__kmp_should_speculate(lck, gtid)) {
2161     if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2162       if (__kmp_test_adaptive_lock_only(lck, gtid))
2163         return;
2164       // We tried speculation and failed, so give up.
2165     } else {
2166       // We can't try speculation until the lock is free, so we pause here
2167       // (without suspending on the queueing lock, to allow it to drain, then
2168       // try again. All other threads will also see the same result for
2169       // shouldSpeculate, so will be doing the same if they try to claim the
2170       // lock from now on.
2171       while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2172         KMP_INC_STAT(lck, lemmingYields);
2173         __kmp_yield(TRUE);
2174       }
2175 
2176       if (__kmp_test_adaptive_lock_only(lck, gtid))
2177         return;
2178     }
2179   }
2180 
2181   // Speculative acquisition failed, so acquire it non-speculatively.
2182   // Count the non-speculative acquire attempt
2183   lck->lk.adaptive.acquire_attempts++;
2184 
2185   __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
2186   // We have acquired the base lock, so count that.
2187   KMP_INC_STAT(lck, nonSpeculativeAcquires);
2188   ANNOTATE_QUEUING_ACQUIRED(lck);
2189 }
2190 
2191 static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2192                                                     kmp_int32 gtid) {
2193   char const *const func = "omp_set_lock";
2194   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2195     KMP_FATAL(LockIsUninitialized, func);
2196   }
2197   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
2198     KMP_FATAL(LockIsAlreadyOwned, func);
2199   }
2200 
2201   __kmp_acquire_adaptive_lock(lck, gtid);
2202 
2203   lck->lk.qlk.owner_id = gtid + 1;
2204 }
2205 
2206 static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
2207                                        kmp_int32 gtid) {
2208   if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
2209           lck))) { // If the lock doesn't look claimed we must be speculating.
2210     // (Or the user's code is buggy and they're releasing without locking;
2211     // if we had XTEST we'd be able to check that case...)
2212     _xend(); // Exit speculation
2213     __kmp_update_badness_after_success(lck);
2214   } else { // Since the lock *is* visibly locked we're not speculating,
2215     // so should use the underlying lock's release scheme.
2216     __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
2217   }
2218   return KMP_LOCK_RELEASED;
2219 }
2220 
2221 static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2222                                                    kmp_int32 gtid) {
2223   char const *const func = "omp_unset_lock";
2224   KMP_MB(); /* in case another processor initialized lock */
2225   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2226     KMP_FATAL(LockIsUninitialized, func);
2227   }
2228   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
2229     KMP_FATAL(LockUnsettingFree, func);
2230   }
2231   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
2232     KMP_FATAL(LockUnsettingSetByAnother, func);
2233   }
2234   lck->lk.qlk.owner_id = 0;
2235   __kmp_release_adaptive_lock(lck, gtid);
2236   return KMP_LOCK_RELEASED;
2237 }
2238 
2239 static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
2240   __kmp_init_queuing_lock(GET_QLK_PTR(lck));
2241   lck->lk.adaptive.badness = 0;
2242   lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
2243   lck->lk.adaptive.max_soft_retries =
2244       __kmp_adaptive_backoff_params.max_soft_retries;
2245   lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
2246 #if KMP_DEBUG_ADAPTIVE_LOCKS
2247   __kmp_zero_speculative_stats(&lck->lk.adaptive);
2248 #endif
2249   KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
2250 }
2251 
2252 static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2253   __kmp_init_adaptive_lock(lck);
2254 }
2255 
2256 static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
2257 #if KMP_DEBUG_ADAPTIVE_LOCKS
2258   __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
2259 #endif
2260   __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
2261   // Nothing needed for the speculative part.
2262 }
2263 
2264 static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2265   char const *const func = "omp_destroy_lock";
2266   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2267     KMP_FATAL(LockIsUninitialized, func);
2268   }
2269   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
2270     KMP_FATAL(LockStillOwned, func);
2271   }
2272   __kmp_destroy_adaptive_lock(lck);
2273 }
2274 
2275 #endif // KMP_USE_ADAPTIVE_LOCKS
2276 
2277 /* ------------------------------------------------------------------------ */
2278 /* DRDPA ticket locks                                                */
2279 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
2280 
2281 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
2282   return TCR_4(lck->lk.owner_id) - 1;
2283 }
2284 
2285 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
2286   return lck->lk.depth_locked != -1;
2287 }
2288 
2289 __forceinline static int
2290 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2291   kmp_uint64 ticket =
2292       KMP_TEST_THEN_INC64(RCAST(volatile kmp_int64 *, &lck->lk.next_ticket));
2293   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2294   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
2295 
2296 #ifdef USE_LOCK_PROFILE
2297   if (TCR_8(polls[ticket & mask].poll) != ticket)
2298     __kmp_printf("LOCK CONTENTION: %p\n", lck);
2299 /* else __kmp_printf( "." );*/
2300 #endif /* USE_LOCK_PROFILE */
2301 
2302   // Now spin-wait, but reload the polls pointer and mask, in case the
2303   // polling area has been reconfigured.  Unless it is reconfigured, the
2304   // reloads stay in L1 cache and are cheap.
2305   //
2306   // Keep this code in sync with KMP_WAIT_YIELD, in kmp_dispatch.cpp !!!
2307   //
2308   // The current implementation of KMP_WAIT_YIELD doesn't allow for mask
2309   // and poll to be re-read every spin iteration.
2310   kmp_uint32 spins;
2311 
2312   KMP_FSYNC_PREPARE(lck);
2313   KMP_INIT_YIELD(spins);
2314   while (TCR_8(polls[ticket & mask].poll) < ticket) { // volatile load
2315     // If we are oversubscribed,
2316     // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
2317     // CPU Pause is in the macros for yield.
2318     //
2319     KMP_YIELD(TCR_4(__kmp_nth) >
2320               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
2321     KMP_YIELD_SPIN(spins);
2322 
2323     // Re-read the mask and the poll pointer from the lock structure.
2324     //
2325     // Make certain that "mask" is read before "polls" !!!
2326     //
2327     // If another thread picks reconfigures the polling area and updates their
2328     // values, and we get the new value of mask and the old polls pointer, we
2329     // could access memory beyond the end of the old polling area.
2330     mask = TCR_8(lck->lk.mask); // volatile load
2331     polls = lck->lk.polls; // volatile load
2332   }
2333 
2334   // Critical section starts here
2335   KMP_FSYNC_ACQUIRED(lck);
2336   KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
2337                   ticket, lck));
2338   lck->lk.now_serving = ticket; // non-volatile store
2339 
2340   // Deallocate a garbage polling area if we know that we are the last
2341   // thread that could possibly access it.
2342   //
2343   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
2344   // ticket.
2345   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
2346     __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
2347     lck->lk.old_polls = NULL;
2348     lck->lk.cleanup_ticket = 0;
2349   }
2350 
2351   // Check to see if we should reconfigure the polling area.
2352   // If there is still a garbage polling area to be deallocated from a
2353   // previous reconfiguration, let a later thread reconfigure it.
2354   if (lck->lk.old_polls == NULL) {
2355     bool reconfigure = false;
2356     volatile struct kmp_base_drdpa_lock::kmp_lock_poll *old_polls = polls;
2357     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
2358 
2359     if (TCR_4(__kmp_nth) >
2360         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
2361       // We are in oversubscription mode.  Contract the polling area
2362       // down to a single location, if that hasn't been done already.
2363       if (num_polls > 1) {
2364         reconfigure = true;
2365         num_polls = TCR_4(lck->lk.num_polls);
2366         mask = 0;
2367         num_polls = 1;
2368         polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
2369             __kmp_allocate(num_polls * sizeof(*polls));
2370         polls[0].poll = ticket;
2371       }
2372     } else {
2373       // We are in under/fully subscribed mode.  Check the number of
2374       // threads waiting on the lock.  The size of the polling area
2375       // should be at least the number of threads waiting.
2376       kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
2377       if (num_waiting > num_polls) {
2378         kmp_uint32 old_num_polls = num_polls;
2379         reconfigure = true;
2380         do {
2381           mask = (mask << 1) | 1;
2382           num_polls *= 2;
2383         } while (num_polls <= num_waiting);
2384 
2385         // Allocate the new polling area, and copy the relevant portion
2386         // of the old polling area to the new area.  __kmp_allocate()
2387         // zeroes the memory it allocates, and most of the old area is
2388         // just zero padding, so we only copy the release counters.
2389         polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
2390             __kmp_allocate(num_polls * sizeof(*polls));
2391         kmp_uint32 i;
2392         for (i = 0; i < old_num_polls; i++) {
2393           polls[i].poll = old_polls[i].poll;
2394         }
2395       }
2396     }
2397 
2398     if (reconfigure) {
2399       // Now write the updated fields back to the lock structure.
2400       //
2401       // Make certain that "polls" is written before "mask" !!!
2402       //
2403       // If another thread picks up the new value of mask and the old polls
2404       // pointer , it could access memory beyond the end of the old polling
2405       // area.
2406       //
2407       // On x86, we need memory fences.
2408       KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
2409                       "lock %p to %d polls\n",
2410                       ticket, lck, num_polls));
2411 
2412       lck->lk.old_polls = old_polls; // non-volatile store
2413       lck->lk.polls = polls; // volatile store
2414 
2415       KMP_MB();
2416 
2417       lck->lk.num_polls = num_polls; // non-volatile store
2418       lck->lk.mask = mask; // volatile store
2419 
2420       KMP_MB();
2421 
2422       // Only after the new polling area and mask have been flushed
2423       // to main memory can we update the cleanup ticket field.
2424       //
2425       // volatile load / non-volatile store
2426       lck->lk.cleanup_ticket = TCR_8(lck->lk.next_ticket);
2427     }
2428   }
2429   return KMP_LOCK_ACQUIRED_FIRST;
2430 }
2431 
2432 int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2433   int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2434   ANNOTATE_DRDPA_ACQUIRED(lck);
2435   return retval;
2436 }
2437 
2438 static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2439                                                 kmp_int32 gtid) {
2440   char const *const func = "omp_set_lock";
2441   if (lck->lk.initialized != lck) {
2442     KMP_FATAL(LockIsUninitialized, func);
2443   }
2444   if (__kmp_is_drdpa_lock_nestable(lck)) {
2445     KMP_FATAL(LockNestableUsedAsSimple, func);
2446   }
2447   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
2448     KMP_FATAL(LockIsAlreadyOwned, func);
2449   }
2450 
2451   __kmp_acquire_drdpa_lock(lck, gtid);
2452 
2453   lck->lk.owner_id = gtid + 1;
2454   return KMP_LOCK_ACQUIRED_FIRST;
2455 }
2456 
2457 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2458   // First get a ticket, then read the polls pointer and the mask.
2459   // The polls pointer must be read before the mask!!! (See above)
2460   kmp_uint64 ticket = TCR_8(lck->lk.next_ticket); // volatile load
2461   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
2462   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2463   if (TCR_8(polls[ticket & mask].poll) == ticket) {
2464     kmp_uint64 next_ticket = ticket + 1;
2465     if (KMP_COMPARE_AND_STORE_ACQ64(&lck->lk.next_ticket, ticket,
2466                                     next_ticket)) {
2467       KMP_FSYNC_ACQUIRED(lck);
2468       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
2469                       ticket, lck));
2470       lck->lk.now_serving = ticket; // non-volatile store
2471 
2472       // Since no threads are waiting, there is no possibility that we would
2473       // want to reconfigure the polling area.  We might have the cleanup ticket
2474       // value (which says that it is now safe to deallocate old_polls), but
2475       // we'll let a later thread which calls __kmp_acquire_lock do that - this
2476       // routine isn't supposed to block, and we would risk blocks if we called
2477       // __kmp_free() to do the deallocation.
2478       return TRUE;
2479     }
2480   }
2481   return FALSE;
2482 }
2483 
2484 static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2485                                              kmp_int32 gtid) {
2486   char const *const func = "omp_test_lock";
2487   if (lck->lk.initialized != lck) {
2488     KMP_FATAL(LockIsUninitialized, func);
2489   }
2490   if (__kmp_is_drdpa_lock_nestable(lck)) {
2491     KMP_FATAL(LockNestableUsedAsSimple, func);
2492   }
2493 
2494   int retval = __kmp_test_drdpa_lock(lck, gtid);
2495 
2496   if (retval) {
2497     lck->lk.owner_id = gtid + 1;
2498   }
2499   return retval;
2500 }
2501 
2502 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2503   // Read the ticket value from the lock data struct, then the polls pointer and
2504   // the mask.  The polls pointer must be read before the mask!!! (See above)
2505   kmp_uint64 ticket = lck->lk.now_serving + 1; // non-volatile load
2506   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
2507   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2508   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
2509                   ticket - 1, lck));
2510   KMP_FSYNC_RELEASING(lck);
2511   ANNOTATE_DRDPA_RELEASED(lck);
2512   KMP_ST_REL64(&(polls[ticket & mask].poll), ticket); // volatile store
2513   return KMP_LOCK_RELEASED;
2514 }
2515 
2516 static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2517                                                 kmp_int32 gtid) {
2518   char const *const func = "omp_unset_lock";
2519   KMP_MB(); /* in case another processor initialized lock */
2520   if (lck->lk.initialized != lck) {
2521     KMP_FATAL(LockIsUninitialized, func);
2522   }
2523   if (__kmp_is_drdpa_lock_nestable(lck)) {
2524     KMP_FATAL(LockNestableUsedAsSimple, func);
2525   }
2526   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2527     KMP_FATAL(LockUnsettingFree, func);
2528   }
2529   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
2530       (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
2531     KMP_FATAL(LockUnsettingSetByAnother, func);
2532   }
2533   lck->lk.owner_id = 0;
2534   return __kmp_release_drdpa_lock(lck, gtid);
2535 }
2536 
2537 void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
2538   lck->lk.location = NULL;
2539   lck->lk.mask = 0;
2540   lck->lk.num_polls = 1;
2541   lck->lk.polls =
2542       (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)__kmp_allocate(
2543           lck->lk.num_polls * sizeof(*(lck->lk.polls)));
2544   lck->lk.cleanup_ticket = 0;
2545   lck->lk.old_polls = NULL;
2546   lck->lk.next_ticket = 0;
2547   lck->lk.now_serving = 0;
2548   lck->lk.owner_id = 0; // no thread owns the lock.
2549   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
2550   lck->lk.initialized = lck;
2551 
2552   KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
2553 }
2554 
2555 static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2556   __kmp_init_drdpa_lock(lck);
2557 }
2558 
2559 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
2560   lck->lk.initialized = NULL;
2561   lck->lk.location = NULL;
2562   if (lck->lk.polls != NULL) {
2563     __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.polls));
2564     lck->lk.polls = NULL;
2565   }
2566   if (lck->lk.old_polls != NULL) {
2567     __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
2568     lck->lk.old_polls = NULL;
2569   }
2570   lck->lk.mask = 0;
2571   lck->lk.num_polls = 0;
2572   lck->lk.cleanup_ticket = 0;
2573   lck->lk.next_ticket = 0;
2574   lck->lk.now_serving = 0;
2575   lck->lk.owner_id = 0;
2576   lck->lk.depth_locked = -1;
2577 }
2578 
2579 static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2580   char const *const func = "omp_destroy_lock";
2581   if (lck->lk.initialized != lck) {
2582     KMP_FATAL(LockIsUninitialized, func);
2583   }
2584   if (__kmp_is_drdpa_lock_nestable(lck)) {
2585     KMP_FATAL(LockNestableUsedAsSimple, func);
2586   }
2587   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2588     KMP_FATAL(LockStillOwned, func);
2589   }
2590   __kmp_destroy_drdpa_lock(lck);
2591 }
2592 
2593 // nested drdpa ticket locks
2594 
2595 int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2596   KMP_DEBUG_ASSERT(gtid >= 0);
2597 
2598   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2599     lck->lk.depth_locked += 1;
2600     return KMP_LOCK_ACQUIRED_NEXT;
2601   } else {
2602     __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2603     ANNOTATE_DRDPA_ACQUIRED(lck);
2604     KMP_MB();
2605     lck->lk.depth_locked = 1;
2606     KMP_MB();
2607     lck->lk.owner_id = gtid + 1;
2608     return KMP_LOCK_ACQUIRED_FIRST;
2609   }
2610 }
2611 
2612 static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2613                                                         kmp_int32 gtid) {
2614   char const *const func = "omp_set_nest_lock";
2615   if (lck->lk.initialized != lck) {
2616     KMP_FATAL(LockIsUninitialized, func);
2617   }
2618   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2619     KMP_FATAL(LockSimpleUsedAsNestable, func);
2620   }
2621   __kmp_acquire_nested_drdpa_lock(lck, gtid);
2622 }
2623 
2624 int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2625   int retval;
2626 
2627   KMP_DEBUG_ASSERT(gtid >= 0);
2628 
2629   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2630     retval = ++lck->lk.depth_locked;
2631   } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
2632     retval = 0;
2633   } else {
2634     KMP_MB();
2635     retval = lck->lk.depth_locked = 1;
2636     KMP_MB();
2637     lck->lk.owner_id = gtid + 1;
2638   }
2639   return retval;
2640 }
2641 
2642 static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2643                                                     kmp_int32 gtid) {
2644   char const *const func = "omp_test_nest_lock";
2645   if (lck->lk.initialized != lck) {
2646     KMP_FATAL(LockIsUninitialized, func);
2647   }
2648   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2649     KMP_FATAL(LockSimpleUsedAsNestable, func);
2650   }
2651   return __kmp_test_nested_drdpa_lock(lck, gtid);
2652 }
2653 
2654 int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2655   KMP_DEBUG_ASSERT(gtid >= 0);
2656 
2657   KMP_MB();
2658   if (--(lck->lk.depth_locked) == 0) {
2659     KMP_MB();
2660     lck->lk.owner_id = 0;
2661     __kmp_release_drdpa_lock(lck, gtid);
2662     return KMP_LOCK_RELEASED;
2663   }
2664   return KMP_LOCK_STILL_HELD;
2665 }
2666 
2667 static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2668                                                        kmp_int32 gtid) {
2669   char const *const func = "omp_unset_nest_lock";
2670   KMP_MB(); /* in case another processor initialized lock */
2671   if (lck->lk.initialized != lck) {
2672     KMP_FATAL(LockIsUninitialized, func);
2673   }
2674   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2675     KMP_FATAL(LockSimpleUsedAsNestable, func);
2676   }
2677   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2678     KMP_FATAL(LockUnsettingFree, func);
2679   }
2680   if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
2681     KMP_FATAL(LockUnsettingSetByAnother, func);
2682   }
2683   return __kmp_release_nested_drdpa_lock(lck, gtid);
2684 }
2685 
2686 void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2687   __kmp_init_drdpa_lock(lck);
2688   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
2689 }
2690 
2691 static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2692   __kmp_init_nested_drdpa_lock(lck);
2693 }
2694 
2695 void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2696   __kmp_destroy_drdpa_lock(lck);
2697   lck->lk.depth_locked = 0;
2698 }
2699 
2700 static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2701   char const *const func = "omp_destroy_nest_lock";
2702   if (lck->lk.initialized != lck) {
2703     KMP_FATAL(LockIsUninitialized, func);
2704   }
2705   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2706     KMP_FATAL(LockSimpleUsedAsNestable, func);
2707   }
2708   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2709     KMP_FATAL(LockStillOwned, func);
2710   }
2711   __kmp_destroy_nested_drdpa_lock(lck);
2712 }
2713 
2714 // access functions to fields which don't exist for all lock kinds.
2715 
2716 static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
2717   return lck == lck->lk.initialized;
2718 }
2719 
2720 static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
2721   return lck->lk.location;
2722 }
2723 
2724 static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
2725                                           const ident_t *loc) {
2726   lck->lk.location = loc;
2727 }
2728 
2729 static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
2730   return lck->lk.flags;
2731 }
2732 
2733 static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
2734                                        kmp_lock_flags_t flags) {
2735   lck->lk.flags = flags;
2736 }
2737 
2738 // Time stamp counter
2739 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2740 #define __kmp_tsc() __kmp_hardware_timestamp()
2741 // Runtime's default backoff parameters
2742 kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
2743 #else
2744 // Use nanoseconds for other platforms
2745 extern kmp_uint64 __kmp_now_nsec();
2746 kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
2747 #define __kmp_tsc() __kmp_now_nsec()
2748 #endif
2749 
2750 // A useful predicate for dealing with timestamps that may wrap.
2751 // Is a before b? Since the timestamps may wrap, this is asking whether it's
2752 // shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
2753 // Times where going clockwise is less distance than going anti-clockwise
2754 // are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
2755 // then a > b (true) does not mean a reached b; whereas signed(a) = -2,
2756 // signed(b) = 0 captures the actual difference
2757 static inline bool before(kmp_uint64 a, kmp_uint64 b) {
2758   return ((kmp_int64)b - (kmp_int64)a) > 0;
2759 }
2760 
2761 // Truncated binary exponential backoff function
2762 void __kmp_spin_backoff(kmp_backoff_t *boff) {
2763   // We could flatten this loop, but making it a nested loop gives better result
2764   kmp_uint32 i;
2765   for (i = boff->step; i > 0; i--) {
2766     kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
2767     do {
2768       KMP_CPU_PAUSE();
2769     } while (before(__kmp_tsc(), goal));
2770   }
2771   boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
2772 }
2773 
2774 #if KMP_USE_DYNAMIC_LOCK
2775 
2776 // Direct lock initializers. It simply writes a tag to the low 8 bits of the
2777 // lock word.
2778 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
2779                                    kmp_dyna_lockseq_t seq) {
2780   TCW_4(*lck, KMP_GET_D_TAG(seq));
2781   KA_TRACE(
2782       20,
2783       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
2784 }
2785 
2786 #if KMP_USE_TSX
2787 
2788 // HLE lock functions - imported from the testbed runtime.
2789 #define HLE_ACQUIRE ".byte 0xf2;"
2790 #define HLE_RELEASE ".byte 0xf3;"
2791 
2792 static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
2793   __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
2794   return v;
2795 }
2796 
2797 static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
2798 
2799 static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2800   // Use gtid for KMP_LOCK_BUSY if necessary
2801   if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
2802     int delay = 1;
2803     do {
2804       while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
2805         for (int i = delay; i != 0; --i)
2806           KMP_CPU_PAUSE();
2807         delay = ((delay << 1) | 1) & 7;
2808       }
2809     } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
2810   }
2811 }
2812 
2813 static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2814                                                kmp_int32 gtid) {
2815   __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
2816 }
2817 
2818 static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2819   __asm__ volatile(HLE_RELEASE "movl %1,%0"
2820                    : "=m"(*lck)
2821                    : "r"(KMP_LOCK_FREE(hle))
2822                    : "memory");
2823   return KMP_LOCK_RELEASED;
2824 }
2825 
2826 static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2827                                               kmp_int32 gtid) {
2828   return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
2829 }
2830 
2831 static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2832   return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
2833 }
2834 
2835 static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2836                                            kmp_int32 gtid) {
2837   return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
2838 }
2839 
2840 static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) {
2841   __kmp_init_queuing_lock(lck);
2842 }
2843 
2844 static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) {
2845   __kmp_destroy_queuing_lock(lck);
2846 }
2847 
2848 static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2849   unsigned retries = 3, status;
2850   do {
2851     status = _xbegin();
2852     if (status == _XBEGIN_STARTED) {
2853       if (__kmp_is_unlocked_queuing_lock(lck))
2854         return;
2855       _xabort(0xff);
2856     }
2857     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
2858       // Wait until lock becomes free
2859       while (!__kmp_is_unlocked_queuing_lock(lck))
2860         __kmp_yield(TRUE);
2861     } else if (!(status & _XABORT_RETRY))
2862       break;
2863   } while (retries--);
2864 
2865   // Fall-back non-speculative lock (xchg)
2866   __kmp_acquire_queuing_lock(lck, gtid);
2867 }
2868 
2869 static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2870                                                kmp_int32 gtid) {
2871   __kmp_acquire_rtm_lock(lck, gtid);
2872 }
2873 
2874 static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2875   if (__kmp_is_unlocked_queuing_lock(lck)) {
2876     // Releasing from speculation
2877     _xend();
2878   } else {
2879     // Releasing from a real lock
2880     __kmp_release_queuing_lock(lck, gtid);
2881   }
2882   return KMP_LOCK_RELEASED;
2883 }
2884 
2885 static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2886                                               kmp_int32 gtid) {
2887   return __kmp_release_rtm_lock(lck, gtid);
2888 }
2889 
2890 static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2891   unsigned retries = 3, status;
2892   do {
2893     status = _xbegin();
2894     if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
2895       return 1;
2896     }
2897     if (!(status & _XABORT_RETRY))
2898       break;
2899   } while (retries--);
2900 
2901   return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0;
2902 }
2903 
2904 static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2905                                            kmp_int32 gtid) {
2906   return __kmp_test_rtm_lock(lck, gtid);
2907 }
2908 
2909 #endif // KMP_USE_TSX
2910 
2911 // Entry functions for indirect locks (first element of direct lock jump tables)
2912 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
2913                                      kmp_dyna_lockseq_t tag);
2914 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
2915 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2916 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2917 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2918 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2919                                                kmp_int32);
2920 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2921                                                  kmp_int32);
2922 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2923                                                 kmp_int32);
2924 
2925 // Jump tables for the indirect lock functions
2926 // Only fill in the odd entries, that avoids the need to shift out the low bit
2927 
2928 // init functions
2929 #define expand(l, op) 0, __kmp_init_direct_lock,
2930 void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
2931     __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
2932 #undef expand
2933 
2934 // destroy functions
2935 #define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
2936 void (*__kmp_direct_destroy[])(kmp_dyna_lock_t *) = {
2937     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
2938 #undef expand
2939 
2940 // set/acquire functions
2941 #define expand(l, op)                                                          \
2942   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
2943 static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
2944     __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
2945 #undef expand
2946 #define expand(l, op)                                                          \
2947   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
2948 static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2949     __kmp_set_indirect_lock_with_checks, 0,
2950     KMP_FOREACH_D_LOCK(expand, acquire)};
2951 #undef expand
2952 
2953 // unset/release and test functions
2954 #define expand(l, op)                                                          \
2955   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
2956 static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
2957     __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
2958 static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
2959     __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
2960 #undef expand
2961 #define expand(l, op)                                                          \
2962   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
2963 static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2964     __kmp_unset_indirect_lock_with_checks, 0,
2965     KMP_FOREACH_D_LOCK(expand, release)};
2966 static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2967     __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
2968 #undef expand
2969 
2970 // Exposes only one set of jump tables (*lock or *lock_with_checks).
2971 int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
2972 int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
2973 int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
2974 
2975 // Jump tables for the indirect lock functions
2976 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
2977 void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
2978     KMP_FOREACH_I_LOCK(expand, init)};
2979 void (*__kmp_indirect_destroy[])(kmp_user_lock_p) = {
2980     KMP_FOREACH_I_LOCK(expand, destroy)};
2981 #undef expand
2982 
2983 // set/acquire functions
2984 #define expand(l, op)                                                          \
2985   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
2986 static int (*indirect_set[])(kmp_user_lock_p,
2987                              kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
2988 #undef expand
2989 #define expand(l, op)                                                          \
2990   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
2991 static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
2992     KMP_FOREACH_I_LOCK(expand, acquire)};
2993 #undef expand
2994 
2995 // unset/release and test functions
2996 #define expand(l, op)                                                          \
2997   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
2998 static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
2999     KMP_FOREACH_I_LOCK(expand, release)};
3000 static int (*indirect_test[])(kmp_user_lock_p,
3001                               kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
3002 #undef expand
3003 #define expand(l, op)                                                          \
3004   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
3005 static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
3006     KMP_FOREACH_I_LOCK(expand, release)};
3007 static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
3008     KMP_FOREACH_I_LOCK(expand, test)};
3009 #undef expand
3010 
3011 // Exposes only one jump tables (*lock or *lock_with_checks).
3012 int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
3013 int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
3014 int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
3015 
3016 // Lock index table.
3017 kmp_indirect_lock_table_t __kmp_i_lock_table;
3018 
3019 // Size of indirect locks.
3020 static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
3021 
3022 // Jump tables for lock accessor/modifier.
3023 void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3024                                                      const ident_t *) = {0};
3025 void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3026                                                   kmp_lock_flags_t) = {0};
3027 const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
3028     kmp_user_lock_p) = {0};
3029 kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
3030     kmp_user_lock_p) = {0};
3031 
3032 // Use different lock pools for different lock types.
3033 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
3034 
3035 // User lock allocator for dynamically dispatched indirect locks. Every entry of
3036 // the indirect lock table holds the address and type of the allocated indrect
3037 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
3038 // full. A destroyed indirect lock object is returned to the reusable pool of
3039 // locks, unique to each lock type.
3040 kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
3041                                                   kmp_int32 gtid,
3042                                                   kmp_indirect_locktag_t tag) {
3043   kmp_indirect_lock_t *lck;
3044   kmp_lock_index_t idx;
3045 
3046   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3047 
3048   if (__kmp_indirect_lock_pool[tag] != NULL) {
3049     // Reuse the allocated and destroyed lock object
3050     lck = __kmp_indirect_lock_pool[tag];
3051     if (OMP_LOCK_T_SIZE < sizeof(void *))
3052       idx = lck->lock->pool.index;
3053     __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
3054     KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
3055                   lck));
3056   } else {
3057     idx = __kmp_i_lock_table.next;
3058     // Check capacity and double the size if it is full
3059     if (idx == __kmp_i_lock_table.size) {
3060       // Double up the space for block pointers
3061       int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
3062       kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate(
3063           2 * row * sizeof(kmp_indirect_lock_t *));
3064       KMP_MEMCPY(new_table, __kmp_i_lock_table.table,
3065                  row * sizeof(kmp_indirect_lock_t *));
3066       kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
3067       __kmp_i_lock_table.table = new_table;
3068       __kmp_free(old_table);
3069       // Allocate new objects in the new blocks
3070       for (int i = row; i < 2 * row; ++i)
3071         *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
3072             KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3073       __kmp_i_lock_table.size = 2 * idx;
3074     }
3075     __kmp_i_lock_table.next++;
3076     lck = KMP_GET_I_LOCK(idx);
3077     // Allocate a new base lock object
3078     lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
3079     KA_TRACE(20,
3080              ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
3081   }
3082 
3083   __kmp_release_lock(&__kmp_global_lock, gtid);
3084 
3085   lck->type = tag;
3086 
3087   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3088     *((kmp_lock_index_t *)user_lock) = idx
3089                                        << 1; // indirect lock word must be even
3090   } else {
3091     *((kmp_indirect_lock_t **)user_lock) = lck;
3092   }
3093 
3094   return lck;
3095 }
3096 
3097 // User lock lookup for dynamically dispatched locks.
3098 static __forceinline kmp_indirect_lock_t *
3099 __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
3100   if (__kmp_env_consistency_check) {
3101     kmp_indirect_lock_t *lck = NULL;
3102     if (user_lock == NULL) {
3103       KMP_FATAL(LockIsUninitialized, func);
3104     }
3105     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3106       kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
3107       if (idx >= __kmp_i_lock_table.size) {
3108         KMP_FATAL(LockIsUninitialized, func);
3109       }
3110       lck = KMP_GET_I_LOCK(idx);
3111     } else {
3112       lck = *((kmp_indirect_lock_t **)user_lock);
3113     }
3114     if (lck == NULL) {
3115       KMP_FATAL(LockIsUninitialized, func);
3116     }
3117     return lck;
3118   } else {
3119     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3120       return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
3121     } else {
3122       return *((kmp_indirect_lock_t **)user_lock);
3123     }
3124   }
3125 }
3126 
3127 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
3128                                      kmp_dyna_lockseq_t seq) {
3129 #if KMP_USE_ADAPTIVE_LOCKS
3130   if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
3131     KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
3132     seq = lockseq_queuing;
3133   }
3134 #endif
3135 #if KMP_USE_TSX
3136   if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) {
3137     seq = lockseq_queuing;
3138   }
3139 #endif
3140   kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
3141   kmp_indirect_lock_t *l =
3142       __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
3143   KMP_I_LOCK_FUNC(l, init)(l->lock);
3144   KA_TRACE(
3145       20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
3146            seq));
3147 }
3148 
3149 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
3150   kmp_uint32 gtid = __kmp_entry_gtid();
3151   kmp_indirect_lock_t *l =
3152       __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
3153   KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3154   kmp_indirect_locktag_t tag = l->type;
3155 
3156   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3157 
3158   // Use the base lock's space to keep the pool chain.
3159   l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
3160   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3161     l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
3162   }
3163   __kmp_indirect_lock_pool[tag] = l;
3164 
3165   __kmp_release_lock(&__kmp_global_lock, gtid);
3166 }
3167 
3168 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3169   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3170   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3171 }
3172 
3173 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3174   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3175   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3176 }
3177 
3178 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3179   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3180   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3181 }
3182 
3183 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3184                                                kmp_int32 gtid) {
3185   kmp_indirect_lock_t *l =
3186       __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
3187   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3188 }
3189 
3190 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3191                                                  kmp_int32 gtid) {
3192   kmp_indirect_lock_t *l =
3193       __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
3194   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3195 }
3196 
3197 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3198                                                 kmp_int32 gtid) {
3199   kmp_indirect_lock_t *l =
3200       __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
3201   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3202 }
3203 
3204 kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
3205 
3206 // This is used only in kmp_error.cpp when consistency checking is on.
3207 kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
3208   switch (seq) {
3209   case lockseq_tas:
3210   case lockseq_nested_tas:
3211     return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
3212 #if KMP_USE_FUTEX
3213   case lockseq_futex:
3214   case lockseq_nested_futex:
3215     return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
3216 #endif
3217   case lockseq_ticket:
3218   case lockseq_nested_ticket:
3219     return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
3220   case lockseq_queuing:
3221   case lockseq_nested_queuing:
3222 #if KMP_USE_ADAPTIVE_LOCKS
3223   case lockseq_adaptive:
3224 #endif
3225     return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
3226   case lockseq_drdpa:
3227   case lockseq_nested_drdpa:
3228     return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
3229   default:
3230     return 0;
3231   }
3232 }
3233 
3234 // Initializes data for dynamic user locks.
3235 void __kmp_init_dynamic_user_locks() {
3236   // Initialize jump table for the lock functions
3237   if (__kmp_env_consistency_check) {
3238     __kmp_direct_set = direct_set_check;
3239     __kmp_direct_unset = direct_unset_check;
3240     __kmp_direct_test = direct_test_check;
3241     __kmp_indirect_set = indirect_set_check;
3242     __kmp_indirect_unset = indirect_unset_check;
3243     __kmp_indirect_test = indirect_test_check;
3244   } else {
3245     __kmp_direct_set = direct_set;
3246     __kmp_direct_unset = direct_unset;
3247     __kmp_direct_test = direct_test;
3248     __kmp_indirect_set = indirect_set;
3249     __kmp_indirect_unset = indirect_unset;
3250     __kmp_indirect_test = indirect_test;
3251   }
3252   // If the user locks have already been initialized, then return. Allow the
3253   // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
3254   // new lock tables if they have already been allocated.
3255   if (__kmp_init_user_locks)
3256     return;
3257 
3258   // Initialize lock index table
3259   __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
3260   __kmp_i_lock_table.table =
3261       (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
3262   *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
3263       KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3264   __kmp_i_lock_table.next = 0;
3265 
3266   // Indirect lock size
3267   __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
3268   __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
3269 #if KMP_USE_ADAPTIVE_LOCKS
3270   __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
3271 #endif
3272   __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
3273 #if KMP_USE_TSX
3274   __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t);
3275 #endif
3276   __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
3277 #if KMP_USE_FUTEX
3278   __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
3279 #endif
3280   __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
3281   __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
3282   __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
3283 
3284 // Initialize lock accessor/modifier
3285 #define fill_jumps(table, expand, sep)                                         \
3286   {                                                                            \
3287     table[locktag##sep##ticket] = expand(ticket);                              \
3288     table[locktag##sep##queuing] = expand(queuing);                            \
3289     table[locktag##sep##drdpa] = expand(drdpa);                                \
3290   }
3291 
3292 #if KMP_USE_ADAPTIVE_LOCKS
3293 #define fill_table(table, expand)                                              \
3294   {                                                                            \
3295     fill_jumps(table, expand, _);                                              \
3296     table[locktag_adaptive] = expand(queuing);                                 \
3297     fill_jumps(table, expand, _nested_);                                       \
3298   }
3299 #else
3300 #define fill_table(table, expand)                                              \
3301   {                                                                            \
3302     fill_jumps(table, expand, _);                                              \
3303     fill_jumps(table, expand, _nested_);                                       \
3304   }
3305 #endif // KMP_USE_ADAPTIVE_LOCKS
3306 
3307 #define expand(l)                                                              \
3308   (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
3309   fill_table(__kmp_indirect_set_location, expand);
3310 #undef expand
3311 #define expand(l)                                                              \
3312   (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
3313   fill_table(__kmp_indirect_set_flags, expand);
3314 #undef expand
3315 #define expand(l)                                                              \
3316   (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
3317   fill_table(__kmp_indirect_get_location, expand);
3318 #undef expand
3319 #define expand(l)                                                              \
3320   (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
3321   fill_table(__kmp_indirect_get_flags, expand);
3322 #undef expand
3323 
3324   __kmp_init_user_locks = TRUE;
3325 }
3326 
3327 // Clean up the lock table.
3328 void __kmp_cleanup_indirect_user_locks() {
3329   kmp_lock_index_t i;
3330   int k;
3331 
3332   // Clean up locks in the pools first (they were already destroyed before going
3333   // into the pools).
3334   for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
3335     kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
3336     while (l != NULL) {
3337       kmp_indirect_lock_t *ll = l;
3338       l = (kmp_indirect_lock_t *)l->lock->pool.next;
3339       KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
3340                     ll));
3341       __kmp_free(ll->lock);
3342       ll->lock = NULL;
3343     }
3344     __kmp_indirect_lock_pool[k] = NULL;
3345   }
3346   // Clean up the remaining undestroyed locks.
3347   for (i = 0; i < __kmp_i_lock_table.next; i++) {
3348     kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
3349     if (l->lock != NULL) {
3350       // Locks not destroyed explicitly need to be destroyed here.
3351       KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3352       KA_TRACE(
3353           20,
3354           ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
3355            l));
3356       __kmp_free(l->lock);
3357     }
3358   }
3359   // Free the table
3360   for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
3361     __kmp_free(__kmp_i_lock_table.table[i]);
3362   __kmp_free(__kmp_i_lock_table.table);
3363 
3364   __kmp_init_user_locks = FALSE;
3365 }
3366 
3367 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3368 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3369 
3370 #else // KMP_USE_DYNAMIC_LOCK
3371 
3372 /* user locks
3373  * They are implemented as a table of function pointers which are set to the
3374  * lock functions of the appropriate kind, once that has been determined. */
3375 
3376 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3377 
3378 size_t __kmp_base_user_lock_size = 0;
3379 size_t __kmp_user_lock_size = 0;
3380 
3381 kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
3382 int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
3383                                             kmp_int32 gtid) = NULL;
3384 
3385 int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
3386                                          kmp_int32 gtid) = NULL;
3387 int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
3388                                             kmp_int32 gtid) = NULL;
3389 void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3390 void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
3391 void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3392 int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3393                                                    kmp_int32 gtid) = NULL;
3394 
3395 int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3396                                                 kmp_int32 gtid) = NULL;
3397 int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3398                                                    kmp_int32 gtid) = NULL;
3399 void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3400 void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3401 
3402 int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
3403 const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
3404 void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
3405                                       const ident_t *loc) = NULL;
3406 kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
3407 void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
3408                                    kmp_lock_flags_t flags) = NULL;
3409 
3410 void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
3411   switch (user_lock_kind) {
3412   case lk_default:
3413   default:
3414     KMP_ASSERT(0);
3415 
3416   case lk_tas: {
3417     __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
3418     __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
3419 
3420     __kmp_get_user_lock_owner_ =
3421         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
3422 
3423     if (__kmp_env_consistency_check) {
3424       KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
3425       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
3426     } else {
3427       KMP_BIND_USER_LOCK(tas);
3428       KMP_BIND_NESTED_USER_LOCK(tas);
3429     }
3430 
3431     __kmp_destroy_user_lock_ =
3432         (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
3433 
3434     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3435 
3436     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3437 
3438     __kmp_set_user_lock_location_ =
3439         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3440 
3441     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3442 
3443     __kmp_set_user_lock_flags_ =
3444         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3445   } break;
3446 
3447 #if KMP_USE_FUTEX
3448 
3449   case lk_futex: {
3450     __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
3451     __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
3452 
3453     __kmp_get_user_lock_owner_ =
3454         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
3455 
3456     if (__kmp_env_consistency_check) {
3457       KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
3458       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
3459     } else {
3460       KMP_BIND_USER_LOCK(futex);
3461       KMP_BIND_NESTED_USER_LOCK(futex);
3462     }
3463 
3464     __kmp_destroy_user_lock_ =
3465         (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
3466 
3467     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3468 
3469     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3470 
3471     __kmp_set_user_lock_location_ =
3472         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3473 
3474     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3475 
3476     __kmp_set_user_lock_flags_ =
3477         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3478   } break;
3479 
3480 #endif // KMP_USE_FUTEX
3481 
3482   case lk_ticket: {
3483     __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
3484     __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
3485 
3486     __kmp_get_user_lock_owner_ =
3487         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
3488 
3489     if (__kmp_env_consistency_check) {
3490       KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
3491       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
3492     } else {
3493       KMP_BIND_USER_LOCK(ticket);
3494       KMP_BIND_NESTED_USER_LOCK(ticket);
3495     }
3496 
3497     __kmp_destroy_user_lock_ =
3498         (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
3499 
3500     __kmp_is_user_lock_initialized_ =
3501         (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
3502 
3503     __kmp_get_user_lock_location_ =
3504         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
3505 
3506     __kmp_set_user_lock_location_ = (void (*)(
3507         kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
3508 
3509     __kmp_get_user_lock_flags_ =
3510         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
3511 
3512     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3513         &__kmp_set_ticket_lock_flags);
3514   } break;
3515 
3516   case lk_queuing: {
3517     __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
3518     __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
3519 
3520     __kmp_get_user_lock_owner_ =
3521         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3522 
3523     if (__kmp_env_consistency_check) {
3524       KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
3525       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
3526     } else {
3527       KMP_BIND_USER_LOCK(queuing);
3528       KMP_BIND_NESTED_USER_LOCK(queuing);
3529     }
3530 
3531     __kmp_destroy_user_lock_ =
3532         (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
3533 
3534     __kmp_is_user_lock_initialized_ =
3535         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3536 
3537     __kmp_get_user_lock_location_ =
3538         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3539 
3540     __kmp_set_user_lock_location_ = (void (*)(
3541         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3542 
3543     __kmp_get_user_lock_flags_ =
3544         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3545 
3546     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3547         &__kmp_set_queuing_lock_flags);
3548   } break;
3549 
3550 #if KMP_USE_ADAPTIVE_LOCKS
3551   case lk_adaptive: {
3552     __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
3553     __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
3554 
3555     __kmp_get_user_lock_owner_ =
3556         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3557 
3558     if (__kmp_env_consistency_check) {
3559       KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
3560     } else {
3561       KMP_BIND_USER_LOCK(adaptive);
3562     }
3563 
3564     __kmp_destroy_user_lock_ =
3565         (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
3566 
3567     __kmp_is_user_lock_initialized_ =
3568         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3569 
3570     __kmp_get_user_lock_location_ =
3571         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3572 
3573     __kmp_set_user_lock_location_ = (void (*)(
3574         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3575 
3576     __kmp_get_user_lock_flags_ =
3577         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3578 
3579     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3580         &__kmp_set_queuing_lock_flags);
3581 
3582   } break;
3583 #endif // KMP_USE_ADAPTIVE_LOCKS
3584 
3585   case lk_drdpa: {
3586     __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
3587     __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
3588 
3589     __kmp_get_user_lock_owner_ =
3590         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
3591 
3592     if (__kmp_env_consistency_check) {
3593       KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
3594       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
3595     } else {
3596       KMP_BIND_USER_LOCK(drdpa);
3597       KMP_BIND_NESTED_USER_LOCK(drdpa);
3598     }
3599 
3600     __kmp_destroy_user_lock_ =
3601         (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
3602 
3603     __kmp_is_user_lock_initialized_ =
3604         (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
3605 
3606     __kmp_get_user_lock_location_ =
3607         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
3608 
3609     __kmp_set_user_lock_location_ = (void (*)(
3610         kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
3611 
3612     __kmp_get_user_lock_flags_ =
3613         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
3614 
3615     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3616         &__kmp_set_drdpa_lock_flags);
3617   } break;
3618   }
3619 }
3620 
3621 // ----------------------------------------------------------------------------
3622 // User lock table & lock allocation
3623 
3624 kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
3625 kmp_user_lock_p __kmp_lock_pool = NULL;
3626 
3627 // Lock block-allocation support.
3628 kmp_block_of_locks *__kmp_lock_blocks = NULL;
3629 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3630 
3631 static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
3632   // Assume that kmp_global_lock is held upon entry/exit.
3633   kmp_lock_index_t index;
3634   if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
3635     kmp_lock_index_t size;
3636     kmp_user_lock_p *table;
3637     // Reallocate lock table.
3638     if (__kmp_user_lock_table.allocated == 0) {
3639       size = 1024;
3640     } else {
3641       size = __kmp_user_lock_table.allocated * 2;
3642     }
3643     table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
3644     KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
3645                sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
3646     table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
3647     // We cannot free the previous table now, since it may be in use by other
3648     // threads. So save the pointer to the previous table in in the first
3649     // element of the new table. All the tables will be organized into a list,
3650     // and could be freed when library shutting down.
3651     __kmp_user_lock_table.table = table;
3652     __kmp_user_lock_table.allocated = size;
3653   }
3654   KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
3655                    __kmp_user_lock_table.allocated);
3656   index = __kmp_user_lock_table.used;
3657   __kmp_user_lock_table.table[index] = lck;
3658   ++__kmp_user_lock_table.used;
3659   return index;
3660 }
3661 
3662 static kmp_user_lock_p __kmp_lock_block_allocate() {
3663   // Assume that kmp_global_lock is held upon entry/exit.
3664   static int last_index = 0;
3665   if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
3666     // Restart the index.
3667     last_index = 0;
3668     // Need to allocate a new block.
3669     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3670     size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
3671     char *buffer =
3672         (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
3673     // Set up the new block.
3674     kmp_block_of_locks *new_block =
3675         (kmp_block_of_locks *)(&buffer[space_for_locks]);
3676     new_block->next_block = __kmp_lock_blocks;
3677     new_block->locks = (void *)buffer;
3678     // Publish the new block.
3679     KMP_MB();
3680     __kmp_lock_blocks = new_block;
3681   }
3682   kmp_user_lock_p ret = (kmp_user_lock_p)(&(
3683       ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
3684   last_index++;
3685   return ret;
3686 }
3687 
3688 // Get memory for a lock. It may be freshly allocated memory or reused memory
3689 // from lock pool.
3690 kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
3691                                          kmp_lock_flags_t flags) {
3692   kmp_user_lock_p lck;
3693   kmp_lock_index_t index;
3694   KMP_DEBUG_ASSERT(user_lock);
3695 
3696   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3697 
3698   if (__kmp_lock_pool == NULL) {
3699     // Lock pool is empty. Allocate new memory.
3700 
3701     // ANNOTATION: Found no good way to express the syncronisation
3702     // between allocation and usage, so ignore the allocation
3703     ANNOTATE_IGNORE_WRITES_BEGIN();
3704     if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
3705       lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
3706     } else {
3707       lck = __kmp_lock_block_allocate();
3708     }
3709     ANNOTATE_IGNORE_WRITES_END();
3710 
3711     // Insert lock in the table so that it can be freed in __kmp_cleanup,
3712     // and debugger has info on all allocated locks.
3713     index = __kmp_lock_table_insert(lck);
3714   } else {
3715     // Pick up lock from pool.
3716     lck = __kmp_lock_pool;
3717     index = __kmp_lock_pool->pool.index;
3718     __kmp_lock_pool = __kmp_lock_pool->pool.next;
3719   }
3720 
3721   // We could potentially differentiate between nested and regular locks
3722   // here, and do the lock table lookup for regular locks only.
3723   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3724     *((kmp_lock_index_t *)user_lock) = index;
3725   } else {
3726     *((kmp_user_lock_p *)user_lock) = lck;
3727   }
3728 
3729   // mark the lock if it is critical section lock.
3730   __kmp_set_user_lock_flags(lck, flags);
3731 
3732   __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
3733 
3734   return lck;
3735 }
3736 
3737 // Put lock's memory to pool for reusing.
3738 void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
3739                           kmp_user_lock_p lck) {
3740   KMP_DEBUG_ASSERT(user_lock != NULL);
3741   KMP_DEBUG_ASSERT(lck != NULL);
3742 
3743   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3744 
3745   lck->pool.next = __kmp_lock_pool;
3746   __kmp_lock_pool = lck;
3747   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3748     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3749     KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
3750     lck->pool.index = index;
3751   }
3752 
3753   __kmp_release_lock(&__kmp_global_lock, gtid);
3754 }
3755 
3756 kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
3757   kmp_user_lock_p lck = NULL;
3758 
3759   if (__kmp_env_consistency_check) {
3760     if (user_lock == NULL) {
3761       KMP_FATAL(LockIsUninitialized, func);
3762     }
3763   }
3764 
3765   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3766     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3767     if (__kmp_env_consistency_check) {
3768       if (!(0 < index && index < __kmp_user_lock_table.used)) {
3769         KMP_FATAL(LockIsUninitialized, func);
3770       }
3771     }
3772     KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
3773     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3774     lck = __kmp_user_lock_table.table[index];
3775   } else {
3776     lck = *((kmp_user_lock_p *)user_lock);
3777   }
3778 
3779   if (__kmp_env_consistency_check) {
3780     if (lck == NULL) {
3781       KMP_FATAL(LockIsUninitialized, func);
3782     }
3783   }
3784 
3785   return lck;
3786 }
3787 
3788 void __kmp_cleanup_user_locks(void) {
3789   // Reset lock pool. Don't worry about lock in the pool--we will free them when
3790   // iterating through lock table (it includes all the locks, dead or alive).
3791   __kmp_lock_pool = NULL;
3792 
3793 #define IS_CRITICAL(lck)                                                       \
3794   ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
3795    ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
3796 
3797   // Loop through lock table, free all locks.
3798   // Do not free item [0], it is reserved for lock tables list.
3799   //
3800   // FIXME - we are iterating through a list of (pointers to) objects of type
3801   // union kmp_user_lock, but we have no way of knowing whether the base type is
3802   // currently "pool" or whatever the global user lock type is.
3803   //
3804   // We are relying on the fact that for all of the user lock types
3805   // (except "tas"), the first field in the lock struct is the "initialized"
3806   // field, which is set to the address of the lock object itself when
3807   // the lock is initialized.  When the union is of type "pool", the
3808   // first field is a pointer to the next object in the free list, which
3809   // will not be the same address as the object itself.
3810   //
3811   // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
3812   // for "pool" objects on the free list.  This must happen as the "location"
3813   // field of real user locks overlaps the "index" field of "pool" objects.
3814   //
3815   // It would be better to run through the free list, and remove all "pool"
3816   // objects from the lock table before executing this loop.  However,
3817   // "pool" objects do not always have their index field set (only on
3818   // lin_32e), and I don't want to search the lock table for the address
3819   // of every "pool" object on the free list.
3820   while (__kmp_user_lock_table.used > 1) {
3821     const ident *loc;
3822 
3823     // reduce __kmp_user_lock_table.used before freeing the lock,
3824     // so that state of locks is consistent
3825     kmp_user_lock_p lck =
3826         __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
3827 
3828     if ((__kmp_is_user_lock_initialized_ != NULL) &&
3829         (*__kmp_is_user_lock_initialized_)(lck)) {
3830       // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
3831       // it is NOT a critical section (user is not responsible for destroying
3832       // criticals) AND we know source location to report.
3833       if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
3834           ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
3835           (loc->psource != NULL)) {
3836         kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 0);
3837         KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
3838         __kmp_str_loc_free(&str_loc);
3839       }
3840 
3841 #ifdef KMP_DEBUG
3842       if (IS_CRITICAL(lck)) {
3843         KA_TRACE(
3844             20,
3845             ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
3846              lck, *(void **)lck));
3847       } else {
3848         KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
3849                       *(void **)lck));
3850       }
3851 #endif // KMP_DEBUG
3852 
3853       // Cleanup internal lock dynamic resources (for drdpa locks particularly).
3854       __kmp_destroy_user_lock(lck);
3855     }
3856 
3857     // Free the lock if block allocation of locks is not used.
3858     if (__kmp_lock_blocks == NULL) {
3859       __kmp_free(lck);
3860     }
3861   }
3862 
3863 #undef IS_CRITICAL
3864 
3865   // delete lock table(s).
3866   kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
3867   __kmp_user_lock_table.table = NULL;
3868   __kmp_user_lock_table.allocated = 0;
3869 
3870   while (table_ptr != NULL) {
3871     // In the first element we saved the pointer to the previous
3872     // (smaller) lock table.
3873     kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
3874     __kmp_free(table_ptr);
3875     table_ptr = next;
3876   }
3877 
3878   // Free buffers allocated for blocks of locks.
3879   kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
3880   __kmp_lock_blocks = NULL;
3881 
3882   while (block_ptr != NULL) {
3883     kmp_block_of_locks_t *next = block_ptr->next_block;
3884     __kmp_free(block_ptr->locks);
3885     // *block_ptr itself was allocated at the end of the locks vector.
3886     block_ptr = next;
3887   }
3888 
3889   TCW_4(__kmp_init_user_locks, FALSE);
3890 }
3891 
3892 #endif // KMP_USE_DYNAMIC_LOCK
3893