1 /*
2  * kmp_lock.cpp -- lock-related functions
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include <stddef.h>
17 #include <atomic>
18 
19 #include "kmp.h"
20 #include "kmp_i18n.h"
21 #include "kmp_io.h"
22 #include "kmp_itt.h"
23 #include "kmp_lock.h"
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_USE_FUTEX
28 #include <sys/syscall.h>
29 #include <unistd.h>
30 // We should really include <futex.h>, but that causes compatibility problems on
31 // different Linux* OS distributions that either require that you include (or
32 // break when you try to include) <pci/types.h>. Since all we need is the two
33 // macros below (which are part of the kernel ABI, so can't change) we just
34 // define the constants here and don't include <futex.h>
35 #ifndef FUTEX_WAIT
36 #define FUTEX_WAIT 0
37 #endif
38 #ifndef FUTEX_WAKE
39 #define FUTEX_WAKE 1
40 #endif
41 #endif
42 
43 /* Implement spin locks for internal library use.             */
44 /* The algorithm implemented is Lamport's bakery lock [1974]. */
45 
46 void __kmp_validate_locks(void) {
47   int i;
48   kmp_uint32 x, y;
49 
50   /* Check to make sure unsigned arithmetic does wraps properly */
51   x = ~((kmp_uint32)0) - 2;
52   y = x - 2;
53 
54   for (i = 0; i < 8; ++i, ++x, ++y) {
55     kmp_uint32 z = (x - y);
56     KMP_ASSERT(z == 2);
57   }
58 
59   KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
60 }
61 
62 /* ------------------------------------------------------------------------ */
63 /* test and set locks */
64 
65 // For the non-nested locks, we can only assume that the first 4 bytes were
66 // allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
67 // compiler only allocates a 4 byte pointer on IA-32 architecture.  On
68 // Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
69 //
70 // gcc reserves >= 8 bytes for nested locks, so we can assume that the
71 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
72 
73 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
74   return KMP_LOCK_STRIP(TCR_4(lck->lk.poll)) - 1;
75 }
76 
77 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
78   return lck->lk.depth_locked != -1;
79 }
80 
81 __forceinline static int
82 __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
83   KMP_MB();
84 
85 #ifdef USE_LOCK_PROFILE
86   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
87   if ((curr != 0) && (curr != gtid + 1))
88     __kmp_printf("LOCK CONTENTION: %p\n", lck);
89 /* else __kmp_printf( "." );*/
90 #endif /* USE_LOCK_PROFILE */
91 
92   if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
93       KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
94                                   KMP_LOCK_BUSY(gtid + 1, tas))) {
95     KMP_FSYNC_ACQUIRED(lck);
96     return KMP_LOCK_ACQUIRED_FIRST;
97   }
98 
99   kmp_uint32 spins;
100   KMP_FSYNC_PREPARE(lck);
101   KMP_INIT_YIELD(spins);
102   if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
103     KMP_YIELD(TRUE);
104   } else {
105     KMP_YIELD_SPIN(spins);
106   }
107 
108   kmp_backoff_t backoff = __kmp_spin_backoff_params;
109   while ((lck->lk.poll != KMP_LOCK_FREE(tas)) ||
110          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
111                                        KMP_LOCK_BUSY(gtid + 1, tas)))) {
112 
113     __kmp_spin_backoff(&backoff);
114     if (TCR_4(__kmp_nth) >
115         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
116       KMP_YIELD(TRUE);
117     } else {
118       KMP_YIELD_SPIN(spins);
119     }
120   }
121   KMP_FSYNC_ACQUIRED(lck);
122   return KMP_LOCK_ACQUIRED_FIRST;
123 }
124 
125 int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
126   int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
127   ANNOTATE_TAS_ACQUIRED(lck);
128   return retval;
129 }
130 
131 static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
132                                               kmp_int32 gtid) {
133   char const *const func = "omp_set_lock";
134   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
135       __kmp_is_tas_lock_nestable(lck)) {
136     KMP_FATAL(LockNestableUsedAsSimple, func);
137   }
138   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
139     KMP_FATAL(LockIsAlreadyOwned, func);
140   }
141   return __kmp_acquire_tas_lock(lck, gtid);
142 }
143 
144 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
145   if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
146       KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
147                                   KMP_LOCK_BUSY(gtid + 1, tas))) {
148     KMP_FSYNC_ACQUIRED(lck);
149     return TRUE;
150   }
151   return FALSE;
152 }
153 
154 static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
155                                            kmp_int32 gtid) {
156   char const *const func = "omp_test_lock";
157   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
158       __kmp_is_tas_lock_nestable(lck)) {
159     KMP_FATAL(LockNestableUsedAsSimple, func);
160   }
161   return __kmp_test_tas_lock(lck, gtid);
162 }
163 
164 int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
165   KMP_MB(); /* Flush all pending memory write invalidates.  */
166 
167   KMP_FSYNC_RELEASING(lck);
168   ANNOTATE_TAS_RELEASED(lck);
169   KMP_ST_REL32(&(lck->lk.poll), KMP_LOCK_FREE(tas));
170   KMP_MB(); /* Flush all pending memory write invalidates.  */
171 
172   KMP_YIELD(TCR_4(__kmp_nth) >
173             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
174   return KMP_LOCK_RELEASED;
175 }
176 
177 static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
178                                               kmp_int32 gtid) {
179   char const *const func = "omp_unset_lock";
180   KMP_MB(); /* in case another processor initialized lock */
181   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
182       __kmp_is_tas_lock_nestable(lck)) {
183     KMP_FATAL(LockNestableUsedAsSimple, func);
184   }
185   if (__kmp_get_tas_lock_owner(lck) == -1) {
186     KMP_FATAL(LockUnsettingFree, func);
187   }
188   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
189       (__kmp_get_tas_lock_owner(lck) != gtid)) {
190     KMP_FATAL(LockUnsettingSetByAnother, func);
191   }
192   return __kmp_release_tas_lock(lck, gtid);
193 }
194 
195 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
196   TCW_4(lck->lk.poll, KMP_LOCK_FREE(tas));
197 }
198 
199 static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
200   __kmp_init_tas_lock(lck);
201 }
202 
203 void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
204 
205 static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
206   char const *const func = "omp_destroy_lock";
207   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
208       __kmp_is_tas_lock_nestable(lck)) {
209     KMP_FATAL(LockNestableUsedAsSimple, func);
210   }
211   if (__kmp_get_tas_lock_owner(lck) != -1) {
212     KMP_FATAL(LockStillOwned, func);
213   }
214   __kmp_destroy_tas_lock(lck);
215 }
216 
217 // nested test and set locks
218 
219 int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
220   KMP_DEBUG_ASSERT(gtid >= 0);
221 
222   if (__kmp_get_tas_lock_owner(lck) == gtid) {
223     lck->lk.depth_locked += 1;
224     return KMP_LOCK_ACQUIRED_NEXT;
225   } else {
226     __kmp_acquire_tas_lock_timed_template(lck, gtid);
227     ANNOTATE_TAS_ACQUIRED(lck);
228     lck->lk.depth_locked = 1;
229     return KMP_LOCK_ACQUIRED_FIRST;
230   }
231 }
232 
233 static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
234                                                      kmp_int32 gtid) {
235   char const *const func = "omp_set_nest_lock";
236   if (!__kmp_is_tas_lock_nestable(lck)) {
237     KMP_FATAL(LockSimpleUsedAsNestable, func);
238   }
239   return __kmp_acquire_nested_tas_lock(lck, gtid);
240 }
241 
242 int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
243   int retval;
244 
245   KMP_DEBUG_ASSERT(gtid >= 0);
246 
247   if (__kmp_get_tas_lock_owner(lck) == gtid) {
248     retval = ++lck->lk.depth_locked;
249   } else if (!__kmp_test_tas_lock(lck, gtid)) {
250     retval = 0;
251   } else {
252     KMP_MB();
253     retval = lck->lk.depth_locked = 1;
254   }
255   return retval;
256 }
257 
258 static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
259                                                   kmp_int32 gtid) {
260   char const *const func = "omp_test_nest_lock";
261   if (!__kmp_is_tas_lock_nestable(lck)) {
262     KMP_FATAL(LockSimpleUsedAsNestable, func);
263   }
264   return __kmp_test_nested_tas_lock(lck, gtid);
265 }
266 
267 int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
268   KMP_DEBUG_ASSERT(gtid >= 0);
269 
270   KMP_MB();
271   if (--(lck->lk.depth_locked) == 0) {
272     __kmp_release_tas_lock(lck, gtid);
273     return KMP_LOCK_RELEASED;
274   }
275   return KMP_LOCK_STILL_HELD;
276 }
277 
278 static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
279                                                      kmp_int32 gtid) {
280   char const *const func = "omp_unset_nest_lock";
281   KMP_MB(); /* in case another processor initialized lock */
282   if (!__kmp_is_tas_lock_nestable(lck)) {
283     KMP_FATAL(LockSimpleUsedAsNestable, func);
284   }
285   if (__kmp_get_tas_lock_owner(lck) == -1) {
286     KMP_FATAL(LockUnsettingFree, func);
287   }
288   if (__kmp_get_tas_lock_owner(lck) != gtid) {
289     KMP_FATAL(LockUnsettingSetByAnother, func);
290   }
291   return __kmp_release_nested_tas_lock(lck, gtid);
292 }
293 
294 void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
295   __kmp_init_tas_lock(lck);
296   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
297 }
298 
299 static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
300   __kmp_init_nested_tas_lock(lck);
301 }
302 
303 void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
304   __kmp_destroy_tas_lock(lck);
305   lck->lk.depth_locked = 0;
306 }
307 
308 static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
309   char const *const func = "omp_destroy_nest_lock";
310   if (!__kmp_is_tas_lock_nestable(lck)) {
311     KMP_FATAL(LockSimpleUsedAsNestable, func);
312   }
313   if (__kmp_get_tas_lock_owner(lck) != -1) {
314     KMP_FATAL(LockStillOwned, func);
315   }
316   __kmp_destroy_nested_tas_lock(lck);
317 }
318 
319 #if KMP_USE_FUTEX
320 
321 /* ------------------------------------------------------------------------ */
322 /* futex locks */
323 
324 // futex locks are really just test and set locks, with a different method
325 // of handling contention.  They take the same amount of space as test and
326 // set locks, and are allocated the same way (i.e. use the area allocated by
327 // the compiler for non-nested locks / allocate nested locks on the heap).
328 
329 static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
330   return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
331 }
332 
333 static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
334   return lck->lk.depth_locked != -1;
335 }
336 
337 __forceinline static int
338 __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
339   kmp_int32 gtid_code = (gtid + 1) << 1;
340 
341   KMP_MB();
342 
343 #ifdef USE_LOCK_PROFILE
344   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
345   if ((curr != 0) && (curr != gtid_code))
346     __kmp_printf("LOCK CONTENTION: %p\n", lck);
347 /* else __kmp_printf( "." );*/
348 #endif /* USE_LOCK_PROFILE */
349 
350   KMP_FSYNC_PREPARE(lck);
351   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
352                   lck, lck->lk.poll, gtid));
353 
354   kmp_int32 poll_val;
355 
356   while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
357               &(lck->lk.poll), KMP_LOCK_FREE(futex),
358               KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
359 
360     kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
361     KA_TRACE(
362         1000,
363         ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
364          lck, gtid, poll_val, cond));
365 
366     // NOTE: if you try to use the following condition for this branch
367     //
368     // if ( poll_val & 1 == 0 )
369     //
370     // Then the 12.0 compiler has a bug where the following block will
371     // always be skipped, regardless of the value of the LSB of poll_val.
372     if (!cond) {
373       // Try to set the lsb in the poll to indicate to the owner
374       // thread that they need to wake this thread up.
375       if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
376                                        poll_val | KMP_LOCK_BUSY(1, futex))) {
377         KA_TRACE(
378             1000,
379             ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
380              lck, lck->lk.poll, gtid));
381         continue;
382       }
383       poll_val |= KMP_LOCK_BUSY(1, futex);
384 
385       KA_TRACE(1000,
386                ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
387                 lck->lk.poll, gtid));
388     }
389 
390     KA_TRACE(
391         1000,
392         ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
393          lck, gtid, poll_val));
394 
395     kmp_int32 rc;
396     if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
397                       NULL, 0)) != 0) {
398       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
399                       "failed (rc=%d errno=%d)\n",
400                       lck, gtid, poll_val, rc, errno));
401       continue;
402     }
403 
404     KA_TRACE(1000,
405              ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
406               lck, gtid, poll_val));
407     // This thread has now done a successful futex wait call and was entered on
408     // the OS futex queue.  We must now perform a futex wake call when releasing
409     // the lock, as we have no idea how many other threads are in the queue.
410     gtid_code |= 1;
411   }
412 
413   KMP_FSYNC_ACQUIRED(lck);
414   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
415                   lck->lk.poll, gtid));
416   return KMP_LOCK_ACQUIRED_FIRST;
417 }
418 
419 int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
420   int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
421   ANNOTATE_FUTEX_ACQUIRED(lck);
422   return retval;
423 }
424 
425 static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
426                                                 kmp_int32 gtid) {
427   char const *const func = "omp_set_lock";
428   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
429       __kmp_is_futex_lock_nestable(lck)) {
430     KMP_FATAL(LockNestableUsedAsSimple, func);
431   }
432   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
433     KMP_FATAL(LockIsAlreadyOwned, func);
434   }
435   return __kmp_acquire_futex_lock(lck, gtid);
436 }
437 
438 int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
439   if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
440                                   KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
441     KMP_FSYNC_ACQUIRED(lck);
442     return TRUE;
443   }
444   return FALSE;
445 }
446 
447 static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
448                                              kmp_int32 gtid) {
449   char const *const func = "omp_test_lock";
450   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
451       __kmp_is_futex_lock_nestable(lck)) {
452     KMP_FATAL(LockNestableUsedAsSimple, func);
453   }
454   return __kmp_test_futex_lock(lck, gtid);
455 }
456 
457 int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
458   KMP_MB(); /* Flush all pending memory write invalidates.  */
459 
460   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
461                   lck, lck->lk.poll, gtid));
462 
463   KMP_FSYNC_RELEASING(lck);
464   ANNOTATE_FUTEX_RELEASED(lck);
465 
466   kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
467 
468   KA_TRACE(1000,
469            ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
470             lck, gtid, poll_val));
471 
472   if (KMP_LOCK_STRIP(poll_val) & 1) {
473     KA_TRACE(1000,
474              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
475               lck, gtid));
476     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
477             NULL, NULL, 0);
478   }
479 
480   KMP_MB(); /* Flush all pending memory write invalidates.  */
481 
482   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
483                   lck->lk.poll, gtid));
484 
485   KMP_YIELD(TCR_4(__kmp_nth) >
486             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
487   return KMP_LOCK_RELEASED;
488 }
489 
490 static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
491                                                 kmp_int32 gtid) {
492   char const *const func = "omp_unset_lock";
493   KMP_MB(); /* in case another processor initialized lock */
494   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
495       __kmp_is_futex_lock_nestable(lck)) {
496     KMP_FATAL(LockNestableUsedAsSimple, func);
497   }
498   if (__kmp_get_futex_lock_owner(lck) == -1) {
499     KMP_FATAL(LockUnsettingFree, func);
500   }
501   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
502       (__kmp_get_futex_lock_owner(lck) != gtid)) {
503     KMP_FATAL(LockUnsettingSetByAnother, func);
504   }
505   return __kmp_release_futex_lock(lck, gtid);
506 }
507 
508 void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
509   TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
510 }
511 
512 static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
513   __kmp_init_futex_lock(lck);
514 }
515 
516 void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
517 
518 static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
519   char const *const func = "omp_destroy_lock";
520   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
521       __kmp_is_futex_lock_nestable(lck)) {
522     KMP_FATAL(LockNestableUsedAsSimple, func);
523   }
524   if (__kmp_get_futex_lock_owner(lck) != -1) {
525     KMP_FATAL(LockStillOwned, func);
526   }
527   __kmp_destroy_futex_lock(lck);
528 }
529 
530 // nested futex locks
531 
532 int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
533   KMP_DEBUG_ASSERT(gtid >= 0);
534 
535   if (__kmp_get_futex_lock_owner(lck) == gtid) {
536     lck->lk.depth_locked += 1;
537     return KMP_LOCK_ACQUIRED_NEXT;
538   } else {
539     __kmp_acquire_futex_lock_timed_template(lck, gtid);
540     ANNOTATE_FUTEX_ACQUIRED(lck);
541     lck->lk.depth_locked = 1;
542     return KMP_LOCK_ACQUIRED_FIRST;
543   }
544 }
545 
546 static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
547                                                        kmp_int32 gtid) {
548   char const *const func = "omp_set_nest_lock";
549   if (!__kmp_is_futex_lock_nestable(lck)) {
550     KMP_FATAL(LockSimpleUsedAsNestable, func);
551   }
552   return __kmp_acquire_nested_futex_lock(lck, gtid);
553 }
554 
555 int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
556   int retval;
557 
558   KMP_DEBUG_ASSERT(gtid >= 0);
559 
560   if (__kmp_get_futex_lock_owner(lck) == gtid) {
561     retval = ++lck->lk.depth_locked;
562   } else if (!__kmp_test_futex_lock(lck, gtid)) {
563     retval = 0;
564   } else {
565     KMP_MB();
566     retval = lck->lk.depth_locked = 1;
567   }
568   return retval;
569 }
570 
571 static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
572                                                     kmp_int32 gtid) {
573   char const *const func = "omp_test_nest_lock";
574   if (!__kmp_is_futex_lock_nestable(lck)) {
575     KMP_FATAL(LockSimpleUsedAsNestable, func);
576   }
577   return __kmp_test_nested_futex_lock(lck, gtid);
578 }
579 
580 int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
581   KMP_DEBUG_ASSERT(gtid >= 0);
582 
583   KMP_MB();
584   if (--(lck->lk.depth_locked) == 0) {
585     __kmp_release_futex_lock(lck, gtid);
586     return KMP_LOCK_RELEASED;
587   }
588   return KMP_LOCK_STILL_HELD;
589 }
590 
591 static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
592                                                        kmp_int32 gtid) {
593   char const *const func = "omp_unset_nest_lock";
594   KMP_MB(); /* in case another processor initialized lock */
595   if (!__kmp_is_futex_lock_nestable(lck)) {
596     KMP_FATAL(LockSimpleUsedAsNestable, func);
597   }
598   if (__kmp_get_futex_lock_owner(lck) == -1) {
599     KMP_FATAL(LockUnsettingFree, func);
600   }
601   if (__kmp_get_futex_lock_owner(lck) != gtid) {
602     KMP_FATAL(LockUnsettingSetByAnother, func);
603   }
604   return __kmp_release_nested_futex_lock(lck, gtid);
605 }
606 
607 void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
608   __kmp_init_futex_lock(lck);
609   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
610 }
611 
612 static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
613   __kmp_init_nested_futex_lock(lck);
614 }
615 
616 void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
617   __kmp_destroy_futex_lock(lck);
618   lck->lk.depth_locked = 0;
619 }
620 
621 static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
622   char const *const func = "omp_destroy_nest_lock";
623   if (!__kmp_is_futex_lock_nestable(lck)) {
624     KMP_FATAL(LockSimpleUsedAsNestable, func);
625   }
626   if (__kmp_get_futex_lock_owner(lck) != -1) {
627     KMP_FATAL(LockStillOwned, func);
628   }
629   __kmp_destroy_nested_futex_lock(lck);
630 }
631 
632 #endif // KMP_USE_FUTEX
633 
634 /* ------------------------------------------------------------------------ */
635 /* ticket (bakery) locks */
636 
637 static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
638   return std::atomic_load_explicit(&lck->lk.owner_id,
639                                    std::memory_order_relaxed) -
640          1;
641 }
642 
643 static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
644   return std::atomic_load_explicit(&lck->lk.depth_locked,
645                                    std::memory_order_relaxed) != -1;
646 }
647 
648 static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
649   return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
650                                    std::memory_order_acquire) == my_ticket;
651 }
652 
653 __forceinline static int
654 __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
655                                          kmp_int32 gtid) {
656   kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
657       &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
658 
659 #ifdef USE_LOCK_PROFILE
660   if (std::atomic_load_explicit(&lck->lk.now_serving,
661                                 std::memory_order_relaxed) != my_ticket)
662     __kmp_printf("LOCK CONTENTION: %p\n", lck);
663 /* else __kmp_printf( "." );*/
664 #endif /* USE_LOCK_PROFILE */
665 
666   if (std::atomic_load_explicit(&lck->lk.now_serving,
667                                 std::memory_order_acquire) == my_ticket) {
668     return KMP_LOCK_ACQUIRED_FIRST;
669   }
670   KMP_WAIT_YIELD_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
671   return KMP_LOCK_ACQUIRED_FIRST;
672 }
673 
674 int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
675   int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
676   ANNOTATE_TICKET_ACQUIRED(lck);
677   return retval;
678 }
679 
680 static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
681                                                  kmp_int32 gtid) {
682   char const *const func = "omp_set_lock";
683 
684   if (!std::atomic_load_explicit(&lck->lk.initialized,
685                                  std::memory_order_relaxed)) {
686     KMP_FATAL(LockIsUninitialized, func);
687   }
688   if (lck->lk.self != lck) {
689     KMP_FATAL(LockIsUninitialized, func);
690   }
691   if (__kmp_is_ticket_lock_nestable(lck)) {
692     KMP_FATAL(LockNestableUsedAsSimple, func);
693   }
694   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
695     KMP_FATAL(LockIsAlreadyOwned, func);
696   }
697 
698   __kmp_acquire_ticket_lock(lck, gtid);
699 
700   std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
701                              std::memory_order_relaxed);
702   return KMP_LOCK_ACQUIRED_FIRST;
703 }
704 
705 int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
706   kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
707                                                    std::memory_order_relaxed);
708 
709   if (std::atomic_load_explicit(&lck->lk.now_serving,
710                                 std::memory_order_relaxed) == my_ticket) {
711     kmp_uint32 next_ticket = my_ticket + 1;
712     if (std::atomic_compare_exchange_strong_explicit(
713             &lck->lk.next_ticket, &my_ticket, next_ticket,
714             std::memory_order_acquire, std::memory_order_acquire)) {
715       return TRUE;
716     }
717   }
718   return FALSE;
719 }
720 
721 static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
722                                               kmp_int32 gtid) {
723   char const *const func = "omp_test_lock";
724 
725   if (!std::atomic_load_explicit(&lck->lk.initialized,
726                                  std::memory_order_relaxed)) {
727     KMP_FATAL(LockIsUninitialized, func);
728   }
729   if (lck->lk.self != lck) {
730     KMP_FATAL(LockIsUninitialized, func);
731   }
732   if (__kmp_is_ticket_lock_nestable(lck)) {
733     KMP_FATAL(LockNestableUsedAsSimple, func);
734   }
735 
736   int retval = __kmp_test_ticket_lock(lck, gtid);
737 
738   if (retval) {
739     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
740                                std::memory_order_relaxed);
741   }
742   return retval;
743 }
744 
745 int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
746   kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
747                                                   std::memory_order_relaxed) -
748                         std::atomic_load_explicit(&lck->lk.now_serving,
749                                                   std::memory_order_relaxed);
750 
751   ANNOTATE_TICKET_RELEASED(lck);
752   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
753                                  std::memory_order_release);
754 
755   KMP_YIELD(distance >
756             (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
757   return KMP_LOCK_RELEASED;
758 }
759 
760 static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
761                                                  kmp_int32 gtid) {
762   char const *const func = "omp_unset_lock";
763 
764   if (!std::atomic_load_explicit(&lck->lk.initialized,
765                                  std::memory_order_relaxed)) {
766     KMP_FATAL(LockIsUninitialized, func);
767   }
768   if (lck->lk.self != lck) {
769     KMP_FATAL(LockIsUninitialized, func);
770   }
771   if (__kmp_is_ticket_lock_nestable(lck)) {
772     KMP_FATAL(LockNestableUsedAsSimple, func);
773   }
774   if (__kmp_get_ticket_lock_owner(lck) == -1) {
775     KMP_FATAL(LockUnsettingFree, func);
776   }
777   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
778       (__kmp_get_ticket_lock_owner(lck) != gtid)) {
779     KMP_FATAL(LockUnsettingSetByAnother, func);
780   }
781   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
782   return __kmp_release_ticket_lock(lck, gtid);
783 }
784 
785 void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
786   lck->lk.location = NULL;
787   lck->lk.self = lck;
788   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
789                              std::memory_order_relaxed);
790   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
791                              std::memory_order_relaxed);
792   std::atomic_store_explicit(
793       &lck->lk.owner_id, 0,
794       std::memory_order_relaxed); // no thread owns the lock.
795   std::atomic_store_explicit(
796       &lck->lk.depth_locked, -1,
797       std::memory_order_relaxed); // -1 => not a nested lock.
798   std::atomic_store_explicit(&lck->lk.initialized, true,
799                              std::memory_order_release);
800 }
801 
802 static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
803   __kmp_init_ticket_lock(lck);
804 }
805 
806 void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
807   std::atomic_store_explicit(&lck->lk.initialized, false,
808                              std::memory_order_release);
809   lck->lk.self = NULL;
810   lck->lk.location = NULL;
811   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
812                              std::memory_order_relaxed);
813   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
814                              std::memory_order_relaxed);
815   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
816   std::atomic_store_explicit(&lck->lk.depth_locked, -1,
817                              std::memory_order_relaxed);
818 }
819 
820 static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
821   char const *const func = "omp_destroy_lock";
822 
823   if (!std::atomic_load_explicit(&lck->lk.initialized,
824                                  std::memory_order_relaxed)) {
825     KMP_FATAL(LockIsUninitialized, func);
826   }
827   if (lck->lk.self != lck) {
828     KMP_FATAL(LockIsUninitialized, func);
829   }
830   if (__kmp_is_ticket_lock_nestable(lck)) {
831     KMP_FATAL(LockNestableUsedAsSimple, func);
832   }
833   if (__kmp_get_ticket_lock_owner(lck) != -1) {
834     KMP_FATAL(LockStillOwned, func);
835   }
836   __kmp_destroy_ticket_lock(lck);
837 }
838 
839 // nested ticket locks
840 
841 int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
842   KMP_DEBUG_ASSERT(gtid >= 0);
843 
844   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
845     std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
846                                    std::memory_order_relaxed);
847     return KMP_LOCK_ACQUIRED_NEXT;
848   } else {
849     __kmp_acquire_ticket_lock_timed_template(lck, gtid);
850     ANNOTATE_TICKET_ACQUIRED(lck);
851     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
852                                std::memory_order_relaxed);
853     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
854                                std::memory_order_relaxed);
855     return KMP_LOCK_ACQUIRED_FIRST;
856   }
857 }
858 
859 static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
860                                                         kmp_int32 gtid) {
861   char const *const func = "omp_set_nest_lock";
862 
863   if (!std::atomic_load_explicit(&lck->lk.initialized,
864                                  std::memory_order_relaxed)) {
865     KMP_FATAL(LockIsUninitialized, func);
866   }
867   if (lck->lk.self != lck) {
868     KMP_FATAL(LockIsUninitialized, func);
869   }
870   if (!__kmp_is_ticket_lock_nestable(lck)) {
871     KMP_FATAL(LockSimpleUsedAsNestable, func);
872   }
873   return __kmp_acquire_nested_ticket_lock(lck, gtid);
874 }
875 
876 int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
877   int retval;
878 
879   KMP_DEBUG_ASSERT(gtid >= 0);
880 
881   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
882     retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
883                                             std::memory_order_relaxed) +
884              1;
885   } else if (!__kmp_test_ticket_lock(lck, gtid)) {
886     retval = 0;
887   } else {
888     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
889                                std::memory_order_relaxed);
890     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
891                                std::memory_order_relaxed);
892     retval = 1;
893   }
894   return retval;
895 }
896 
897 static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
898                                                      kmp_int32 gtid) {
899   char const *const func = "omp_test_nest_lock";
900 
901   if (!std::atomic_load_explicit(&lck->lk.initialized,
902                                  std::memory_order_relaxed)) {
903     KMP_FATAL(LockIsUninitialized, func);
904   }
905   if (lck->lk.self != lck) {
906     KMP_FATAL(LockIsUninitialized, func);
907   }
908   if (!__kmp_is_ticket_lock_nestable(lck)) {
909     KMP_FATAL(LockSimpleUsedAsNestable, func);
910   }
911   return __kmp_test_nested_ticket_lock(lck, gtid);
912 }
913 
914 int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
915   KMP_DEBUG_ASSERT(gtid >= 0);
916 
917   if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
918                                       std::memory_order_relaxed) -
919        1) == 0) {
920     std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
921     __kmp_release_ticket_lock(lck, gtid);
922     return KMP_LOCK_RELEASED;
923   }
924   return KMP_LOCK_STILL_HELD;
925 }
926 
927 static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
928                                                         kmp_int32 gtid) {
929   char const *const func = "omp_unset_nest_lock";
930 
931   if (!std::atomic_load_explicit(&lck->lk.initialized,
932                                  std::memory_order_relaxed)) {
933     KMP_FATAL(LockIsUninitialized, func);
934   }
935   if (lck->lk.self != lck) {
936     KMP_FATAL(LockIsUninitialized, func);
937   }
938   if (!__kmp_is_ticket_lock_nestable(lck)) {
939     KMP_FATAL(LockSimpleUsedAsNestable, func);
940   }
941   if (__kmp_get_ticket_lock_owner(lck) == -1) {
942     KMP_FATAL(LockUnsettingFree, func);
943   }
944   if (__kmp_get_ticket_lock_owner(lck) != gtid) {
945     KMP_FATAL(LockUnsettingSetByAnother, func);
946   }
947   return __kmp_release_nested_ticket_lock(lck, gtid);
948 }
949 
950 void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
951   __kmp_init_ticket_lock(lck);
952   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
953                              std::memory_order_relaxed); // >= 0 for nestable
954                                                          // locks, -1 for simple
955                                                          // locks
956 }
957 
958 static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
959   __kmp_init_nested_ticket_lock(lck);
960 }
961 
962 void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
963   __kmp_destroy_ticket_lock(lck);
964   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
965                              std::memory_order_relaxed);
966 }
967 
968 static void
969 __kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
970   char const *const func = "omp_destroy_nest_lock";
971 
972   if (!std::atomic_load_explicit(&lck->lk.initialized,
973                                  std::memory_order_relaxed)) {
974     KMP_FATAL(LockIsUninitialized, func);
975   }
976   if (lck->lk.self != lck) {
977     KMP_FATAL(LockIsUninitialized, func);
978   }
979   if (!__kmp_is_ticket_lock_nestable(lck)) {
980     KMP_FATAL(LockSimpleUsedAsNestable, func);
981   }
982   if (__kmp_get_ticket_lock_owner(lck) != -1) {
983     KMP_FATAL(LockStillOwned, func);
984   }
985   __kmp_destroy_nested_ticket_lock(lck);
986 }
987 
988 // access functions to fields which don't exist for all lock kinds.
989 
990 static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
991   return std::atomic_load_explicit(&lck->lk.initialized,
992                                    std::memory_order_relaxed) &&
993          (lck->lk.self == lck);
994 }
995 
996 static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
997   return lck->lk.location;
998 }
999 
1000 static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
1001                                            const ident_t *loc) {
1002   lck->lk.location = loc;
1003 }
1004 
1005 static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
1006   return lck->lk.flags;
1007 }
1008 
1009 static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
1010                                         kmp_lock_flags_t flags) {
1011   lck->lk.flags = flags;
1012 }
1013 
1014 /* ------------------------------------------------------------------------ */
1015 /* queuing locks */
1016 
1017 /* First the states
1018    (head,tail) =              0, 0  means lock is unheld, nobody on queue
1019                  UINT_MAX or -1, 0  means lock is held, nobody on queue
1020                               h, h  means lock held or about to transition,
1021                                     1 element on queue
1022                               h, t  h <> t, means lock is held or about to
1023                                     transition, >1 elements on queue
1024 
1025    Now the transitions
1026       Acquire(0,0)  = -1 ,0
1027       Release(0,0)  = Error
1028       Acquire(-1,0) =  h ,h    h > 0
1029       Release(-1,0) =  0 ,0
1030       Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
1031       Release(h,h)  = -1 ,0    h > 0
1032       Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
1033       Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
1034 
1035    And pictorially
1036 
1037            +-----+
1038            | 0, 0|------- release -------> Error
1039            +-----+
1040              |  ^
1041       acquire|  |release
1042              |  |
1043              |  |
1044              v  |
1045            +-----+
1046            |-1, 0|
1047            +-----+
1048              |  ^
1049       acquire|  |release
1050              |  |
1051              |  |
1052              v  |
1053            +-----+
1054            | h, h|
1055            +-----+
1056              |  ^
1057       acquire|  |release
1058              |  |
1059              |  |
1060              v  |
1061            +-----+
1062            | h, t|----- acquire, release loopback ---+
1063            +-----+                                   |
1064                 ^                                    |
1065                 |                                    |
1066                 +------------------------------------+
1067  */
1068 
1069 #ifdef DEBUG_QUEUING_LOCKS
1070 
1071 /* Stuff for circular trace buffer */
1072 #define TRACE_BUF_ELE 1024
1073 static char traces[TRACE_BUF_ELE][128] = {0};
1074 static int tc = 0;
1075 #define TRACE_LOCK(X, Y)                                                       \
1076   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
1077 #define TRACE_LOCK_T(X, Y, Z)                                                  \
1078   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
1079 #define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
1080   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
1081                Z, Q);
1082 
1083 static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
1084                                     kmp_queuing_lock_t *lck, kmp_int32 head_id,
1085                                     kmp_int32 tail_id) {
1086   kmp_int32 t, i;
1087 
1088   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
1089 
1090   i = tc % TRACE_BUF_ELE;
1091   __kmp_printf_no_lock("%s\n", traces[i]);
1092   i = (i + 1) % TRACE_BUF_ELE;
1093   while (i != (tc % TRACE_BUF_ELE)) {
1094     __kmp_printf_no_lock("%s", traces[i]);
1095     i = (i + 1) % TRACE_BUF_ELE;
1096   }
1097   __kmp_printf_no_lock("\n");
1098 
1099   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
1100                        "next_wait:%d, head_id:%d, tail_id:%d\n",
1101                        gtid + 1, this_thr->th.th_spin_here,
1102                        this_thr->th.th_next_waiting, head_id, tail_id);
1103 
1104   __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
1105 
1106   if (lck->lk.head_id >= 1) {
1107     t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
1108     while (t > 0) {
1109       __kmp_printf_no_lock("-> %d ", t);
1110       t = __kmp_threads[t - 1]->th.th_next_waiting;
1111     }
1112   }
1113   __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
1114   __kmp_printf_no_lock("\n\n");
1115 }
1116 
1117 #endif /* DEBUG_QUEUING_LOCKS */
1118 
1119 static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
1120   return TCR_4(lck->lk.owner_id) - 1;
1121 }
1122 
1123 static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
1124   return lck->lk.depth_locked != -1;
1125 }
1126 
1127 /* Acquire a lock using a the queuing lock implementation */
1128 template <bool takeTime>
1129 /* [TLW] The unused template above is left behind because of what BEB believes
1130    is a potential compiler problem with __forceinline. */
1131 __forceinline static int
1132 __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
1133                                           kmp_int32 gtid) {
1134   register kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
1135   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1136   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1137   volatile kmp_uint32 *spin_here_p;
1138   kmp_int32 need_mf = 1;
1139 
1140 #if OMPT_SUPPORT
1141   ompt_state_t prev_state = ompt_state_undefined;
1142 #endif
1143 
1144   KA_TRACE(1000,
1145            ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1146 
1147   KMP_FSYNC_PREPARE(lck);
1148   KMP_DEBUG_ASSERT(this_thr != NULL);
1149   spin_here_p = &this_thr->th.th_spin_here;
1150 
1151 #ifdef DEBUG_QUEUING_LOCKS
1152   TRACE_LOCK(gtid + 1, "acq ent");
1153   if (*spin_here_p)
1154     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1155   if (this_thr->th.th_next_waiting != 0)
1156     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1157 #endif
1158   KMP_DEBUG_ASSERT(!*spin_here_p);
1159   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1160 
1161   /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
1162      head_id_p that may follow, not just in execution order, but also in
1163      visibility order. This way, when a releasing thread observes the changes to
1164      the queue by this thread, it can rightly assume that spin_here_p has
1165      already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
1166      not premature.  If the releasing thread sets spin_here_p to FALSE before
1167      this thread sets it to TRUE, this thread will hang. */
1168   *spin_here_p = TRUE; /* before enqueuing to prevent race */
1169 
1170   while (1) {
1171     kmp_int32 enqueued;
1172     kmp_int32 head;
1173     kmp_int32 tail;
1174 
1175     head = *head_id_p;
1176 
1177     switch (head) {
1178 
1179     case -1: {
1180 #ifdef DEBUG_QUEUING_LOCKS
1181       tail = *tail_id_p;
1182       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1183 #endif
1184       tail = 0; /* to make sure next link asynchronously read is not set
1185                 accidentally; this assignment prevents us from entering the
1186                 if ( t > 0 ) condition in the enqueued case below, which is not
1187                 necessary for this state transition */
1188 
1189       need_mf = 0;
1190       /* try (-1,0)->(tid,tid) */
1191       enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
1192                                              KMP_PACK_64(-1, 0),
1193                                              KMP_PACK_64(gtid + 1, gtid + 1));
1194 #ifdef DEBUG_QUEUING_LOCKS
1195       if (enqueued)
1196         TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
1197 #endif
1198     } break;
1199 
1200     default: {
1201       tail = *tail_id_p;
1202       KMP_DEBUG_ASSERT(tail != gtid + 1);
1203 
1204 #ifdef DEBUG_QUEUING_LOCKS
1205       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1206 #endif
1207 
1208       if (tail == 0) {
1209         enqueued = FALSE;
1210       } else {
1211         need_mf = 0;
1212         /* try (h,t) or (h,h)->(h,tid) */
1213         enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
1214 
1215 #ifdef DEBUG_QUEUING_LOCKS
1216         if (enqueued)
1217           TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
1218 #endif
1219       }
1220     } break;
1221 
1222     case 0: /* empty queue */
1223     {
1224       kmp_int32 grabbed_lock;
1225 
1226 #ifdef DEBUG_QUEUING_LOCKS
1227       tail = *tail_id_p;
1228       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1229 #endif
1230       /* try (0,0)->(-1,0) */
1231 
1232       /* only legal transition out of head = 0 is head = -1 with no change to
1233        * tail */
1234       grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
1235 
1236       if (grabbed_lock) {
1237 
1238         *spin_here_p = FALSE;
1239 
1240         KA_TRACE(
1241             1000,
1242             ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
1243              lck, gtid));
1244 #ifdef DEBUG_QUEUING_LOCKS
1245         TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
1246 #endif
1247 
1248 #if OMPT_SUPPORT
1249         if (ompt_enabled && prev_state != ompt_state_undefined) {
1250           /* change the state before clearing wait_id */
1251           this_thr->th.ompt_thread_info.state = prev_state;
1252           this_thr->th.ompt_thread_info.wait_id = 0;
1253         }
1254 #endif
1255 
1256         KMP_FSYNC_ACQUIRED(lck);
1257         return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
1258       }
1259       enqueued = FALSE;
1260     } break;
1261     }
1262 
1263 #if OMPT_SUPPORT
1264     if (ompt_enabled && prev_state == ompt_state_undefined) {
1265       /* this thread will spin; set wait_id before entering wait state */
1266       prev_state = this_thr->th.ompt_thread_info.state;
1267       this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
1268       this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
1269     }
1270 #endif
1271 
1272     if (enqueued) {
1273       if (tail > 0) {
1274         kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
1275         KMP_ASSERT(tail_thr != NULL);
1276         tail_thr->th.th_next_waiting = gtid + 1;
1277         /* corresponding wait for this write in release code */
1278       }
1279       KA_TRACE(1000,
1280                ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
1281                 lck, gtid));
1282 
1283       /* ToDo: May want to consider using __kmp_wait_sleep  or something that
1284          sleeps for throughput only here. */
1285       KMP_MB();
1286       KMP_WAIT_YIELD(spin_here_p, FALSE, KMP_EQ, lck);
1287 
1288 #ifdef DEBUG_QUEUING_LOCKS
1289       TRACE_LOCK(gtid + 1, "acq spin");
1290 
1291       if (this_thr->th.th_next_waiting != 0)
1292         __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1293 #endif
1294       KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1295       KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
1296                       "waiting on queue\n",
1297                       lck, gtid));
1298 
1299 #ifdef DEBUG_QUEUING_LOCKS
1300       TRACE_LOCK(gtid + 1, "acq exit 2");
1301 #endif
1302 
1303 #if OMPT_SUPPORT
1304       /* change the state before clearing wait_id */
1305       this_thr->th.ompt_thread_info.state = prev_state;
1306       this_thr->th.ompt_thread_info.wait_id = 0;
1307 #endif
1308 
1309       /* got lock, we were dequeued by the thread that released lock */
1310       return KMP_LOCK_ACQUIRED_FIRST;
1311     }
1312 
1313     /* Yield if number of threads > number of logical processors */
1314     /* ToDo: Not sure why this should only be in oversubscription case,
1315        maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
1316     KMP_YIELD(TCR_4(__kmp_nth) >
1317               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
1318 #ifdef DEBUG_QUEUING_LOCKS
1319     TRACE_LOCK(gtid + 1, "acq retry");
1320 #endif
1321   }
1322   KMP_ASSERT2(0, "should not get here");
1323   return KMP_LOCK_ACQUIRED_FIRST;
1324 }
1325 
1326 int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1327   KMP_DEBUG_ASSERT(gtid >= 0);
1328 
1329   int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1330   ANNOTATE_QUEUING_ACQUIRED(lck);
1331   return retval;
1332 }
1333 
1334 static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1335                                                   kmp_int32 gtid) {
1336   char const *const func = "omp_set_lock";
1337   if (lck->lk.initialized != lck) {
1338     KMP_FATAL(LockIsUninitialized, func);
1339   }
1340   if (__kmp_is_queuing_lock_nestable(lck)) {
1341     KMP_FATAL(LockNestableUsedAsSimple, func);
1342   }
1343   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1344     KMP_FATAL(LockIsAlreadyOwned, func);
1345   }
1346 
1347   __kmp_acquire_queuing_lock(lck, gtid);
1348 
1349   lck->lk.owner_id = gtid + 1;
1350   return KMP_LOCK_ACQUIRED_FIRST;
1351 }
1352 
1353 int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1354   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1355   kmp_int32 head;
1356 #ifdef KMP_DEBUG
1357   kmp_info_t *this_thr;
1358 #endif
1359 
1360   KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
1361   KMP_DEBUG_ASSERT(gtid >= 0);
1362 #ifdef KMP_DEBUG
1363   this_thr = __kmp_thread_from_gtid(gtid);
1364   KMP_DEBUG_ASSERT(this_thr != NULL);
1365   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1366 #endif
1367 
1368   head = *head_id_p;
1369 
1370   if (head == 0) { /* nobody on queue, nobody holding */
1371     /* try (0,0)->(-1,0) */
1372     if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
1373       KA_TRACE(1000,
1374                ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
1375       KMP_FSYNC_ACQUIRED(lck);
1376       ANNOTATE_QUEUING_ACQUIRED(lck);
1377       return TRUE;
1378     }
1379   }
1380 
1381   KA_TRACE(1000,
1382            ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
1383   return FALSE;
1384 }
1385 
1386 static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1387                                                kmp_int32 gtid) {
1388   char const *const func = "omp_test_lock";
1389   if (lck->lk.initialized != lck) {
1390     KMP_FATAL(LockIsUninitialized, func);
1391   }
1392   if (__kmp_is_queuing_lock_nestable(lck)) {
1393     KMP_FATAL(LockNestableUsedAsSimple, func);
1394   }
1395 
1396   int retval = __kmp_test_queuing_lock(lck, gtid);
1397 
1398   if (retval) {
1399     lck->lk.owner_id = gtid + 1;
1400   }
1401   return retval;
1402 }
1403 
1404 int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1405   register kmp_info_t *this_thr;
1406   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1407   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1408 
1409   KA_TRACE(1000,
1410            ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1411   KMP_DEBUG_ASSERT(gtid >= 0);
1412   this_thr = __kmp_thread_from_gtid(gtid);
1413   KMP_DEBUG_ASSERT(this_thr != NULL);
1414 #ifdef DEBUG_QUEUING_LOCKS
1415   TRACE_LOCK(gtid + 1, "rel ent");
1416 
1417   if (this_thr->th.th_spin_here)
1418     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1419   if (this_thr->th.th_next_waiting != 0)
1420     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1421 #endif
1422   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1423   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1424 
1425   KMP_FSYNC_RELEASING(lck);
1426   ANNOTATE_QUEUING_RELEASED(lck);
1427 
1428   while (1) {
1429     kmp_int32 dequeued;
1430     kmp_int32 head;
1431     kmp_int32 tail;
1432 
1433     head = *head_id_p;
1434 
1435 #ifdef DEBUG_QUEUING_LOCKS
1436     tail = *tail_id_p;
1437     TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
1438     if (head == 0)
1439       __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1440 #endif
1441     KMP_DEBUG_ASSERT(head !=
1442                      0); /* holding the lock, head must be -1 or queue head */
1443 
1444     if (head == -1) { /* nobody on queue */
1445       /* try (-1,0)->(0,0) */
1446       if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
1447         KA_TRACE(
1448             1000,
1449             ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
1450              lck, gtid));
1451 #ifdef DEBUG_QUEUING_LOCKS
1452         TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
1453 #endif
1454 
1455 #if OMPT_SUPPORT
1456 /* nothing to do - no other thread is trying to shift blame */
1457 #endif
1458         return KMP_LOCK_RELEASED;
1459       }
1460       dequeued = FALSE;
1461     } else {
1462       tail = *tail_id_p;
1463       if (head == tail) { /* only one thread on the queue */
1464 #ifdef DEBUG_QUEUING_LOCKS
1465         if (head <= 0)
1466           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1467 #endif
1468         KMP_DEBUG_ASSERT(head > 0);
1469 
1470         /* try (h,h)->(-1,0) */
1471         dequeued = KMP_COMPARE_AND_STORE_REL64((kmp_int64 *)tail_id_p,
1472                                                KMP_PACK_64(head, head),
1473                                                KMP_PACK_64(-1, 0));
1474 #ifdef DEBUG_QUEUING_LOCKS
1475         TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
1476 #endif
1477 
1478       } else {
1479         volatile kmp_int32 *waiting_id_p;
1480         kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1481         KMP_DEBUG_ASSERT(head_thr != NULL);
1482         waiting_id_p = &head_thr->th.th_next_waiting;
1483 
1484 /* Does this require synchronous reads? */
1485 #ifdef DEBUG_QUEUING_LOCKS
1486         if (head <= 0 || tail <= 0)
1487           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1488 #endif
1489         KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1490 
1491         /* try (h,t)->(h',t) or (t,t) */
1492         KMP_MB();
1493         /* make sure enqueuing thread has time to update next waiting thread
1494          * field */
1495         *head_id_p = KMP_WAIT_YIELD((volatile kmp_uint32 *)waiting_id_p, 0,
1496                                     KMP_NEQ, NULL);
1497 #ifdef DEBUG_QUEUING_LOCKS
1498         TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
1499 #endif
1500         dequeued = TRUE;
1501       }
1502     }
1503 
1504     if (dequeued) {
1505       kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1506       KMP_DEBUG_ASSERT(head_thr != NULL);
1507 
1508 /* Does this require synchronous reads? */
1509 #ifdef DEBUG_QUEUING_LOCKS
1510       if (head <= 0 || tail <= 0)
1511         __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1512 #endif
1513       KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1514 
1515       /* For clean code only. Thread not released until next statement prevents
1516          race with acquire code. */
1517       head_thr->th.th_next_waiting = 0;
1518 #ifdef DEBUG_QUEUING_LOCKS
1519       TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
1520 #endif
1521 
1522       KMP_MB();
1523       /* reset spin value */
1524       head_thr->th.th_spin_here = FALSE;
1525 
1526       KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
1527                       "dequeuing\n",
1528                       lck, gtid));
1529 #ifdef DEBUG_QUEUING_LOCKS
1530       TRACE_LOCK(gtid + 1, "rel exit 2");
1531 #endif
1532       return KMP_LOCK_RELEASED;
1533     }
1534 /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
1535    threads */
1536 
1537 #ifdef DEBUG_QUEUING_LOCKS
1538     TRACE_LOCK(gtid + 1, "rel retry");
1539 #endif
1540 
1541   } /* while */
1542   KMP_ASSERT2(0, "should not get here");
1543   return KMP_LOCK_RELEASED;
1544 }
1545 
1546 static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1547                                                   kmp_int32 gtid) {
1548   char const *const func = "omp_unset_lock";
1549   KMP_MB(); /* in case another processor initialized lock */
1550   if (lck->lk.initialized != lck) {
1551     KMP_FATAL(LockIsUninitialized, func);
1552   }
1553   if (__kmp_is_queuing_lock_nestable(lck)) {
1554     KMP_FATAL(LockNestableUsedAsSimple, func);
1555   }
1556   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1557     KMP_FATAL(LockUnsettingFree, func);
1558   }
1559   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1560     KMP_FATAL(LockUnsettingSetByAnother, func);
1561   }
1562   lck->lk.owner_id = 0;
1563   return __kmp_release_queuing_lock(lck, gtid);
1564 }
1565 
1566 void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
1567   lck->lk.location = NULL;
1568   lck->lk.head_id = 0;
1569   lck->lk.tail_id = 0;
1570   lck->lk.next_ticket = 0;
1571   lck->lk.now_serving = 0;
1572   lck->lk.owner_id = 0; // no thread owns the lock.
1573   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
1574   lck->lk.initialized = lck;
1575 
1576   KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
1577 }
1578 
1579 static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1580   __kmp_init_queuing_lock(lck);
1581 }
1582 
1583 void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
1584   lck->lk.initialized = NULL;
1585   lck->lk.location = NULL;
1586   lck->lk.head_id = 0;
1587   lck->lk.tail_id = 0;
1588   lck->lk.next_ticket = 0;
1589   lck->lk.now_serving = 0;
1590   lck->lk.owner_id = 0;
1591   lck->lk.depth_locked = -1;
1592 }
1593 
1594 static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1595   char const *const func = "omp_destroy_lock";
1596   if (lck->lk.initialized != lck) {
1597     KMP_FATAL(LockIsUninitialized, func);
1598   }
1599   if (__kmp_is_queuing_lock_nestable(lck)) {
1600     KMP_FATAL(LockNestableUsedAsSimple, func);
1601   }
1602   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1603     KMP_FATAL(LockStillOwned, func);
1604   }
1605   __kmp_destroy_queuing_lock(lck);
1606 }
1607 
1608 // nested queuing locks
1609 
1610 int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1611   KMP_DEBUG_ASSERT(gtid >= 0);
1612 
1613   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1614     lck->lk.depth_locked += 1;
1615     return KMP_LOCK_ACQUIRED_NEXT;
1616   } else {
1617     __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1618     ANNOTATE_QUEUING_ACQUIRED(lck);
1619     KMP_MB();
1620     lck->lk.depth_locked = 1;
1621     KMP_MB();
1622     lck->lk.owner_id = gtid + 1;
1623     return KMP_LOCK_ACQUIRED_FIRST;
1624   }
1625 }
1626 
1627 static int
1628 __kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1629                                               kmp_int32 gtid) {
1630   char const *const func = "omp_set_nest_lock";
1631   if (lck->lk.initialized != lck) {
1632     KMP_FATAL(LockIsUninitialized, func);
1633   }
1634   if (!__kmp_is_queuing_lock_nestable(lck)) {
1635     KMP_FATAL(LockSimpleUsedAsNestable, func);
1636   }
1637   return __kmp_acquire_nested_queuing_lock(lck, gtid);
1638 }
1639 
1640 int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1641   int retval;
1642 
1643   KMP_DEBUG_ASSERT(gtid >= 0);
1644 
1645   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1646     retval = ++lck->lk.depth_locked;
1647   } else if (!__kmp_test_queuing_lock(lck, gtid)) {
1648     retval = 0;
1649   } else {
1650     KMP_MB();
1651     retval = lck->lk.depth_locked = 1;
1652     KMP_MB();
1653     lck->lk.owner_id = gtid + 1;
1654   }
1655   return retval;
1656 }
1657 
1658 static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1659                                                       kmp_int32 gtid) {
1660   char const *const func = "omp_test_nest_lock";
1661   if (lck->lk.initialized != lck) {
1662     KMP_FATAL(LockIsUninitialized, func);
1663   }
1664   if (!__kmp_is_queuing_lock_nestable(lck)) {
1665     KMP_FATAL(LockSimpleUsedAsNestable, func);
1666   }
1667   return __kmp_test_nested_queuing_lock(lck, gtid);
1668 }
1669 
1670 int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1671   KMP_DEBUG_ASSERT(gtid >= 0);
1672 
1673   KMP_MB();
1674   if (--(lck->lk.depth_locked) == 0) {
1675     KMP_MB();
1676     lck->lk.owner_id = 0;
1677     __kmp_release_queuing_lock(lck, gtid);
1678     return KMP_LOCK_RELEASED;
1679   }
1680   return KMP_LOCK_STILL_HELD;
1681 }
1682 
1683 static int
1684 __kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1685                                               kmp_int32 gtid) {
1686   char const *const func = "omp_unset_nest_lock";
1687   KMP_MB(); /* in case another processor initialized lock */
1688   if (lck->lk.initialized != lck) {
1689     KMP_FATAL(LockIsUninitialized, func);
1690   }
1691   if (!__kmp_is_queuing_lock_nestable(lck)) {
1692     KMP_FATAL(LockSimpleUsedAsNestable, func);
1693   }
1694   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1695     KMP_FATAL(LockUnsettingFree, func);
1696   }
1697   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1698     KMP_FATAL(LockUnsettingSetByAnother, func);
1699   }
1700   return __kmp_release_nested_queuing_lock(lck, gtid);
1701 }
1702 
1703 void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1704   __kmp_init_queuing_lock(lck);
1705   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
1706 }
1707 
1708 static void
1709 __kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1710   __kmp_init_nested_queuing_lock(lck);
1711 }
1712 
1713 void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1714   __kmp_destroy_queuing_lock(lck);
1715   lck->lk.depth_locked = 0;
1716 }
1717 
1718 static void
1719 __kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1720   char const *const func = "omp_destroy_nest_lock";
1721   if (lck->lk.initialized != lck) {
1722     KMP_FATAL(LockIsUninitialized, func);
1723   }
1724   if (!__kmp_is_queuing_lock_nestable(lck)) {
1725     KMP_FATAL(LockSimpleUsedAsNestable, func);
1726   }
1727   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1728     KMP_FATAL(LockStillOwned, func);
1729   }
1730   __kmp_destroy_nested_queuing_lock(lck);
1731 }
1732 
1733 // access functions to fields which don't exist for all lock kinds.
1734 
1735 static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
1736   return lck == lck->lk.initialized;
1737 }
1738 
1739 static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
1740   return lck->lk.location;
1741 }
1742 
1743 static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
1744                                             const ident_t *loc) {
1745   lck->lk.location = loc;
1746 }
1747 
1748 static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
1749   return lck->lk.flags;
1750 }
1751 
1752 static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
1753                                          kmp_lock_flags_t flags) {
1754   lck->lk.flags = flags;
1755 }
1756 
1757 #if KMP_USE_ADAPTIVE_LOCKS
1758 
1759 /* RTM Adaptive locks */
1760 
1761 #if KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1762 
1763 #include <immintrin.h>
1764 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1765 
1766 #else
1767 
1768 // Values from the status register after failed speculation.
1769 #define _XBEGIN_STARTED (~0u)
1770 #define _XABORT_EXPLICIT (1 << 0)
1771 #define _XABORT_RETRY (1 << 1)
1772 #define _XABORT_CONFLICT (1 << 2)
1773 #define _XABORT_CAPACITY (1 << 3)
1774 #define _XABORT_DEBUG (1 << 4)
1775 #define _XABORT_NESTED (1 << 5)
1776 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
1777 
1778 // Aborts for which it's worth trying again immediately
1779 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1780 
1781 #define STRINGIZE_INTERNAL(arg) #arg
1782 #define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
1783 
1784 // Access to RTM instructions
1785 /*A version of XBegin which returns -1 on speculation, and the value of EAX on
1786   an abort. This is the same definition as the compiler intrinsic that will be
1787   supported at some point. */
1788 static __inline int _xbegin() {
1789   int res = -1;
1790 
1791 #if KMP_OS_WINDOWS
1792 #if KMP_ARCH_X86_64
1793   _asm {
1794         _emit 0xC7
1795         _emit 0xF8
1796         _emit 2
1797         _emit 0
1798         _emit 0
1799         _emit 0
1800         jmp   L2
1801         mov   res, eax
1802     L2:
1803   }
1804 #else /* IA32 */
1805   _asm {
1806         _emit 0xC7
1807         _emit 0xF8
1808         _emit 2
1809         _emit 0
1810         _emit 0
1811         _emit 0
1812         jmp   L2
1813         mov   res, eax
1814     L2:
1815   }
1816 #endif // KMP_ARCH_X86_64
1817 #else
1818   /* Note that %eax must be noted as killed (clobbered), because the XSR is
1819      returned in %eax(%rax) on abort.  Other register values are restored, so
1820      don't need to be killed.
1821 
1822      We must also mark 'res' as an input and an output, since otherwise
1823      'res=-1' may be dropped as being dead, whereas we do need the assignment on
1824      the successful (i.e., non-abort) path. */
1825   __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
1826                    "   .long  1f-1b-6\n"
1827                    "    jmp   2f\n"
1828                    "1:  movl  %%eax,%0\n"
1829                    "2:"
1830                    : "+r"(res)::"memory", "%eax");
1831 #endif // KMP_OS_WINDOWS
1832   return res;
1833 }
1834 
1835 /* Transaction end */
1836 static __inline void _xend() {
1837 #if KMP_OS_WINDOWS
1838   __asm {
1839         _emit 0x0f
1840         _emit 0x01
1841         _emit 0xd5
1842   }
1843 #else
1844   __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
1845 #endif
1846 }
1847 
1848 /* This is a macro, the argument must be a single byte constant which can be
1849    evaluated by the inline assembler, since it is emitted as a byte into the
1850    assembly code. */
1851 // clang-format off
1852 #if KMP_OS_WINDOWS
1853 #define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
1854 #else
1855 #define _xabort(ARG)                                                           \
1856   __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
1857 #endif
1858 // clang-format on
1859 #endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1860 
1861 // Statistics is collected for testing purpose
1862 #if KMP_DEBUG_ADAPTIVE_LOCKS
1863 
1864 // We accumulate speculative lock statistics when the lock is destroyed. We
1865 // keep locks that haven't been destroyed in the liveLocks list so that we can
1866 // grab their statistics too.
1867 static kmp_adaptive_lock_statistics_t destroyedStats;
1868 
1869 // To hold the list of live locks.
1870 static kmp_adaptive_lock_info_t liveLocks;
1871 
1872 // A lock so we can safely update the list of locks.
1873 static kmp_bootstrap_lock_t chain_lock;
1874 
1875 // Initialize the list of stats.
1876 void __kmp_init_speculative_stats() {
1877   kmp_adaptive_lock_info_t *lck = &liveLocks;
1878 
1879   memset((void *)&(lck->stats), 0, sizeof(lck->stats));
1880   lck->stats.next = lck;
1881   lck->stats.prev = lck;
1882 
1883   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1884   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1885 
1886   __kmp_init_bootstrap_lock(&chain_lock);
1887 }
1888 
1889 // Insert the lock into the circular list
1890 static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
1891   __kmp_acquire_bootstrap_lock(&chain_lock);
1892 
1893   lck->stats.next = liveLocks.stats.next;
1894   lck->stats.prev = &liveLocks;
1895 
1896   liveLocks.stats.next = lck;
1897   lck->stats.next->stats.prev = lck;
1898 
1899   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1900   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1901 
1902   __kmp_release_bootstrap_lock(&chain_lock);
1903 }
1904 
1905 static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
1906   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1907   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1908 
1909   kmp_adaptive_lock_info_t *n = lck->stats.next;
1910   kmp_adaptive_lock_info_t *p = lck->stats.prev;
1911 
1912   n->stats.prev = p;
1913   p->stats.next = n;
1914 }
1915 
1916 static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1917   memset((void *)&lck->stats, 0, sizeof(lck->stats));
1918   __kmp_remember_lock(lck);
1919 }
1920 
1921 static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
1922                             kmp_adaptive_lock_info_t *lck) {
1923   kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
1924 
1925   t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
1926   t->successfulSpeculations += s->successfulSpeculations;
1927   t->hardFailedSpeculations += s->hardFailedSpeculations;
1928   t->softFailedSpeculations += s->softFailedSpeculations;
1929   t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
1930   t->lemmingYields += s->lemmingYields;
1931 }
1932 
1933 static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1934   kmp_adaptive_lock_statistics_t *t = &destroyedStats;
1935 
1936   __kmp_acquire_bootstrap_lock(&chain_lock);
1937 
1938   __kmp_add_stats(&destroyedStats, lck);
1939   __kmp_forget_lock(lck);
1940 
1941   __kmp_release_bootstrap_lock(&chain_lock);
1942 }
1943 
1944 static float percent(kmp_uint32 count, kmp_uint32 total) {
1945   return (total == 0) ? 0.0 : (100.0 * count) / total;
1946 }
1947 
1948 static FILE *__kmp_open_stats_file() {
1949   if (strcmp(__kmp_speculative_statsfile, "-") == 0)
1950     return stdout;
1951 
1952   size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
1953   char buffer[buffLen];
1954   KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
1955                (kmp_int32)getpid());
1956   FILE *result = fopen(&buffer[0], "w");
1957 
1958   // Maybe we should issue a warning here...
1959   return result ? result : stdout;
1960 }
1961 
1962 void __kmp_print_speculative_stats() {
1963   if (__kmp_user_lock_kind != lk_adaptive)
1964     return;
1965 
1966   FILE *statsFile = __kmp_open_stats_file();
1967 
1968   kmp_adaptive_lock_statistics_t total = destroyedStats;
1969   kmp_adaptive_lock_info_t *lck;
1970 
1971   for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
1972     __kmp_add_stats(&total, lck);
1973   }
1974   kmp_adaptive_lock_statistics_t *t = &total;
1975   kmp_uint32 totalSections =
1976       t->nonSpeculativeAcquires + t->successfulSpeculations;
1977   kmp_uint32 totalSpeculations = t->successfulSpeculations +
1978                                  t->hardFailedSpeculations +
1979                                  t->softFailedSpeculations;
1980 
1981   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
1982   fprintf(statsFile, " Lock parameters: \n"
1983                      "   max_soft_retries               : %10d\n"
1984                      "   max_badness                    : %10d\n",
1985           __kmp_adaptive_backoff_params.max_soft_retries,
1986           __kmp_adaptive_backoff_params.max_badness);
1987   fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
1988           t->nonSpeculativeAcquireAttempts);
1989   fprintf(statsFile, " Total critical sections          : %10d\n",
1990           totalSections);
1991   fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
1992           t->successfulSpeculations,
1993           percent(t->successfulSpeculations, totalSections));
1994   fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
1995           t->nonSpeculativeAcquires,
1996           percent(t->nonSpeculativeAcquires, totalSections));
1997   fprintf(statsFile, " Lemming yields                   : %10d\n\n",
1998           t->lemmingYields);
1999 
2000   fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
2001           totalSpeculations);
2002   fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
2003           t->successfulSpeculations,
2004           percent(t->successfulSpeculations, totalSpeculations));
2005   fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
2006           t->softFailedSpeculations,
2007           percent(t->softFailedSpeculations, totalSpeculations));
2008   fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
2009           t->hardFailedSpeculations,
2010           percent(t->hardFailedSpeculations, totalSpeculations));
2011 
2012   if (statsFile != stdout)
2013     fclose(statsFile);
2014 }
2015 
2016 #define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
2017 #else
2018 #define KMP_INC_STAT(lck, stat)
2019 
2020 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
2021 
2022 static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
2023   // It is enough to check that the head_id is zero.
2024   // We don't also need to check the tail.
2025   bool res = lck->lk.head_id == 0;
2026 
2027 // We need a fence here, since we must ensure that no memory operations
2028 // from later in this thread float above that read.
2029 #if KMP_COMPILER_ICC
2030   _mm_mfence();
2031 #else
2032   __sync_synchronize();
2033 #endif
2034 
2035   return res;
2036 }
2037 
2038 // Functions for manipulating the badness
2039 static __inline void
2040 __kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
2041   // Reset the badness to zero so we eagerly try to speculate again
2042   lck->lk.adaptive.badness = 0;
2043   KMP_INC_STAT(lck, successfulSpeculations);
2044 }
2045 
2046 // Create a bit mask with one more set bit.
2047 static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
2048   kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
2049   if (newBadness > lck->lk.adaptive.max_badness) {
2050     return;
2051   } else {
2052     lck->lk.adaptive.badness = newBadness;
2053   }
2054 }
2055 
2056 // Check whether speculation should be attempted.
2057 static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
2058                                            kmp_int32 gtid) {
2059   kmp_uint32 badness = lck->lk.adaptive.badness;
2060   kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
2061   int res = (attempts & badness) == 0;
2062   return res;
2063 }
2064 
2065 // Attempt to acquire only the speculative lock.
2066 // Does not back off to the non-speculative lock.
2067 static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
2068                                          kmp_int32 gtid) {
2069   int retries = lck->lk.adaptive.max_soft_retries;
2070 
2071   // We don't explicitly count the start of speculation, rather we record the
2072   // results (success, hard fail, soft fail). The sum of all of those is the
2073   // total number of times we started speculation since all speculations must
2074   // end one of those ways.
2075   do {
2076     kmp_uint32 status = _xbegin();
2077     // Switch this in to disable actual speculation but exercise at least some
2078     // of the rest of the code. Useful for debugging...
2079     // kmp_uint32 status = _XABORT_NESTED;
2080 
2081     if (status == _XBEGIN_STARTED) {
2082       /* We have successfully started speculation. Check that no-one acquired
2083          the lock for real between when we last looked and now. This also gets
2084          the lock cache line into our read-set, which we need so that we'll
2085          abort if anyone later claims it for real. */
2086       if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2087         // Lock is now visibly acquired, so someone beat us to it. Abort the
2088         // transaction so we'll restart from _xbegin with the failure status.
2089         _xabort(0x01);
2090         KMP_ASSERT2(0, "should not get here");
2091       }
2092       return 1; // Lock has been acquired (speculatively)
2093     } else {
2094       // We have aborted, update the statistics
2095       if (status & SOFT_ABORT_MASK) {
2096         KMP_INC_STAT(lck, softFailedSpeculations);
2097         // and loop round to retry.
2098       } else {
2099         KMP_INC_STAT(lck, hardFailedSpeculations);
2100         // Give up if we had a hard failure.
2101         break;
2102       }
2103     }
2104   } while (retries--); // Loop while we have retries, and didn't fail hard.
2105 
2106   // Either we had a hard failure or we didn't succeed softly after
2107   // the full set of attempts, so back off the badness.
2108   __kmp_step_badness(lck);
2109   return 0;
2110 }
2111 
2112 // Attempt to acquire the speculative lock, or back off to the non-speculative
2113 // one if the speculative lock cannot be acquired.
2114 // We can succeed speculatively, non-speculatively, or fail.
2115 static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
2116   // First try to acquire the lock speculatively
2117   if (__kmp_should_speculate(lck, gtid) &&
2118       __kmp_test_adaptive_lock_only(lck, gtid))
2119     return 1;
2120 
2121   // Speculative acquisition failed, so try to acquire it non-speculatively.
2122   // Count the non-speculative acquire attempt
2123   lck->lk.adaptive.acquire_attempts++;
2124 
2125   // Use base, non-speculative lock.
2126   if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
2127     KMP_INC_STAT(lck, nonSpeculativeAcquires);
2128     return 1; // Lock is acquired (non-speculatively)
2129   } else {
2130     return 0; // Failed to acquire the lock, it's already visibly locked.
2131   }
2132 }
2133 
2134 static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2135                                                 kmp_int32 gtid) {
2136   char const *const func = "omp_test_lock";
2137   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2138     KMP_FATAL(LockIsUninitialized, func);
2139   }
2140 
2141   int retval = __kmp_test_adaptive_lock(lck, gtid);
2142 
2143   if (retval) {
2144     lck->lk.qlk.owner_id = gtid + 1;
2145   }
2146   return retval;
2147 }
2148 
2149 // Block until we can acquire a speculative, adaptive lock. We check whether we
2150 // should be trying to speculate. If we should be, we check the real lock to see
2151 // if it is free, and, if not, pause without attempting to acquire it until it
2152 // is. Then we try the speculative acquire. This means that although we suffer
2153 // from lemmings a little (because all we can't acquire the lock speculatively
2154 // until the queue of threads waiting has cleared), we don't get into a state
2155 // where we can never acquire the lock speculatively (because we force the queue
2156 // to clear by preventing new arrivals from entering the queue). This does mean
2157 // that when we're trying to break lemmings, the lock is no longer fair. However
2158 // OpenMP makes no guarantee that its locks are fair, so this isn't a real
2159 // problem.
2160 static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
2161                                         kmp_int32 gtid) {
2162   if (__kmp_should_speculate(lck, gtid)) {
2163     if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2164       if (__kmp_test_adaptive_lock_only(lck, gtid))
2165         return;
2166       // We tried speculation and failed, so give up.
2167     } else {
2168       // We can't try speculation until the lock is free, so we pause here
2169       // (without suspending on the queueing lock, to allow it to drain, then
2170       // try again. All other threads will also see the same result for
2171       // shouldSpeculate, so will be doing the same if they try to claim the
2172       // lock from now on.
2173       while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2174         KMP_INC_STAT(lck, lemmingYields);
2175         __kmp_yield(TRUE);
2176       }
2177 
2178       if (__kmp_test_adaptive_lock_only(lck, gtid))
2179         return;
2180     }
2181   }
2182 
2183   // Speculative acquisition failed, so acquire it non-speculatively.
2184   // Count the non-speculative acquire attempt
2185   lck->lk.adaptive.acquire_attempts++;
2186 
2187   __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
2188   // We have acquired the base lock, so count that.
2189   KMP_INC_STAT(lck, nonSpeculativeAcquires);
2190   ANNOTATE_QUEUING_ACQUIRED(lck);
2191 }
2192 
2193 static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2194                                                     kmp_int32 gtid) {
2195   char const *const func = "omp_set_lock";
2196   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2197     KMP_FATAL(LockIsUninitialized, func);
2198   }
2199   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
2200     KMP_FATAL(LockIsAlreadyOwned, func);
2201   }
2202 
2203   __kmp_acquire_adaptive_lock(lck, gtid);
2204 
2205   lck->lk.qlk.owner_id = gtid + 1;
2206 }
2207 
2208 static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
2209                                        kmp_int32 gtid) {
2210   if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
2211           lck))) { // If the lock doesn't look claimed we must be speculating.
2212     // (Or the user's code is buggy and they're releasing without locking;
2213     // if we had XTEST we'd be able to check that case...)
2214     _xend(); // Exit speculation
2215     __kmp_update_badness_after_success(lck);
2216   } else { // Since the lock *is* visibly locked we're not speculating,
2217     // so should use the underlying lock's release scheme.
2218     __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
2219   }
2220   return KMP_LOCK_RELEASED;
2221 }
2222 
2223 static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2224                                                    kmp_int32 gtid) {
2225   char const *const func = "omp_unset_lock";
2226   KMP_MB(); /* in case another processor initialized lock */
2227   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2228     KMP_FATAL(LockIsUninitialized, func);
2229   }
2230   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
2231     KMP_FATAL(LockUnsettingFree, func);
2232   }
2233   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
2234     KMP_FATAL(LockUnsettingSetByAnother, func);
2235   }
2236   lck->lk.qlk.owner_id = 0;
2237   __kmp_release_adaptive_lock(lck, gtid);
2238   return KMP_LOCK_RELEASED;
2239 }
2240 
2241 static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
2242   __kmp_init_queuing_lock(GET_QLK_PTR(lck));
2243   lck->lk.adaptive.badness = 0;
2244   lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
2245   lck->lk.adaptive.max_soft_retries =
2246       __kmp_adaptive_backoff_params.max_soft_retries;
2247   lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
2248 #if KMP_DEBUG_ADAPTIVE_LOCKS
2249   __kmp_zero_speculative_stats(&lck->lk.adaptive);
2250 #endif
2251   KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
2252 }
2253 
2254 static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2255   __kmp_init_adaptive_lock(lck);
2256 }
2257 
2258 static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
2259 #if KMP_DEBUG_ADAPTIVE_LOCKS
2260   __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
2261 #endif
2262   __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
2263   // Nothing needed for the speculative part.
2264 }
2265 
2266 static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2267   char const *const func = "omp_destroy_lock";
2268   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2269     KMP_FATAL(LockIsUninitialized, func);
2270   }
2271   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
2272     KMP_FATAL(LockStillOwned, func);
2273   }
2274   __kmp_destroy_adaptive_lock(lck);
2275 }
2276 
2277 #endif // KMP_USE_ADAPTIVE_LOCKS
2278 
2279 /* ------------------------------------------------------------------------ */
2280 /* DRDPA ticket locks                                                */
2281 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
2282 
2283 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
2284   return TCR_4(lck->lk.owner_id) - 1;
2285 }
2286 
2287 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
2288   return lck->lk.depth_locked != -1;
2289 }
2290 
2291 __forceinline static int
2292 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2293   kmp_uint64 ticket = KMP_TEST_THEN_INC64((kmp_int64 *)&lck->lk.next_ticket);
2294   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2295   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls =
2296       (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
2297           lck->lk.polls); // volatile load
2298 
2299 #ifdef USE_LOCK_PROFILE
2300   if (TCR_8(polls[ticket & mask].poll) != ticket)
2301     __kmp_printf("LOCK CONTENTION: %p\n", lck);
2302 /* else __kmp_printf( "." );*/
2303 #endif /* USE_LOCK_PROFILE */
2304 
2305   // Now spin-wait, but reload the polls pointer and mask, in case the
2306   // polling area has been reconfigured.  Unless it is reconfigured, the
2307   // reloads stay in L1 cache and are cheap.
2308   //
2309   // Keep this code in sync with KMP_WAIT_YIELD, in kmp_dispatch.cpp !!!
2310   //
2311   // The current implementation of KMP_WAIT_YIELD doesn't allow for mask
2312   // and poll to be re-read every spin iteration.
2313   kmp_uint32 spins;
2314 
2315   KMP_FSYNC_PREPARE(lck);
2316   KMP_INIT_YIELD(spins);
2317   while (TCR_8(polls[ticket & mask].poll) < ticket) { // volatile load
2318     // If we are oversubscribed,
2319     // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
2320     // CPU Pause is in the macros for yield.
2321     //
2322     KMP_YIELD(TCR_4(__kmp_nth) >
2323               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
2324     KMP_YIELD_SPIN(spins);
2325 
2326     // Re-read the mask and the poll pointer from the lock structure.
2327     //
2328     // Make certain that "mask" is read before "polls" !!!
2329     //
2330     // If another thread picks reconfigures the polling area and updates their
2331     // values, and we get the new value of mask and the old polls pointer, we
2332     // could access memory beyond the end of the old polling area.
2333     mask = TCR_8(lck->lk.mask); // volatile load
2334     polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
2335         lck->lk.polls); // volatile load
2336   }
2337 
2338   // Critical section starts here
2339   KMP_FSYNC_ACQUIRED(lck);
2340   KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
2341                   ticket, lck));
2342   lck->lk.now_serving = ticket; // non-volatile store
2343 
2344   // Deallocate a garbage polling area if we know that we are the last
2345   // thread that could possibly access it.
2346   //
2347   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
2348   // ticket.
2349   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
2350     __kmp_free((void *)lck->lk.old_polls);
2351     lck->lk.old_polls = NULL;
2352     lck->lk.cleanup_ticket = 0;
2353   }
2354 
2355   // Check to see if we should reconfigure the polling area.
2356   // If there is still a garbage polling area to be deallocated from a
2357   // previous reconfiguration, let a later thread reconfigure it.
2358   if (lck->lk.old_polls == NULL) {
2359     bool reconfigure = false;
2360     volatile struct kmp_base_drdpa_lock::kmp_lock_poll *old_polls = polls;
2361     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
2362 
2363     if (TCR_4(__kmp_nth) >
2364         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
2365       // We are in oversubscription mode.  Contract the polling area
2366       // down to a single location, if that hasn't been done already.
2367       if (num_polls > 1) {
2368         reconfigure = true;
2369         num_polls = TCR_4(lck->lk.num_polls);
2370         mask = 0;
2371         num_polls = 1;
2372         polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
2373             __kmp_allocate(num_polls * sizeof(*polls));
2374         polls[0].poll = ticket;
2375       }
2376     } else {
2377       // We are in under/fully subscribed mode.  Check the number of
2378       // threads waiting on the lock.  The size of the polling area
2379       // should be at least the number of threads waiting.
2380       kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
2381       if (num_waiting > num_polls) {
2382         kmp_uint32 old_num_polls = num_polls;
2383         reconfigure = true;
2384         do {
2385           mask = (mask << 1) | 1;
2386           num_polls *= 2;
2387         } while (num_polls <= num_waiting);
2388 
2389         // Allocate the new polling area, and copy the relevant portion
2390         // of the old polling area to the new area.  __kmp_allocate()
2391         // zeroes the memory it allocates, and most of the old area is
2392         // just zero padding, so we only copy the release counters.
2393         polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
2394             __kmp_allocate(num_polls * sizeof(*polls));
2395         kmp_uint32 i;
2396         for (i = 0; i < old_num_polls; i++) {
2397           polls[i].poll = old_polls[i].poll;
2398         }
2399       }
2400     }
2401 
2402     if (reconfigure) {
2403       // Now write the updated fields back to the lock structure.
2404       //
2405       // Make certain that "polls" is written before "mask" !!!
2406       //
2407       // If another thread picks up the new value of mask and the old polls
2408       // pointer , it could access memory beyond the end of the old polling
2409       // area.
2410       //
2411       // On x86, we need memory fences.
2412       KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
2413                       "lock %p to %d polls\n",
2414                       ticket, lck, num_polls));
2415 
2416       lck->lk.old_polls = old_polls; // non-volatile store
2417       lck->lk.polls = polls; // volatile store
2418 
2419       KMP_MB();
2420 
2421       lck->lk.num_polls = num_polls; // non-volatile store
2422       lck->lk.mask = mask; // volatile store
2423 
2424       KMP_MB();
2425 
2426       // Only after the new polling area and mask have been flushed
2427       // to main memory can we update the cleanup ticket field.
2428       //
2429       // volatile load / non-volatile store
2430       lck->lk.cleanup_ticket = TCR_8(lck->lk.next_ticket);
2431     }
2432   }
2433   return KMP_LOCK_ACQUIRED_FIRST;
2434 }
2435 
2436 int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2437   int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2438   ANNOTATE_DRDPA_ACQUIRED(lck);
2439   return retval;
2440 }
2441 
2442 static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2443                                                 kmp_int32 gtid) {
2444   char const *const func = "omp_set_lock";
2445   if (lck->lk.initialized != lck) {
2446     KMP_FATAL(LockIsUninitialized, func);
2447   }
2448   if (__kmp_is_drdpa_lock_nestable(lck)) {
2449     KMP_FATAL(LockNestableUsedAsSimple, func);
2450   }
2451   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
2452     KMP_FATAL(LockIsAlreadyOwned, func);
2453   }
2454 
2455   __kmp_acquire_drdpa_lock(lck, gtid);
2456 
2457   lck->lk.owner_id = gtid + 1;
2458   return KMP_LOCK_ACQUIRED_FIRST;
2459 }
2460 
2461 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2462   // First get a ticket, then read the polls pointer and the mask.
2463   // The polls pointer must be read before the mask!!! (See above)
2464   kmp_uint64 ticket = TCR_8(lck->lk.next_ticket); // volatile load
2465   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls =
2466       (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
2467           lck->lk.polls); // volatile load
2468   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2469   if (TCR_8(polls[ticket & mask].poll) == ticket) {
2470     kmp_uint64 next_ticket = ticket + 1;
2471     if (KMP_COMPARE_AND_STORE_ACQ64((kmp_int64 *)&lck->lk.next_ticket, ticket,
2472                                     next_ticket)) {
2473       KMP_FSYNC_ACQUIRED(lck);
2474       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
2475                       ticket, lck));
2476       lck->lk.now_serving = ticket; // non-volatile store
2477 
2478       // Since no threads are waiting, there is no possibility that we would
2479       // want to reconfigure the polling area.  We might have the cleanup ticket
2480       // value (which says that it is now safe to deallocate old_polls), but
2481       // we'll let a later thread which calls __kmp_acquire_lock do that - this
2482       // routine isn't supposed to block, and we would risk blocks if we called
2483       // __kmp_free() to do the deallocation.
2484       return TRUE;
2485     }
2486   }
2487   return FALSE;
2488 }
2489 
2490 static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2491                                              kmp_int32 gtid) {
2492   char const *const func = "omp_test_lock";
2493   if (lck->lk.initialized != lck) {
2494     KMP_FATAL(LockIsUninitialized, func);
2495   }
2496   if (__kmp_is_drdpa_lock_nestable(lck)) {
2497     KMP_FATAL(LockNestableUsedAsSimple, func);
2498   }
2499 
2500   int retval = __kmp_test_drdpa_lock(lck, gtid);
2501 
2502   if (retval) {
2503     lck->lk.owner_id = gtid + 1;
2504   }
2505   return retval;
2506 }
2507 
2508 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2509   // Read the ticket value from the lock data struct, then the polls pointer and
2510   // the mask.  The polls pointer must be read before the mask!!! (See above)
2511   kmp_uint64 ticket = lck->lk.now_serving + 1; // non-volatile load
2512   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls =
2513       (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)TCR_PTR(
2514           lck->lk.polls); // volatile load
2515   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2516   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
2517                   ticket - 1, lck));
2518   KMP_FSYNC_RELEASING(lck);
2519   ANNOTATE_DRDPA_RELEASED(lck);
2520   KMP_ST_REL64(&(polls[ticket & mask].poll), ticket); // volatile store
2521   return KMP_LOCK_RELEASED;
2522 }
2523 
2524 static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2525                                                 kmp_int32 gtid) {
2526   char const *const func = "omp_unset_lock";
2527   KMP_MB(); /* in case another processor initialized lock */
2528   if (lck->lk.initialized != lck) {
2529     KMP_FATAL(LockIsUninitialized, func);
2530   }
2531   if (__kmp_is_drdpa_lock_nestable(lck)) {
2532     KMP_FATAL(LockNestableUsedAsSimple, func);
2533   }
2534   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2535     KMP_FATAL(LockUnsettingFree, func);
2536   }
2537   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
2538       (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
2539     KMP_FATAL(LockUnsettingSetByAnother, func);
2540   }
2541   lck->lk.owner_id = 0;
2542   return __kmp_release_drdpa_lock(lck, gtid);
2543 }
2544 
2545 void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
2546   lck->lk.location = NULL;
2547   lck->lk.mask = 0;
2548   lck->lk.num_polls = 1;
2549   lck->lk.polls =
2550       (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)__kmp_allocate(
2551           lck->lk.num_polls * sizeof(*(lck->lk.polls)));
2552   lck->lk.cleanup_ticket = 0;
2553   lck->lk.old_polls = NULL;
2554   lck->lk.next_ticket = 0;
2555   lck->lk.now_serving = 0;
2556   lck->lk.owner_id = 0; // no thread owns the lock.
2557   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
2558   lck->lk.initialized = lck;
2559 
2560   KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
2561 }
2562 
2563 static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2564   __kmp_init_drdpa_lock(lck);
2565 }
2566 
2567 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
2568   lck->lk.initialized = NULL;
2569   lck->lk.location = NULL;
2570   if (lck->lk.polls != NULL) {
2571     __kmp_free((void *)lck->lk.polls);
2572     lck->lk.polls = NULL;
2573   }
2574   if (lck->lk.old_polls != NULL) {
2575     __kmp_free((void *)lck->lk.old_polls);
2576     lck->lk.old_polls = NULL;
2577   }
2578   lck->lk.mask = 0;
2579   lck->lk.num_polls = 0;
2580   lck->lk.cleanup_ticket = 0;
2581   lck->lk.next_ticket = 0;
2582   lck->lk.now_serving = 0;
2583   lck->lk.owner_id = 0;
2584   lck->lk.depth_locked = -1;
2585 }
2586 
2587 static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2588   char const *const func = "omp_destroy_lock";
2589   if (lck->lk.initialized != lck) {
2590     KMP_FATAL(LockIsUninitialized, func);
2591   }
2592   if (__kmp_is_drdpa_lock_nestable(lck)) {
2593     KMP_FATAL(LockNestableUsedAsSimple, func);
2594   }
2595   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2596     KMP_FATAL(LockStillOwned, func);
2597   }
2598   __kmp_destroy_drdpa_lock(lck);
2599 }
2600 
2601 // nested drdpa ticket locks
2602 
2603 int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2604   KMP_DEBUG_ASSERT(gtid >= 0);
2605 
2606   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2607     lck->lk.depth_locked += 1;
2608     return KMP_LOCK_ACQUIRED_NEXT;
2609   } else {
2610     __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2611     ANNOTATE_DRDPA_ACQUIRED(lck);
2612     KMP_MB();
2613     lck->lk.depth_locked = 1;
2614     KMP_MB();
2615     lck->lk.owner_id = gtid + 1;
2616     return KMP_LOCK_ACQUIRED_FIRST;
2617   }
2618 }
2619 
2620 static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2621                                                         kmp_int32 gtid) {
2622   char const *const func = "omp_set_nest_lock";
2623   if (lck->lk.initialized != lck) {
2624     KMP_FATAL(LockIsUninitialized, func);
2625   }
2626   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2627     KMP_FATAL(LockSimpleUsedAsNestable, func);
2628   }
2629   __kmp_acquire_nested_drdpa_lock(lck, gtid);
2630 }
2631 
2632 int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2633   int retval;
2634 
2635   KMP_DEBUG_ASSERT(gtid >= 0);
2636 
2637   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2638     retval = ++lck->lk.depth_locked;
2639   } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
2640     retval = 0;
2641   } else {
2642     KMP_MB();
2643     retval = lck->lk.depth_locked = 1;
2644     KMP_MB();
2645     lck->lk.owner_id = gtid + 1;
2646   }
2647   return retval;
2648 }
2649 
2650 static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2651                                                     kmp_int32 gtid) {
2652   char const *const func = "omp_test_nest_lock";
2653   if (lck->lk.initialized != lck) {
2654     KMP_FATAL(LockIsUninitialized, func);
2655   }
2656   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2657     KMP_FATAL(LockSimpleUsedAsNestable, func);
2658   }
2659   return __kmp_test_nested_drdpa_lock(lck, gtid);
2660 }
2661 
2662 int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2663   KMP_DEBUG_ASSERT(gtid >= 0);
2664 
2665   KMP_MB();
2666   if (--(lck->lk.depth_locked) == 0) {
2667     KMP_MB();
2668     lck->lk.owner_id = 0;
2669     __kmp_release_drdpa_lock(lck, gtid);
2670     return KMP_LOCK_RELEASED;
2671   }
2672   return KMP_LOCK_STILL_HELD;
2673 }
2674 
2675 static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2676                                                        kmp_int32 gtid) {
2677   char const *const func = "omp_unset_nest_lock";
2678   KMP_MB(); /* in case another processor initialized lock */
2679   if (lck->lk.initialized != lck) {
2680     KMP_FATAL(LockIsUninitialized, func);
2681   }
2682   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2683     KMP_FATAL(LockSimpleUsedAsNestable, func);
2684   }
2685   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2686     KMP_FATAL(LockUnsettingFree, func);
2687   }
2688   if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
2689     KMP_FATAL(LockUnsettingSetByAnother, func);
2690   }
2691   return __kmp_release_nested_drdpa_lock(lck, gtid);
2692 }
2693 
2694 void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2695   __kmp_init_drdpa_lock(lck);
2696   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
2697 }
2698 
2699 static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2700   __kmp_init_nested_drdpa_lock(lck);
2701 }
2702 
2703 void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2704   __kmp_destroy_drdpa_lock(lck);
2705   lck->lk.depth_locked = 0;
2706 }
2707 
2708 static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2709   char const *const func = "omp_destroy_nest_lock";
2710   if (lck->lk.initialized != lck) {
2711     KMP_FATAL(LockIsUninitialized, func);
2712   }
2713   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2714     KMP_FATAL(LockSimpleUsedAsNestable, func);
2715   }
2716   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2717     KMP_FATAL(LockStillOwned, func);
2718   }
2719   __kmp_destroy_nested_drdpa_lock(lck);
2720 }
2721 
2722 // access functions to fields which don't exist for all lock kinds.
2723 
2724 static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
2725   return lck == lck->lk.initialized;
2726 }
2727 
2728 static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
2729   return lck->lk.location;
2730 }
2731 
2732 static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
2733                                           const ident_t *loc) {
2734   lck->lk.location = loc;
2735 }
2736 
2737 static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
2738   return lck->lk.flags;
2739 }
2740 
2741 static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
2742                                        kmp_lock_flags_t flags) {
2743   lck->lk.flags = flags;
2744 }
2745 
2746 // Time stamp counter
2747 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2748 #define __kmp_tsc() __kmp_hardware_timestamp()
2749 // Runtime's default backoff parameters
2750 kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
2751 #else
2752 // Use nanoseconds for other platforms
2753 extern kmp_uint64 __kmp_now_nsec();
2754 kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
2755 #define __kmp_tsc() __kmp_now_nsec()
2756 #endif
2757 
2758 // A useful predicate for dealing with timestamps that may wrap.
2759 // Is a before b? Since the timestamps may wrap, this is asking whether it's
2760 // shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
2761 // Times where going clockwise is less distance than going anti-clockwise
2762 // are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
2763 // then a > b (true) does not mean a reached b; whereas signed(a) = -2,
2764 // signed(b) = 0 captures the actual difference
2765 static inline bool before(kmp_uint64 a, kmp_uint64 b) {
2766   return ((kmp_int64)b - (kmp_int64)a) > 0;
2767 }
2768 
2769 // Truncated binary exponential backoff function
2770 void __kmp_spin_backoff(kmp_backoff_t *boff) {
2771   // We could flatten this loop, but making it a nested loop gives better result
2772   kmp_uint32 i;
2773   for (i = boff->step; i > 0; i--) {
2774     kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
2775     do {
2776       KMP_CPU_PAUSE();
2777     } while (before(__kmp_tsc(), goal));
2778   }
2779   boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
2780 }
2781 
2782 #if KMP_USE_DYNAMIC_LOCK
2783 
2784 // Direct lock initializers. It simply writes a tag to the low 8 bits of the
2785 // lock word.
2786 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
2787                                    kmp_dyna_lockseq_t seq) {
2788   TCW_4(*lck, KMP_GET_D_TAG(seq));
2789   KA_TRACE(
2790       20,
2791       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
2792 }
2793 
2794 #if KMP_USE_TSX
2795 
2796 // HLE lock functions - imported from the testbed runtime.
2797 #define HLE_ACQUIRE ".byte 0xf2;"
2798 #define HLE_RELEASE ".byte 0xf3;"
2799 
2800 static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
2801   __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
2802   return v;
2803 }
2804 
2805 static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
2806 
2807 static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2808   // Use gtid for KMP_LOCK_BUSY if necessary
2809   if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
2810     int delay = 1;
2811     do {
2812       while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
2813         for (int i = delay; i != 0; --i)
2814           KMP_CPU_PAUSE();
2815         delay = ((delay << 1) | 1) & 7;
2816       }
2817     } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
2818   }
2819 }
2820 
2821 static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2822                                                kmp_int32 gtid) {
2823   __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
2824 }
2825 
2826 static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2827   __asm__ volatile(HLE_RELEASE "movl %1,%0"
2828                    : "=m"(*lck)
2829                    : "r"(KMP_LOCK_FREE(hle))
2830                    : "memory");
2831   return KMP_LOCK_RELEASED;
2832 }
2833 
2834 static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2835                                               kmp_int32 gtid) {
2836   return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
2837 }
2838 
2839 static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2840   return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
2841 }
2842 
2843 static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2844                                            kmp_int32 gtid) {
2845   return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
2846 }
2847 
2848 static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) {
2849   __kmp_init_queuing_lock(lck);
2850 }
2851 
2852 static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) {
2853   __kmp_destroy_queuing_lock(lck);
2854 }
2855 
2856 static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2857   unsigned retries = 3, status;
2858   do {
2859     status = _xbegin();
2860     if (status == _XBEGIN_STARTED) {
2861       if (__kmp_is_unlocked_queuing_lock(lck))
2862         return;
2863       _xabort(0xff);
2864     }
2865     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
2866       // Wait until lock becomes free
2867       while (!__kmp_is_unlocked_queuing_lock(lck))
2868         __kmp_yield(TRUE);
2869     } else if (!(status & _XABORT_RETRY))
2870       break;
2871   } while (retries--);
2872 
2873   // Fall-back non-speculative lock (xchg)
2874   __kmp_acquire_queuing_lock(lck, gtid);
2875 }
2876 
2877 static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2878                                                kmp_int32 gtid) {
2879   __kmp_acquire_rtm_lock(lck, gtid);
2880 }
2881 
2882 static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2883   if (__kmp_is_unlocked_queuing_lock(lck)) {
2884     // Releasing from speculation
2885     _xend();
2886   } else {
2887     // Releasing from a real lock
2888     __kmp_release_queuing_lock(lck, gtid);
2889   }
2890   return KMP_LOCK_RELEASED;
2891 }
2892 
2893 static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2894                                               kmp_int32 gtid) {
2895   return __kmp_release_rtm_lock(lck, gtid);
2896 }
2897 
2898 static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2899   unsigned retries = 3, status;
2900   do {
2901     status = _xbegin();
2902     if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
2903       return 1;
2904     }
2905     if (!(status & _XABORT_RETRY))
2906       break;
2907   } while (retries--);
2908 
2909   return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0;
2910 }
2911 
2912 static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2913                                            kmp_int32 gtid) {
2914   return __kmp_test_rtm_lock(lck, gtid);
2915 }
2916 
2917 #endif // KMP_USE_TSX
2918 
2919 // Entry functions for indirect locks (first element of direct lock jump tables)
2920 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
2921                                      kmp_dyna_lockseq_t tag);
2922 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
2923 static void __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2924 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2925 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2926 static void __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2927                                                 kmp_int32);
2928 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2929                                                  kmp_int32);
2930 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2931                                                 kmp_int32);
2932 
2933 // Jump tables for the indirect lock functions
2934 // Only fill in the odd entries, that avoids the need to shift out the low bit
2935 
2936 // init functions
2937 #define expand(l, op) 0, __kmp_init_direct_lock,
2938 void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
2939     __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
2940 #undef expand
2941 
2942 // destroy functions
2943 #define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
2944 void (*__kmp_direct_destroy[])(kmp_dyna_lock_t *) = {
2945     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
2946 #undef expand
2947 
2948 // set/acquire functions
2949 #define expand(l, op)                                                          \
2950   0, (void (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
2951 static void (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
2952     __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
2953 #undef expand
2954 #define expand(l, op)                                                          \
2955   0, (void (*)(kmp_dyna_lock_t *,                                              \
2956                kmp_int32))__kmp_##op##_##l##_lock_with_checks,
2957 static void (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2958     __kmp_set_indirect_lock_with_checks, 0,
2959     KMP_FOREACH_D_LOCK(expand, acquire)};
2960 #undef expand
2961 
2962 // unset/release and test functions
2963 #define expand(l, op)                                                          \
2964   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
2965 static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
2966     __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
2967 static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
2968     __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
2969 #undef expand
2970 #define expand(l, op)                                                          \
2971   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
2972 static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2973     __kmp_unset_indirect_lock_with_checks, 0,
2974     KMP_FOREACH_D_LOCK(expand, release)};
2975 static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2976     __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
2977 #undef expand
2978 
2979 // Exposes only one set of jump tables (*lock or *lock_with_checks).
2980 void (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
2981 int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
2982 int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
2983 
2984 // Jump tables for the indirect lock functions
2985 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
2986 void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
2987     KMP_FOREACH_I_LOCK(expand, init)};
2988 void (*__kmp_indirect_destroy[])(kmp_user_lock_p) = {
2989     KMP_FOREACH_I_LOCK(expand, destroy)};
2990 #undef expand
2991 
2992 // set/acquire functions
2993 #define expand(l, op)                                                          \
2994   (void (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
2995 static void (*indirect_set[])(kmp_user_lock_p, kmp_int32) = {
2996     KMP_FOREACH_I_LOCK(expand, acquire)};
2997 #undef expand
2998 #define expand(l, op)                                                          \
2999   (void (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
3000 static void (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
3001     KMP_FOREACH_I_LOCK(expand, acquire)};
3002 #undef expand
3003 
3004 // unset/release and test functions
3005 #define expand(l, op)                                                          \
3006   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
3007 static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
3008     KMP_FOREACH_I_LOCK(expand, release)};
3009 static int (*indirect_test[])(kmp_user_lock_p,
3010                               kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
3011 #undef expand
3012 #define expand(l, op)                                                          \
3013   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
3014 static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
3015     KMP_FOREACH_I_LOCK(expand, release)};
3016 static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
3017     KMP_FOREACH_I_LOCK(expand, test)};
3018 #undef expand
3019 
3020 // Exposes only one jump tables (*lock or *lock_with_checks).
3021 void (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
3022 int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
3023 int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
3024 
3025 // Lock index table.
3026 kmp_indirect_lock_table_t __kmp_i_lock_table;
3027 
3028 // Size of indirect locks.
3029 static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
3030 
3031 // Jump tables for lock accessor/modifier.
3032 void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3033                                                      const ident_t *) = {0};
3034 void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3035                                                   kmp_lock_flags_t) = {0};
3036 const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
3037     kmp_user_lock_p) = {0};
3038 kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
3039     kmp_user_lock_p) = {0};
3040 
3041 // Use different lock pools for different lock types.
3042 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
3043 
3044 // User lock allocator for dynamically dispatched indirect locks. Every entry of
3045 // the indirect lock table holds the address and type of the allocated indrect
3046 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
3047 // full. A destroyed indirect lock object is returned to the reusable pool of
3048 // locks, unique to each lock type.
3049 kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
3050                                                   kmp_int32 gtid,
3051                                                   kmp_indirect_locktag_t tag) {
3052   kmp_indirect_lock_t *lck;
3053   kmp_lock_index_t idx;
3054 
3055   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3056 
3057   if (__kmp_indirect_lock_pool[tag] != NULL) {
3058     // Reuse the allocated and destroyed lock object
3059     lck = __kmp_indirect_lock_pool[tag];
3060     if (OMP_LOCK_T_SIZE < sizeof(void *))
3061       idx = lck->lock->pool.index;
3062     __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
3063     KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
3064                   lck));
3065   } else {
3066     idx = __kmp_i_lock_table.next;
3067     // Check capacity and double the size if it is full
3068     if (idx == __kmp_i_lock_table.size) {
3069       // Double up the space for block pointers
3070       int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
3071       kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
3072       __kmp_i_lock_table.table = (kmp_indirect_lock_t **)__kmp_allocate(
3073           2 * row * sizeof(kmp_indirect_lock_t *));
3074       KMP_MEMCPY(__kmp_i_lock_table.table, old_table,
3075                  row * sizeof(kmp_indirect_lock_t *));
3076       __kmp_free(old_table);
3077       // Allocate new objects in the new blocks
3078       for (int i = row; i < 2 * row; ++i)
3079         *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
3080             KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3081       __kmp_i_lock_table.size = 2 * idx;
3082     }
3083     __kmp_i_lock_table.next++;
3084     lck = KMP_GET_I_LOCK(idx);
3085     // Allocate a new base lock object
3086     lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
3087     KA_TRACE(20,
3088              ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
3089   }
3090 
3091   __kmp_release_lock(&__kmp_global_lock, gtid);
3092 
3093   lck->type = tag;
3094 
3095   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3096     *((kmp_lock_index_t *)user_lock) = idx
3097                                        << 1; // indirect lock word must be even
3098   } else {
3099     *((kmp_indirect_lock_t **)user_lock) = lck;
3100   }
3101 
3102   return lck;
3103 }
3104 
3105 // User lock lookup for dynamically dispatched locks.
3106 static __forceinline kmp_indirect_lock_t *
3107 __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
3108   if (__kmp_env_consistency_check) {
3109     kmp_indirect_lock_t *lck = NULL;
3110     if (user_lock == NULL) {
3111       KMP_FATAL(LockIsUninitialized, func);
3112     }
3113     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3114       kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
3115       if (idx >= __kmp_i_lock_table.size) {
3116         KMP_FATAL(LockIsUninitialized, func);
3117       }
3118       lck = KMP_GET_I_LOCK(idx);
3119     } else {
3120       lck = *((kmp_indirect_lock_t **)user_lock);
3121     }
3122     if (lck == NULL) {
3123       KMP_FATAL(LockIsUninitialized, func);
3124     }
3125     return lck;
3126   } else {
3127     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3128       return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
3129     } else {
3130       return *((kmp_indirect_lock_t **)user_lock);
3131     }
3132   }
3133 }
3134 
3135 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
3136                                      kmp_dyna_lockseq_t seq) {
3137 #if KMP_USE_ADAPTIVE_LOCKS
3138   if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
3139     KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
3140     seq = lockseq_queuing;
3141   }
3142 #endif
3143 #if KMP_USE_TSX
3144   if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) {
3145     seq = lockseq_queuing;
3146   }
3147 #endif
3148   kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
3149   kmp_indirect_lock_t *l =
3150       __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
3151   KMP_I_LOCK_FUNC(l, init)(l->lock);
3152   KA_TRACE(
3153       20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
3154            seq));
3155 }
3156 
3157 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
3158   kmp_uint32 gtid = __kmp_entry_gtid();
3159   kmp_indirect_lock_t *l =
3160       __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
3161   KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3162   kmp_indirect_locktag_t tag = l->type;
3163 
3164   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3165 
3166   // Use the base lock's space to keep the pool chain.
3167   l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
3168   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3169     l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
3170   }
3171   __kmp_indirect_lock_pool[tag] = l;
3172 
3173   __kmp_release_lock(&__kmp_global_lock, gtid);
3174 }
3175 
3176 static void __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3177   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3178   KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3179 }
3180 
3181 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3182   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3183   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3184 }
3185 
3186 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3187   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3188   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3189 }
3190 
3191 static void __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3192                                                 kmp_int32 gtid) {
3193   kmp_indirect_lock_t *l =
3194       __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
3195   KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3196 }
3197 
3198 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3199                                                  kmp_int32 gtid) {
3200   kmp_indirect_lock_t *l =
3201       __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
3202   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3203 }
3204 
3205 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3206                                                 kmp_int32 gtid) {
3207   kmp_indirect_lock_t *l =
3208       __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
3209   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3210 }
3211 
3212 kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
3213 
3214 // This is used only in kmp_error.cpp when consistency checking is on.
3215 kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
3216   switch (seq) {
3217   case lockseq_tas:
3218   case lockseq_nested_tas:
3219     return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
3220 #if KMP_USE_FUTEX
3221   case lockseq_futex:
3222   case lockseq_nested_futex:
3223     return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
3224 #endif
3225   case lockseq_ticket:
3226   case lockseq_nested_ticket:
3227     return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
3228   case lockseq_queuing:
3229   case lockseq_nested_queuing:
3230 #if KMP_USE_ADAPTIVE_LOCKS
3231   case lockseq_adaptive:
3232 #endif
3233     return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
3234   case lockseq_drdpa:
3235   case lockseq_nested_drdpa:
3236     return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
3237   default:
3238     return 0;
3239   }
3240 }
3241 
3242 // Initializes data for dynamic user locks.
3243 void __kmp_init_dynamic_user_locks() {
3244   // Initialize jump table for the lock functions
3245   if (__kmp_env_consistency_check) {
3246     __kmp_direct_set = direct_set_check;
3247     __kmp_direct_unset = direct_unset_check;
3248     __kmp_direct_test = direct_test_check;
3249     __kmp_indirect_set = indirect_set_check;
3250     __kmp_indirect_unset = indirect_unset_check;
3251     __kmp_indirect_test = indirect_test_check;
3252   } else {
3253     __kmp_direct_set = direct_set;
3254     __kmp_direct_unset = direct_unset;
3255     __kmp_direct_test = direct_test;
3256     __kmp_indirect_set = indirect_set;
3257     __kmp_indirect_unset = indirect_unset;
3258     __kmp_indirect_test = indirect_test;
3259   }
3260   // If the user locks have already been initialized, then return. Allow the
3261   // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
3262   // new lock tables if they have already been allocated.
3263   if (__kmp_init_user_locks)
3264     return;
3265 
3266   // Initialize lock index table
3267   __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
3268   __kmp_i_lock_table.table =
3269       (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
3270   *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
3271       KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3272   __kmp_i_lock_table.next = 0;
3273 
3274   // Indirect lock size
3275   __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
3276   __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
3277 #if KMP_USE_ADAPTIVE_LOCKS
3278   __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
3279 #endif
3280   __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
3281 #if KMP_USE_TSX
3282   __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t);
3283 #endif
3284   __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
3285 #if KMP_USE_FUTEX
3286   __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
3287 #endif
3288   __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
3289   __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
3290   __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
3291 
3292 // Initialize lock accessor/modifier
3293 #define fill_jumps(table, expand, sep)                                         \
3294   {                                                                            \
3295     table[locktag##sep##ticket] = expand(ticket);                              \
3296     table[locktag##sep##queuing] = expand(queuing);                            \
3297     table[locktag##sep##drdpa] = expand(drdpa);                                \
3298   }
3299 
3300 #if KMP_USE_ADAPTIVE_LOCKS
3301 #define fill_table(table, expand)                                              \
3302   {                                                                            \
3303     fill_jumps(table, expand, _);                                              \
3304     table[locktag_adaptive] = expand(queuing);                                 \
3305     fill_jumps(table, expand, _nested_);                                       \
3306   }
3307 #else
3308 #define fill_table(table, expand)                                              \
3309   {                                                                            \
3310     fill_jumps(table, expand, _);                                              \
3311     fill_jumps(table, expand, _nested_);                                       \
3312   }
3313 #endif // KMP_USE_ADAPTIVE_LOCKS
3314 
3315 #define expand(l)                                                              \
3316   (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
3317   fill_table(__kmp_indirect_set_location, expand);
3318 #undef expand
3319 #define expand(l)                                                              \
3320   (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
3321   fill_table(__kmp_indirect_set_flags, expand);
3322 #undef expand
3323 #define expand(l)                                                              \
3324   (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
3325   fill_table(__kmp_indirect_get_location, expand);
3326 #undef expand
3327 #define expand(l)                                                              \
3328   (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
3329   fill_table(__kmp_indirect_get_flags, expand);
3330 #undef expand
3331 
3332   __kmp_init_user_locks = TRUE;
3333 }
3334 
3335 // Clean up the lock table.
3336 void __kmp_cleanup_indirect_user_locks() {
3337   kmp_lock_index_t i;
3338   int k;
3339 
3340   // Clean up locks in the pools first (they were already destroyed before going
3341   // into the pools).
3342   for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
3343     kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
3344     while (l != NULL) {
3345       kmp_indirect_lock_t *ll = l;
3346       l = (kmp_indirect_lock_t *)l->lock->pool.next;
3347       KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
3348                     ll));
3349       __kmp_free(ll->lock);
3350       ll->lock = NULL;
3351     }
3352     __kmp_indirect_lock_pool[k] = NULL;
3353   }
3354   // Clean up the remaining undestroyed locks.
3355   for (i = 0; i < __kmp_i_lock_table.next; i++) {
3356     kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
3357     if (l->lock != NULL) {
3358       // Locks not destroyed explicitly need to be destroyed here.
3359       KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3360       KA_TRACE(
3361           20,
3362           ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
3363            l));
3364       __kmp_free(l->lock);
3365     }
3366   }
3367   // Free the table
3368   for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
3369     __kmp_free(__kmp_i_lock_table.table[i]);
3370   __kmp_free(__kmp_i_lock_table.table);
3371 
3372   __kmp_init_user_locks = FALSE;
3373 }
3374 
3375 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3376 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3377 
3378 #else // KMP_USE_DYNAMIC_LOCK
3379 
3380 /* user locks
3381  * They are implemented as a table of function pointers which are set to the
3382  * lock functions of the appropriate kind, once that has been determined. */
3383 
3384 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3385 
3386 size_t __kmp_base_user_lock_size = 0;
3387 size_t __kmp_user_lock_size = 0;
3388 
3389 kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
3390 int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
3391                                             kmp_int32 gtid) = NULL;
3392 
3393 int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
3394                                          kmp_int32 gtid) = NULL;
3395 int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
3396                                             kmp_int32 gtid) = NULL;
3397 void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3398 void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
3399 void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3400 int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3401                                                    kmp_int32 gtid) = NULL;
3402 
3403 int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3404                                                 kmp_int32 gtid) = NULL;
3405 int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3406                                                    kmp_int32 gtid) = NULL;
3407 void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3408 void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3409 
3410 int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
3411 const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
3412 void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
3413                                       const ident_t *loc) = NULL;
3414 kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
3415 void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
3416                                    kmp_lock_flags_t flags) = NULL;
3417 
3418 void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
3419   switch (user_lock_kind) {
3420   case lk_default:
3421   default:
3422     KMP_ASSERT(0);
3423 
3424   case lk_tas: {
3425     __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
3426     __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
3427 
3428     __kmp_get_user_lock_owner_ =
3429         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
3430 
3431     if (__kmp_env_consistency_check) {
3432       KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
3433       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
3434     } else {
3435       KMP_BIND_USER_LOCK(tas);
3436       KMP_BIND_NESTED_USER_LOCK(tas);
3437     }
3438 
3439     __kmp_destroy_user_lock_ =
3440         (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
3441 
3442     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3443 
3444     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3445 
3446     __kmp_set_user_lock_location_ =
3447         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3448 
3449     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3450 
3451     __kmp_set_user_lock_flags_ =
3452         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3453   } break;
3454 
3455 #if KMP_USE_FUTEX
3456 
3457   case lk_futex: {
3458     __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
3459     __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
3460 
3461     __kmp_get_user_lock_owner_ =
3462         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
3463 
3464     if (__kmp_env_consistency_check) {
3465       KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
3466       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
3467     } else {
3468       KMP_BIND_USER_LOCK(futex);
3469       KMP_BIND_NESTED_USER_LOCK(futex);
3470     }
3471 
3472     __kmp_destroy_user_lock_ =
3473         (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
3474 
3475     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3476 
3477     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3478 
3479     __kmp_set_user_lock_location_ =
3480         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3481 
3482     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3483 
3484     __kmp_set_user_lock_flags_ =
3485         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3486   } break;
3487 
3488 #endif // KMP_USE_FUTEX
3489 
3490   case lk_ticket: {
3491     __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
3492     __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
3493 
3494     __kmp_get_user_lock_owner_ =
3495         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
3496 
3497     if (__kmp_env_consistency_check) {
3498       KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
3499       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
3500     } else {
3501       KMP_BIND_USER_LOCK(ticket);
3502       KMP_BIND_NESTED_USER_LOCK(ticket);
3503     }
3504 
3505     __kmp_destroy_user_lock_ =
3506         (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
3507 
3508     __kmp_is_user_lock_initialized_ =
3509         (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
3510 
3511     __kmp_get_user_lock_location_ =
3512         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
3513 
3514     __kmp_set_user_lock_location_ = (void (*)(
3515         kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
3516 
3517     __kmp_get_user_lock_flags_ =
3518         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
3519 
3520     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3521         &__kmp_set_ticket_lock_flags);
3522   } break;
3523 
3524   case lk_queuing: {
3525     __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
3526     __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
3527 
3528     __kmp_get_user_lock_owner_ =
3529         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3530 
3531     if (__kmp_env_consistency_check) {
3532       KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
3533       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
3534     } else {
3535       KMP_BIND_USER_LOCK(queuing);
3536       KMP_BIND_NESTED_USER_LOCK(queuing);
3537     }
3538 
3539     __kmp_destroy_user_lock_ =
3540         (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
3541 
3542     __kmp_is_user_lock_initialized_ =
3543         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3544 
3545     __kmp_get_user_lock_location_ =
3546         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3547 
3548     __kmp_set_user_lock_location_ = (void (*)(
3549         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3550 
3551     __kmp_get_user_lock_flags_ =
3552         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3553 
3554     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3555         &__kmp_set_queuing_lock_flags);
3556   } break;
3557 
3558 #if KMP_USE_ADAPTIVE_LOCKS
3559   case lk_adaptive: {
3560     __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
3561     __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
3562 
3563     __kmp_get_user_lock_owner_ =
3564         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3565 
3566     if (__kmp_env_consistency_check) {
3567       KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
3568     } else {
3569       KMP_BIND_USER_LOCK(adaptive);
3570     }
3571 
3572     __kmp_destroy_user_lock_ =
3573         (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
3574 
3575     __kmp_is_user_lock_initialized_ =
3576         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3577 
3578     __kmp_get_user_lock_location_ =
3579         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3580 
3581     __kmp_set_user_lock_location_ = (void (*)(
3582         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3583 
3584     __kmp_get_user_lock_flags_ =
3585         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3586 
3587     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3588         &__kmp_set_queuing_lock_flags);
3589 
3590   } break;
3591 #endif // KMP_USE_ADAPTIVE_LOCKS
3592 
3593   case lk_drdpa: {
3594     __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
3595     __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
3596 
3597     __kmp_get_user_lock_owner_ =
3598         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
3599 
3600     if (__kmp_env_consistency_check) {
3601       KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
3602       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
3603     } else {
3604       KMP_BIND_USER_LOCK(drdpa);
3605       KMP_BIND_NESTED_USER_LOCK(drdpa);
3606     }
3607 
3608     __kmp_destroy_user_lock_ =
3609         (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
3610 
3611     __kmp_is_user_lock_initialized_ =
3612         (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
3613 
3614     __kmp_get_user_lock_location_ =
3615         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
3616 
3617     __kmp_set_user_lock_location_ = (void (*)(
3618         kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
3619 
3620     __kmp_get_user_lock_flags_ =
3621         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
3622 
3623     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3624         &__kmp_set_drdpa_lock_flags);
3625   } break;
3626   }
3627 }
3628 
3629 // ----------------------------------------------------------------------------
3630 // User lock table & lock allocation
3631 
3632 kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
3633 kmp_user_lock_p __kmp_lock_pool = NULL;
3634 
3635 // Lock block-allocation support.
3636 kmp_block_of_locks *__kmp_lock_blocks = NULL;
3637 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3638 
3639 static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
3640   // Assume that kmp_global_lock is held upon entry/exit.
3641   kmp_lock_index_t index;
3642   if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
3643     kmp_lock_index_t size;
3644     kmp_user_lock_p *table;
3645     // Reallocate lock table.
3646     if (__kmp_user_lock_table.allocated == 0) {
3647       size = 1024;
3648     } else {
3649       size = __kmp_user_lock_table.allocated * 2;
3650     }
3651     table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
3652     KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
3653                sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
3654     table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
3655     // We cannot free the previous table now, since it may be in use by other
3656     // threads. So save the pointer to the previous table in in the first
3657     // element of the new table. All the tables will be organized into a list,
3658     // and could be freed when library shutting down.
3659     __kmp_user_lock_table.table = table;
3660     __kmp_user_lock_table.allocated = size;
3661   }
3662   KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
3663                    __kmp_user_lock_table.allocated);
3664   index = __kmp_user_lock_table.used;
3665   __kmp_user_lock_table.table[index] = lck;
3666   ++__kmp_user_lock_table.used;
3667   return index;
3668 }
3669 
3670 static kmp_user_lock_p __kmp_lock_block_allocate() {
3671   // Assume that kmp_global_lock is held upon entry/exit.
3672   static int last_index = 0;
3673   if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
3674     // Restart the index.
3675     last_index = 0;
3676     // Need to allocate a new block.
3677     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3678     size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
3679     char *buffer =
3680         (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
3681     // Set up the new block.
3682     kmp_block_of_locks *new_block =
3683         (kmp_block_of_locks *)(&buffer[space_for_locks]);
3684     new_block->next_block = __kmp_lock_blocks;
3685     new_block->locks = (void *)buffer;
3686     // Publish the new block.
3687     KMP_MB();
3688     __kmp_lock_blocks = new_block;
3689   }
3690   kmp_user_lock_p ret = (kmp_user_lock_p)(&(
3691       ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
3692   last_index++;
3693   return ret;
3694 }
3695 
3696 // Get memory for a lock. It may be freshly allocated memory or reused memory
3697 // from lock pool.
3698 kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
3699                                          kmp_lock_flags_t flags) {
3700   kmp_user_lock_p lck;
3701   kmp_lock_index_t index;
3702   KMP_DEBUG_ASSERT(user_lock);
3703 
3704   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3705 
3706   if (__kmp_lock_pool == NULL) {
3707     // Lock pool is empty. Allocate new memory.
3708 
3709     // ANNOTATION: Found no good way to express the syncronisation
3710     // between allocation and usage, so ignore the allocation
3711     ANNOTATE_IGNORE_WRITES_BEGIN();
3712     if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
3713       lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
3714     } else {
3715       lck = __kmp_lock_block_allocate();
3716     }
3717     ANNOTATE_IGNORE_WRITES_END();
3718 
3719     // Insert lock in the table so that it can be freed in __kmp_cleanup,
3720     // and debugger has info on all allocated locks.
3721     index = __kmp_lock_table_insert(lck);
3722   } else {
3723     // Pick up lock from pool.
3724     lck = __kmp_lock_pool;
3725     index = __kmp_lock_pool->pool.index;
3726     __kmp_lock_pool = __kmp_lock_pool->pool.next;
3727   }
3728 
3729   // We could potentially differentiate between nested and regular locks
3730   // here, and do the lock table lookup for regular locks only.
3731   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3732     *((kmp_lock_index_t *)user_lock) = index;
3733   } else {
3734     *((kmp_user_lock_p *)user_lock) = lck;
3735   }
3736 
3737   // mark the lock if it is critical section lock.
3738   __kmp_set_user_lock_flags(lck, flags);
3739 
3740   __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
3741 
3742   return lck;
3743 }
3744 
3745 // Put lock's memory to pool for reusing.
3746 void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
3747                           kmp_user_lock_p lck) {
3748   KMP_DEBUG_ASSERT(user_lock != NULL);
3749   KMP_DEBUG_ASSERT(lck != NULL);
3750 
3751   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3752 
3753   lck->pool.next = __kmp_lock_pool;
3754   __kmp_lock_pool = lck;
3755   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3756     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3757     KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
3758     lck->pool.index = index;
3759   }
3760 
3761   __kmp_release_lock(&__kmp_global_lock, gtid);
3762 }
3763 
3764 kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
3765   kmp_user_lock_p lck = NULL;
3766 
3767   if (__kmp_env_consistency_check) {
3768     if (user_lock == NULL) {
3769       KMP_FATAL(LockIsUninitialized, func);
3770     }
3771   }
3772 
3773   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3774     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3775     if (__kmp_env_consistency_check) {
3776       if (!(0 < index && index < __kmp_user_lock_table.used)) {
3777         KMP_FATAL(LockIsUninitialized, func);
3778       }
3779     }
3780     KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
3781     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3782     lck = __kmp_user_lock_table.table[index];
3783   } else {
3784     lck = *((kmp_user_lock_p *)user_lock);
3785   }
3786 
3787   if (__kmp_env_consistency_check) {
3788     if (lck == NULL) {
3789       KMP_FATAL(LockIsUninitialized, func);
3790     }
3791   }
3792 
3793   return lck;
3794 }
3795 
3796 void __kmp_cleanup_user_locks(void) {
3797   // Reset lock pool. Don't worry about lock in the pool--we will free them when
3798   // iterating through lock table (it includes all the locks, dead or alive).
3799   __kmp_lock_pool = NULL;
3800 
3801 #define IS_CRITICAL(lck)                                                       \
3802   ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
3803    ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
3804 
3805   // Loop through lock table, free all locks.
3806   // Do not free item [0], it is reserved for lock tables list.
3807   //
3808   // FIXME - we are iterating through a list of (pointers to) objects of type
3809   // union kmp_user_lock, but we have no way of knowing whether the base type is
3810   // currently "pool" or whatever the global user lock type is.
3811   //
3812   // We are relying on the fact that for all of the user lock types
3813   // (except "tas"), the first field in the lock struct is the "initialized"
3814   // field, which is set to the address of the lock object itself when
3815   // the lock is initialized.  When the union is of type "pool", the
3816   // first field is a pointer to the next object in the free list, which
3817   // will not be the same address as the object itself.
3818   //
3819   // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
3820   // for "pool" objects on the free list.  This must happen as the "location"
3821   // field of real user locks overlaps the "index" field of "pool" objects.
3822   //
3823   // It would be better to run through the free list, and remove all "pool"
3824   // objects from the lock table before executing this loop.  However,
3825   // "pool" objects do not always have their index field set (only on
3826   // lin_32e), and I don't want to search the lock table for the address
3827   // of every "pool" object on the free list.
3828   while (__kmp_user_lock_table.used > 1) {
3829     const ident *loc;
3830 
3831     // reduce __kmp_user_lock_table.used before freeing the lock,
3832     // so that state of locks is consistent
3833     kmp_user_lock_p lck =
3834         __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
3835 
3836     if ((__kmp_is_user_lock_initialized_ != NULL) &&
3837         (*__kmp_is_user_lock_initialized_)(lck)) {
3838       // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
3839       // it is NOT a critical section (user is not responsible for destroying
3840       // criticals) AND we know source location to report.
3841       if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
3842           ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
3843           (loc->psource != NULL)) {
3844         kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 0);
3845         KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
3846         __kmp_str_loc_free(&str_loc);
3847       }
3848 
3849 #ifdef KMP_DEBUG
3850       if (IS_CRITICAL(lck)) {
3851         KA_TRACE(
3852             20,
3853             ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
3854              lck, *(void **)lck));
3855       } else {
3856         KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
3857                       *(void **)lck));
3858       }
3859 #endif // KMP_DEBUG
3860 
3861       // Cleanup internal lock dynamic resources (for drdpa locks particularly).
3862       __kmp_destroy_user_lock(lck);
3863     }
3864 
3865     // Free the lock if block allocation of locks is not used.
3866     if (__kmp_lock_blocks == NULL) {
3867       __kmp_free(lck);
3868     }
3869   }
3870 
3871 #undef IS_CRITICAL
3872 
3873   // delete lock table(s).
3874   kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
3875   __kmp_user_lock_table.table = NULL;
3876   __kmp_user_lock_table.allocated = 0;
3877 
3878   while (table_ptr != NULL) {
3879     // In the first element we saved the pointer to the previous
3880     // (smaller) lock table.
3881     kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
3882     __kmp_free(table_ptr);
3883     table_ptr = next;
3884   }
3885 
3886   // Free buffers allocated for blocks of locks.
3887   kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
3888   __kmp_lock_blocks = NULL;
3889 
3890   while (block_ptr != NULL) {
3891     kmp_block_of_locks_t *next = block_ptr->next_block;
3892     __kmp_free(block_ptr->locks);
3893     // *block_ptr itself was allocated at the end of the locks vector.
3894     block_ptr = next;
3895   }
3896 
3897   TCW_4(__kmp_init_user_locks, FALSE);
3898 }
3899 
3900 #endif // KMP_USE_DYNAMIC_LOCK
3901