1 /*
2  * kmp_lock.cpp -- lock-related functions
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <stddef.h>
15 #include <atomic>
16 
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_lock.h"
22 
23 #include "tsan_annotations.h"
24 
25 #if KMP_USE_FUTEX
26 #include <sys/syscall.h>
27 #include <unistd.h>
28 // We should really include <futex.h>, but that causes compatibility problems on
29 // different Linux* OS distributions that either require that you include (or
30 // break when you try to include) <pci/types.h>. Since all we need is the two
31 // macros below (which are part of the kernel ABI, so can't change) we just
32 // define the constants here and don't include <futex.h>
33 #ifndef FUTEX_WAIT
34 #define FUTEX_WAIT 0
35 #endif
36 #ifndef FUTEX_WAKE
37 #define FUTEX_WAKE 1
38 #endif
39 #endif
40 
41 /* Implement spin locks for internal library use.             */
42 /* The algorithm implemented is Lamport's bakery lock [1974]. */
43 
44 void __kmp_validate_locks(void) {
45   int i;
46   kmp_uint32 x, y;
47 
48   /* Check to make sure unsigned arithmetic does wraps properly */
49   x = ~((kmp_uint32)0) - 2;
50   y = x - 2;
51 
52   for (i = 0; i < 8; ++i, ++x, ++y) {
53     kmp_uint32 z = (x - y);
54     KMP_ASSERT(z == 2);
55   }
56 
57   KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
58 }
59 
60 /* ------------------------------------------------------------------------ */
61 /* test and set locks */
62 
63 // For the non-nested locks, we can only assume that the first 4 bytes were
64 // allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
65 // compiler only allocates a 4 byte pointer on IA-32 architecture.  On
66 // Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
67 //
68 // gcc reserves >= 8 bytes for nested locks, so we can assume that the
69 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
70 
71 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
72   return KMP_LOCK_STRIP(TCR_4(lck->lk.poll)) - 1;
73 }
74 
75 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
76   return lck->lk.depth_locked != -1;
77 }
78 
79 __forceinline static int
80 __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
81   KMP_MB();
82 
83 #ifdef USE_LOCK_PROFILE
84   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
85   if ((curr != 0) && (curr != gtid + 1))
86     __kmp_printf("LOCK CONTENTION: %p\n", lck);
87 /* else __kmp_printf( "." );*/
88 #endif /* USE_LOCK_PROFILE */
89 
90   if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
91       KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
92                                   KMP_LOCK_BUSY(gtid + 1, tas))) {
93     KMP_FSYNC_ACQUIRED(lck);
94     return KMP_LOCK_ACQUIRED_FIRST;
95   }
96 
97   kmp_uint32 spins;
98   KMP_FSYNC_PREPARE(lck);
99   KMP_INIT_YIELD(spins);
100   if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
101     KMP_YIELD(TRUE);
102   } else {
103     KMP_YIELD_SPIN(spins);
104   }
105 
106   kmp_backoff_t backoff = __kmp_spin_backoff_params;
107   while ((lck->lk.poll != KMP_LOCK_FREE(tas)) ||
108          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
109                                        KMP_LOCK_BUSY(gtid + 1, tas)))) {
110 
111     __kmp_spin_backoff(&backoff);
112     if (TCR_4(__kmp_nth) >
113         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
114       KMP_YIELD(TRUE);
115     } else {
116       KMP_YIELD_SPIN(spins);
117     }
118   }
119   KMP_FSYNC_ACQUIRED(lck);
120   return KMP_LOCK_ACQUIRED_FIRST;
121 }
122 
123 int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
124   int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
125   ANNOTATE_TAS_ACQUIRED(lck);
126   return retval;
127 }
128 
129 static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
130                                               kmp_int32 gtid) {
131   char const *const func = "omp_set_lock";
132   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
133       __kmp_is_tas_lock_nestable(lck)) {
134     KMP_FATAL(LockNestableUsedAsSimple, func);
135   }
136   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
137     KMP_FATAL(LockIsAlreadyOwned, func);
138   }
139   return __kmp_acquire_tas_lock(lck, gtid);
140 }
141 
142 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
143   if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
144       KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
145                                   KMP_LOCK_BUSY(gtid + 1, tas))) {
146     KMP_FSYNC_ACQUIRED(lck);
147     return TRUE;
148   }
149   return FALSE;
150 }
151 
152 static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
153                                            kmp_int32 gtid) {
154   char const *const func = "omp_test_lock";
155   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
156       __kmp_is_tas_lock_nestable(lck)) {
157     KMP_FATAL(LockNestableUsedAsSimple, func);
158   }
159   return __kmp_test_tas_lock(lck, gtid);
160 }
161 
162 int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
163   KMP_MB(); /* Flush all pending memory write invalidates.  */
164 
165   KMP_FSYNC_RELEASING(lck);
166   ANNOTATE_TAS_RELEASED(lck);
167   KMP_ST_REL32(&(lck->lk.poll), KMP_LOCK_FREE(tas));
168   KMP_MB(); /* Flush all pending memory write invalidates.  */
169 
170   KMP_YIELD(TCR_4(__kmp_nth) >
171             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
172   return KMP_LOCK_RELEASED;
173 }
174 
175 static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
176                                               kmp_int32 gtid) {
177   char const *const func = "omp_unset_lock";
178   KMP_MB(); /* in case another processor initialized lock */
179   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
180       __kmp_is_tas_lock_nestable(lck)) {
181     KMP_FATAL(LockNestableUsedAsSimple, func);
182   }
183   if (__kmp_get_tas_lock_owner(lck) == -1) {
184     KMP_FATAL(LockUnsettingFree, func);
185   }
186   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
187       (__kmp_get_tas_lock_owner(lck) != gtid)) {
188     KMP_FATAL(LockUnsettingSetByAnother, func);
189   }
190   return __kmp_release_tas_lock(lck, gtid);
191 }
192 
193 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
194   TCW_4(lck->lk.poll, KMP_LOCK_FREE(tas));
195 }
196 
197 static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
198   __kmp_init_tas_lock(lck);
199 }
200 
201 void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
202 
203 static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
204   char const *const func = "omp_destroy_lock";
205   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
206       __kmp_is_tas_lock_nestable(lck)) {
207     KMP_FATAL(LockNestableUsedAsSimple, func);
208   }
209   if (__kmp_get_tas_lock_owner(lck) != -1) {
210     KMP_FATAL(LockStillOwned, func);
211   }
212   __kmp_destroy_tas_lock(lck);
213 }
214 
215 // nested test and set locks
216 
217 int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
218   KMP_DEBUG_ASSERT(gtid >= 0);
219 
220   if (__kmp_get_tas_lock_owner(lck) == gtid) {
221     lck->lk.depth_locked += 1;
222     return KMP_LOCK_ACQUIRED_NEXT;
223   } else {
224     __kmp_acquire_tas_lock_timed_template(lck, gtid);
225     ANNOTATE_TAS_ACQUIRED(lck);
226     lck->lk.depth_locked = 1;
227     return KMP_LOCK_ACQUIRED_FIRST;
228   }
229 }
230 
231 static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
232                                                      kmp_int32 gtid) {
233   char const *const func = "omp_set_nest_lock";
234   if (!__kmp_is_tas_lock_nestable(lck)) {
235     KMP_FATAL(LockSimpleUsedAsNestable, func);
236   }
237   return __kmp_acquire_nested_tas_lock(lck, gtid);
238 }
239 
240 int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
241   int retval;
242 
243   KMP_DEBUG_ASSERT(gtid >= 0);
244 
245   if (__kmp_get_tas_lock_owner(lck) == gtid) {
246     retval = ++lck->lk.depth_locked;
247   } else if (!__kmp_test_tas_lock(lck, gtid)) {
248     retval = 0;
249   } else {
250     KMP_MB();
251     retval = lck->lk.depth_locked = 1;
252   }
253   return retval;
254 }
255 
256 static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
257                                                   kmp_int32 gtid) {
258   char const *const func = "omp_test_nest_lock";
259   if (!__kmp_is_tas_lock_nestable(lck)) {
260     KMP_FATAL(LockSimpleUsedAsNestable, func);
261   }
262   return __kmp_test_nested_tas_lock(lck, gtid);
263 }
264 
265 int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
266   KMP_DEBUG_ASSERT(gtid >= 0);
267 
268   KMP_MB();
269   if (--(lck->lk.depth_locked) == 0) {
270     __kmp_release_tas_lock(lck, gtid);
271     return KMP_LOCK_RELEASED;
272   }
273   return KMP_LOCK_STILL_HELD;
274 }
275 
276 static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
277                                                      kmp_int32 gtid) {
278   char const *const func = "omp_unset_nest_lock";
279   KMP_MB(); /* in case another processor initialized lock */
280   if (!__kmp_is_tas_lock_nestable(lck)) {
281     KMP_FATAL(LockSimpleUsedAsNestable, func);
282   }
283   if (__kmp_get_tas_lock_owner(lck) == -1) {
284     KMP_FATAL(LockUnsettingFree, func);
285   }
286   if (__kmp_get_tas_lock_owner(lck) != gtid) {
287     KMP_FATAL(LockUnsettingSetByAnother, func);
288   }
289   return __kmp_release_nested_tas_lock(lck, gtid);
290 }
291 
292 void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
293   __kmp_init_tas_lock(lck);
294   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
295 }
296 
297 static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
298   __kmp_init_nested_tas_lock(lck);
299 }
300 
301 void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
302   __kmp_destroy_tas_lock(lck);
303   lck->lk.depth_locked = 0;
304 }
305 
306 static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
307   char const *const func = "omp_destroy_nest_lock";
308   if (!__kmp_is_tas_lock_nestable(lck)) {
309     KMP_FATAL(LockSimpleUsedAsNestable, func);
310   }
311   if (__kmp_get_tas_lock_owner(lck) != -1) {
312     KMP_FATAL(LockStillOwned, func);
313   }
314   __kmp_destroy_nested_tas_lock(lck);
315 }
316 
317 #if KMP_USE_FUTEX
318 
319 /* ------------------------------------------------------------------------ */
320 /* futex locks */
321 
322 // futex locks are really just test and set locks, with a different method
323 // of handling contention.  They take the same amount of space as test and
324 // set locks, and are allocated the same way (i.e. use the area allocated by
325 // the compiler for non-nested locks / allocate nested locks on the heap).
326 
327 static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
328   return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
329 }
330 
331 static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
332   return lck->lk.depth_locked != -1;
333 }
334 
335 __forceinline static int
336 __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
337   kmp_int32 gtid_code = (gtid + 1) << 1;
338 
339   KMP_MB();
340 
341 #ifdef USE_LOCK_PROFILE
342   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
343   if ((curr != 0) && (curr != gtid_code))
344     __kmp_printf("LOCK CONTENTION: %p\n", lck);
345 /* else __kmp_printf( "." );*/
346 #endif /* USE_LOCK_PROFILE */
347 
348   KMP_FSYNC_PREPARE(lck);
349   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
350                   lck, lck->lk.poll, gtid));
351 
352   kmp_int32 poll_val;
353 
354   while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
355               &(lck->lk.poll), KMP_LOCK_FREE(futex),
356               KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
357 
358     kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
359     KA_TRACE(
360         1000,
361         ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
362          lck, gtid, poll_val, cond));
363 
364     // NOTE: if you try to use the following condition for this branch
365     //
366     // if ( poll_val & 1 == 0 )
367     //
368     // Then the 12.0 compiler has a bug where the following block will
369     // always be skipped, regardless of the value of the LSB of poll_val.
370     if (!cond) {
371       // Try to set the lsb in the poll to indicate to the owner
372       // thread that they need to wake this thread up.
373       if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
374                                        poll_val | KMP_LOCK_BUSY(1, futex))) {
375         KA_TRACE(
376             1000,
377             ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
378              lck, lck->lk.poll, gtid));
379         continue;
380       }
381       poll_val |= KMP_LOCK_BUSY(1, futex);
382 
383       KA_TRACE(1000,
384                ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
385                 lck->lk.poll, gtid));
386     }
387 
388     KA_TRACE(
389         1000,
390         ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
391          lck, gtid, poll_val));
392 
393     kmp_int32 rc;
394     if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
395                       NULL, 0)) != 0) {
396       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
397                       "failed (rc=%d errno=%d)\n",
398                       lck, gtid, poll_val, rc, errno));
399       continue;
400     }
401 
402     KA_TRACE(1000,
403              ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
404               lck, gtid, poll_val));
405     // This thread has now done a successful futex wait call and was entered on
406     // the OS futex queue.  We must now perform a futex wake call when releasing
407     // the lock, as we have no idea how many other threads are in the queue.
408     gtid_code |= 1;
409   }
410 
411   KMP_FSYNC_ACQUIRED(lck);
412   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
413                   lck->lk.poll, gtid));
414   return KMP_LOCK_ACQUIRED_FIRST;
415 }
416 
417 int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
418   int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
419   ANNOTATE_FUTEX_ACQUIRED(lck);
420   return retval;
421 }
422 
423 static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
424                                                 kmp_int32 gtid) {
425   char const *const func = "omp_set_lock";
426   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
427       __kmp_is_futex_lock_nestable(lck)) {
428     KMP_FATAL(LockNestableUsedAsSimple, func);
429   }
430   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
431     KMP_FATAL(LockIsAlreadyOwned, func);
432   }
433   return __kmp_acquire_futex_lock(lck, gtid);
434 }
435 
436 int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
437   if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
438                                   KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
439     KMP_FSYNC_ACQUIRED(lck);
440     return TRUE;
441   }
442   return FALSE;
443 }
444 
445 static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
446                                              kmp_int32 gtid) {
447   char const *const func = "omp_test_lock";
448   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
449       __kmp_is_futex_lock_nestable(lck)) {
450     KMP_FATAL(LockNestableUsedAsSimple, func);
451   }
452   return __kmp_test_futex_lock(lck, gtid);
453 }
454 
455 int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
456   KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
459                   lck, lck->lk.poll, gtid));
460 
461   KMP_FSYNC_RELEASING(lck);
462   ANNOTATE_FUTEX_RELEASED(lck);
463 
464   kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
465 
466   KA_TRACE(1000,
467            ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
468             lck, gtid, poll_val));
469 
470   if (KMP_LOCK_STRIP(poll_val) & 1) {
471     KA_TRACE(1000,
472              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
473               lck, gtid));
474     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
475             NULL, NULL, 0);
476   }
477 
478   KMP_MB(); /* Flush all pending memory write invalidates.  */
479 
480   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
481                   lck->lk.poll, gtid));
482 
483   KMP_YIELD(TCR_4(__kmp_nth) >
484             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
485   return KMP_LOCK_RELEASED;
486 }
487 
488 static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
489                                                 kmp_int32 gtid) {
490   char const *const func = "omp_unset_lock";
491   KMP_MB(); /* in case another processor initialized lock */
492   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
493       __kmp_is_futex_lock_nestable(lck)) {
494     KMP_FATAL(LockNestableUsedAsSimple, func);
495   }
496   if (__kmp_get_futex_lock_owner(lck) == -1) {
497     KMP_FATAL(LockUnsettingFree, func);
498   }
499   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
500       (__kmp_get_futex_lock_owner(lck) != gtid)) {
501     KMP_FATAL(LockUnsettingSetByAnother, func);
502   }
503   return __kmp_release_futex_lock(lck, gtid);
504 }
505 
506 void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
507   TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
508 }
509 
510 static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
511   __kmp_init_futex_lock(lck);
512 }
513 
514 void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
515 
516 static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
517   char const *const func = "omp_destroy_lock";
518   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
519       __kmp_is_futex_lock_nestable(lck)) {
520     KMP_FATAL(LockNestableUsedAsSimple, func);
521   }
522   if (__kmp_get_futex_lock_owner(lck) != -1) {
523     KMP_FATAL(LockStillOwned, func);
524   }
525   __kmp_destroy_futex_lock(lck);
526 }
527 
528 // nested futex locks
529 
530 int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
531   KMP_DEBUG_ASSERT(gtid >= 0);
532 
533   if (__kmp_get_futex_lock_owner(lck) == gtid) {
534     lck->lk.depth_locked += 1;
535     return KMP_LOCK_ACQUIRED_NEXT;
536   } else {
537     __kmp_acquire_futex_lock_timed_template(lck, gtid);
538     ANNOTATE_FUTEX_ACQUIRED(lck);
539     lck->lk.depth_locked = 1;
540     return KMP_LOCK_ACQUIRED_FIRST;
541   }
542 }
543 
544 static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
545                                                        kmp_int32 gtid) {
546   char const *const func = "omp_set_nest_lock";
547   if (!__kmp_is_futex_lock_nestable(lck)) {
548     KMP_FATAL(LockSimpleUsedAsNestable, func);
549   }
550   return __kmp_acquire_nested_futex_lock(lck, gtid);
551 }
552 
553 int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
554   int retval;
555 
556   KMP_DEBUG_ASSERT(gtid >= 0);
557 
558   if (__kmp_get_futex_lock_owner(lck) == gtid) {
559     retval = ++lck->lk.depth_locked;
560   } else if (!__kmp_test_futex_lock(lck, gtid)) {
561     retval = 0;
562   } else {
563     KMP_MB();
564     retval = lck->lk.depth_locked = 1;
565   }
566   return retval;
567 }
568 
569 static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
570                                                     kmp_int32 gtid) {
571   char const *const func = "omp_test_nest_lock";
572   if (!__kmp_is_futex_lock_nestable(lck)) {
573     KMP_FATAL(LockSimpleUsedAsNestable, func);
574   }
575   return __kmp_test_nested_futex_lock(lck, gtid);
576 }
577 
578 int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
579   KMP_DEBUG_ASSERT(gtid >= 0);
580 
581   KMP_MB();
582   if (--(lck->lk.depth_locked) == 0) {
583     __kmp_release_futex_lock(lck, gtid);
584     return KMP_LOCK_RELEASED;
585   }
586   return KMP_LOCK_STILL_HELD;
587 }
588 
589 static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
590                                                        kmp_int32 gtid) {
591   char const *const func = "omp_unset_nest_lock";
592   KMP_MB(); /* in case another processor initialized lock */
593   if (!__kmp_is_futex_lock_nestable(lck)) {
594     KMP_FATAL(LockSimpleUsedAsNestable, func);
595   }
596   if (__kmp_get_futex_lock_owner(lck) == -1) {
597     KMP_FATAL(LockUnsettingFree, func);
598   }
599   if (__kmp_get_futex_lock_owner(lck) != gtid) {
600     KMP_FATAL(LockUnsettingSetByAnother, func);
601   }
602   return __kmp_release_nested_futex_lock(lck, gtid);
603 }
604 
605 void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
606   __kmp_init_futex_lock(lck);
607   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
608 }
609 
610 static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
611   __kmp_init_nested_futex_lock(lck);
612 }
613 
614 void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
615   __kmp_destroy_futex_lock(lck);
616   lck->lk.depth_locked = 0;
617 }
618 
619 static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
620   char const *const func = "omp_destroy_nest_lock";
621   if (!__kmp_is_futex_lock_nestable(lck)) {
622     KMP_FATAL(LockSimpleUsedAsNestable, func);
623   }
624   if (__kmp_get_futex_lock_owner(lck) != -1) {
625     KMP_FATAL(LockStillOwned, func);
626   }
627   __kmp_destroy_nested_futex_lock(lck);
628 }
629 
630 #endif // KMP_USE_FUTEX
631 
632 /* ------------------------------------------------------------------------ */
633 /* ticket (bakery) locks */
634 
635 static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
636   return std::atomic_load_explicit(&lck->lk.owner_id,
637                                    std::memory_order_relaxed) -
638          1;
639 }
640 
641 static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
642   return std::atomic_load_explicit(&lck->lk.depth_locked,
643                                    std::memory_order_relaxed) != -1;
644 }
645 
646 static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
647   return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
648                                    std::memory_order_acquire) == my_ticket;
649 }
650 
651 __forceinline static int
652 __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
653                                          kmp_int32 gtid) {
654   kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
655       &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
656 
657 #ifdef USE_LOCK_PROFILE
658   if (std::atomic_load_explicit(&lck->lk.now_serving,
659                                 std::memory_order_relaxed) != my_ticket)
660     __kmp_printf("LOCK CONTENTION: %p\n", lck);
661 /* else __kmp_printf( "." );*/
662 #endif /* USE_LOCK_PROFILE */
663 
664   if (std::atomic_load_explicit(&lck->lk.now_serving,
665                                 std::memory_order_acquire) == my_ticket) {
666     return KMP_LOCK_ACQUIRED_FIRST;
667   }
668   KMP_WAIT_YIELD_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
669   return KMP_LOCK_ACQUIRED_FIRST;
670 }
671 
672 int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
673   int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
674   ANNOTATE_TICKET_ACQUIRED(lck);
675   return retval;
676 }
677 
678 static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
679                                                  kmp_int32 gtid) {
680   char const *const func = "omp_set_lock";
681 
682   if (!std::atomic_load_explicit(&lck->lk.initialized,
683                                  std::memory_order_relaxed)) {
684     KMP_FATAL(LockIsUninitialized, func);
685   }
686   if (lck->lk.self != lck) {
687     KMP_FATAL(LockIsUninitialized, func);
688   }
689   if (__kmp_is_ticket_lock_nestable(lck)) {
690     KMP_FATAL(LockNestableUsedAsSimple, func);
691   }
692   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
693     KMP_FATAL(LockIsAlreadyOwned, func);
694   }
695 
696   __kmp_acquire_ticket_lock(lck, gtid);
697 
698   std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
699                              std::memory_order_relaxed);
700   return KMP_LOCK_ACQUIRED_FIRST;
701 }
702 
703 int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
704   kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
705                                                    std::memory_order_relaxed);
706 
707   if (std::atomic_load_explicit(&lck->lk.now_serving,
708                                 std::memory_order_relaxed) == my_ticket) {
709     kmp_uint32 next_ticket = my_ticket + 1;
710     if (std::atomic_compare_exchange_strong_explicit(
711             &lck->lk.next_ticket, &my_ticket, next_ticket,
712             std::memory_order_acquire, std::memory_order_acquire)) {
713       return TRUE;
714     }
715   }
716   return FALSE;
717 }
718 
719 static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
720                                               kmp_int32 gtid) {
721   char const *const func = "omp_test_lock";
722 
723   if (!std::atomic_load_explicit(&lck->lk.initialized,
724                                  std::memory_order_relaxed)) {
725     KMP_FATAL(LockIsUninitialized, func);
726   }
727   if (lck->lk.self != lck) {
728     KMP_FATAL(LockIsUninitialized, func);
729   }
730   if (__kmp_is_ticket_lock_nestable(lck)) {
731     KMP_FATAL(LockNestableUsedAsSimple, func);
732   }
733 
734   int retval = __kmp_test_ticket_lock(lck, gtid);
735 
736   if (retval) {
737     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
738                                std::memory_order_relaxed);
739   }
740   return retval;
741 }
742 
743 int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
744   kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
745                                                   std::memory_order_relaxed) -
746                         std::atomic_load_explicit(&lck->lk.now_serving,
747                                                   std::memory_order_relaxed);
748 
749   ANNOTATE_TICKET_RELEASED(lck);
750   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
751                                  std::memory_order_release);
752 
753   KMP_YIELD(distance >
754             (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
755   return KMP_LOCK_RELEASED;
756 }
757 
758 static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
759                                                  kmp_int32 gtid) {
760   char const *const func = "omp_unset_lock";
761 
762   if (!std::atomic_load_explicit(&lck->lk.initialized,
763                                  std::memory_order_relaxed)) {
764     KMP_FATAL(LockIsUninitialized, func);
765   }
766   if (lck->lk.self != lck) {
767     KMP_FATAL(LockIsUninitialized, func);
768   }
769   if (__kmp_is_ticket_lock_nestable(lck)) {
770     KMP_FATAL(LockNestableUsedAsSimple, func);
771   }
772   if (__kmp_get_ticket_lock_owner(lck) == -1) {
773     KMP_FATAL(LockUnsettingFree, func);
774   }
775   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
776       (__kmp_get_ticket_lock_owner(lck) != gtid)) {
777     KMP_FATAL(LockUnsettingSetByAnother, func);
778   }
779   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
780   return __kmp_release_ticket_lock(lck, gtid);
781 }
782 
783 void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
784   lck->lk.location = NULL;
785   lck->lk.self = lck;
786   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
787                              std::memory_order_relaxed);
788   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
789                              std::memory_order_relaxed);
790   std::atomic_store_explicit(
791       &lck->lk.owner_id, 0,
792       std::memory_order_relaxed); // no thread owns the lock.
793   std::atomic_store_explicit(
794       &lck->lk.depth_locked, -1,
795       std::memory_order_relaxed); // -1 => not a nested lock.
796   std::atomic_store_explicit(&lck->lk.initialized, true,
797                              std::memory_order_release);
798 }
799 
800 static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
801   __kmp_init_ticket_lock(lck);
802 }
803 
804 void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
805   std::atomic_store_explicit(&lck->lk.initialized, false,
806                              std::memory_order_release);
807   lck->lk.self = NULL;
808   lck->lk.location = NULL;
809   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
810                              std::memory_order_relaxed);
811   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
812                              std::memory_order_relaxed);
813   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
814   std::atomic_store_explicit(&lck->lk.depth_locked, -1,
815                              std::memory_order_relaxed);
816 }
817 
818 static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
819   char const *const func = "omp_destroy_lock";
820 
821   if (!std::atomic_load_explicit(&lck->lk.initialized,
822                                  std::memory_order_relaxed)) {
823     KMP_FATAL(LockIsUninitialized, func);
824   }
825   if (lck->lk.self != lck) {
826     KMP_FATAL(LockIsUninitialized, func);
827   }
828   if (__kmp_is_ticket_lock_nestable(lck)) {
829     KMP_FATAL(LockNestableUsedAsSimple, func);
830   }
831   if (__kmp_get_ticket_lock_owner(lck) != -1) {
832     KMP_FATAL(LockStillOwned, func);
833   }
834   __kmp_destroy_ticket_lock(lck);
835 }
836 
837 // nested ticket locks
838 
839 int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
840   KMP_DEBUG_ASSERT(gtid >= 0);
841 
842   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
843     std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
844                                    std::memory_order_relaxed);
845     return KMP_LOCK_ACQUIRED_NEXT;
846   } else {
847     __kmp_acquire_ticket_lock_timed_template(lck, gtid);
848     ANNOTATE_TICKET_ACQUIRED(lck);
849     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
850                                std::memory_order_relaxed);
851     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
852                                std::memory_order_relaxed);
853     return KMP_LOCK_ACQUIRED_FIRST;
854   }
855 }
856 
857 static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
858                                                         kmp_int32 gtid) {
859   char const *const func = "omp_set_nest_lock";
860 
861   if (!std::atomic_load_explicit(&lck->lk.initialized,
862                                  std::memory_order_relaxed)) {
863     KMP_FATAL(LockIsUninitialized, func);
864   }
865   if (lck->lk.self != lck) {
866     KMP_FATAL(LockIsUninitialized, func);
867   }
868   if (!__kmp_is_ticket_lock_nestable(lck)) {
869     KMP_FATAL(LockSimpleUsedAsNestable, func);
870   }
871   return __kmp_acquire_nested_ticket_lock(lck, gtid);
872 }
873 
874 int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
875   int retval;
876 
877   KMP_DEBUG_ASSERT(gtid >= 0);
878 
879   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
880     retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
881                                             std::memory_order_relaxed) +
882              1;
883   } else if (!__kmp_test_ticket_lock(lck, gtid)) {
884     retval = 0;
885   } else {
886     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
887                                std::memory_order_relaxed);
888     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
889                                std::memory_order_relaxed);
890     retval = 1;
891   }
892   return retval;
893 }
894 
895 static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
896                                                      kmp_int32 gtid) {
897   char const *const func = "omp_test_nest_lock";
898 
899   if (!std::atomic_load_explicit(&lck->lk.initialized,
900                                  std::memory_order_relaxed)) {
901     KMP_FATAL(LockIsUninitialized, func);
902   }
903   if (lck->lk.self != lck) {
904     KMP_FATAL(LockIsUninitialized, func);
905   }
906   if (!__kmp_is_ticket_lock_nestable(lck)) {
907     KMP_FATAL(LockSimpleUsedAsNestable, func);
908   }
909   return __kmp_test_nested_ticket_lock(lck, gtid);
910 }
911 
912 int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
913   KMP_DEBUG_ASSERT(gtid >= 0);
914 
915   if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
916                                       std::memory_order_relaxed) -
917        1) == 0) {
918     std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
919     __kmp_release_ticket_lock(lck, gtid);
920     return KMP_LOCK_RELEASED;
921   }
922   return KMP_LOCK_STILL_HELD;
923 }
924 
925 static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
926                                                         kmp_int32 gtid) {
927   char const *const func = "omp_unset_nest_lock";
928 
929   if (!std::atomic_load_explicit(&lck->lk.initialized,
930                                  std::memory_order_relaxed)) {
931     KMP_FATAL(LockIsUninitialized, func);
932   }
933   if (lck->lk.self != lck) {
934     KMP_FATAL(LockIsUninitialized, func);
935   }
936   if (!__kmp_is_ticket_lock_nestable(lck)) {
937     KMP_FATAL(LockSimpleUsedAsNestable, func);
938   }
939   if (__kmp_get_ticket_lock_owner(lck) == -1) {
940     KMP_FATAL(LockUnsettingFree, func);
941   }
942   if (__kmp_get_ticket_lock_owner(lck) != gtid) {
943     KMP_FATAL(LockUnsettingSetByAnother, func);
944   }
945   return __kmp_release_nested_ticket_lock(lck, gtid);
946 }
947 
948 void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
949   __kmp_init_ticket_lock(lck);
950   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
951                              std::memory_order_relaxed);
952   // >= 0 for nestable locks, -1 for simple locks
953 }
954 
955 static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
956   __kmp_init_nested_ticket_lock(lck);
957 }
958 
959 void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
960   __kmp_destroy_ticket_lock(lck);
961   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
962                              std::memory_order_relaxed);
963 }
964 
965 static void
966 __kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
967   char const *const func = "omp_destroy_nest_lock";
968 
969   if (!std::atomic_load_explicit(&lck->lk.initialized,
970                                  std::memory_order_relaxed)) {
971     KMP_FATAL(LockIsUninitialized, func);
972   }
973   if (lck->lk.self != lck) {
974     KMP_FATAL(LockIsUninitialized, func);
975   }
976   if (!__kmp_is_ticket_lock_nestable(lck)) {
977     KMP_FATAL(LockSimpleUsedAsNestable, func);
978   }
979   if (__kmp_get_ticket_lock_owner(lck) != -1) {
980     KMP_FATAL(LockStillOwned, func);
981   }
982   __kmp_destroy_nested_ticket_lock(lck);
983 }
984 
985 // access functions to fields which don't exist for all lock kinds.
986 
987 static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
988   return std::atomic_load_explicit(&lck->lk.initialized,
989                                    std::memory_order_relaxed) &&
990          (lck->lk.self == lck);
991 }
992 
993 static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
994   return lck->lk.location;
995 }
996 
997 static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
998                                            const ident_t *loc) {
999   lck->lk.location = loc;
1000 }
1001 
1002 static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
1003   return lck->lk.flags;
1004 }
1005 
1006 static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
1007                                         kmp_lock_flags_t flags) {
1008   lck->lk.flags = flags;
1009 }
1010 
1011 /* ------------------------------------------------------------------------ */
1012 /* queuing locks */
1013 
1014 /* First the states
1015    (head,tail) =              0, 0  means lock is unheld, nobody on queue
1016                  UINT_MAX or -1, 0  means lock is held, nobody on queue
1017                               h, h  means lock held or about to transition,
1018                                     1 element on queue
1019                               h, t  h <> t, means lock is held or about to
1020                                     transition, >1 elements on queue
1021 
1022    Now the transitions
1023       Acquire(0,0)  = -1 ,0
1024       Release(0,0)  = Error
1025       Acquire(-1,0) =  h ,h    h > 0
1026       Release(-1,0) =  0 ,0
1027       Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
1028       Release(h,h)  = -1 ,0    h > 0
1029       Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
1030       Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
1031 
1032    And pictorially
1033 
1034            +-----+
1035            | 0, 0|------- release -------> Error
1036            +-----+
1037              |  ^
1038       acquire|  |release
1039              |  |
1040              |  |
1041              v  |
1042            +-----+
1043            |-1, 0|
1044            +-----+
1045              |  ^
1046       acquire|  |release
1047              |  |
1048              |  |
1049              v  |
1050            +-----+
1051            | h, h|
1052            +-----+
1053              |  ^
1054       acquire|  |release
1055              |  |
1056              |  |
1057              v  |
1058            +-----+
1059            | h, t|----- acquire, release loopback ---+
1060            +-----+                                   |
1061                 ^                                    |
1062                 |                                    |
1063                 +------------------------------------+
1064  */
1065 
1066 #ifdef DEBUG_QUEUING_LOCKS
1067 
1068 /* Stuff for circular trace buffer */
1069 #define TRACE_BUF_ELE 1024
1070 static char traces[TRACE_BUF_ELE][128] = {0};
1071 static int tc = 0;
1072 #define TRACE_LOCK(X, Y)                                                       \
1073   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
1074 #define TRACE_LOCK_T(X, Y, Z)                                                  \
1075   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
1076 #define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
1077   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
1078                Z, Q);
1079 
1080 static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
1081                                     kmp_queuing_lock_t *lck, kmp_int32 head_id,
1082                                     kmp_int32 tail_id) {
1083   kmp_int32 t, i;
1084 
1085   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
1086 
1087   i = tc % TRACE_BUF_ELE;
1088   __kmp_printf_no_lock("%s\n", traces[i]);
1089   i = (i + 1) % TRACE_BUF_ELE;
1090   while (i != (tc % TRACE_BUF_ELE)) {
1091     __kmp_printf_no_lock("%s", traces[i]);
1092     i = (i + 1) % TRACE_BUF_ELE;
1093   }
1094   __kmp_printf_no_lock("\n");
1095 
1096   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
1097                        "next_wait:%d, head_id:%d, tail_id:%d\n",
1098                        gtid + 1, this_thr->th.th_spin_here,
1099                        this_thr->th.th_next_waiting, head_id, tail_id);
1100 
1101   __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
1102 
1103   if (lck->lk.head_id >= 1) {
1104     t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
1105     while (t > 0) {
1106       __kmp_printf_no_lock("-> %d ", t);
1107       t = __kmp_threads[t - 1]->th.th_next_waiting;
1108     }
1109   }
1110   __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
1111   __kmp_printf_no_lock("\n\n");
1112 }
1113 
1114 #endif /* DEBUG_QUEUING_LOCKS */
1115 
1116 static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
1117   return TCR_4(lck->lk.owner_id) - 1;
1118 }
1119 
1120 static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
1121   return lck->lk.depth_locked != -1;
1122 }
1123 
1124 /* Acquire a lock using a the queuing lock implementation */
1125 template <bool takeTime>
1126 /* [TLW] The unused template above is left behind because of what BEB believes
1127    is a potential compiler problem with __forceinline. */
1128 __forceinline static int
1129 __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
1130                                           kmp_int32 gtid) {
1131   kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
1132   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1133   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1134   volatile kmp_uint32 *spin_here_p;
1135   kmp_int32 need_mf = 1;
1136 
1137 #if OMPT_SUPPORT
1138   omp_state_t prev_state = omp_state_undefined;
1139 #endif
1140 
1141   KA_TRACE(1000,
1142            ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1143 
1144   KMP_FSYNC_PREPARE(lck);
1145   KMP_DEBUG_ASSERT(this_thr != NULL);
1146   spin_here_p = &this_thr->th.th_spin_here;
1147 
1148 #ifdef DEBUG_QUEUING_LOCKS
1149   TRACE_LOCK(gtid + 1, "acq ent");
1150   if (*spin_here_p)
1151     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1152   if (this_thr->th.th_next_waiting != 0)
1153     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1154 #endif
1155   KMP_DEBUG_ASSERT(!*spin_here_p);
1156   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1157 
1158   /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
1159      head_id_p that may follow, not just in execution order, but also in
1160      visibility order. This way, when a releasing thread observes the changes to
1161      the queue by this thread, it can rightly assume that spin_here_p has
1162      already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
1163      not premature.  If the releasing thread sets spin_here_p to FALSE before
1164      this thread sets it to TRUE, this thread will hang. */
1165   *spin_here_p = TRUE; /* before enqueuing to prevent race */
1166 
1167   while (1) {
1168     kmp_int32 enqueued;
1169     kmp_int32 head;
1170     kmp_int32 tail;
1171 
1172     head = *head_id_p;
1173 
1174     switch (head) {
1175 
1176     case -1: {
1177 #ifdef DEBUG_QUEUING_LOCKS
1178       tail = *tail_id_p;
1179       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1180 #endif
1181       tail = 0; /* to make sure next link asynchronously read is not set
1182                 accidentally; this assignment prevents us from entering the
1183                 if ( t > 0 ) condition in the enqueued case below, which is not
1184                 necessary for this state transition */
1185 
1186       need_mf = 0;
1187       /* try (-1,0)->(tid,tid) */
1188       enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
1189                                              KMP_PACK_64(-1, 0),
1190                                              KMP_PACK_64(gtid + 1, gtid + 1));
1191 #ifdef DEBUG_QUEUING_LOCKS
1192       if (enqueued)
1193         TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
1194 #endif
1195     } break;
1196 
1197     default: {
1198       tail = *tail_id_p;
1199       KMP_DEBUG_ASSERT(tail != gtid + 1);
1200 
1201 #ifdef DEBUG_QUEUING_LOCKS
1202       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1203 #endif
1204 
1205       if (tail == 0) {
1206         enqueued = FALSE;
1207       } else {
1208         need_mf = 0;
1209         /* try (h,t) or (h,h)->(h,tid) */
1210         enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
1211 
1212 #ifdef DEBUG_QUEUING_LOCKS
1213         if (enqueued)
1214           TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
1215 #endif
1216       }
1217     } break;
1218 
1219     case 0: /* empty queue */
1220     {
1221       kmp_int32 grabbed_lock;
1222 
1223 #ifdef DEBUG_QUEUING_LOCKS
1224       tail = *tail_id_p;
1225       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
1226 #endif
1227       /* try (0,0)->(-1,0) */
1228 
1229       /* only legal transition out of head = 0 is head = -1 with no change to
1230        * tail */
1231       grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
1232 
1233       if (grabbed_lock) {
1234 
1235         *spin_here_p = FALSE;
1236 
1237         KA_TRACE(
1238             1000,
1239             ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
1240              lck, gtid));
1241 #ifdef DEBUG_QUEUING_LOCKS
1242         TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
1243 #endif
1244 
1245 #if OMPT_SUPPORT
1246         if (ompt_enabled.enabled && prev_state != omp_state_undefined) {
1247           /* change the state before clearing wait_id */
1248           this_thr->th.ompt_thread_info.state = prev_state;
1249           this_thr->th.ompt_thread_info.wait_id = 0;
1250         }
1251 #endif
1252 
1253         KMP_FSYNC_ACQUIRED(lck);
1254         return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
1255       }
1256       enqueued = FALSE;
1257     } break;
1258     }
1259 
1260 #if OMPT_SUPPORT
1261     if (ompt_enabled.enabled && prev_state == omp_state_undefined) {
1262       /* this thread will spin; set wait_id before entering wait state */
1263       prev_state = this_thr->th.ompt_thread_info.state;
1264       this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
1265       this_thr->th.ompt_thread_info.state = omp_state_wait_lock;
1266     }
1267 #endif
1268 
1269     if (enqueued) {
1270       if (tail > 0) {
1271         kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
1272         KMP_ASSERT(tail_thr != NULL);
1273         tail_thr->th.th_next_waiting = gtid + 1;
1274         /* corresponding wait for this write in release code */
1275       }
1276       KA_TRACE(1000,
1277                ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
1278                 lck, gtid));
1279 
1280       /* ToDo: May want to consider using __kmp_wait_sleep  or something that
1281          sleeps for throughput only here. */
1282       KMP_MB();
1283       KMP_WAIT_YIELD(spin_here_p, FALSE, KMP_EQ, lck);
1284 
1285 #ifdef DEBUG_QUEUING_LOCKS
1286       TRACE_LOCK(gtid + 1, "acq spin");
1287 
1288       if (this_thr->th.th_next_waiting != 0)
1289         __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1290 #endif
1291       KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1292       KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
1293                       "waiting on queue\n",
1294                       lck, gtid));
1295 
1296 #ifdef DEBUG_QUEUING_LOCKS
1297       TRACE_LOCK(gtid + 1, "acq exit 2");
1298 #endif
1299 
1300 #if OMPT_SUPPORT
1301       /* change the state before clearing wait_id */
1302       this_thr->th.ompt_thread_info.state = prev_state;
1303       this_thr->th.ompt_thread_info.wait_id = 0;
1304 #endif
1305 
1306       /* got lock, we were dequeued by the thread that released lock */
1307       return KMP_LOCK_ACQUIRED_FIRST;
1308     }
1309 
1310     /* Yield if number of threads > number of logical processors */
1311     /* ToDo: Not sure why this should only be in oversubscription case,
1312        maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
1313     KMP_YIELD(TCR_4(__kmp_nth) >
1314               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
1315 #ifdef DEBUG_QUEUING_LOCKS
1316     TRACE_LOCK(gtid + 1, "acq retry");
1317 #endif
1318   }
1319   KMP_ASSERT2(0, "should not get here");
1320   return KMP_LOCK_ACQUIRED_FIRST;
1321 }
1322 
1323 int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1324   KMP_DEBUG_ASSERT(gtid >= 0);
1325 
1326   int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1327   ANNOTATE_QUEUING_ACQUIRED(lck);
1328   return retval;
1329 }
1330 
1331 static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1332                                                   kmp_int32 gtid) {
1333   char const *const func = "omp_set_lock";
1334   if (lck->lk.initialized != lck) {
1335     KMP_FATAL(LockIsUninitialized, func);
1336   }
1337   if (__kmp_is_queuing_lock_nestable(lck)) {
1338     KMP_FATAL(LockNestableUsedAsSimple, func);
1339   }
1340   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1341     KMP_FATAL(LockIsAlreadyOwned, func);
1342   }
1343 
1344   __kmp_acquire_queuing_lock(lck, gtid);
1345 
1346   lck->lk.owner_id = gtid + 1;
1347   return KMP_LOCK_ACQUIRED_FIRST;
1348 }
1349 
1350 int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1351   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1352   kmp_int32 head;
1353 #ifdef KMP_DEBUG
1354   kmp_info_t *this_thr;
1355 #endif
1356 
1357   KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
1358   KMP_DEBUG_ASSERT(gtid >= 0);
1359 #ifdef KMP_DEBUG
1360   this_thr = __kmp_thread_from_gtid(gtid);
1361   KMP_DEBUG_ASSERT(this_thr != NULL);
1362   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1363 #endif
1364 
1365   head = *head_id_p;
1366 
1367   if (head == 0) { /* nobody on queue, nobody holding */
1368     /* try (0,0)->(-1,0) */
1369     if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
1370       KA_TRACE(1000,
1371                ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
1372       KMP_FSYNC_ACQUIRED(lck);
1373       ANNOTATE_QUEUING_ACQUIRED(lck);
1374       return TRUE;
1375     }
1376   }
1377 
1378   KA_TRACE(1000,
1379            ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
1380   return FALSE;
1381 }
1382 
1383 static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1384                                                kmp_int32 gtid) {
1385   char const *const func = "omp_test_lock";
1386   if (lck->lk.initialized != lck) {
1387     KMP_FATAL(LockIsUninitialized, func);
1388   }
1389   if (__kmp_is_queuing_lock_nestable(lck)) {
1390     KMP_FATAL(LockNestableUsedAsSimple, func);
1391   }
1392 
1393   int retval = __kmp_test_queuing_lock(lck, gtid);
1394 
1395   if (retval) {
1396     lck->lk.owner_id = gtid + 1;
1397   }
1398   return retval;
1399 }
1400 
1401 int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1402   kmp_info_t *this_thr;
1403   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
1404   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
1405 
1406   KA_TRACE(1000,
1407            ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
1408   KMP_DEBUG_ASSERT(gtid >= 0);
1409   this_thr = __kmp_thread_from_gtid(gtid);
1410   KMP_DEBUG_ASSERT(this_thr != NULL);
1411 #ifdef DEBUG_QUEUING_LOCKS
1412   TRACE_LOCK(gtid + 1, "rel ent");
1413 
1414   if (this_thr->th.th_spin_here)
1415     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1416   if (this_thr->th.th_next_waiting != 0)
1417     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
1418 #endif
1419   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
1420   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
1421 
1422   KMP_FSYNC_RELEASING(lck);
1423   ANNOTATE_QUEUING_RELEASED(lck);
1424 
1425   while (1) {
1426     kmp_int32 dequeued;
1427     kmp_int32 head;
1428     kmp_int32 tail;
1429 
1430     head = *head_id_p;
1431 
1432 #ifdef DEBUG_QUEUING_LOCKS
1433     tail = *tail_id_p;
1434     TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
1435     if (head == 0)
1436       __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1437 #endif
1438     KMP_DEBUG_ASSERT(head !=
1439                      0); /* holding the lock, head must be -1 or queue head */
1440 
1441     if (head == -1) { /* nobody on queue */
1442       /* try (-1,0)->(0,0) */
1443       if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
1444         KA_TRACE(
1445             1000,
1446             ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
1447              lck, gtid));
1448 #ifdef DEBUG_QUEUING_LOCKS
1449         TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
1450 #endif
1451 
1452 #if OMPT_SUPPORT
1453 /* nothing to do - no other thread is trying to shift blame */
1454 #endif
1455         return KMP_LOCK_RELEASED;
1456       }
1457       dequeued = FALSE;
1458     } else {
1459       tail = *tail_id_p;
1460       if (head == tail) { /* only one thread on the queue */
1461 #ifdef DEBUG_QUEUING_LOCKS
1462         if (head <= 0)
1463           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1464 #endif
1465         KMP_DEBUG_ASSERT(head > 0);
1466 
1467         /* try (h,h)->(-1,0) */
1468         dequeued = KMP_COMPARE_AND_STORE_REL64(
1469             RCAST(volatile kmp_int64 *, tail_id_p), KMP_PACK_64(head, head),
1470             KMP_PACK_64(-1, 0));
1471 #ifdef DEBUG_QUEUING_LOCKS
1472         TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
1473 #endif
1474 
1475       } else {
1476         volatile kmp_int32 *waiting_id_p;
1477         kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1478         KMP_DEBUG_ASSERT(head_thr != NULL);
1479         waiting_id_p = &head_thr->th.th_next_waiting;
1480 
1481 /* Does this require synchronous reads? */
1482 #ifdef DEBUG_QUEUING_LOCKS
1483         if (head <= 0 || tail <= 0)
1484           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1485 #endif
1486         KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1487 
1488         /* try (h,t)->(h',t) or (t,t) */
1489         KMP_MB();
1490         /* make sure enqueuing thread has time to update next waiting thread
1491          * field */
1492         *head_id_p = KMP_WAIT_YIELD((volatile kmp_uint32 *)waiting_id_p, 0,
1493                                     KMP_NEQ, NULL);
1494 #ifdef DEBUG_QUEUING_LOCKS
1495         TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
1496 #endif
1497         dequeued = TRUE;
1498       }
1499     }
1500 
1501     if (dequeued) {
1502       kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
1503       KMP_DEBUG_ASSERT(head_thr != NULL);
1504 
1505 /* Does this require synchronous reads? */
1506 #ifdef DEBUG_QUEUING_LOCKS
1507       if (head <= 0 || tail <= 0)
1508         __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
1509 #endif
1510       KMP_DEBUG_ASSERT(head > 0 && tail > 0);
1511 
1512       /* For clean code only. Thread not released until next statement prevents
1513          race with acquire code. */
1514       head_thr->th.th_next_waiting = 0;
1515 #ifdef DEBUG_QUEUING_LOCKS
1516       TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
1517 #endif
1518 
1519       KMP_MB();
1520       /* reset spin value */
1521       head_thr->th.th_spin_here = FALSE;
1522 
1523       KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
1524                       "dequeuing\n",
1525                       lck, gtid));
1526 #ifdef DEBUG_QUEUING_LOCKS
1527       TRACE_LOCK(gtid + 1, "rel exit 2");
1528 #endif
1529       return KMP_LOCK_RELEASED;
1530     }
1531 /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
1532    threads */
1533 
1534 #ifdef DEBUG_QUEUING_LOCKS
1535     TRACE_LOCK(gtid + 1, "rel retry");
1536 #endif
1537 
1538   } /* while */
1539   KMP_ASSERT2(0, "should not get here");
1540   return KMP_LOCK_RELEASED;
1541 }
1542 
1543 static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1544                                                   kmp_int32 gtid) {
1545   char const *const func = "omp_unset_lock";
1546   KMP_MB(); /* in case another processor initialized lock */
1547   if (lck->lk.initialized != lck) {
1548     KMP_FATAL(LockIsUninitialized, func);
1549   }
1550   if (__kmp_is_queuing_lock_nestable(lck)) {
1551     KMP_FATAL(LockNestableUsedAsSimple, func);
1552   }
1553   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1554     KMP_FATAL(LockUnsettingFree, func);
1555   }
1556   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1557     KMP_FATAL(LockUnsettingSetByAnother, func);
1558   }
1559   lck->lk.owner_id = 0;
1560   return __kmp_release_queuing_lock(lck, gtid);
1561 }
1562 
1563 void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
1564   lck->lk.location = NULL;
1565   lck->lk.head_id = 0;
1566   lck->lk.tail_id = 0;
1567   lck->lk.next_ticket = 0;
1568   lck->lk.now_serving = 0;
1569   lck->lk.owner_id = 0; // no thread owns the lock.
1570   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
1571   lck->lk.initialized = lck;
1572 
1573   KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
1574 }
1575 
1576 static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1577   __kmp_init_queuing_lock(lck);
1578 }
1579 
1580 void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
1581   lck->lk.initialized = NULL;
1582   lck->lk.location = NULL;
1583   lck->lk.head_id = 0;
1584   lck->lk.tail_id = 0;
1585   lck->lk.next_ticket = 0;
1586   lck->lk.now_serving = 0;
1587   lck->lk.owner_id = 0;
1588   lck->lk.depth_locked = -1;
1589 }
1590 
1591 static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1592   char const *const func = "omp_destroy_lock";
1593   if (lck->lk.initialized != lck) {
1594     KMP_FATAL(LockIsUninitialized, func);
1595   }
1596   if (__kmp_is_queuing_lock_nestable(lck)) {
1597     KMP_FATAL(LockNestableUsedAsSimple, func);
1598   }
1599   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1600     KMP_FATAL(LockStillOwned, func);
1601   }
1602   __kmp_destroy_queuing_lock(lck);
1603 }
1604 
1605 // nested queuing locks
1606 
1607 int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1608   KMP_DEBUG_ASSERT(gtid >= 0);
1609 
1610   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1611     lck->lk.depth_locked += 1;
1612     return KMP_LOCK_ACQUIRED_NEXT;
1613   } else {
1614     __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
1615     ANNOTATE_QUEUING_ACQUIRED(lck);
1616     KMP_MB();
1617     lck->lk.depth_locked = 1;
1618     KMP_MB();
1619     lck->lk.owner_id = gtid + 1;
1620     return KMP_LOCK_ACQUIRED_FIRST;
1621   }
1622 }
1623 
1624 static int
1625 __kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1626                                               kmp_int32 gtid) {
1627   char const *const func = "omp_set_nest_lock";
1628   if (lck->lk.initialized != lck) {
1629     KMP_FATAL(LockIsUninitialized, func);
1630   }
1631   if (!__kmp_is_queuing_lock_nestable(lck)) {
1632     KMP_FATAL(LockSimpleUsedAsNestable, func);
1633   }
1634   return __kmp_acquire_nested_queuing_lock(lck, gtid);
1635 }
1636 
1637 int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1638   int retval;
1639 
1640   KMP_DEBUG_ASSERT(gtid >= 0);
1641 
1642   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
1643     retval = ++lck->lk.depth_locked;
1644   } else if (!__kmp_test_queuing_lock(lck, gtid)) {
1645     retval = 0;
1646   } else {
1647     KMP_MB();
1648     retval = lck->lk.depth_locked = 1;
1649     KMP_MB();
1650     lck->lk.owner_id = gtid + 1;
1651   }
1652   return retval;
1653 }
1654 
1655 static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1656                                                       kmp_int32 gtid) {
1657   char const *const func = "omp_test_nest_lock";
1658   if (lck->lk.initialized != lck) {
1659     KMP_FATAL(LockIsUninitialized, func);
1660   }
1661   if (!__kmp_is_queuing_lock_nestable(lck)) {
1662     KMP_FATAL(LockSimpleUsedAsNestable, func);
1663   }
1664   return __kmp_test_nested_queuing_lock(lck, gtid);
1665 }
1666 
1667 int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
1668   KMP_DEBUG_ASSERT(gtid >= 0);
1669 
1670   KMP_MB();
1671   if (--(lck->lk.depth_locked) == 0) {
1672     KMP_MB();
1673     lck->lk.owner_id = 0;
1674     __kmp_release_queuing_lock(lck, gtid);
1675     return KMP_LOCK_RELEASED;
1676   }
1677   return KMP_LOCK_STILL_HELD;
1678 }
1679 
1680 static int
1681 __kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
1682                                               kmp_int32 gtid) {
1683   char const *const func = "omp_unset_nest_lock";
1684   KMP_MB(); /* in case another processor initialized lock */
1685   if (lck->lk.initialized != lck) {
1686     KMP_FATAL(LockIsUninitialized, func);
1687   }
1688   if (!__kmp_is_queuing_lock_nestable(lck)) {
1689     KMP_FATAL(LockSimpleUsedAsNestable, func);
1690   }
1691   if (__kmp_get_queuing_lock_owner(lck) == -1) {
1692     KMP_FATAL(LockUnsettingFree, func);
1693   }
1694   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
1695     KMP_FATAL(LockUnsettingSetByAnother, func);
1696   }
1697   return __kmp_release_nested_queuing_lock(lck, gtid);
1698 }
1699 
1700 void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1701   __kmp_init_queuing_lock(lck);
1702   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
1703 }
1704 
1705 static void
1706 __kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1707   __kmp_init_nested_queuing_lock(lck);
1708 }
1709 
1710 void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
1711   __kmp_destroy_queuing_lock(lck);
1712   lck->lk.depth_locked = 0;
1713 }
1714 
1715 static void
1716 __kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
1717   char const *const func = "omp_destroy_nest_lock";
1718   if (lck->lk.initialized != lck) {
1719     KMP_FATAL(LockIsUninitialized, func);
1720   }
1721   if (!__kmp_is_queuing_lock_nestable(lck)) {
1722     KMP_FATAL(LockSimpleUsedAsNestable, func);
1723   }
1724   if (__kmp_get_queuing_lock_owner(lck) != -1) {
1725     KMP_FATAL(LockStillOwned, func);
1726   }
1727   __kmp_destroy_nested_queuing_lock(lck);
1728 }
1729 
1730 // access functions to fields which don't exist for all lock kinds.
1731 
1732 static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
1733   return lck == lck->lk.initialized;
1734 }
1735 
1736 static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
1737   return lck->lk.location;
1738 }
1739 
1740 static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
1741                                             const ident_t *loc) {
1742   lck->lk.location = loc;
1743 }
1744 
1745 static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
1746   return lck->lk.flags;
1747 }
1748 
1749 static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
1750                                          kmp_lock_flags_t flags) {
1751   lck->lk.flags = flags;
1752 }
1753 
1754 #if KMP_USE_ADAPTIVE_LOCKS
1755 
1756 /* RTM Adaptive locks */
1757 
1758 #if KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1759 
1760 #include <immintrin.h>
1761 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1762 
1763 #else
1764 
1765 // Values from the status register after failed speculation.
1766 #define _XBEGIN_STARTED (~0u)
1767 #define _XABORT_EXPLICIT (1 << 0)
1768 #define _XABORT_RETRY (1 << 1)
1769 #define _XABORT_CONFLICT (1 << 2)
1770 #define _XABORT_CAPACITY (1 << 3)
1771 #define _XABORT_DEBUG (1 << 4)
1772 #define _XABORT_NESTED (1 << 5)
1773 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
1774 
1775 // Aborts for which it's worth trying again immediately
1776 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
1777 
1778 #define STRINGIZE_INTERNAL(arg) #arg
1779 #define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
1780 
1781 // Access to RTM instructions
1782 /*A version of XBegin which returns -1 on speculation, and the value of EAX on
1783   an abort. This is the same definition as the compiler intrinsic that will be
1784   supported at some point. */
1785 static __inline int _xbegin() {
1786   int res = -1;
1787 
1788 #if KMP_OS_WINDOWS
1789 #if KMP_ARCH_X86_64
1790   _asm {
1791         _emit 0xC7
1792         _emit 0xF8
1793         _emit 2
1794         _emit 0
1795         _emit 0
1796         _emit 0
1797         jmp   L2
1798         mov   res, eax
1799     L2:
1800   }
1801 #else /* IA32 */
1802   _asm {
1803         _emit 0xC7
1804         _emit 0xF8
1805         _emit 2
1806         _emit 0
1807         _emit 0
1808         _emit 0
1809         jmp   L2
1810         mov   res, eax
1811     L2:
1812   }
1813 #endif // KMP_ARCH_X86_64
1814 #else
1815   /* Note that %eax must be noted as killed (clobbered), because the XSR is
1816      returned in %eax(%rax) on abort.  Other register values are restored, so
1817      don't need to be killed.
1818 
1819      We must also mark 'res' as an input and an output, since otherwise
1820      'res=-1' may be dropped as being dead, whereas we do need the assignment on
1821      the successful (i.e., non-abort) path. */
1822   __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
1823                    "   .long  1f-1b-6\n"
1824                    "    jmp   2f\n"
1825                    "1:  movl  %%eax,%0\n"
1826                    "2:"
1827                    : "+r"(res)::"memory", "%eax");
1828 #endif // KMP_OS_WINDOWS
1829   return res;
1830 }
1831 
1832 /* Transaction end */
1833 static __inline void _xend() {
1834 #if KMP_OS_WINDOWS
1835   __asm {
1836         _emit 0x0f
1837         _emit 0x01
1838         _emit 0xd5
1839   }
1840 #else
1841   __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
1842 #endif
1843 }
1844 
1845 /* This is a macro, the argument must be a single byte constant which can be
1846    evaluated by the inline assembler, since it is emitted as a byte into the
1847    assembly code. */
1848 // clang-format off
1849 #if KMP_OS_WINDOWS
1850 #define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
1851 #else
1852 #define _xabort(ARG)                                                           \
1853   __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
1854 #endif
1855 // clang-format on
1856 #endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
1857 
1858 // Statistics is collected for testing purpose
1859 #if KMP_DEBUG_ADAPTIVE_LOCKS
1860 
1861 // We accumulate speculative lock statistics when the lock is destroyed. We
1862 // keep locks that haven't been destroyed in the liveLocks list so that we can
1863 // grab their statistics too.
1864 static kmp_adaptive_lock_statistics_t destroyedStats;
1865 
1866 // To hold the list of live locks.
1867 static kmp_adaptive_lock_info_t liveLocks;
1868 
1869 // A lock so we can safely update the list of locks.
1870 static kmp_bootstrap_lock_t chain_lock;
1871 
1872 // Initialize the list of stats.
1873 void __kmp_init_speculative_stats() {
1874   kmp_adaptive_lock_info_t *lck = &liveLocks;
1875 
1876   memset((void *)&(lck->stats), 0, sizeof(lck->stats));
1877   lck->stats.next = lck;
1878   lck->stats.prev = lck;
1879 
1880   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1881   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1882 
1883   __kmp_init_bootstrap_lock(&chain_lock);
1884 }
1885 
1886 // Insert the lock into the circular list
1887 static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
1888   __kmp_acquire_bootstrap_lock(&chain_lock);
1889 
1890   lck->stats.next = liveLocks.stats.next;
1891   lck->stats.prev = &liveLocks;
1892 
1893   liveLocks.stats.next = lck;
1894   lck->stats.next->stats.prev = lck;
1895 
1896   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1897   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1898 
1899   __kmp_release_bootstrap_lock(&chain_lock);
1900 }
1901 
1902 static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
1903   KMP_ASSERT(lck->stats.next->stats.prev == lck);
1904   KMP_ASSERT(lck->stats.prev->stats.next == lck);
1905 
1906   kmp_adaptive_lock_info_t *n = lck->stats.next;
1907   kmp_adaptive_lock_info_t *p = lck->stats.prev;
1908 
1909   n->stats.prev = p;
1910   p->stats.next = n;
1911 }
1912 
1913 static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1914   memset((void *)&lck->stats, 0, sizeof(lck->stats));
1915   __kmp_remember_lock(lck);
1916 }
1917 
1918 static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
1919                             kmp_adaptive_lock_info_t *lck) {
1920   kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
1921 
1922   t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
1923   t->successfulSpeculations += s->successfulSpeculations;
1924   t->hardFailedSpeculations += s->hardFailedSpeculations;
1925   t->softFailedSpeculations += s->softFailedSpeculations;
1926   t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
1927   t->lemmingYields += s->lemmingYields;
1928 }
1929 
1930 static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
1931   kmp_adaptive_lock_statistics_t *t = &destroyedStats;
1932 
1933   __kmp_acquire_bootstrap_lock(&chain_lock);
1934 
1935   __kmp_add_stats(&destroyedStats, lck);
1936   __kmp_forget_lock(lck);
1937 
1938   __kmp_release_bootstrap_lock(&chain_lock);
1939 }
1940 
1941 static float percent(kmp_uint32 count, kmp_uint32 total) {
1942   return (total == 0) ? 0.0 : (100.0 * count) / total;
1943 }
1944 
1945 static FILE *__kmp_open_stats_file() {
1946   if (strcmp(__kmp_speculative_statsfile, "-") == 0)
1947     return stdout;
1948 
1949   size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
1950   char buffer[buffLen];
1951   KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
1952                (kmp_int32)getpid());
1953   FILE *result = fopen(&buffer[0], "w");
1954 
1955   // Maybe we should issue a warning here...
1956   return result ? result : stdout;
1957 }
1958 
1959 void __kmp_print_speculative_stats() {
1960   if (__kmp_user_lock_kind != lk_adaptive)
1961     return;
1962 
1963   FILE *statsFile = __kmp_open_stats_file();
1964 
1965   kmp_adaptive_lock_statistics_t total = destroyedStats;
1966   kmp_adaptive_lock_info_t *lck;
1967 
1968   for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
1969     __kmp_add_stats(&total, lck);
1970   }
1971   kmp_adaptive_lock_statistics_t *t = &total;
1972   kmp_uint32 totalSections =
1973       t->nonSpeculativeAcquires + t->successfulSpeculations;
1974   kmp_uint32 totalSpeculations = t->successfulSpeculations +
1975                                  t->hardFailedSpeculations +
1976                                  t->softFailedSpeculations;
1977 
1978   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
1979   fprintf(statsFile, " Lock parameters: \n"
1980                      "   max_soft_retries               : %10d\n"
1981                      "   max_badness                    : %10d\n",
1982           __kmp_adaptive_backoff_params.max_soft_retries,
1983           __kmp_adaptive_backoff_params.max_badness);
1984   fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
1985           t->nonSpeculativeAcquireAttempts);
1986   fprintf(statsFile, " Total critical sections          : %10d\n",
1987           totalSections);
1988   fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
1989           t->successfulSpeculations,
1990           percent(t->successfulSpeculations, totalSections));
1991   fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
1992           t->nonSpeculativeAcquires,
1993           percent(t->nonSpeculativeAcquires, totalSections));
1994   fprintf(statsFile, " Lemming yields                   : %10d\n\n",
1995           t->lemmingYields);
1996 
1997   fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
1998           totalSpeculations);
1999   fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
2000           t->successfulSpeculations,
2001           percent(t->successfulSpeculations, totalSpeculations));
2002   fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
2003           t->softFailedSpeculations,
2004           percent(t->softFailedSpeculations, totalSpeculations));
2005   fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
2006           t->hardFailedSpeculations,
2007           percent(t->hardFailedSpeculations, totalSpeculations));
2008 
2009   if (statsFile != stdout)
2010     fclose(statsFile);
2011 }
2012 
2013 #define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
2014 #else
2015 #define KMP_INC_STAT(lck, stat)
2016 
2017 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
2018 
2019 static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
2020   // It is enough to check that the head_id is zero.
2021   // We don't also need to check the tail.
2022   bool res = lck->lk.head_id == 0;
2023 
2024 // We need a fence here, since we must ensure that no memory operations
2025 // from later in this thread float above that read.
2026 #if KMP_COMPILER_ICC
2027   _mm_mfence();
2028 #else
2029   __sync_synchronize();
2030 #endif
2031 
2032   return res;
2033 }
2034 
2035 // Functions for manipulating the badness
2036 static __inline void
2037 __kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
2038   // Reset the badness to zero so we eagerly try to speculate again
2039   lck->lk.adaptive.badness = 0;
2040   KMP_INC_STAT(lck, successfulSpeculations);
2041 }
2042 
2043 // Create a bit mask with one more set bit.
2044 static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
2045   kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
2046   if (newBadness > lck->lk.adaptive.max_badness) {
2047     return;
2048   } else {
2049     lck->lk.adaptive.badness = newBadness;
2050   }
2051 }
2052 
2053 // Check whether speculation should be attempted.
2054 static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
2055                                            kmp_int32 gtid) {
2056   kmp_uint32 badness = lck->lk.adaptive.badness;
2057   kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
2058   int res = (attempts & badness) == 0;
2059   return res;
2060 }
2061 
2062 // Attempt to acquire only the speculative lock.
2063 // Does not back off to the non-speculative lock.
2064 static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
2065                                          kmp_int32 gtid) {
2066   int retries = lck->lk.adaptive.max_soft_retries;
2067 
2068   // We don't explicitly count the start of speculation, rather we record the
2069   // results (success, hard fail, soft fail). The sum of all of those is the
2070   // total number of times we started speculation since all speculations must
2071   // end one of those ways.
2072   do {
2073     kmp_uint32 status = _xbegin();
2074     // Switch this in to disable actual speculation but exercise at least some
2075     // of the rest of the code. Useful for debugging...
2076     // kmp_uint32 status = _XABORT_NESTED;
2077 
2078     if (status == _XBEGIN_STARTED) {
2079       /* We have successfully started speculation. Check that no-one acquired
2080          the lock for real between when we last looked and now. This also gets
2081          the lock cache line into our read-set, which we need so that we'll
2082          abort if anyone later claims it for real. */
2083       if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2084         // Lock is now visibly acquired, so someone beat us to it. Abort the
2085         // transaction so we'll restart from _xbegin with the failure status.
2086         _xabort(0x01);
2087         KMP_ASSERT2(0, "should not get here");
2088       }
2089       return 1; // Lock has been acquired (speculatively)
2090     } else {
2091       // We have aborted, update the statistics
2092       if (status & SOFT_ABORT_MASK) {
2093         KMP_INC_STAT(lck, softFailedSpeculations);
2094         // and loop round to retry.
2095       } else {
2096         KMP_INC_STAT(lck, hardFailedSpeculations);
2097         // Give up if we had a hard failure.
2098         break;
2099       }
2100     }
2101   } while (retries--); // Loop while we have retries, and didn't fail hard.
2102 
2103   // Either we had a hard failure or we didn't succeed softly after
2104   // the full set of attempts, so back off the badness.
2105   __kmp_step_badness(lck);
2106   return 0;
2107 }
2108 
2109 // Attempt to acquire the speculative lock, or back off to the non-speculative
2110 // one if the speculative lock cannot be acquired.
2111 // We can succeed speculatively, non-speculatively, or fail.
2112 static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
2113   // First try to acquire the lock speculatively
2114   if (__kmp_should_speculate(lck, gtid) &&
2115       __kmp_test_adaptive_lock_only(lck, gtid))
2116     return 1;
2117 
2118   // Speculative acquisition failed, so try to acquire it non-speculatively.
2119   // Count the non-speculative acquire attempt
2120   lck->lk.adaptive.acquire_attempts++;
2121 
2122   // Use base, non-speculative lock.
2123   if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
2124     KMP_INC_STAT(lck, nonSpeculativeAcquires);
2125     return 1; // Lock is acquired (non-speculatively)
2126   } else {
2127     return 0; // Failed to acquire the lock, it's already visibly locked.
2128   }
2129 }
2130 
2131 static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2132                                                 kmp_int32 gtid) {
2133   char const *const func = "omp_test_lock";
2134   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2135     KMP_FATAL(LockIsUninitialized, func);
2136   }
2137 
2138   int retval = __kmp_test_adaptive_lock(lck, gtid);
2139 
2140   if (retval) {
2141     lck->lk.qlk.owner_id = gtid + 1;
2142   }
2143   return retval;
2144 }
2145 
2146 // Block until we can acquire a speculative, adaptive lock. We check whether we
2147 // should be trying to speculate. If we should be, we check the real lock to see
2148 // if it is free, and, if not, pause without attempting to acquire it until it
2149 // is. Then we try the speculative acquire. This means that although we suffer
2150 // from lemmings a little (because all we can't acquire the lock speculatively
2151 // until the queue of threads waiting has cleared), we don't get into a state
2152 // where we can never acquire the lock speculatively (because we force the queue
2153 // to clear by preventing new arrivals from entering the queue). This does mean
2154 // that when we're trying to break lemmings, the lock is no longer fair. However
2155 // OpenMP makes no guarantee that its locks are fair, so this isn't a real
2156 // problem.
2157 static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
2158                                         kmp_int32 gtid) {
2159   if (__kmp_should_speculate(lck, gtid)) {
2160     if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2161       if (__kmp_test_adaptive_lock_only(lck, gtid))
2162         return;
2163       // We tried speculation and failed, so give up.
2164     } else {
2165       // We can't try speculation until the lock is free, so we pause here
2166       // (without suspending on the queueing lock, to allow it to drain, then
2167       // try again. All other threads will also see the same result for
2168       // shouldSpeculate, so will be doing the same if they try to claim the
2169       // lock from now on.
2170       while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
2171         KMP_INC_STAT(lck, lemmingYields);
2172         __kmp_yield(TRUE);
2173       }
2174 
2175       if (__kmp_test_adaptive_lock_only(lck, gtid))
2176         return;
2177     }
2178   }
2179 
2180   // Speculative acquisition failed, so acquire it non-speculatively.
2181   // Count the non-speculative acquire attempt
2182   lck->lk.adaptive.acquire_attempts++;
2183 
2184   __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
2185   // We have acquired the base lock, so count that.
2186   KMP_INC_STAT(lck, nonSpeculativeAcquires);
2187   ANNOTATE_QUEUING_ACQUIRED(lck);
2188 }
2189 
2190 static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2191                                                     kmp_int32 gtid) {
2192   char const *const func = "omp_set_lock";
2193   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2194     KMP_FATAL(LockIsUninitialized, func);
2195   }
2196   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
2197     KMP_FATAL(LockIsAlreadyOwned, func);
2198   }
2199 
2200   __kmp_acquire_adaptive_lock(lck, gtid);
2201 
2202   lck->lk.qlk.owner_id = gtid + 1;
2203 }
2204 
2205 static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
2206                                        kmp_int32 gtid) {
2207   if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
2208           lck))) { // If the lock doesn't look claimed we must be speculating.
2209     // (Or the user's code is buggy and they're releasing without locking;
2210     // if we had XTEST we'd be able to check that case...)
2211     _xend(); // Exit speculation
2212     __kmp_update_badness_after_success(lck);
2213   } else { // Since the lock *is* visibly locked we're not speculating,
2214     // so should use the underlying lock's release scheme.
2215     __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
2216   }
2217   return KMP_LOCK_RELEASED;
2218 }
2219 
2220 static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
2221                                                    kmp_int32 gtid) {
2222   char const *const func = "omp_unset_lock";
2223   KMP_MB(); /* in case another processor initialized lock */
2224   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2225     KMP_FATAL(LockIsUninitialized, func);
2226   }
2227   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
2228     KMP_FATAL(LockUnsettingFree, func);
2229   }
2230   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
2231     KMP_FATAL(LockUnsettingSetByAnother, func);
2232   }
2233   lck->lk.qlk.owner_id = 0;
2234   __kmp_release_adaptive_lock(lck, gtid);
2235   return KMP_LOCK_RELEASED;
2236 }
2237 
2238 static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
2239   __kmp_init_queuing_lock(GET_QLK_PTR(lck));
2240   lck->lk.adaptive.badness = 0;
2241   lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
2242   lck->lk.adaptive.max_soft_retries =
2243       __kmp_adaptive_backoff_params.max_soft_retries;
2244   lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
2245 #if KMP_DEBUG_ADAPTIVE_LOCKS
2246   __kmp_zero_speculative_stats(&lck->lk.adaptive);
2247 #endif
2248   KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
2249 }
2250 
2251 static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2252   __kmp_init_adaptive_lock(lck);
2253 }
2254 
2255 static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
2256 #if KMP_DEBUG_ADAPTIVE_LOCKS
2257   __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
2258 #endif
2259   __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
2260   // Nothing needed for the speculative part.
2261 }
2262 
2263 static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
2264   char const *const func = "omp_destroy_lock";
2265   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
2266     KMP_FATAL(LockIsUninitialized, func);
2267   }
2268   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
2269     KMP_FATAL(LockStillOwned, func);
2270   }
2271   __kmp_destroy_adaptive_lock(lck);
2272 }
2273 
2274 #endif // KMP_USE_ADAPTIVE_LOCKS
2275 
2276 /* ------------------------------------------------------------------------ */
2277 /* DRDPA ticket locks                                                */
2278 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
2279 
2280 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
2281   return TCR_4(lck->lk.owner_id) - 1;
2282 }
2283 
2284 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
2285   return lck->lk.depth_locked != -1;
2286 }
2287 
2288 __forceinline static int
2289 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2290   kmp_uint64 ticket =
2291       KMP_TEST_THEN_INC64(RCAST(volatile kmp_int64 *, &lck->lk.next_ticket));
2292   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2293   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
2294 
2295 #ifdef USE_LOCK_PROFILE
2296   if (TCR_8(polls[ticket & mask].poll) != ticket)
2297     __kmp_printf("LOCK CONTENTION: %p\n", lck);
2298 /* else __kmp_printf( "." );*/
2299 #endif /* USE_LOCK_PROFILE */
2300 
2301   // Now spin-wait, but reload the polls pointer and mask, in case the
2302   // polling area has been reconfigured.  Unless it is reconfigured, the
2303   // reloads stay in L1 cache and are cheap.
2304   //
2305   // Keep this code in sync with KMP_WAIT_YIELD, in kmp_dispatch.cpp !!!
2306   //
2307   // The current implementation of KMP_WAIT_YIELD doesn't allow for mask
2308   // and poll to be re-read every spin iteration.
2309   kmp_uint32 spins;
2310 
2311   KMP_FSYNC_PREPARE(lck);
2312   KMP_INIT_YIELD(spins);
2313   while (TCR_8(polls[ticket & mask].poll) < ticket) { // volatile load
2314     // If we are oversubscribed,
2315     // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
2316     // CPU Pause is in the macros for yield.
2317     //
2318     KMP_YIELD(TCR_4(__kmp_nth) >
2319               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
2320     KMP_YIELD_SPIN(spins);
2321 
2322     // Re-read the mask and the poll pointer from the lock structure.
2323     //
2324     // Make certain that "mask" is read before "polls" !!!
2325     //
2326     // If another thread picks reconfigures the polling area and updates their
2327     // values, and we get the new value of mask and the old polls pointer, we
2328     // could access memory beyond the end of the old polling area.
2329     mask = TCR_8(lck->lk.mask); // volatile load
2330     polls = lck->lk.polls; // volatile load
2331   }
2332 
2333   // Critical section starts here
2334   KMP_FSYNC_ACQUIRED(lck);
2335   KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
2336                   ticket, lck));
2337   lck->lk.now_serving = ticket; // non-volatile store
2338 
2339   // Deallocate a garbage polling area if we know that we are the last
2340   // thread that could possibly access it.
2341   //
2342   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
2343   // ticket.
2344   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
2345     __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
2346     lck->lk.old_polls = NULL;
2347     lck->lk.cleanup_ticket = 0;
2348   }
2349 
2350   // Check to see if we should reconfigure the polling area.
2351   // If there is still a garbage polling area to be deallocated from a
2352   // previous reconfiguration, let a later thread reconfigure it.
2353   if (lck->lk.old_polls == NULL) {
2354     bool reconfigure = false;
2355     volatile struct kmp_base_drdpa_lock::kmp_lock_poll *old_polls = polls;
2356     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
2357 
2358     if (TCR_4(__kmp_nth) >
2359         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
2360       // We are in oversubscription mode.  Contract the polling area
2361       // down to a single location, if that hasn't been done already.
2362       if (num_polls > 1) {
2363         reconfigure = true;
2364         num_polls = TCR_4(lck->lk.num_polls);
2365         mask = 0;
2366         num_polls = 1;
2367         polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
2368             __kmp_allocate(num_polls * sizeof(*polls));
2369         polls[0].poll = ticket;
2370       }
2371     } else {
2372       // We are in under/fully subscribed mode.  Check the number of
2373       // threads waiting on the lock.  The size of the polling area
2374       // should be at least the number of threads waiting.
2375       kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
2376       if (num_waiting > num_polls) {
2377         kmp_uint32 old_num_polls = num_polls;
2378         reconfigure = true;
2379         do {
2380           mask = (mask << 1) | 1;
2381           num_polls *= 2;
2382         } while (num_polls <= num_waiting);
2383 
2384         // Allocate the new polling area, and copy the relevant portion
2385         // of the old polling area to the new area.  __kmp_allocate()
2386         // zeroes the memory it allocates, and most of the old area is
2387         // just zero padding, so we only copy the release counters.
2388         polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
2389             __kmp_allocate(num_polls * sizeof(*polls));
2390         kmp_uint32 i;
2391         for (i = 0; i < old_num_polls; i++) {
2392           polls[i].poll = old_polls[i].poll;
2393         }
2394       }
2395     }
2396 
2397     if (reconfigure) {
2398       // Now write the updated fields back to the lock structure.
2399       //
2400       // Make certain that "polls" is written before "mask" !!!
2401       //
2402       // If another thread picks up the new value of mask and the old polls
2403       // pointer , it could access memory beyond the end of the old polling
2404       // area.
2405       //
2406       // On x86, we need memory fences.
2407       KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
2408                       "lock %p to %d polls\n",
2409                       ticket, lck, num_polls));
2410 
2411       lck->lk.old_polls = old_polls; // non-volatile store
2412       lck->lk.polls = polls; // volatile store
2413 
2414       KMP_MB();
2415 
2416       lck->lk.num_polls = num_polls; // non-volatile store
2417       lck->lk.mask = mask; // volatile store
2418 
2419       KMP_MB();
2420 
2421       // Only after the new polling area and mask have been flushed
2422       // to main memory can we update the cleanup ticket field.
2423       //
2424       // volatile load / non-volatile store
2425       lck->lk.cleanup_ticket = TCR_8(lck->lk.next_ticket);
2426     }
2427   }
2428   return KMP_LOCK_ACQUIRED_FIRST;
2429 }
2430 
2431 int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2432   int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2433   ANNOTATE_DRDPA_ACQUIRED(lck);
2434   return retval;
2435 }
2436 
2437 static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2438                                                 kmp_int32 gtid) {
2439   char const *const func = "omp_set_lock";
2440   if (lck->lk.initialized != lck) {
2441     KMP_FATAL(LockIsUninitialized, func);
2442   }
2443   if (__kmp_is_drdpa_lock_nestable(lck)) {
2444     KMP_FATAL(LockNestableUsedAsSimple, func);
2445   }
2446   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
2447     KMP_FATAL(LockIsAlreadyOwned, func);
2448   }
2449 
2450   __kmp_acquire_drdpa_lock(lck, gtid);
2451 
2452   lck->lk.owner_id = gtid + 1;
2453   return KMP_LOCK_ACQUIRED_FIRST;
2454 }
2455 
2456 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2457   // First get a ticket, then read the polls pointer and the mask.
2458   // The polls pointer must be read before the mask!!! (See above)
2459   kmp_uint64 ticket = TCR_8(lck->lk.next_ticket); // volatile load
2460   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
2461   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2462   if (TCR_8(polls[ticket & mask].poll) == ticket) {
2463     kmp_uint64 next_ticket = ticket + 1;
2464     if (KMP_COMPARE_AND_STORE_ACQ64(&lck->lk.next_ticket, ticket,
2465                                     next_ticket)) {
2466       KMP_FSYNC_ACQUIRED(lck);
2467       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
2468                       ticket, lck));
2469       lck->lk.now_serving = ticket; // non-volatile store
2470 
2471       // Since no threads are waiting, there is no possibility that we would
2472       // want to reconfigure the polling area.  We might have the cleanup ticket
2473       // value (which says that it is now safe to deallocate old_polls), but
2474       // we'll let a later thread which calls __kmp_acquire_lock do that - this
2475       // routine isn't supposed to block, and we would risk blocks if we called
2476       // __kmp_free() to do the deallocation.
2477       return TRUE;
2478     }
2479   }
2480   return FALSE;
2481 }
2482 
2483 static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2484                                              kmp_int32 gtid) {
2485   char const *const func = "omp_test_lock";
2486   if (lck->lk.initialized != lck) {
2487     KMP_FATAL(LockIsUninitialized, func);
2488   }
2489   if (__kmp_is_drdpa_lock_nestable(lck)) {
2490     KMP_FATAL(LockNestableUsedAsSimple, func);
2491   }
2492 
2493   int retval = __kmp_test_drdpa_lock(lck, gtid);
2494 
2495   if (retval) {
2496     lck->lk.owner_id = gtid + 1;
2497   }
2498   return retval;
2499 }
2500 
2501 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2502   // Read the ticket value from the lock data struct, then the polls pointer and
2503   // the mask.  The polls pointer must be read before the mask!!! (See above)
2504   kmp_uint64 ticket = lck->lk.now_serving + 1; // non-volatile load
2505   volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
2506   kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
2507   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
2508                   ticket - 1, lck));
2509   KMP_FSYNC_RELEASING(lck);
2510   ANNOTATE_DRDPA_RELEASED(lck);
2511   KMP_ST_REL64(&(polls[ticket & mask].poll), ticket); // volatile store
2512   return KMP_LOCK_RELEASED;
2513 }
2514 
2515 static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2516                                                 kmp_int32 gtid) {
2517   char const *const func = "omp_unset_lock";
2518   KMP_MB(); /* in case another processor initialized lock */
2519   if (lck->lk.initialized != lck) {
2520     KMP_FATAL(LockIsUninitialized, func);
2521   }
2522   if (__kmp_is_drdpa_lock_nestable(lck)) {
2523     KMP_FATAL(LockNestableUsedAsSimple, func);
2524   }
2525   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2526     KMP_FATAL(LockUnsettingFree, func);
2527   }
2528   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
2529       (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
2530     KMP_FATAL(LockUnsettingSetByAnother, func);
2531   }
2532   lck->lk.owner_id = 0;
2533   return __kmp_release_drdpa_lock(lck, gtid);
2534 }
2535 
2536 void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
2537   lck->lk.location = NULL;
2538   lck->lk.mask = 0;
2539   lck->lk.num_polls = 1;
2540   lck->lk.polls =
2541       (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)__kmp_allocate(
2542           lck->lk.num_polls * sizeof(*(lck->lk.polls)));
2543   lck->lk.cleanup_ticket = 0;
2544   lck->lk.old_polls = NULL;
2545   lck->lk.next_ticket = 0;
2546   lck->lk.now_serving = 0;
2547   lck->lk.owner_id = 0; // no thread owns the lock.
2548   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
2549   lck->lk.initialized = lck;
2550 
2551   KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
2552 }
2553 
2554 static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2555   __kmp_init_drdpa_lock(lck);
2556 }
2557 
2558 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
2559   lck->lk.initialized = NULL;
2560   lck->lk.location = NULL;
2561   if (lck->lk.polls != NULL) {
2562     __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.polls));
2563     lck->lk.polls = NULL;
2564   }
2565   if (lck->lk.old_polls != NULL) {
2566     __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
2567     lck->lk.old_polls = NULL;
2568   }
2569   lck->lk.mask = 0;
2570   lck->lk.num_polls = 0;
2571   lck->lk.cleanup_ticket = 0;
2572   lck->lk.next_ticket = 0;
2573   lck->lk.now_serving = 0;
2574   lck->lk.owner_id = 0;
2575   lck->lk.depth_locked = -1;
2576 }
2577 
2578 static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2579   char const *const func = "omp_destroy_lock";
2580   if (lck->lk.initialized != lck) {
2581     KMP_FATAL(LockIsUninitialized, func);
2582   }
2583   if (__kmp_is_drdpa_lock_nestable(lck)) {
2584     KMP_FATAL(LockNestableUsedAsSimple, func);
2585   }
2586   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2587     KMP_FATAL(LockStillOwned, func);
2588   }
2589   __kmp_destroy_drdpa_lock(lck);
2590 }
2591 
2592 // nested drdpa ticket locks
2593 
2594 int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2595   KMP_DEBUG_ASSERT(gtid >= 0);
2596 
2597   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2598     lck->lk.depth_locked += 1;
2599     return KMP_LOCK_ACQUIRED_NEXT;
2600   } else {
2601     __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
2602     ANNOTATE_DRDPA_ACQUIRED(lck);
2603     KMP_MB();
2604     lck->lk.depth_locked = 1;
2605     KMP_MB();
2606     lck->lk.owner_id = gtid + 1;
2607     return KMP_LOCK_ACQUIRED_FIRST;
2608   }
2609 }
2610 
2611 static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2612                                                         kmp_int32 gtid) {
2613   char const *const func = "omp_set_nest_lock";
2614   if (lck->lk.initialized != lck) {
2615     KMP_FATAL(LockIsUninitialized, func);
2616   }
2617   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2618     KMP_FATAL(LockSimpleUsedAsNestable, func);
2619   }
2620   __kmp_acquire_nested_drdpa_lock(lck, gtid);
2621 }
2622 
2623 int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2624   int retval;
2625 
2626   KMP_DEBUG_ASSERT(gtid >= 0);
2627 
2628   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
2629     retval = ++lck->lk.depth_locked;
2630   } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
2631     retval = 0;
2632   } else {
2633     KMP_MB();
2634     retval = lck->lk.depth_locked = 1;
2635     KMP_MB();
2636     lck->lk.owner_id = gtid + 1;
2637   }
2638   return retval;
2639 }
2640 
2641 static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2642                                                     kmp_int32 gtid) {
2643   char const *const func = "omp_test_nest_lock";
2644   if (lck->lk.initialized != lck) {
2645     KMP_FATAL(LockIsUninitialized, func);
2646   }
2647   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2648     KMP_FATAL(LockSimpleUsedAsNestable, func);
2649   }
2650   return __kmp_test_nested_drdpa_lock(lck, gtid);
2651 }
2652 
2653 int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
2654   KMP_DEBUG_ASSERT(gtid >= 0);
2655 
2656   KMP_MB();
2657   if (--(lck->lk.depth_locked) == 0) {
2658     KMP_MB();
2659     lck->lk.owner_id = 0;
2660     __kmp_release_drdpa_lock(lck, gtid);
2661     return KMP_LOCK_RELEASED;
2662   }
2663   return KMP_LOCK_STILL_HELD;
2664 }
2665 
2666 static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
2667                                                        kmp_int32 gtid) {
2668   char const *const func = "omp_unset_nest_lock";
2669   KMP_MB(); /* in case another processor initialized lock */
2670   if (lck->lk.initialized != lck) {
2671     KMP_FATAL(LockIsUninitialized, func);
2672   }
2673   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2674     KMP_FATAL(LockSimpleUsedAsNestable, func);
2675   }
2676   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
2677     KMP_FATAL(LockUnsettingFree, func);
2678   }
2679   if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
2680     KMP_FATAL(LockUnsettingSetByAnother, func);
2681   }
2682   return __kmp_release_nested_drdpa_lock(lck, gtid);
2683 }
2684 
2685 void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2686   __kmp_init_drdpa_lock(lck);
2687   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
2688 }
2689 
2690 static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2691   __kmp_init_nested_drdpa_lock(lck);
2692 }
2693 
2694 void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
2695   __kmp_destroy_drdpa_lock(lck);
2696   lck->lk.depth_locked = 0;
2697 }
2698 
2699 static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
2700   char const *const func = "omp_destroy_nest_lock";
2701   if (lck->lk.initialized != lck) {
2702     KMP_FATAL(LockIsUninitialized, func);
2703   }
2704   if (!__kmp_is_drdpa_lock_nestable(lck)) {
2705     KMP_FATAL(LockSimpleUsedAsNestable, func);
2706   }
2707   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
2708     KMP_FATAL(LockStillOwned, func);
2709   }
2710   __kmp_destroy_nested_drdpa_lock(lck);
2711 }
2712 
2713 // access functions to fields which don't exist for all lock kinds.
2714 
2715 static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
2716   return lck == lck->lk.initialized;
2717 }
2718 
2719 static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
2720   return lck->lk.location;
2721 }
2722 
2723 static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
2724                                           const ident_t *loc) {
2725   lck->lk.location = loc;
2726 }
2727 
2728 static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
2729   return lck->lk.flags;
2730 }
2731 
2732 static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
2733                                        kmp_lock_flags_t flags) {
2734   lck->lk.flags = flags;
2735 }
2736 
2737 // Time stamp counter
2738 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2739 #define __kmp_tsc() __kmp_hardware_timestamp()
2740 // Runtime's default backoff parameters
2741 kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
2742 #else
2743 // Use nanoseconds for other platforms
2744 extern kmp_uint64 __kmp_now_nsec();
2745 kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
2746 #define __kmp_tsc() __kmp_now_nsec()
2747 #endif
2748 
2749 // A useful predicate for dealing with timestamps that may wrap.
2750 // Is a before b? Since the timestamps may wrap, this is asking whether it's
2751 // shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
2752 // Times where going clockwise is less distance than going anti-clockwise
2753 // are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
2754 // then a > b (true) does not mean a reached b; whereas signed(a) = -2,
2755 // signed(b) = 0 captures the actual difference
2756 static inline bool before(kmp_uint64 a, kmp_uint64 b) {
2757   return ((kmp_int64)b - (kmp_int64)a) > 0;
2758 }
2759 
2760 // Truncated binary exponential backoff function
2761 void __kmp_spin_backoff(kmp_backoff_t *boff) {
2762   // We could flatten this loop, but making it a nested loop gives better result
2763   kmp_uint32 i;
2764   for (i = boff->step; i > 0; i--) {
2765     kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
2766     do {
2767       KMP_CPU_PAUSE();
2768     } while (before(__kmp_tsc(), goal));
2769   }
2770   boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
2771 }
2772 
2773 #if KMP_USE_DYNAMIC_LOCK
2774 
2775 // Direct lock initializers. It simply writes a tag to the low 8 bits of the
2776 // lock word.
2777 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
2778                                    kmp_dyna_lockseq_t seq) {
2779   TCW_4(*lck, KMP_GET_D_TAG(seq));
2780   KA_TRACE(
2781       20,
2782       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
2783 }
2784 
2785 #if KMP_USE_TSX
2786 
2787 // HLE lock functions - imported from the testbed runtime.
2788 #define HLE_ACQUIRE ".byte 0xf2;"
2789 #define HLE_RELEASE ".byte 0xf3;"
2790 
2791 static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
2792   __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
2793   return v;
2794 }
2795 
2796 static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
2797 
2798 static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2799   // Use gtid for KMP_LOCK_BUSY if necessary
2800   if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
2801     int delay = 1;
2802     do {
2803       while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
2804         for (int i = delay; i != 0; --i)
2805           KMP_CPU_PAUSE();
2806         delay = ((delay << 1) | 1) & 7;
2807       }
2808     } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
2809   }
2810 }
2811 
2812 static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2813                                                kmp_int32 gtid) {
2814   __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
2815 }
2816 
2817 static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2818   __asm__ volatile(HLE_RELEASE "movl %1,%0"
2819                    : "=m"(*lck)
2820                    : "r"(KMP_LOCK_FREE(hle))
2821                    : "memory");
2822   return KMP_LOCK_RELEASED;
2823 }
2824 
2825 static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2826                                               kmp_int32 gtid) {
2827   return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
2828 }
2829 
2830 static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
2831   return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
2832 }
2833 
2834 static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
2835                                            kmp_int32 gtid) {
2836   return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
2837 }
2838 
2839 static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) {
2840   __kmp_init_queuing_lock(lck);
2841 }
2842 
2843 static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) {
2844   __kmp_destroy_queuing_lock(lck);
2845 }
2846 
2847 static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2848   unsigned retries = 3, status;
2849   do {
2850     status = _xbegin();
2851     if (status == _XBEGIN_STARTED) {
2852       if (__kmp_is_unlocked_queuing_lock(lck))
2853         return;
2854       _xabort(0xff);
2855     }
2856     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
2857       // Wait until lock becomes free
2858       while (!__kmp_is_unlocked_queuing_lock(lck))
2859         __kmp_yield(TRUE);
2860     } else if (!(status & _XABORT_RETRY))
2861       break;
2862   } while (retries--);
2863 
2864   // Fall-back non-speculative lock (xchg)
2865   __kmp_acquire_queuing_lock(lck, gtid);
2866 }
2867 
2868 static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2869                                                kmp_int32 gtid) {
2870   __kmp_acquire_rtm_lock(lck, gtid);
2871 }
2872 
2873 static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2874   if (__kmp_is_unlocked_queuing_lock(lck)) {
2875     // Releasing from speculation
2876     _xend();
2877   } else {
2878     // Releasing from a real lock
2879     __kmp_release_queuing_lock(lck, gtid);
2880   }
2881   return KMP_LOCK_RELEASED;
2882 }
2883 
2884 static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2885                                               kmp_int32 gtid) {
2886   return __kmp_release_rtm_lock(lck, gtid);
2887 }
2888 
2889 static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
2890   unsigned retries = 3, status;
2891   do {
2892     status = _xbegin();
2893     if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
2894       return 1;
2895     }
2896     if (!(status & _XABORT_RETRY))
2897       break;
2898   } while (retries--);
2899 
2900   return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0;
2901 }
2902 
2903 static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
2904                                            kmp_int32 gtid) {
2905   return __kmp_test_rtm_lock(lck, gtid);
2906 }
2907 
2908 #endif // KMP_USE_TSX
2909 
2910 // Entry functions for indirect locks (first element of direct lock jump tables)
2911 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
2912                                      kmp_dyna_lockseq_t tag);
2913 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
2914 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2915 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2916 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
2917 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2918                                                kmp_int32);
2919 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2920                                                  kmp_int32);
2921 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
2922                                                 kmp_int32);
2923 
2924 // Jump tables for the indirect lock functions
2925 // Only fill in the odd entries, that avoids the need to shift out the low bit
2926 
2927 // init functions
2928 #define expand(l, op) 0, __kmp_init_direct_lock,
2929 void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
2930     __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
2931 #undef expand
2932 
2933 // destroy functions
2934 #define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
2935 void (*__kmp_direct_destroy[])(kmp_dyna_lock_t *) = {
2936     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
2937 #undef expand
2938 
2939 // set/acquire functions
2940 #define expand(l, op)                                                          \
2941   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
2942 static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
2943     __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
2944 #undef expand
2945 #define expand(l, op)                                                          \
2946   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
2947 static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2948     __kmp_set_indirect_lock_with_checks, 0,
2949     KMP_FOREACH_D_LOCK(expand, acquire)};
2950 #undef expand
2951 
2952 // unset/release and test functions
2953 #define expand(l, op)                                                          \
2954   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
2955 static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
2956     __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
2957 static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
2958     __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
2959 #undef expand
2960 #define expand(l, op)                                                          \
2961   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
2962 static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2963     __kmp_unset_indirect_lock_with_checks, 0,
2964     KMP_FOREACH_D_LOCK(expand, release)};
2965 static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
2966     __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
2967 #undef expand
2968 
2969 // Exposes only one set of jump tables (*lock or *lock_with_checks).
2970 int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
2971 int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
2972 int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
2973 
2974 // Jump tables for the indirect lock functions
2975 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
2976 void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
2977     KMP_FOREACH_I_LOCK(expand, init)};
2978 void (*__kmp_indirect_destroy[])(kmp_user_lock_p) = {
2979     KMP_FOREACH_I_LOCK(expand, destroy)};
2980 #undef expand
2981 
2982 // set/acquire functions
2983 #define expand(l, op)                                                          \
2984   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
2985 static int (*indirect_set[])(kmp_user_lock_p,
2986                              kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
2987 #undef expand
2988 #define expand(l, op)                                                          \
2989   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
2990 static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
2991     KMP_FOREACH_I_LOCK(expand, acquire)};
2992 #undef expand
2993 
2994 // unset/release and test functions
2995 #define expand(l, op)                                                          \
2996   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
2997 static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
2998     KMP_FOREACH_I_LOCK(expand, release)};
2999 static int (*indirect_test[])(kmp_user_lock_p,
3000                               kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
3001 #undef expand
3002 #define expand(l, op)                                                          \
3003   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
3004 static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
3005     KMP_FOREACH_I_LOCK(expand, release)};
3006 static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
3007     KMP_FOREACH_I_LOCK(expand, test)};
3008 #undef expand
3009 
3010 // Exposes only one jump tables (*lock or *lock_with_checks).
3011 int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
3012 int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
3013 int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
3014 
3015 // Lock index table.
3016 kmp_indirect_lock_table_t __kmp_i_lock_table;
3017 
3018 // Size of indirect locks.
3019 static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
3020 
3021 // Jump tables for lock accessor/modifier.
3022 void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3023                                                      const ident_t *) = {0};
3024 void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
3025                                                   kmp_lock_flags_t) = {0};
3026 const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
3027     kmp_user_lock_p) = {0};
3028 kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
3029     kmp_user_lock_p) = {0};
3030 
3031 // Use different lock pools for different lock types.
3032 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
3033 
3034 // User lock allocator for dynamically dispatched indirect locks. Every entry of
3035 // the indirect lock table holds the address and type of the allocated indrect
3036 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
3037 // full. A destroyed indirect lock object is returned to the reusable pool of
3038 // locks, unique to each lock type.
3039 kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
3040                                                   kmp_int32 gtid,
3041                                                   kmp_indirect_locktag_t tag) {
3042   kmp_indirect_lock_t *lck;
3043   kmp_lock_index_t idx;
3044 
3045   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3046 
3047   if (__kmp_indirect_lock_pool[tag] != NULL) {
3048     // Reuse the allocated and destroyed lock object
3049     lck = __kmp_indirect_lock_pool[tag];
3050     if (OMP_LOCK_T_SIZE < sizeof(void *))
3051       idx = lck->lock->pool.index;
3052     __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
3053     KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
3054                   lck));
3055   } else {
3056     idx = __kmp_i_lock_table.next;
3057     // Check capacity and double the size if it is full
3058     if (idx == __kmp_i_lock_table.size) {
3059       // Double up the space for block pointers
3060       int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
3061       kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate(
3062           2 * row * sizeof(kmp_indirect_lock_t *));
3063       KMP_MEMCPY(new_table, __kmp_i_lock_table.table,
3064                  row * sizeof(kmp_indirect_lock_t *));
3065       kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
3066       __kmp_i_lock_table.table = new_table;
3067       __kmp_free(old_table);
3068       // Allocate new objects in the new blocks
3069       for (int i = row; i < 2 * row; ++i)
3070         *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
3071             KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3072       __kmp_i_lock_table.size = 2 * idx;
3073     }
3074     __kmp_i_lock_table.next++;
3075     lck = KMP_GET_I_LOCK(idx);
3076     // Allocate a new base lock object
3077     lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
3078     KA_TRACE(20,
3079              ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
3080   }
3081 
3082   __kmp_release_lock(&__kmp_global_lock, gtid);
3083 
3084   lck->type = tag;
3085 
3086   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3087     *((kmp_lock_index_t *)user_lock) = idx
3088                                        << 1; // indirect lock word must be even
3089   } else {
3090     *((kmp_indirect_lock_t **)user_lock) = lck;
3091   }
3092 
3093   return lck;
3094 }
3095 
3096 // User lock lookup for dynamically dispatched locks.
3097 static __forceinline kmp_indirect_lock_t *
3098 __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
3099   if (__kmp_env_consistency_check) {
3100     kmp_indirect_lock_t *lck = NULL;
3101     if (user_lock == NULL) {
3102       KMP_FATAL(LockIsUninitialized, func);
3103     }
3104     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3105       kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
3106       if (idx >= __kmp_i_lock_table.size) {
3107         KMP_FATAL(LockIsUninitialized, func);
3108       }
3109       lck = KMP_GET_I_LOCK(idx);
3110     } else {
3111       lck = *((kmp_indirect_lock_t **)user_lock);
3112     }
3113     if (lck == NULL) {
3114       KMP_FATAL(LockIsUninitialized, func);
3115     }
3116     return lck;
3117   } else {
3118     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3119       return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
3120     } else {
3121       return *((kmp_indirect_lock_t **)user_lock);
3122     }
3123   }
3124 }
3125 
3126 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
3127                                      kmp_dyna_lockseq_t seq) {
3128 #if KMP_USE_ADAPTIVE_LOCKS
3129   if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
3130     KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
3131     seq = lockseq_queuing;
3132   }
3133 #endif
3134 #if KMP_USE_TSX
3135   if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) {
3136     seq = lockseq_queuing;
3137   }
3138 #endif
3139   kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
3140   kmp_indirect_lock_t *l =
3141       __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
3142   KMP_I_LOCK_FUNC(l, init)(l->lock);
3143   KA_TRACE(
3144       20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
3145            seq));
3146 }
3147 
3148 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
3149   kmp_uint32 gtid = __kmp_entry_gtid();
3150   kmp_indirect_lock_t *l =
3151       __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
3152   KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3153   kmp_indirect_locktag_t tag = l->type;
3154 
3155   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3156 
3157   // Use the base lock's space to keep the pool chain.
3158   l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
3159   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3160     l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
3161   }
3162   __kmp_indirect_lock_pool[tag] = l;
3163 
3164   __kmp_release_lock(&__kmp_global_lock, gtid);
3165 }
3166 
3167 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3168   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3169   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3170 }
3171 
3172 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3173   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3174   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3175 }
3176 
3177 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
3178   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
3179   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3180 }
3181 
3182 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3183                                                kmp_int32 gtid) {
3184   kmp_indirect_lock_t *l =
3185       __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
3186   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
3187 }
3188 
3189 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3190                                                  kmp_int32 gtid) {
3191   kmp_indirect_lock_t *l =
3192       __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
3193   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
3194 }
3195 
3196 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
3197                                                 kmp_int32 gtid) {
3198   kmp_indirect_lock_t *l =
3199       __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
3200   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
3201 }
3202 
3203 kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
3204 
3205 // This is used only in kmp_error.cpp when consistency checking is on.
3206 kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
3207   switch (seq) {
3208   case lockseq_tas:
3209   case lockseq_nested_tas:
3210     return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
3211 #if KMP_USE_FUTEX
3212   case lockseq_futex:
3213   case lockseq_nested_futex:
3214     return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
3215 #endif
3216   case lockseq_ticket:
3217   case lockseq_nested_ticket:
3218     return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
3219   case lockseq_queuing:
3220   case lockseq_nested_queuing:
3221 #if KMP_USE_ADAPTIVE_LOCKS
3222   case lockseq_adaptive:
3223 #endif
3224     return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
3225   case lockseq_drdpa:
3226   case lockseq_nested_drdpa:
3227     return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
3228   default:
3229     return 0;
3230   }
3231 }
3232 
3233 // Initializes data for dynamic user locks.
3234 void __kmp_init_dynamic_user_locks() {
3235   // Initialize jump table for the lock functions
3236   if (__kmp_env_consistency_check) {
3237     __kmp_direct_set = direct_set_check;
3238     __kmp_direct_unset = direct_unset_check;
3239     __kmp_direct_test = direct_test_check;
3240     __kmp_indirect_set = indirect_set_check;
3241     __kmp_indirect_unset = indirect_unset_check;
3242     __kmp_indirect_test = indirect_test_check;
3243   } else {
3244     __kmp_direct_set = direct_set;
3245     __kmp_direct_unset = direct_unset;
3246     __kmp_direct_test = direct_test;
3247     __kmp_indirect_set = indirect_set;
3248     __kmp_indirect_unset = indirect_unset;
3249     __kmp_indirect_test = indirect_test;
3250   }
3251   // If the user locks have already been initialized, then return. Allow the
3252   // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
3253   // new lock tables if they have already been allocated.
3254   if (__kmp_init_user_locks)
3255     return;
3256 
3257   // Initialize lock index table
3258   __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
3259   __kmp_i_lock_table.table =
3260       (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
3261   *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
3262       KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
3263   __kmp_i_lock_table.next = 0;
3264 
3265   // Indirect lock size
3266   __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
3267   __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
3268 #if KMP_USE_ADAPTIVE_LOCKS
3269   __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
3270 #endif
3271   __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
3272 #if KMP_USE_TSX
3273   __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t);
3274 #endif
3275   __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
3276 #if KMP_USE_FUTEX
3277   __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
3278 #endif
3279   __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
3280   __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
3281   __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
3282 
3283 // Initialize lock accessor/modifier
3284 #define fill_jumps(table, expand, sep)                                         \
3285   {                                                                            \
3286     table[locktag##sep##ticket] = expand(ticket);                              \
3287     table[locktag##sep##queuing] = expand(queuing);                            \
3288     table[locktag##sep##drdpa] = expand(drdpa);                                \
3289   }
3290 
3291 #if KMP_USE_ADAPTIVE_LOCKS
3292 #define fill_table(table, expand)                                              \
3293   {                                                                            \
3294     fill_jumps(table, expand, _);                                              \
3295     table[locktag_adaptive] = expand(queuing);                                 \
3296     fill_jumps(table, expand, _nested_);                                       \
3297   }
3298 #else
3299 #define fill_table(table, expand)                                              \
3300   {                                                                            \
3301     fill_jumps(table, expand, _);                                              \
3302     fill_jumps(table, expand, _nested_);                                       \
3303   }
3304 #endif // KMP_USE_ADAPTIVE_LOCKS
3305 
3306 #define expand(l)                                                              \
3307   (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
3308   fill_table(__kmp_indirect_set_location, expand);
3309 #undef expand
3310 #define expand(l)                                                              \
3311   (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
3312   fill_table(__kmp_indirect_set_flags, expand);
3313 #undef expand
3314 #define expand(l)                                                              \
3315   (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
3316   fill_table(__kmp_indirect_get_location, expand);
3317 #undef expand
3318 #define expand(l)                                                              \
3319   (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
3320   fill_table(__kmp_indirect_get_flags, expand);
3321 #undef expand
3322 
3323   __kmp_init_user_locks = TRUE;
3324 }
3325 
3326 // Clean up the lock table.
3327 void __kmp_cleanup_indirect_user_locks() {
3328   kmp_lock_index_t i;
3329   int k;
3330 
3331   // Clean up locks in the pools first (they were already destroyed before going
3332   // into the pools).
3333   for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
3334     kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
3335     while (l != NULL) {
3336       kmp_indirect_lock_t *ll = l;
3337       l = (kmp_indirect_lock_t *)l->lock->pool.next;
3338       KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
3339                     ll));
3340       __kmp_free(ll->lock);
3341       ll->lock = NULL;
3342     }
3343     __kmp_indirect_lock_pool[k] = NULL;
3344   }
3345   // Clean up the remaining undestroyed locks.
3346   for (i = 0; i < __kmp_i_lock_table.next; i++) {
3347     kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
3348     if (l->lock != NULL) {
3349       // Locks not destroyed explicitly need to be destroyed here.
3350       KMP_I_LOCK_FUNC(l, destroy)(l->lock);
3351       KA_TRACE(
3352           20,
3353           ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
3354            l));
3355       __kmp_free(l->lock);
3356     }
3357   }
3358   // Free the table
3359   for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
3360     __kmp_free(__kmp_i_lock_table.table[i]);
3361   __kmp_free(__kmp_i_lock_table.table);
3362 
3363   __kmp_init_user_locks = FALSE;
3364 }
3365 
3366 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3367 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3368 
3369 #else // KMP_USE_DYNAMIC_LOCK
3370 
3371 /* user locks
3372  * They are implemented as a table of function pointers which are set to the
3373  * lock functions of the appropriate kind, once that has been determined. */
3374 
3375 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
3376 
3377 size_t __kmp_base_user_lock_size = 0;
3378 size_t __kmp_user_lock_size = 0;
3379 
3380 kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
3381 int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
3382                                             kmp_int32 gtid) = NULL;
3383 
3384 int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
3385                                          kmp_int32 gtid) = NULL;
3386 int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
3387                                             kmp_int32 gtid) = NULL;
3388 void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3389 void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
3390 void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3391 int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3392                                                    kmp_int32 gtid) = NULL;
3393 
3394 int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3395                                                 kmp_int32 gtid) = NULL;
3396 int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
3397                                                    kmp_int32 gtid) = NULL;
3398 void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3399 void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
3400 
3401 int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
3402 const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
3403 void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
3404                                       const ident_t *loc) = NULL;
3405 kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
3406 void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
3407                                    kmp_lock_flags_t flags) = NULL;
3408 
3409 void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
3410   switch (user_lock_kind) {
3411   case lk_default:
3412   default:
3413     KMP_ASSERT(0);
3414 
3415   case lk_tas: {
3416     __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
3417     __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
3418 
3419     __kmp_get_user_lock_owner_ =
3420         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
3421 
3422     if (__kmp_env_consistency_check) {
3423       KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
3424       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
3425     } else {
3426       KMP_BIND_USER_LOCK(tas);
3427       KMP_BIND_NESTED_USER_LOCK(tas);
3428     }
3429 
3430     __kmp_destroy_user_lock_ =
3431         (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
3432 
3433     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3434 
3435     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3436 
3437     __kmp_set_user_lock_location_ =
3438         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3439 
3440     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3441 
3442     __kmp_set_user_lock_flags_ =
3443         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3444   } break;
3445 
3446 #if KMP_USE_FUTEX
3447 
3448   case lk_futex: {
3449     __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
3450     __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
3451 
3452     __kmp_get_user_lock_owner_ =
3453         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
3454 
3455     if (__kmp_env_consistency_check) {
3456       KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
3457       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
3458     } else {
3459       KMP_BIND_USER_LOCK(futex);
3460       KMP_BIND_NESTED_USER_LOCK(futex);
3461     }
3462 
3463     __kmp_destroy_user_lock_ =
3464         (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
3465 
3466     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
3467 
3468     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
3469 
3470     __kmp_set_user_lock_location_ =
3471         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
3472 
3473     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
3474 
3475     __kmp_set_user_lock_flags_ =
3476         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
3477   } break;
3478 
3479 #endif // KMP_USE_FUTEX
3480 
3481   case lk_ticket: {
3482     __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
3483     __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
3484 
3485     __kmp_get_user_lock_owner_ =
3486         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
3487 
3488     if (__kmp_env_consistency_check) {
3489       KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
3490       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
3491     } else {
3492       KMP_BIND_USER_LOCK(ticket);
3493       KMP_BIND_NESTED_USER_LOCK(ticket);
3494     }
3495 
3496     __kmp_destroy_user_lock_ =
3497         (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
3498 
3499     __kmp_is_user_lock_initialized_ =
3500         (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
3501 
3502     __kmp_get_user_lock_location_ =
3503         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
3504 
3505     __kmp_set_user_lock_location_ = (void (*)(
3506         kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
3507 
3508     __kmp_get_user_lock_flags_ =
3509         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
3510 
3511     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3512         &__kmp_set_ticket_lock_flags);
3513   } break;
3514 
3515   case lk_queuing: {
3516     __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
3517     __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
3518 
3519     __kmp_get_user_lock_owner_ =
3520         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3521 
3522     if (__kmp_env_consistency_check) {
3523       KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
3524       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
3525     } else {
3526       KMP_BIND_USER_LOCK(queuing);
3527       KMP_BIND_NESTED_USER_LOCK(queuing);
3528     }
3529 
3530     __kmp_destroy_user_lock_ =
3531         (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
3532 
3533     __kmp_is_user_lock_initialized_ =
3534         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3535 
3536     __kmp_get_user_lock_location_ =
3537         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3538 
3539     __kmp_set_user_lock_location_ = (void (*)(
3540         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3541 
3542     __kmp_get_user_lock_flags_ =
3543         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3544 
3545     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3546         &__kmp_set_queuing_lock_flags);
3547   } break;
3548 
3549 #if KMP_USE_ADAPTIVE_LOCKS
3550   case lk_adaptive: {
3551     __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
3552     __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
3553 
3554     __kmp_get_user_lock_owner_ =
3555         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
3556 
3557     if (__kmp_env_consistency_check) {
3558       KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
3559     } else {
3560       KMP_BIND_USER_LOCK(adaptive);
3561     }
3562 
3563     __kmp_destroy_user_lock_ =
3564         (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
3565 
3566     __kmp_is_user_lock_initialized_ =
3567         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
3568 
3569     __kmp_get_user_lock_location_ =
3570         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
3571 
3572     __kmp_set_user_lock_location_ = (void (*)(
3573         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
3574 
3575     __kmp_get_user_lock_flags_ =
3576         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
3577 
3578     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3579         &__kmp_set_queuing_lock_flags);
3580 
3581   } break;
3582 #endif // KMP_USE_ADAPTIVE_LOCKS
3583 
3584   case lk_drdpa: {
3585     __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
3586     __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
3587 
3588     __kmp_get_user_lock_owner_ =
3589         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
3590 
3591     if (__kmp_env_consistency_check) {
3592       KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
3593       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
3594     } else {
3595       KMP_BIND_USER_LOCK(drdpa);
3596       KMP_BIND_NESTED_USER_LOCK(drdpa);
3597     }
3598 
3599     __kmp_destroy_user_lock_ =
3600         (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
3601 
3602     __kmp_is_user_lock_initialized_ =
3603         (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
3604 
3605     __kmp_get_user_lock_location_ =
3606         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
3607 
3608     __kmp_set_user_lock_location_ = (void (*)(
3609         kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
3610 
3611     __kmp_get_user_lock_flags_ =
3612         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
3613 
3614     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
3615         &__kmp_set_drdpa_lock_flags);
3616   } break;
3617   }
3618 }
3619 
3620 // ----------------------------------------------------------------------------
3621 // User lock table & lock allocation
3622 
3623 kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
3624 kmp_user_lock_p __kmp_lock_pool = NULL;
3625 
3626 // Lock block-allocation support.
3627 kmp_block_of_locks *__kmp_lock_blocks = NULL;
3628 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
3629 
3630 static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
3631   // Assume that kmp_global_lock is held upon entry/exit.
3632   kmp_lock_index_t index;
3633   if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
3634     kmp_lock_index_t size;
3635     kmp_user_lock_p *table;
3636     // Reallocate lock table.
3637     if (__kmp_user_lock_table.allocated == 0) {
3638       size = 1024;
3639     } else {
3640       size = __kmp_user_lock_table.allocated * 2;
3641     }
3642     table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
3643     KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
3644                sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
3645     table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
3646     // We cannot free the previous table now, since it may be in use by other
3647     // threads. So save the pointer to the previous table in in the first
3648     // element of the new table. All the tables will be organized into a list,
3649     // and could be freed when library shutting down.
3650     __kmp_user_lock_table.table = table;
3651     __kmp_user_lock_table.allocated = size;
3652   }
3653   KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
3654                    __kmp_user_lock_table.allocated);
3655   index = __kmp_user_lock_table.used;
3656   __kmp_user_lock_table.table[index] = lck;
3657   ++__kmp_user_lock_table.used;
3658   return index;
3659 }
3660 
3661 static kmp_user_lock_p __kmp_lock_block_allocate() {
3662   // Assume that kmp_global_lock is held upon entry/exit.
3663   static int last_index = 0;
3664   if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
3665     // Restart the index.
3666     last_index = 0;
3667     // Need to allocate a new block.
3668     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3669     size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
3670     char *buffer =
3671         (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
3672     // Set up the new block.
3673     kmp_block_of_locks *new_block =
3674         (kmp_block_of_locks *)(&buffer[space_for_locks]);
3675     new_block->next_block = __kmp_lock_blocks;
3676     new_block->locks = (void *)buffer;
3677     // Publish the new block.
3678     KMP_MB();
3679     __kmp_lock_blocks = new_block;
3680   }
3681   kmp_user_lock_p ret = (kmp_user_lock_p)(&(
3682       ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
3683   last_index++;
3684   return ret;
3685 }
3686 
3687 // Get memory for a lock. It may be freshly allocated memory or reused memory
3688 // from lock pool.
3689 kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
3690                                          kmp_lock_flags_t flags) {
3691   kmp_user_lock_p lck;
3692   kmp_lock_index_t index;
3693   KMP_DEBUG_ASSERT(user_lock);
3694 
3695   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3696 
3697   if (__kmp_lock_pool == NULL) {
3698     // Lock pool is empty. Allocate new memory.
3699 
3700     // ANNOTATION: Found no good way to express the syncronisation
3701     // between allocation and usage, so ignore the allocation
3702     ANNOTATE_IGNORE_WRITES_BEGIN();
3703     if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
3704       lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
3705     } else {
3706       lck = __kmp_lock_block_allocate();
3707     }
3708     ANNOTATE_IGNORE_WRITES_END();
3709 
3710     // Insert lock in the table so that it can be freed in __kmp_cleanup,
3711     // and debugger has info on all allocated locks.
3712     index = __kmp_lock_table_insert(lck);
3713   } else {
3714     // Pick up lock from pool.
3715     lck = __kmp_lock_pool;
3716     index = __kmp_lock_pool->pool.index;
3717     __kmp_lock_pool = __kmp_lock_pool->pool.next;
3718   }
3719 
3720   // We could potentially differentiate between nested and regular locks
3721   // here, and do the lock table lookup for regular locks only.
3722   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3723     *((kmp_lock_index_t *)user_lock) = index;
3724   } else {
3725     *((kmp_user_lock_p *)user_lock) = lck;
3726   }
3727 
3728   // mark the lock if it is critical section lock.
3729   __kmp_set_user_lock_flags(lck, flags);
3730 
3731   __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
3732 
3733   return lck;
3734 }
3735 
3736 // Put lock's memory to pool for reusing.
3737 void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
3738                           kmp_user_lock_p lck) {
3739   KMP_DEBUG_ASSERT(user_lock != NULL);
3740   KMP_DEBUG_ASSERT(lck != NULL);
3741 
3742   __kmp_acquire_lock(&__kmp_global_lock, gtid);
3743 
3744   lck->pool.next = __kmp_lock_pool;
3745   __kmp_lock_pool = lck;
3746   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3747     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3748     KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
3749     lck->pool.index = index;
3750   }
3751 
3752   __kmp_release_lock(&__kmp_global_lock, gtid);
3753 }
3754 
3755 kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
3756   kmp_user_lock_p lck = NULL;
3757 
3758   if (__kmp_env_consistency_check) {
3759     if (user_lock == NULL) {
3760       KMP_FATAL(LockIsUninitialized, func);
3761     }
3762   }
3763 
3764   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
3765     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
3766     if (__kmp_env_consistency_check) {
3767       if (!(0 < index && index < __kmp_user_lock_table.used)) {
3768         KMP_FATAL(LockIsUninitialized, func);
3769       }
3770     }
3771     KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
3772     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
3773     lck = __kmp_user_lock_table.table[index];
3774   } else {
3775     lck = *((kmp_user_lock_p *)user_lock);
3776   }
3777 
3778   if (__kmp_env_consistency_check) {
3779     if (lck == NULL) {
3780       KMP_FATAL(LockIsUninitialized, func);
3781     }
3782   }
3783 
3784   return lck;
3785 }
3786 
3787 void __kmp_cleanup_user_locks(void) {
3788   // Reset lock pool. Don't worry about lock in the pool--we will free them when
3789   // iterating through lock table (it includes all the locks, dead or alive).
3790   __kmp_lock_pool = NULL;
3791 
3792 #define IS_CRITICAL(lck)                                                       \
3793   ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
3794    ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
3795 
3796   // Loop through lock table, free all locks.
3797   // Do not free item [0], it is reserved for lock tables list.
3798   //
3799   // FIXME - we are iterating through a list of (pointers to) objects of type
3800   // union kmp_user_lock, but we have no way of knowing whether the base type is
3801   // currently "pool" or whatever the global user lock type is.
3802   //
3803   // We are relying on the fact that for all of the user lock types
3804   // (except "tas"), the first field in the lock struct is the "initialized"
3805   // field, which is set to the address of the lock object itself when
3806   // the lock is initialized.  When the union is of type "pool", the
3807   // first field is a pointer to the next object in the free list, which
3808   // will not be the same address as the object itself.
3809   //
3810   // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
3811   // for "pool" objects on the free list.  This must happen as the "location"
3812   // field of real user locks overlaps the "index" field of "pool" objects.
3813   //
3814   // It would be better to run through the free list, and remove all "pool"
3815   // objects from the lock table before executing this loop.  However,
3816   // "pool" objects do not always have their index field set (only on
3817   // lin_32e), and I don't want to search the lock table for the address
3818   // of every "pool" object on the free list.
3819   while (__kmp_user_lock_table.used > 1) {
3820     const ident *loc;
3821 
3822     // reduce __kmp_user_lock_table.used before freeing the lock,
3823     // so that state of locks is consistent
3824     kmp_user_lock_p lck =
3825         __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
3826 
3827     if ((__kmp_is_user_lock_initialized_ != NULL) &&
3828         (*__kmp_is_user_lock_initialized_)(lck)) {
3829       // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
3830       // it is NOT a critical section (user is not responsible for destroying
3831       // criticals) AND we know source location to report.
3832       if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
3833           ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
3834           (loc->psource != NULL)) {
3835         kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 0);
3836         KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
3837         __kmp_str_loc_free(&str_loc);
3838       }
3839 
3840 #ifdef KMP_DEBUG
3841       if (IS_CRITICAL(lck)) {
3842         KA_TRACE(
3843             20,
3844             ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
3845              lck, *(void **)lck));
3846       } else {
3847         KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
3848                       *(void **)lck));
3849       }
3850 #endif // KMP_DEBUG
3851 
3852       // Cleanup internal lock dynamic resources (for drdpa locks particularly).
3853       __kmp_destroy_user_lock(lck);
3854     }
3855 
3856     // Free the lock if block allocation of locks is not used.
3857     if (__kmp_lock_blocks == NULL) {
3858       __kmp_free(lck);
3859     }
3860   }
3861 
3862 #undef IS_CRITICAL
3863 
3864   // delete lock table(s).
3865   kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
3866   __kmp_user_lock_table.table = NULL;
3867   __kmp_user_lock_table.allocated = 0;
3868 
3869   while (table_ptr != NULL) {
3870     // In the first element we saved the pointer to the previous
3871     // (smaller) lock table.
3872     kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
3873     __kmp_free(table_ptr);
3874     table_ptr = next;
3875   }
3876 
3877   // Free buffers allocated for blocks of locks.
3878   kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
3879   __kmp_lock_blocks = NULL;
3880 
3881   while (block_ptr != NULL) {
3882     kmp_block_of_locks_t *next = block_ptr->next_block;
3883     __kmp_free(block_ptr->locks);
3884     // *block_ptr itself was allocated at the end of the locks vector.
3885     block_ptr = next;
3886   }
3887 
3888   TCW_4(__kmp_init_user_locks, FALSE);
3889 }
3890 
3891 #endif // KMP_USE_DYNAMIC_LOCK
3892