1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the primary thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc source location information
357 @param global_tid global thread number
358 @param num_teams_lo lower bound on number of teams requested for the teams
359 construct
360 @param num_teams_up upper bound on number of teams requested for the teams
361 construct
362 @param num_threads number of threads per team requested for the teams construct
363 
364 Set the number of teams to be used by the teams construct. The number of initial
365 teams cretaed will be greater than or equal to the lower bound and less than or
366 equal to the upper bound.
367 This call is only required if the teams construct has a `num_teams` clause
368 or a `thread_limit` clause (or both).
369 */
370 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
371                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
372                               kmp_int32 num_threads) {
373   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
374                 " num_teams_ub=%d num_threads=%d\n",
375                 global_tid, num_teams_lb, num_teams_ub, num_threads));
376   __kmp_assert_valid_gtid(global_tid);
377   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
378                           num_threads);
379 }
380 
381 /*!
382 @ingroup PARALLEL
383 @param loc  source location information
384 @param argc  total number of arguments in the ellipsis
385 @param microtask  pointer to callback routine consisting of outlined teams
386 construct
387 @param ...  pointers to shared variables that aren't global
388 
389 Do the actual fork and call the microtask in the relevant number of threads.
390 */
391 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
392                        ...) {
393   int gtid = __kmp_entry_gtid();
394   kmp_info_t *this_thr = __kmp_threads[gtid];
395   va_list ap;
396   va_start(ap, microtask);
397 
398 #if KMP_STATS_ENABLED
399   KMP_COUNT_BLOCK(OMP_TEAMS);
400   stats_state_e previous_state = KMP_GET_THREAD_STATE();
401   if (previous_state == stats_state_e::SERIAL_REGION) {
402     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
403   } else {
404     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
405   }
406 #endif
407 
408   // remember teams entry point and nesting level
409   this_thr->th.th_teams_microtask = microtask;
410   this_thr->th.th_teams_level =
411       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
412 
413 #if OMPT_SUPPORT
414   kmp_team_t *parent_team = this_thr->th.th_team;
415   int tid = __kmp_tid_from_gtid(gtid);
416   if (ompt_enabled.enabled) {
417     parent_team->t.t_implicit_task_taskdata[tid]
418         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
419   }
420   OMPT_STORE_RETURN_ADDRESS(gtid);
421 #endif
422 
423   // check if __kmpc_push_num_teams called, set default number of teams
424   // otherwise
425   if (this_thr->th.th_teams_size.nteams == 0) {
426     __kmp_push_num_teams(loc, gtid, 0, 0);
427   }
428   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
431 
432   __kmp_fork_call(
433       loc, gtid, fork_context_intel, argc,
434       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
435       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
436   __kmp_join_call(loc, gtid
437 #if OMPT_SUPPORT
438                   ,
439                   fork_context_intel
440 #endif
441   );
442 
443   // Pop current CG root off list
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
446   this_thr->th.th_cg_roots = tmp->up;
447   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
448                  " to node %p. cg_nthreads was %d\n",
449                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
450   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
451   int i = tmp->cg_nthreads--;
452   if (i == 1) { // check is we are the last thread in CG (not always the case)
453     __kmp_free(tmp);
454   }
455   // Restore current task's thread_limit from CG root
456   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
457   this_thr->th.th_current_task->td_icvs.thread_limit =
458       this_thr->th.th_cg_roots->cg_thread_limit;
459 
460   this_thr->th.th_teams_microtask = NULL;
461   this_thr->th.th_teams_level = 0;
462   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
463   va_end(ap);
464 #if KMP_STATS_ENABLED
465   if (previous_state == stats_state_e::SERIAL_REGION) {
466     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
467     KMP_SET_THREAD_STATE(previous_state);
468   } else {
469     KMP_POP_PARTITIONED_TIMER();
470   }
471 #endif // KMP_STATS_ENABLED
472 }
473 
474 // I don't think this function should ever have been exported.
475 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
476 // openmp code ever called it, but it's been exported from the RTL for so
477 // long that I'm afraid to remove the definition.
478 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
479 
480 /*!
481 @ingroup PARALLEL
482 @param loc  source location information
483 @param global_tid  global thread number
484 
485 Enter a serialized parallel construct. This interface is used to handle a
486 conditional parallel region, like this,
487 @code
488 #pragma omp parallel if (condition)
489 @endcode
490 when the condition is false.
491 */
492 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
493   // The implementation is now in kmp_runtime.cpp so that it can share static
494   // functions with kmp_fork_call since the tasks to be done are similar in
495   // each case.
496   __kmp_assert_valid_gtid(global_tid);
497 #if OMPT_SUPPORT
498   OMPT_STORE_RETURN_ADDRESS(global_tid);
499 #endif
500   __kmp_serialized_parallel(loc, global_tid);
501 }
502 
503 /*!
504 @ingroup PARALLEL
505 @param loc  source location information
506 @param global_tid  global thread number
507 
508 Leave a serialized parallel construct.
509 */
510 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
511   kmp_internal_control_t *top;
512   kmp_info_t *this_thr;
513   kmp_team_t *serial_team;
514 
515   KC_TRACE(10,
516            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
517 
518   /* skip all this code for autopar serialized loops since it results in
519      unacceptable overhead */
520   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
521     return;
522 
523   // Not autopar code
524   __kmp_assert_valid_gtid(global_tid);
525   if (!TCR_4(__kmp_init_parallel))
526     __kmp_parallel_initialize();
527 
528   __kmp_resume_if_soft_paused();
529 
530   this_thr = __kmp_threads[global_tid];
531   serial_team = this_thr->th.th_serial_team;
532 
533   kmp_task_team_t *task_team = this_thr->th.th_task_team;
534   // we need to wait for the proxy tasks before finishing the thread
535   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   // if( serial_team -> t.t_serialized > 1 )
581   serial_team->t.t_level--;
582 
583   /* pop dispatch buffers stack */
584   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
585   {
586     dispatch_private_info_t *disp_buffer =
587         serial_team->t.t_dispatch->th_disp_buffer;
588     serial_team->t.t_dispatch->th_disp_buffer =
589         serial_team->t.t_dispatch->th_disp_buffer->next;
590     __kmp_free(disp_buffer);
591   }
592   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
593 
594   --serial_team->t.t_serialized;
595   if (serial_team->t.t_serialized == 0) {
596 
597     /* return to the parallel section */
598 
599 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
600     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
601       __kmp_clear_x87_fpu_status_word();
602       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
603       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
604     }
605 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
606 
607     this_thr->th.th_team = serial_team->t.t_parent;
608     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
609 
610     /* restore values cached in the thread */
611     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
612     this_thr->th.th_team_master =
613         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
614     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
615 
616     /* TODO the below shouldn't need to be adjusted for serialized teams */
617     this_thr->th.th_dispatch =
618         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
619 
620     __kmp_pop_current_task_from_thread(this_thr);
621 
622     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
623     this_thr->th.th_current_task->td_flags.executing = 1;
624 
625     if (__kmp_tasking_mode != tskm_immediate_exec) {
626       // Copy the task team from the new child / old parent team to the thread.
627       this_thr->th.th_task_team =
628           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
629       KA_TRACE(20,
630                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
631                 "team %p\n",
632                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
633     }
634   } else {
635     if (__kmp_tasking_mode != tskm_immediate_exec) {
636       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
637                     "depth of serial team %p to %d\n",
638                     global_tid, serial_team, serial_team->t.t_serialized));
639     }
640   }
641 
642   if (__kmp_env_consistency_check)
643     __kmp_pop_parallel(global_tid, NULL);
644 #if OMPT_SUPPORT
645   if (ompt_enabled.enabled)
646     this_thr->th.ompt_thread_info.state =
647         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
648                                            : ompt_state_work_parallel);
649 #endif
650 }
651 
652 /*!
653 @ingroup SYNCHRONIZATION
654 @param loc  source location information.
655 
656 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
657 depending on the memory ordering convention obeyed by the compiler
658 even that may not be necessary).
659 */
660 void __kmpc_flush(ident_t *loc) {
661   KC_TRACE(10, ("__kmpc_flush: called\n"));
662 
663   /* need explicit __mf() here since use volatile instead in library */
664   KMP_MB(); /* Flush all pending memory write invalidates.  */
665 
666 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
667 #if KMP_MIC
668 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
669 // We shouldn't need it, though, since the ABI rules require that
670 // * If the compiler generates NGO stores it also generates the fence
671 // * If users hand-code NGO stores they should insert the fence
672 // therefore no incomplete unordered stores should be visible.
673 #else
674   // C74404
675   // This is to address non-temporal store instructions (sfence needed).
676   // The clflush instruction is addressed either (mfence needed).
677   // Probably the non-temporal load monvtdqa instruction should also be
678   // addressed.
679   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
680   if (!__kmp_cpuinfo.initialized) {
681     __kmp_query_cpuid(&__kmp_cpuinfo);
682   }
683   if (!__kmp_cpuinfo.sse2) {
684     // CPU cannot execute SSE2 instructions.
685   } else {
686 #if KMP_COMPILER_ICC
687     _mm_mfence();
688 #elif KMP_COMPILER_MSVC
689     MemoryBarrier();
690 #else
691     __sync_synchronize();
692 #endif // KMP_COMPILER_ICC
693   }
694 #endif // KMP_MIC
695 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
696        KMP_ARCH_RISCV64)
697 // Nothing to see here move along
698 #elif KMP_ARCH_PPC64
699 // Nothing needed here (we have a real MB above).
700 #else
701 #error Unknown or unsupported architecture
702 #endif
703 
704 #if OMPT_SUPPORT && OMPT_OPTIONAL
705   if (ompt_enabled.ompt_callback_flush) {
706     ompt_callbacks.ompt_callback(ompt_callback_flush)(
707         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
708   }
709 #endif
710 }
711 
712 /* -------------------------------------------------------------------------- */
713 /*!
714 @ingroup SYNCHRONIZATION
715 @param loc source location information
716 @param global_tid thread id.
717 
718 Execute a barrier.
719 */
720 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
721   KMP_COUNT_BLOCK(OMP_BARRIER);
722   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
723   __kmp_assert_valid_gtid(global_tid);
724 
725   if (!TCR_4(__kmp_init_parallel))
726     __kmp_parallel_initialize();
727 
728   __kmp_resume_if_soft_paused();
729 
730   if (__kmp_env_consistency_check) {
731     if (loc == 0) {
732       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
733     }
734     __kmp_check_barrier(global_tid, ct_barrier, loc);
735   }
736 
737 #if OMPT_SUPPORT
738   ompt_frame_t *ompt_frame;
739   if (ompt_enabled.enabled) {
740     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
741     if (ompt_frame->enter_frame.ptr == NULL)
742       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
743   }
744   OMPT_STORE_RETURN_ADDRESS(global_tid);
745 #endif
746   __kmp_threads[global_tid]->th.th_ident = loc;
747   // TODO: explicit barrier_wait_id:
748   //   this function is called when 'barrier' directive is present or
749   //   implicit barrier at the end of a worksharing construct.
750   // 1) better to add a per-thread barrier counter to a thread data structure
751   // 2) set to 0 when a new team is created
752   // 4) no sync is required
753 
754   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
755 #if OMPT_SUPPORT && OMPT_OPTIONAL
756   if (ompt_enabled.enabled) {
757     ompt_frame->enter_frame = ompt_data_none;
758   }
759 #endif
760 }
761 
762 /* The BARRIER for a MASTER section is always explicit   */
763 /*!
764 @ingroup WORK_SHARING
765 @param loc  source location information.
766 @param global_tid  global thread number .
767 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
768 */
769 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
770   int status = 0;
771 
772   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
773   __kmp_assert_valid_gtid(global_tid);
774 
775   if (!TCR_4(__kmp_init_parallel))
776     __kmp_parallel_initialize();
777 
778   __kmp_resume_if_soft_paused();
779 
780   if (KMP_MASTER_GTID(global_tid)) {
781     KMP_COUNT_BLOCK(OMP_MASTER);
782     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
783     status = 1;
784   }
785 
786 #if OMPT_SUPPORT && OMPT_OPTIONAL
787   if (status) {
788     if (ompt_enabled.ompt_callback_masked) {
789       kmp_info_t *this_thr = __kmp_threads[global_tid];
790       kmp_team_t *team = this_thr->th.th_team;
791 
792       int tid = __kmp_tid_from_gtid(global_tid);
793       ompt_callbacks.ompt_callback(ompt_callback_masked)(
794           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
795           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
796           OMPT_GET_RETURN_ADDRESS(0));
797     }
798   }
799 #endif
800 
801   if (__kmp_env_consistency_check) {
802 #if KMP_USE_DYNAMIC_LOCK
803     if (status)
804       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
805     else
806       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
807 #else
808     if (status)
809       __kmp_push_sync(global_tid, ct_master, loc, NULL);
810     else
811       __kmp_check_sync(global_tid, ct_master, loc, NULL);
812 #endif
813   }
814 
815   return status;
816 }
817 
818 /*!
819 @ingroup WORK_SHARING
820 @param loc  source location information.
821 @param global_tid  global thread number .
822 
823 Mark the end of a <tt>master</tt> region. This should only be called by the
824 thread that executes the <tt>master</tt> region.
825 */
826 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
827   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
828   __kmp_assert_valid_gtid(global_tid);
829   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
830   KMP_POP_PARTITIONED_TIMER();
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_info_t *this_thr = __kmp_threads[global_tid];
834   kmp_team_t *team = this_thr->th.th_team;
835   if (ompt_enabled.ompt_callback_masked) {
836     int tid = __kmp_tid_from_gtid(global_tid);
837     ompt_callbacks.ompt_callback(ompt_callback_masked)(
838         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
839         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
840         OMPT_GET_RETURN_ADDRESS(0));
841   }
842 #endif
843 
844   if (__kmp_env_consistency_check) {
845     if (KMP_MASTER_GTID(global_tid))
846       __kmp_pop_sync(global_tid, ct_master, loc);
847   }
848 }
849 
850 /*!
851 @ingroup WORK_SHARING
852 @param loc  source location information.
853 @param global_tid  global thread number.
854 @param filter result of evaluating filter clause on thread global_tid, or zero
855 if no filter clause present
856 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
857 */
858 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
859   int status = 0;
860   int tid;
861   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
862   __kmp_assert_valid_gtid(global_tid);
863 
864   if (!TCR_4(__kmp_init_parallel))
865     __kmp_parallel_initialize();
866 
867   __kmp_resume_if_soft_paused();
868 
869   tid = __kmp_tid_from_gtid(global_tid);
870   if (tid == filter) {
871     KMP_COUNT_BLOCK(OMP_MASKED);
872     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
873     status = 1;
874   }
875 
876 #if OMPT_SUPPORT && OMPT_OPTIONAL
877   if (status) {
878     if (ompt_enabled.ompt_callback_masked) {
879       kmp_info_t *this_thr = __kmp_threads[global_tid];
880       kmp_team_t *team = this_thr->th.th_team;
881       ompt_callbacks.ompt_callback(ompt_callback_masked)(
882           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
883           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
884           OMPT_GET_RETURN_ADDRESS(0));
885     }
886   }
887 #endif
888 
889   if (__kmp_env_consistency_check) {
890 #if KMP_USE_DYNAMIC_LOCK
891     if (status)
892       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
893     else
894       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
895 #else
896     if (status)
897       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
898     else
899       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
900 #endif
901   }
902 
903   return status;
904 }
905 
906 /*!
907 @ingroup WORK_SHARING
908 @param loc  source location information.
909 @param global_tid  global thread number .
910 
911 Mark the end of a <tt>masked</tt> region. This should only be called by the
912 thread that executes the <tt>masked</tt> region.
913 */
914 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
915   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
916   __kmp_assert_valid_gtid(global_tid);
917   KMP_POP_PARTITIONED_TIMER();
918 
919 #if OMPT_SUPPORT && OMPT_OPTIONAL
920   kmp_info_t *this_thr = __kmp_threads[global_tid];
921   kmp_team_t *team = this_thr->th.th_team;
922   if (ompt_enabled.ompt_callback_masked) {
923     int tid = __kmp_tid_from_gtid(global_tid);
924     ompt_callbacks.ompt_callback(ompt_callback_masked)(
925         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
926         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
927         OMPT_GET_RETURN_ADDRESS(0));
928   }
929 #endif
930 
931   if (__kmp_env_consistency_check) {
932     __kmp_pop_sync(global_tid, ct_masked, loc);
933   }
934 }
935 
936 /*!
937 @ingroup WORK_SHARING
938 @param loc  source location information.
939 @param gtid  global thread number.
940 
941 Start execution of an <tt>ordered</tt> construct.
942 */
943 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
944   int cid = 0;
945   kmp_info_t *th;
946   KMP_DEBUG_ASSERT(__kmp_init_serial);
947 
948   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
949   __kmp_assert_valid_gtid(gtid);
950 
951   if (!TCR_4(__kmp_init_parallel))
952     __kmp_parallel_initialize();
953 
954   __kmp_resume_if_soft_paused();
955 
956 #if USE_ITT_BUILD
957   __kmp_itt_ordered_prep(gtid);
958 // TODO: ordered_wait_id
959 #endif /* USE_ITT_BUILD */
960 
961   th = __kmp_threads[gtid];
962 
963 #if OMPT_SUPPORT && OMPT_OPTIONAL
964   kmp_team_t *team;
965   ompt_wait_id_t lck;
966   void *codeptr_ra;
967   OMPT_STORE_RETURN_ADDRESS(gtid);
968   if (ompt_enabled.enabled) {
969     team = __kmp_team_from_gtid(gtid);
970     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
971     /* OMPT state update */
972     th->th.ompt_thread_info.wait_id = lck;
973     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
974 
975     /* OMPT event callback */
976     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
977     if (ompt_enabled.ompt_callback_mutex_acquire) {
978       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
979           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
980           codeptr_ra);
981     }
982   }
983 #endif
984 
985   if (th->th.th_dispatch->th_deo_fcn != 0)
986     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
987   else
988     __kmp_parallel_deo(&gtid, &cid, loc);
989 
990 #if OMPT_SUPPORT && OMPT_OPTIONAL
991   if (ompt_enabled.enabled) {
992     /* OMPT state update */
993     th->th.ompt_thread_info.state = ompt_state_work_parallel;
994     th->th.ompt_thread_info.wait_id = 0;
995 
996     /* OMPT event callback */
997     if (ompt_enabled.ompt_callback_mutex_acquired) {
998       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
999           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1000     }
1001   }
1002 #endif
1003 
1004 #if USE_ITT_BUILD
1005   __kmp_itt_ordered_start(gtid);
1006 #endif /* USE_ITT_BUILD */
1007 }
1008 
1009 /*!
1010 @ingroup WORK_SHARING
1011 @param loc  source location information.
1012 @param gtid  global thread number.
1013 
1014 End execution of an <tt>ordered</tt> construct.
1015 */
1016 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1017   int cid = 0;
1018   kmp_info_t *th;
1019 
1020   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1021   __kmp_assert_valid_gtid(gtid);
1022 
1023 #if USE_ITT_BUILD
1024   __kmp_itt_ordered_end(gtid);
1025 // TODO: ordered_wait_id
1026 #endif /* USE_ITT_BUILD */
1027 
1028   th = __kmp_threads[gtid];
1029 
1030   if (th->th.th_dispatch->th_dxo_fcn != 0)
1031     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1032   else
1033     __kmp_parallel_dxo(&gtid, &cid, loc);
1034 
1035 #if OMPT_SUPPORT && OMPT_OPTIONAL
1036   OMPT_STORE_RETURN_ADDRESS(gtid);
1037   if (ompt_enabled.ompt_callback_mutex_released) {
1038     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1039         ompt_mutex_ordered,
1040         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1041             ->t.t_ordered.dt.t_value,
1042         OMPT_LOAD_RETURN_ADDRESS(gtid));
1043   }
1044 #endif
1045 }
1046 
1047 #if KMP_USE_DYNAMIC_LOCK
1048 
1049 static __forceinline void
1050 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1051                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1052   // Pointer to the allocated indirect lock is written to crit, while indexing
1053   // is ignored.
1054   void *idx;
1055   kmp_indirect_lock_t **lck;
1056   lck = (kmp_indirect_lock_t **)crit;
1057   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1058   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1059   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1060   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1061   KA_TRACE(20,
1062            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1063 #if USE_ITT_BUILD
1064   __kmp_itt_critical_creating(ilk->lock, loc);
1065 #endif
1066   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1067   if (status == 0) {
1068 #if USE_ITT_BUILD
1069     __kmp_itt_critical_destroyed(ilk->lock);
1070 #endif
1071     // We don't really need to destroy the unclaimed lock here since it will be
1072     // cleaned up at program exit.
1073     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1074   }
1075   KMP_DEBUG_ASSERT(*lck != NULL);
1076 }
1077 
1078 // Fast-path acquire tas lock
1079 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1080   {                                                                            \
1081     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1082     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1083     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1084     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1085         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1086       kmp_uint32 spins;                                                        \
1087       KMP_FSYNC_PREPARE(l);                                                    \
1088       KMP_INIT_YIELD(spins);                                                   \
1089       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1090       do {                                                                     \
1091         if (TCR_4(__kmp_nth) >                                                 \
1092             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1093           KMP_YIELD(TRUE);                                                     \
1094         } else {                                                               \
1095           KMP_YIELD_SPIN(spins);                                               \
1096         }                                                                      \
1097         __kmp_spin_backoff(&backoff);                                          \
1098       } while (                                                                \
1099           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1100           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1101     }                                                                          \
1102     KMP_FSYNC_ACQUIRED(l);                                                     \
1103   }
1104 
1105 // Fast-path test tas lock
1106 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1107   {                                                                            \
1108     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1109     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1110     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1111     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1112          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1113   }
1114 
1115 // Fast-path release tas lock
1116 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1117   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1118 
1119 #if KMP_USE_FUTEX
1120 
1121 #include <sys/syscall.h>
1122 #include <unistd.h>
1123 #ifndef FUTEX_WAIT
1124 #define FUTEX_WAIT 0
1125 #endif
1126 #ifndef FUTEX_WAKE
1127 #define FUTEX_WAKE 1
1128 #endif
1129 
1130 // Fast-path acquire futex lock
1131 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1132   {                                                                            \
1133     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1134     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1135     KMP_MB();                                                                  \
1136     KMP_FSYNC_PREPARE(ftx);                                                    \
1137     kmp_int32 poll_val;                                                        \
1138     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1139                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1140                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1141       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1142       if (!cond) {                                                             \
1143         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1144                                          poll_val |                            \
1145                                              KMP_LOCK_BUSY(1, futex))) {       \
1146           continue;                                                            \
1147         }                                                                      \
1148         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1149       }                                                                        \
1150       kmp_int32 rc;                                                            \
1151       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1152                         NULL, NULL, 0)) != 0) {                                \
1153         continue;                                                              \
1154       }                                                                        \
1155       gtid_code |= 1;                                                          \
1156     }                                                                          \
1157     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1158   }
1159 
1160 // Fast-path test futex lock
1161 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1162   {                                                                            \
1163     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1164     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1165                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1166       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1167       rc = TRUE;                                                               \
1168     } else {                                                                   \
1169       rc = FALSE;                                                              \
1170     }                                                                          \
1171   }
1172 
1173 // Fast-path release futex lock
1174 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1175   {                                                                            \
1176     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1177     KMP_MB();                                                                  \
1178     KMP_FSYNC_RELEASING(ftx);                                                  \
1179     kmp_int32 poll_val =                                                       \
1180         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1181     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1182       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1183               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1184     }                                                                          \
1185     KMP_MB();                                                                  \
1186     KMP_YIELD_OVERSUB();                                                       \
1187   }
1188 
1189 #endif // KMP_USE_FUTEX
1190 
1191 #else // KMP_USE_DYNAMIC_LOCK
1192 
1193 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1194                                                       ident_t const *loc,
1195                                                       kmp_int32 gtid) {
1196   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1197 
1198   // Because of the double-check, the following load doesn't need to be volatile
1199   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1200 
1201   if (lck == NULL) {
1202     void *idx;
1203 
1204     // Allocate & initialize the lock.
1205     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1206     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1207     __kmp_init_user_lock_with_checks(lck);
1208     __kmp_set_user_lock_location(lck, loc);
1209 #if USE_ITT_BUILD
1210     __kmp_itt_critical_creating(lck);
1211 // __kmp_itt_critical_creating() should be called *before* the first usage
1212 // of underlying lock. It is the only place where we can guarantee it. There
1213 // are chances the lock will destroyed with no usage, but it is not a
1214 // problem, because this is not real event seen by user but rather setting
1215 // name for object (lock). See more details in kmp_itt.h.
1216 #endif /* USE_ITT_BUILD */
1217 
1218     // Use a cmpxchg instruction to slam the start of the critical section with
1219     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1220     // and use the lock that the other thread allocated.
1221     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1222 
1223     if (status == 0) {
1224 // Deallocate the lock and reload the value.
1225 #if USE_ITT_BUILD
1226       __kmp_itt_critical_destroyed(lck);
1227 // Let ITT know the lock is destroyed and the same memory location may be reused
1228 // for another purpose.
1229 #endif /* USE_ITT_BUILD */
1230       __kmp_destroy_user_lock_with_checks(lck);
1231       __kmp_user_lock_free(&idx, gtid, lck);
1232       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1233       KMP_DEBUG_ASSERT(lck != NULL);
1234     }
1235   }
1236   return lck;
1237 }
1238 
1239 #endif // KMP_USE_DYNAMIC_LOCK
1240 
1241 /*!
1242 @ingroup WORK_SHARING
1243 @param loc  source location information.
1244 @param global_tid  global thread number.
1245 @param crit identity of the critical section. This could be a pointer to a lock
1246 associated with the critical section, or some other suitably unique value.
1247 
1248 Enter code protected by a `critical` construct.
1249 This function blocks until the executing thread can enter the critical section.
1250 */
1251 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1252                      kmp_critical_name *crit) {
1253 #if KMP_USE_DYNAMIC_LOCK
1254 #if OMPT_SUPPORT && OMPT_OPTIONAL
1255   OMPT_STORE_RETURN_ADDRESS(global_tid);
1256 #endif // OMPT_SUPPORT
1257   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1258 #else
1259   KMP_COUNT_BLOCK(OMP_CRITICAL);
1260 #if OMPT_SUPPORT && OMPT_OPTIONAL
1261   ompt_state_t prev_state = ompt_state_undefined;
1262   ompt_thread_info_t ti;
1263 #endif
1264   kmp_user_lock_p lck;
1265 
1266   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1267   __kmp_assert_valid_gtid(global_tid);
1268 
1269   // TODO: add THR_OVHD_STATE
1270 
1271   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1272   KMP_CHECK_USER_LOCK_INIT();
1273 
1274   if ((__kmp_user_lock_kind == lk_tas) &&
1275       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1276     lck = (kmp_user_lock_p)crit;
1277   }
1278 #if KMP_USE_FUTEX
1279   else if ((__kmp_user_lock_kind == lk_futex) &&
1280            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1281     lck = (kmp_user_lock_p)crit;
1282   }
1283 #endif
1284   else { // ticket, queuing or drdpa
1285     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1286   }
1287 
1288   if (__kmp_env_consistency_check)
1289     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1290 
1291     // since the critical directive binds to all threads, not just the current
1292     // team we have to check this even if we are in a serialized team.
1293     // also, even if we are the uber thread, we still have to conduct the lock,
1294     // as we have to contend with sibling threads.
1295 
1296 #if USE_ITT_BUILD
1297   __kmp_itt_critical_acquiring(lck);
1298 #endif /* USE_ITT_BUILD */
1299 #if OMPT_SUPPORT && OMPT_OPTIONAL
1300   OMPT_STORE_RETURN_ADDRESS(gtid);
1301   void *codeptr_ra = NULL;
1302   if (ompt_enabled.enabled) {
1303     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1304     /* OMPT state update */
1305     prev_state = ti.state;
1306     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1307     ti.state = ompt_state_wait_critical;
1308 
1309     /* OMPT event callback */
1310     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1311     if (ompt_enabled.ompt_callback_mutex_acquire) {
1312       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1313           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1314           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1315     }
1316   }
1317 #endif
1318   // Value of 'crit' should be good for using as a critical_id of the critical
1319   // section directive.
1320   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1321 
1322 #if USE_ITT_BUILD
1323   __kmp_itt_critical_acquired(lck);
1324 #endif /* USE_ITT_BUILD */
1325 #if OMPT_SUPPORT && OMPT_OPTIONAL
1326   if (ompt_enabled.enabled) {
1327     /* OMPT state update */
1328     ti.state = prev_state;
1329     ti.wait_id = 0;
1330 
1331     /* OMPT event callback */
1332     if (ompt_enabled.ompt_callback_mutex_acquired) {
1333       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1334           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1335     }
1336   }
1337 #endif
1338   KMP_POP_PARTITIONED_TIMER();
1339 
1340   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1341   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1342 #endif // KMP_USE_DYNAMIC_LOCK
1343 }
1344 
1345 #if KMP_USE_DYNAMIC_LOCK
1346 
1347 // Converts the given hint to an internal lock implementation
1348 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1349 #if KMP_USE_TSX
1350 #define KMP_TSX_LOCK(seq) lockseq_##seq
1351 #else
1352 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1353 #endif
1354 
1355 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1356 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1357 #else
1358 #define KMP_CPUINFO_RTM 0
1359 #endif
1360 
1361   // Hints that do not require further logic
1362   if (hint & kmp_lock_hint_hle)
1363     return KMP_TSX_LOCK(hle);
1364   if (hint & kmp_lock_hint_rtm)
1365     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1366   if (hint & kmp_lock_hint_adaptive)
1367     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1368 
1369   // Rule out conflicting hints first by returning the default lock
1370   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1371     return __kmp_user_lock_seq;
1372   if ((hint & omp_lock_hint_speculative) &&
1373       (hint & omp_lock_hint_nonspeculative))
1374     return __kmp_user_lock_seq;
1375 
1376   // Do not even consider speculation when it appears to be contended
1377   if (hint & omp_lock_hint_contended)
1378     return lockseq_queuing;
1379 
1380   // Uncontended lock without speculation
1381   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1382     return lockseq_tas;
1383 
1384   // Use RTM lock for speculation
1385   if (hint & omp_lock_hint_speculative)
1386     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1387 
1388   return __kmp_user_lock_seq;
1389 }
1390 
1391 #if OMPT_SUPPORT && OMPT_OPTIONAL
1392 #if KMP_USE_DYNAMIC_LOCK
1393 static kmp_mutex_impl_t
1394 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1395   if (user_lock) {
1396     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1397     case 0:
1398       break;
1399 #if KMP_USE_FUTEX
1400     case locktag_futex:
1401       return kmp_mutex_impl_queuing;
1402 #endif
1403     case locktag_tas:
1404       return kmp_mutex_impl_spin;
1405 #if KMP_USE_TSX
1406     case locktag_hle:
1407     case locktag_rtm_spin:
1408       return kmp_mutex_impl_speculative;
1409 #endif
1410     default:
1411       return kmp_mutex_impl_none;
1412     }
1413     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1414   }
1415   KMP_ASSERT(ilock);
1416   switch (ilock->type) {
1417 #if KMP_USE_TSX
1418   case locktag_adaptive:
1419   case locktag_rtm_queuing:
1420     return kmp_mutex_impl_speculative;
1421 #endif
1422   case locktag_nested_tas:
1423     return kmp_mutex_impl_spin;
1424 #if KMP_USE_FUTEX
1425   case locktag_nested_futex:
1426 #endif
1427   case locktag_ticket:
1428   case locktag_queuing:
1429   case locktag_drdpa:
1430   case locktag_nested_ticket:
1431   case locktag_nested_queuing:
1432   case locktag_nested_drdpa:
1433     return kmp_mutex_impl_queuing;
1434   default:
1435     return kmp_mutex_impl_none;
1436   }
1437 }
1438 #else
1439 // For locks without dynamic binding
1440 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1441   switch (__kmp_user_lock_kind) {
1442   case lk_tas:
1443     return kmp_mutex_impl_spin;
1444 #if KMP_USE_FUTEX
1445   case lk_futex:
1446 #endif
1447   case lk_ticket:
1448   case lk_queuing:
1449   case lk_drdpa:
1450     return kmp_mutex_impl_queuing;
1451 #if KMP_USE_TSX
1452   case lk_hle:
1453   case lk_rtm_queuing:
1454   case lk_rtm_spin:
1455   case lk_adaptive:
1456     return kmp_mutex_impl_speculative;
1457 #endif
1458   default:
1459     return kmp_mutex_impl_none;
1460   }
1461 }
1462 #endif // KMP_USE_DYNAMIC_LOCK
1463 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1464 
1465 /*!
1466 @ingroup WORK_SHARING
1467 @param loc  source location information.
1468 @param global_tid  global thread number.
1469 @param crit identity of the critical section. This could be a pointer to a lock
1470 associated with the critical section, or some other suitably unique value.
1471 @param hint the lock hint.
1472 
1473 Enter code protected by a `critical` construct with a hint. The hint value is
1474 used to suggest a lock implementation. This function blocks until the executing
1475 thread can enter the critical section unless the hint suggests use of
1476 speculative execution and the hardware supports it.
1477 */
1478 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1479                                kmp_critical_name *crit, uint32_t hint) {
1480   KMP_COUNT_BLOCK(OMP_CRITICAL);
1481   kmp_user_lock_p lck;
1482 #if OMPT_SUPPORT && OMPT_OPTIONAL
1483   ompt_state_t prev_state = ompt_state_undefined;
1484   ompt_thread_info_t ti;
1485   // This is the case, if called from __kmpc_critical:
1486   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1487   if (!codeptr)
1488     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1489 #endif
1490 
1491   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1492   __kmp_assert_valid_gtid(global_tid);
1493 
1494   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1495   // Check if it is initialized.
1496   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1497   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1498   if (*lk == 0) {
1499     if (KMP_IS_D_LOCK(lockseq)) {
1500       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1501                                   KMP_GET_D_TAG(lockseq));
1502     } else {
1503       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1504     }
1505   }
1506   // Branch for accessing the actual lock object and set operation. This
1507   // branching is inevitable since this lock initialization does not follow the
1508   // normal dispatch path (lock table is not used).
1509   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1510     lck = (kmp_user_lock_p)lk;
1511     if (__kmp_env_consistency_check) {
1512       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1513                       __kmp_map_hint_to_lock(hint));
1514     }
1515 #if USE_ITT_BUILD
1516     __kmp_itt_critical_acquiring(lck);
1517 #endif
1518 #if OMPT_SUPPORT && OMPT_OPTIONAL
1519     if (ompt_enabled.enabled) {
1520       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1521       /* OMPT state update */
1522       prev_state = ti.state;
1523       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1524       ti.state = ompt_state_wait_critical;
1525 
1526       /* OMPT event callback */
1527       if (ompt_enabled.ompt_callback_mutex_acquire) {
1528         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1529             ompt_mutex_critical, (unsigned int)hint,
1530             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1531             codeptr);
1532       }
1533     }
1534 #endif
1535 #if KMP_USE_INLINED_TAS
1536     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1537       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1538     } else
1539 #elif KMP_USE_INLINED_FUTEX
1540     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1541       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1542     } else
1543 #endif
1544     {
1545       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1546     }
1547   } else {
1548     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1549     lck = ilk->lock;
1550     if (__kmp_env_consistency_check) {
1551       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1552                       __kmp_map_hint_to_lock(hint));
1553     }
1554 #if USE_ITT_BUILD
1555     __kmp_itt_critical_acquiring(lck);
1556 #endif
1557 #if OMPT_SUPPORT && OMPT_OPTIONAL
1558     if (ompt_enabled.enabled) {
1559       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1560       /* OMPT state update */
1561       prev_state = ti.state;
1562       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1563       ti.state = ompt_state_wait_critical;
1564 
1565       /* OMPT event callback */
1566       if (ompt_enabled.ompt_callback_mutex_acquire) {
1567         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1568             ompt_mutex_critical, (unsigned int)hint,
1569             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1570             codeptr);
1571       }
1572     }
1573 #endif
1574     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1575   }
1576   KMP_POP_PARTITIONED_TIMER();
1577 
1578 #if USE_ITT_BUILD
1579   __kmp_itt_critical_acquired(lck);
1580 #endif /* USE_ITT_BUILD */
1581 #if OMPT_SUPPORT && OMPT_OPTIONAL
1582   if (ompt_enabled.enabled) {
1583     /* OMPT state update */
1584     ti.state = prev_state;
1585     ti.wait_id = 0;
1586 
1587     /* OMPT event callback */
1588     if (ompt_enabled.ompt_callback_mutex_acquired) {
1589       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1590           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1591     }
1592   }
1593 #endif
1594 
1595   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1596   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1597 } // __kmpc_critical_with_hint
1598 
1599 #endif // KMP_USE_DYNAMIC_LOCK
1600 
1601 /*!
1602 @ingroup WORK_SHARING
1603 @param loc  source location information.
1604 @param global_tid  global thread number .
1605 @param crit identity of the critical section. This could be a pointer to a lock
1606 associated with the critical section, or some other suitably unique value.
1607 
1608 Leave a critical section, releasing any lock that was held during its execution.
1609 */
1610 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1611                          kmp_critical_name *crit) {
1612   kmp_user_lock_p lck;
1613 
1614   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1615 
1616 #if KMP_USE_DYNAMIC_LOCK
1617   int locktag = KMP_EXTRACT_D_TAG(crit);
1618   if (locktag) {
1619     lck = (kmp_user_lock_p)crit;
1620     KMP_ASSERT(lck != NULL);
1621     if (__kmp_env_consistency_check) {
1622       __kmp_pop_sync(global_tid, ct_critical, loc);
1623     }
1624 #if USE_ITT_BUILD
1625     __kmp_itt_critical_releasing(lck);
1626 #endif
1627 #if KMP_USE_INLINED_TAS
1628     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1629       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1630     } else
1631 #elif KMP_USE_INLINED_FUTEX
1632     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1633       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1634     } else
1635 #endif
1636     {
1637       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1638     }
1639   } else {
1640     kmp_indirect_lock_t *ilk =
1641         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1642     KMP_ASSERT(ilk != NULL);
1643     lck = ilk->lock;
1644     if (__kmp_env_consistency_check) {
1645       __kmp_pop_sync(global_tid, ct_critical, loc);
1646     }
1647 #if USE_ITT_BUILD
1648     __kmp_itt_critical_releasing(lck);
1649 #endif
1650     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1651   }
1652 
1653 #else // KMP_USE_DYNAMIC_LOCK
1654 
1655   if ((__kmp_user_lock_kind == lk_tas) &&
1656       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1657     lck = (kmp_user_lock_p)crit;
1658   }
1659 #if KMP_USE_FUTEX
1660   else if ((__kmp_user_lock_kind == lk_futex) &&
1661            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1662     lck = (kmp_user_lock_p)crit;
1663   }
1664 #endif
1665   else { // ticket, queuing or drdpa
1666     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1667   }
1668 
1669   KMP_ASSERT(lck != NULL);
1670 
1671   if (__kmp_env_consistency_check)
1672     __kmp_pop_sync(global_tid, ct_critical, loc);
1673 
1674 #if USE_ITT_BUILD
1675   __kmp_itt_critical_releasing(lck);
1676 #endif /* USE_ITT_BUILD */
1677   // Value of 'crit' should be good for using as a critical_id of the critical
1678   // section directive.
1679   __kmp_release_user_lock_with_checks(lck, global_tid);
1680 
1681 #endif // KMP_USE_DYNAMIC_LOCK
1682 
1683 #if OMPT_SUPPORT && OMPT_OPTIONAL
1684   /* OMPT release event triggers after lock is released; place here to trigger
1685    * for all #if branches */
1686   OMPT_STORE_RETURN_ADDRESS(global_tid);
1687   if (ompt_enabled.ompt_callback_mutex_released) {
1688     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1689         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1690         OMPT_LOAD_RETURN_ADDRESS(0));
1691   }
1692 #endif
1693 
1694   KMP_POP_PARTITIONED_TIMER();
1695   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1696 }
1697 
1698 /*!
1699 @ingroup SYNCHRONIZATION
1700 @param loc source location information
1701 @param global_tid thread id.
1702 @return one if the thread should execute the master block, zero otherwise
1703 
1704 Start execution of a combined barrier and master. The barrier is executed inside
1705 this function.
1706 */
1707 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1708   int status;
1709   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1710   __kmp_assert_valid_gtid(global_tid);
1711 
1712   if (!TCR_4(__kmp_init_parallel))
1713     __kmp_parallel_initialize();
1714 
1715   __kmp_resume_if_soft_paused();
1716 
1717   if (__kmp_env_consistency_check)
1718     __kmp_check_barrier(global_tid, ct_barrier, loc);
1719 
1720 #if OMPT_SUPPORT
1721   ompt_frame_t *ompt_frame;
1722   if (ompt_enabled.enabled) {
1723     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1724     if (ompt_frame->enter_frame.ptr == NULL)
1725       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1726   }
1727   OMPT_STORE_RETURN_ADDRESS(global_tid);
1728 #endif
1729 #if USE_ITT_NOTIFY
1730   __kmp_threads[global_tid]->th.th_ident = loc;
1731 #endif
1732   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1733 #if OMPT_SUPPORT && OMPT_OPTIONAL
1734   if (ompt_enabled.enabled) {
1735     ompt_frame->enter_frame = ompt_data_none;
1736   }
1737 #endif
1738 
1739   return (status != 0) ? 0 : 1;
1740 }
1741 
1742 /*!
1743 @ingroup SYNCHRONIZATION
1744 @param loc source location information
1745 @param global_tid thread id.
1746 
1747 Complete the execution of a combined barrier and master. This function should
1748 only be called at the completion of the <tt>master</tt> code. Other threads will
1749 still be waiting at the barrier and this call releases them.
1750 */
1751 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1752   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1753   __kmp_assert_valid_gtid(global_tid);
1754   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1755 }
1756 
1757 /*!
1758 @ingroup SYNCHRONIZATION
1759 @param loc source location information
1760 @param global_tid thread id.
1761 @return one if the thread should execute the master block, zero otherwise
1762 
1763 Start execution of a combined barrier and master(nowait) construct.
1764 The barrier is executed inside this function.
1765 There is no equivalent "end" function, since the
1766 */
1767 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1768   kmp_int32 ret;
1769   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1770   __kmp_assert_valid_gtid(global_tid);
1771 
1772   if (!TCR_4(__kmp_init_parallel))
1773     __kmp_parallel_initialize();
1774 
1775   __kmp_resume_if_soft_paused();
1776 
1777   if (__kmp_env_consistency_check) {
1778     if (loc == 0) {
1779       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1780     }
1781     __kmp_check_barrier(global_tid, ct_barrier, loc);
1782   }
1783 
1784 #if OMPT_SUPPORT
1785   ompt_frame_t *ompt_frame;
1786   if (ompt_enabled.enabled) {
1787     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1788     if (ompt_frame->enter_frame.ptr == NULL)
1789       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1790   }
1791   OMPT_STORE_RETURN_ADDRESS(global_tid);
1792 #endif
1793 #if USE_ITT_NOTIFY
1794   __kmp_threads[global_tid]->th.th_ident = loc;
1795 #endif
1796   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1797 #if OMPT_SUPPORT && OMPT_OPTIONAL
1798   if (ompt_enabled.enabled) {
1799     ompt_frame->enter_frame = ompt_data_none;
1800   }
1801 #endif
1802 
1803   ret = __kmpc_master(loc, global_tid);
1804 
1805   if (__kmp_env_consistency_check) {
1806     /*  there's no __kmpc_end_master called; so the (stats) */
1807     /*  actions of __kmpc_end_master are done here          */
1808     if (ret) {
1809       /* only one thread should do the pop since only */
1810       /* one did the push (see __kmpc_master())       */
1811       __kmp_pop_sync(global_tid, ct_master, loc);
1812     }
1813   }
1814 
1815   return (ret);
1816 }
1817 
1818 /* The BARRIER for a SINGLE process section is always explicit   */
1819 /*!
1820 @ingroup WORK_SHARING
1821 @param loc  source location information
1822 @param global_tid  global thread number
1823 @return One if this thread should execute the single construct, zero otherwise.
1824 
1825 Test whether to execute a <tt>single</tt> construct.
1826 There are no implicit barriers in the two "single" calls, rather the compiler
1827 should introduce an explicit barrier if it is required.
1828 */
1829 
1830 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1831   __kmp_assert_valid_gtid(global_tid);
1832   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1833 
1834   if (rc) {
1835     // We are going to execute the single statement, so we should count it.
1836     KMP_COUNT_BLOCK(OMP_SINGLE);
1837     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1838   }
1839 
1840 #if OMPT_SUPPORT && OMPT_OPTIONAL
1841   kmp_info_t *this_thr = __kmp_threads[global_tid];
1842   kmp_team_t *team = this_thr->th.th_team;
1843   int tid = __kmp_tid_from_gtid(global_tid);
1844 
1845   if (ompt_enabled.enabled) {
1846     if (rc) {
1847       if (ompt_enabled.ompt_callback_work) {
1848         ompt_callbacks.ompt_callback(ompt_callback_work)(
1849             ompt_work_single_executor, ompt_scope_begin,
1850             &(team->t.ompt_team_info.parallel_data),
1851             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1852             1, OMPT_GET_RETURN_ADDRESS(0));
1853       }
1854     } else {
1855       if (ompt_enabled.ompt_callback_work) {
1856         ompt_callbacks.ompt_callback(ompt_callback_work)(
1857             ompt_work_single_other, ompt_scope_begin,
1858             &(team->t.ompt_team_info.parallel_data),
1859             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1860             1, OMPT_GET_RETURN_ADDRESS(0));
1861         ompt_callbacks.ompt_callback(ompt_callback_work)(
1862             ompt_work_single_other, ompt_scope_end,
1863             &(team->t.ompt_team_info.parallel_data),
1864             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1865             1, OMPT_GET_RETURN_ADDRESS(0));
1866       }
1867     }
1868   }
1869 #endif
1870 
1871   return rc;
1872 }
1873 
1874 /*!
1875 @ingroup WORK_SHARING
1876 @param loc  source location information
1877 @param global_tid  global thread number
1878 
1879 Mark the end of a <tt>single</tt> construct.  This function should
1880 only be called by the thread that executed the block of code protected
1881 by the `single` construct.
1882 */
1883 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1884   __kmp_assert_valid_gtid(global_tid);
1885   __kmp_exit_single(global_tid);
1886   KMP_POP_PARTITIONED_TIMER();
1887 
1888 #if OMPT_SUPPORT && OMPT_OPTIONAL
1889   kmp_info_t *this_thr = __kmp_threads[global_tid];
1890   kmp_team_t *team = this_thr->th.th_team;
1891   int tid = __kmp_tid_from_gtid(global_tid);
1892 
1893   if (ompt_enabled.ompt_callback_work) {
1894     ompt_callbacks.ompt_callback(ompt_callback_work)(
1895         ompt_work_single_executor, ompt_scope_end,
1896         &(team->t.ompt_team_info.parallel_data),
1897         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1898         OMPT_GET_RETURN_ADDRESS(0));
1899   }
1900 #endif
1901 }
1902 
1903 /*!
1904 @ingroup WORK_SHARING
1905 @param loc Source location
1906 @param global_tid Global thread id
1907 
1908 Mark the end of a statically scheduled loop.
1909 */
1910 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1911   KMP_POP_PARTITIONED_TIMER();
1912   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1913 
1914 #if OMPT_SUPPORT && OMPT_OPTIONAL
1915   if (ompt_enabled.ompt_callback_work) {
1916     ompt_work_t ompt_work_type = ompt_work_loop;
1917     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1918     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1919     // Determine workshare type
1920     if (loc != NULL) {
1921       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1922         ompt_work_type = ompt_work_loop;
1923       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1924         ompt_work_type = ompt_work_sections;
1925       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1926         ompt_work_type = ompt_work_distribute;
1927       } else {
1928         // use default set above.
1929         // a warning about this case is provided in __kmpc_for_static_init
1930       }
1931       KMP_DEBUG_ASSERT(ompt_work_type);
1932     }
1933     ompt_callbacks.ompt_callback(ompt_callback_work)(
1934         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1935         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1936   }
1937 #endif
1938   if (__kmp_env_consistency_check)
1939     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1940 }
1941 
1942 // User routines which take C-style arguments (call by value)
1943 // different from the Fortran equivalent routines
1944 
1945 void ompc_set_num_threads(int arg) {
1946   // !!!!! TODO: check the per-task binding
1947   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1948 }
1949 
1950 void ompc_set_dynamic(int flag) {
1951   kmp_info_t *thread;
1952 
1953   /* For the thread-private implementation of the internal controls */
1954   thread = __kmp_entry_thread();
1955 
1956   __kmp_save_internal_controls(thread);
1957 
1958   set__dynamic(thread, flag ? true : false);
1959 }
1960 
1961 void ompc_set_nested(int flag) {
1962   kmp_info_t *thread;
1963 
1964   /* For the thread-private internal controls implementation */
1965   thread = __kmp_entry_thread();
1966 
1967   __kmp_save_internal_controls(thread);
1968 
1969   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1970 }
1971 
1972 void ompc_set_max_active_levels(int max_active_levels) {
1973   /* TO DO */
1974   /* we want per-task implementation of this internal control */
1975 
1976   /* For the per-thread internal controls implementation */
1977   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1978 }
1979 
1980 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1981   // !!!!! TODO: check the per-task binding
1982   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1983 }
1984 
1985 int ompc_get_ancestor_thread_num(int level) {
1986   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1987 }
1988 
1989 int ompc_get_team_size(int level) {
1990   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1991 }
1992 
1993 /* OpenMP 5.0 Affinity Format API */
1994 
1995 void ompc_set_affinity_format(char const *format) {
1996   if (!__kmp_init_serial) {
1997     __kmp_serial_initialize();
1998   }
1999   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2000                          format, KMP_STRLEN(format) + 1);
2001 }
2002 
2003 size_t ompc_get_affinity_format(char *buffer, size_t size) {
2004   size_t format_size;
2005   if (!__kmp_init_serial) {
2006     __kmp_serial_initialize();
2007   }
2008   format_size = KMP_STRLEN(__kmp_affinity_format);
2009   if (buffer && size) {
2010     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2011                            format_size + 1);
2012   }
2013   return format_size;
2014 }
2015 
2016 void ompc_display_affinity(char const *format) {
2017   int gtid;
2018   if (!TCR_4(__kmp_init_middle)) {
2019     __kmp_middle_initialize();
2020   }
2021   gtid = __kmp_get_gtid();
2022   __kmp_aux_display_affinity(gtid, format);
2023 }
2024 
2025 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
2026                              char const *format) {
2027   int gtid;
2028   size_t num_required;
2029   kmp_str_buf_t capture_buf;
2030   if (!TCR_4(__kmp_init_middle)) {
2031     __kmp_middle_initialize();
2032   }
2033   gtid = __kmp_get_gtid();
2034   __kmp_str_buf_init(&capture_buf);
2035   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2036   if (buffer && buf_size) {
2037     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2038                            capture_buf.used + 1);
2039   }
2040   __kmp_str_buf_free(&capture_buf);
2041   return num_required;
2042 }
2043 
2044 void kmpc_set_stacksize(int arg) {
2045   // __kmp_aux_set_stacksize initializes the library if needed
2046   __kmp_aux_set_stacksize(arg);
2047 }
2048 
2049 void kmpc_set_stacksize_s(size_t arg) {
2050   // __kmp_aux_set_stacksize initializes the library if needed
2051   __kmp_aux_set_stacksize(arg);
2052 }
2053 
2054 void kmpc_set_blocktime(int arg) {
2055   int gtid, tid;
2056   kmp_info_t *thread;
2057 
2058   gtid = __kmp_entry_gtid();
2059   tid = __kmp_tid_from_gtid(gtid);
2060   thread = __kmp_thread_from_gtid(gtid);
2061 
2062   __kmp_aux_set_blocktime(arg, thread, tid);
2063 }
2064 
2065 void kmpc_set_library(int arg) {
2066   // __kmp_user_set_library initializes the library if needed
2067   __kmp_user_set_library((enum library_type)arg);
2068 }
2069 
2070 void kmpc_set_defaults(char const *str) {
2071   // __kmp_aux_set_defaults initializes the library if needed
2072   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2073 }
2074 
2075 void kmpc_set_disp_num_buffers(int arg) {
2076   // ignore after initialization because some teams have already
2077   // allocated dispatch buffers
2078   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2079       arg <= KMP_MAX_DISP_NUM_BUFF) {
2080     __kmp_dispatch_num_buffers = arg;
2081   }
2082 }
2083 
2084 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2085 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2086   return -1;
2087 #else
2088   if (!TCR_4(__kmp_init_middle)) {
2089     __kmp_middle_initialize();
2090   }
2091   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2092 #endif
2093 }
2094 
2095 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2096 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2097   return -1;
2098 #else
2099   if (!TCR_4(__kmp_init_middle)) {
2100     __kmp_middle_initialize();
2101   }
2102   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2103 #endif
2104 }
2105 
2106 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2107 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2108   return -1;
2109 #else
2110   if (!TCR_4(__kmp_init_middle)) {
2111     __kmp_middle_initialize();
2112   }
2113   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2114 #endif
2115 }
2116 
2117 /* -------------------------------------------------------------------------- */
2118 /*!
2119 @ingroup THREADPRIVATE
2120 @param loc       source location information
2121 @param gtid      global thread number
2122 @param cpy_size  size of the cpy_data buffer
2123 @param cpy_data  pointer to data to be copied
2124 @param cpy_func  helper function to call for copying data
2125 @param didit     flag variable: 1=single thread; 0=not single thread
2126 
2127 __kmpc_copyprivate implements the interface for the private data broadcast
2128 needed for the copyprivate clause associated with a single region in an
2129 OpenMP<sup>*</sup> program (both C and Fortran).
2130 All threads participating in the parallel region call this routine.
2131 One of the threads (called the single thread) should have the <tt>didit</tt>
2132 variable set to 1 and all other threads should have that variable set to 0.
2133 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2134 
2135 The OpenMP specification forbids the use of nowait on the single region when a
2136 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2137 barrier internally to avoid race conditions, so the code generation for the
2138 single region should avoid generating a barrier after the call to @ref
2139 __kmpc_copyprivate.
2140 
2141 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2142 The <tt>loc</tt> parameter is a pointer to source location information.
2143 
2144 Internal implementation: The single thread will first copy its descriptor
2145 address (cpy_data) to a team-private location, then the other threads will each
2146 call the function pointed to by the parameter cpy_func, which carries out the
2147 copy by copying the data using the cpy_data buffer.
2148 
2149 The cpy_func routine used for the copy and the contents of the data area defined
2150 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2151 to be done. For instance, the cpy_data buffer can hold the actual data to be
2152 copied or it may hold a list of pointers to the data. The cpy_func routine must
2153 interpret the cpy_data buffer appropriately.
2154 
2155 The interface to cpy_func is as follows:
2156 @code
2157 void cpy_func( void *destination, void *source )
2158 @endcode
2159 where void *destination is the cpy_data pointer for the thread being copied to
2160 and void *source is the cpy_data pointer for the thread being copied from.
2161 */
2162 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2163                         void *cpy_data, void (*cpy_func)(void *, void *),
2164                         kmp_int32 didit) {
2165   void **data_ptr;
2166   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2167   __kmp_assert_valid_gtid(gtid);
2168 
2169   KMP_MB();
2170 
2171   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2172 
2173   if (__kmp_env_consistency_check) {
2174     if (loc == 0) {
2175       KMP_WARNING(ConstructIdentInvalid);
2176     }
2177   }
2178 
2179   // ToDo: Optimize the following two barriers into some kind of split barrier
2180 
2181   if (didit)
2182     *data_ptr = cpy_data;
2183 
2184 #if OMPT_SUPPORT
2185   ompt_frame_t *ompt_frame;
2186   if (ompt_enabled.enabled) {
2187     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2188     if (ompt_frame->enter_frame.ptr == NULL)
2189       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2190   }
2191   OMPT_STORE_RETURN_ADDRESS(gtid);
2192 #endif
2193 /* This barrier is not a barrier region boundary */
2194 #if USE_ITT_NOTIFY
2195   __kmp_threads[gtid]->th.th_ident = loc;
2196 #endif
2197   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2198 
2199   if (!didit)
2200     (*cpy_func)(cpy_data, *data_ptr);
2201 
2202   // Consider next barrier a user-visible barrier for barrier region boundaries
2203   // Nesting checks are already handled by the single construct checks
2204   {
2205 #if OMPT_SUPPORT
2206     OMPT_STORE_RETURN_ADDRESS(gtid);
2207 #endif
2208 #if USE_ITT_NOTIFY
2209     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2210 // tasks can overwrite the location)
2211 #endif
2212     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2213 #if OMPT_SUPPORT && OMPT_OPTIONAL
2214     if (ompt_enabled.enabled) {
2215       ompt_frame->enter_frame = ompt_data_none;
2216     }
2217 #endif
2218   }
2219 }
2220 
2221 /* -------------------------------------------------------------------------- */
2222 
2223 #define INIT_LOCK __kmp_init_user_lock_with_checks
2224 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2225 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2226 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2227 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2228 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2229   __kmp_acquire_nested_user_lock_with_checks_timed
2230 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2231 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2232 #define TEST_LOCK __kmp_test_user_lock_with_checks
2233 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2234 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2235 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2236 
2237 // TODO: Make check abort messages use location info & pass it into
2238 // with_checks routines
2239 
2240 #if KMP_USE_DYNAMIC_LOCK
2241 
2242 // internal lock initializer
2243 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2244                                                     kmp_dyna_lockseq_t seq) {
2245   if (KMP_IS_D_LOCK(seq)) {
2246     KMP_INIT_D_LOCK(lock, seq);
2247 #if USE_ITT_BUILD
2248     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2249 #endif
2250   } else {
2251     KMP_INIT_I_LOCK(lock, seq);
2252 #if USE_ITT_BUILD
2253     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2254     __kmp_itt_lock_creating(ilk->lock, loc);
2255 #endif
2256   }
2257 }
2258 
2259 // internal nest lock initializer
2260 static __forceinline void
2261 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2262                                kmp_dyna_lockseq_t seq) {
2263 #if KMP_USE_TSX
2264   // Don't have nested lock implementation for speculative locks
2265   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2266       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2267     seq = __kmp_user_lock_seq;
2268 #endif
2269   switch (seq) {
2270   case lockseq_tas:
2271     seq = lockseq_nested_tas;
2272     break;
2273 #if KMP_USE_FUTEX
2274   case lockseq_futex:
2275     seq = lockseq_nested_futex;
2276     break;
2277 #endif
2278   case lockseq_ticket:
2279     seq = lockseq_nested_ticket;
2280     break;
2281   case lockseq_queuing:
2282     seq = lockseq_nested_queuing;
2283     break;
2284   case lockseq_drdpa:
2285     seq = lockseq_nested_drdpa;
2286     break;
2287   default:
2288     seq = lockseq_nested_queuing;
2289   }
2290   KMP_INIT_I_LOCK(lock, seq);
2291 #if USE_ITT_BUILD
2292   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2293   __kmp_itt_lock_creating(ilk->lock, loc);
2294 #endif
2295 }
2296 
2297 /* initialize the lock with a hint */
2298 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2299                                 uintptr_t hint) {
2300   KMP_DEBUG_ASSERT(__kmp_init_serial);
2301   if (__kmp_env_consistency_check && user_lock == NULL) {
2302     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2303   }
2304 
2305   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2306 
2307 #if OMPT_SUPPORT && OMPT_OPTIONAL
2308   // This is the case, if called from omp_init_lock_with_hint:
2309   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2310   if (!codeptr)
2311     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2312   if (ompt_enabled.ompt_callback_lock_init) {
2313     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2314         ompt_mutex_lock, (omp_lock_hint_t)hint,
2315         __ompt_get_mutex_impl_type(user_lock),
2316         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2317   }
2318 #endif
2319 }
2320 
2321 /* initialize the lock with a hint */
2322 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2323                                      void **user_lock, uintptr_t hint) {
2324   KMP_DEBUG_ASSERT(__kmp_init_serial);
2325   if (__kmp_env_consistency_check && user_lock == NULL) {
2326     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2327   }
2328 
2329   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2330 
2331 #if OMPT_SUPPORT && OMPT_OPTIONAL
2332   // This is the case, if called from omp_init_lock_with_hint:
2333   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2334   if (!codeptr)
2335     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2336   if (ompt_enabled.ompt_callback_lock_init) {
2337     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2338         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2339         __ompt_get_mutex_impl_type(user_lock),
2340         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2341   }
2342 #endif
2343 }
2344 
2345 #endif // KMP_USE_DYNAMIC_LOCK
2346 
2347 /* initialize the lock */
2348 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2349 #if KMP_USE_DYNAMIC_LOCK
2350 
2351   KMP_DEBUG_ASSERT(__kmp_init_serial);
2352   if (__kmp_env_consistency_check && user_lock == NULL) {
2353     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2354   }
2355   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2356 
2357 #if OMPT_SUPPORT && OMPT_OPTIONAL
2358   // This is the case, if called from omp_init_lock_with_hint:
2359   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2360   if (!codeptr)
2361     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2362   if (ompt_enabled.ompt_callback_lock_init) {
2363     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2364         ompt_mutex_lock, omp_lock_hint_none,
2365         __ompt_get_mutex_impl_type(user_lock),
2366         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2367   }
2368 #endif
2369 
2370 #else // KMP_USE_DYNAMIC_LOCK
2371 
2372   static char const *const func = "omp_init_lock";
2373   kmp_user_lock_p lck;
2374   KMP_DEBUG_ASSERT(__kmp_init_serial);
2375 
2376   if (__kmp_env_consistency_check) {
2377     if (user_lock == NULL) {
2378       KMP_FATAL(LockIsUninitialized, func);
2379     }
2380   }
2381 
2382   KMP_CHECK_USER_LOCK_INIT();
2383 
2384   if ((__kmp_user_lock_kind == lk_tas) &&
2385       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2386     lck = (kmp_user_lock_p)user_lock;
2387   }
2388 #if KMP_USE_FUTEX
2389   else if ((__kmp_user_lock_kind == lk_futex) &&
2390            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2391     lck = (kmp_user_lock_p)user_lock;
2392   }
2393 #endif
2394   else {
2395     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2396   }
2397   INIT_LOCK(lck);
2398   __kmp_set_user_lock_location(lck, loc);
2399 
2400 #if OMPT_SUPPORT && OMPT_OPTIONAL
2401   // This is the case, if called from omp_init_lock_with_hint:
2402   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2403   if (!codeptr)
2404     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2405   if (ompt_enabled.ompt_callback_lock_init) {
2406     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2407         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2408         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2409   }
2410 #endif
2411 
2412 #if USE_ITT_BUILD
2413   __kmp_itt_lock_creating(lck);
2414 #endif /* USE_ITT_BUILD */
2415 
2416 #endif // KMP_USE_DYNAMIC_LOCK
2417 } // __kmpc_init_lock
2418 
2419 /* initialize the lock */
2420 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2421 #if KMP_USE_DYNAMIC_LOCK
2422 
2423   KMP_DEBUG_ASSERT(__kmp_init_serial);
2424   if (__kmp_env_consistency_check && user_lock == NULL) {
2425     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2426   }
2427   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2428 
2429 #if OMPT_SUPPORT && OMPT_OPTIONAL
2430   // This is the case, if called from omp_init_lock_with_hint:
2431   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2432   if (!codeptr)
2433     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2434   if (ompt_enabled.ompt_callback_lock_init) {
2435     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2436         ompt_mutex_nest_lock, omp_lock_hint_none,
2437         __ompt_get_mutex_impl_type(user_lock),
2438         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2439   }
2440 #endif
2441 
2442 #else // KMP_USE_DYNAMIC_LOCK
2443 
2444   static char const *const func = "omp_init_nest_lock";
2445   kmp_user_lock_p lck;
2446   KMP_DEBUG_ASSERT(__kmp_init_serial);
2447 
2448   if (__kmp_env_consistency_check) {
2449     if (user_lock == NULL) {
2450       KMP_FATAL(LockIsUninitialized, func);
2451     }
2452   }
2453 
2454   KMP_CHECK_USER_LOCK_INIT();
2455 
2456   if ((__kmp_user_lock_kind == lk_tas) &&
2457       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2458        OMP_NEST_LOCK_T_SIZE)) {
2459     lck = (kmp_user_lock_p)user_lock;
2460   }
2461 #if KMP_USE_FUTEX
2462   else if ((__kmp_user_lock_kind == lk_futex) &&
2463            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2464             OMP_NEST_LOCK_T_SIZE)) {
2465     lck = (kmp_user_lock_p)user_lock;
2466   }
2467 #endif
2468   else {
2469     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2470   }
2471 
2472   INIT_NESTED_LOCK(lck);
2473   __kmp_set_user_lock_location(lck, loc);
2474 
2475 #if OMPT_SUPPORT && OMPT_OPTIONAL
2476   // This is the case, if called from omp_init_lock_with_hint:
2477   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2478   if (!codeptr)
2479     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2480   if (ompt_enabled.ompt_callback_lock_init) {
2481     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2482         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2483         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2484   }
2485 #endif
2486 
2487 #if USE_ITT_BUILD
2488   __kmp_itt_lock_creating(lck);
2489 #endif /* USE_ITT_BUILD */
2490 
2491 #endif // KMP_USE_DYNAMIC_LOCK
2492 } // __kmpc_init_nest_lock
2493 
2494 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2495 #if KMP_USE_DYNAMIC_LOCK
2496 
2497 #if USE_ITT_BUILD
2498   kmp_user_lock_p lck;
2499   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2500     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2501   } else {
2502     lck = (kmp_user_lock_p)user_lock;
2503   }
2504   __kmp_itt_lock_destroyed(lck);
2505 #endif
2506 #if OMPT_SUPPORT && OMPT_OPTIONAL
2507   // This is the case, if called from omp_init_lock_with_hint:
2508   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2509   if (!codeptr)
2510     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2511   if (ompt_enabled.ompt_callback_lock_destroy) {
2512     kmp_user_lock_p lck;
2513     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2514       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2515     } else {
2516       lck = (kmp_user_lock_p)user_lock;
2517     }
2518     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2519         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2520   }
2521 #endif
2522   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2523 #else
2524   kmp_user_lock_p lck;
2525 
2526   if ((__kmp_user_lock_kind == lk_tas) &&
2527       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2528     lck = (kmp_user_lock_p)user_lock;
2529   }
2530 #if KMP_USE_FUTEX
2531   else if ((__kmp_user_lock_kind == lk_futex) &&
2532            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2533     lck = (kmp_user_lock_p)user_lock;
2534   }
2535 #endif
2536   else {
2537     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2538   }
2539 
2540 #if OMPT_SUPPORT && OMPT_OPTIONAL
2541   // This is the case, if called from omp_init_lock_with_hint:
2542   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2543   if (!codeptr)
2544     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2545   if (ompt_enabled.ompt_callback_lock_destroy) {
2546     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2547         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2548   }
2549 #endif
2550 
2551 #if USE_ITT_BUILD
2552   __kmp_itt_lock_destroyed(lck);
2553 #endif /* USE_ITT_BUILD */
2554   DESTROY_LOCK(lck);
2555 
2556   if ((__kmp_user_lock_kind == lk_tas) &&
2557       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2558     ;
2559   }
2560 #if KMP_USE_FUTEX
2561   else if ((__kmp_user_lock_kind == lk_futex) &&
2562            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2563     ;
2564   }
2565 #endif
2566   else {
2567     __kmp_user_lock_free(user_lock, gtid, lck);
2568   }
2569 #endif // KMP_USE_DYNAMIC_LOCK
2570 } // __kmpc_destroy_lock
2571 
2572 /* destroy the lock */
2573 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2574 #if KMP_USE_DYNAMIC_LOCK
2575 
2576 #if USE_ITT_BUILD
2577   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2578   __kmp_itt_lock_destroyed(ilk->lock);
2579 #endif
2580 #if OMPT_SUPPORT && OMPT_OPTIONAL
2581   // This is the case, if called from omp_init_lock_with_hint:
2582   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2583   if (!codeptr)
2584     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2585   if (ompt_enabled.ompt_callback_lock_destroy) {
2586     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2587         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2588   }
2589 #endif
2590   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2591 
2592 #else // KMP_USE_DYNAMIC_LOCK
2593 
2594   kmp_user_lock_p lck;
2595 
2596   if ((__kmp_user_lock_kind == lk_tas) &&
2597       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2598        OMP_NEST_LOCK_T_SIZE)) {
2599     lck = (kmp_user_lock_p)user_lock;
2600   }
2601 #if KMP_USE_FUTEX
2602   else if ((__kmp_user_lock_kind == lk_futex) &&
2603            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2604             OMP_NEST_LOCK_T_SIZE)) {
2605     lck = (kmp_user_lock_p)user_lock;
2606   }
2607 #endif
2608   else {
2609     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2610   }
2611 
2612 #if OMPT_SUPPORT && OMPT_OPTIONAL
2613   // This is the case, if called from omp_init_lock_with_hint:
2614   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2615   if (!codeptr)
2616     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2617   if (ompt_enabled.ompt_callback_lock_destroy) {
2618     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2619         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2620   }
2621 #endif
2622 
2623 #if USE_ITT_BUILD
2624   __kmp_itt_lock_destroyed(lck);
2625 #endif /* USE_ITT_BUILD */
2626 
2627   DESTROY_NESTED_LOCK(lck);
2628 
2629   if ((__kmp_user_lock_kind == lk_tas) &&
2630       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2631        OMP_NEST_LOCK_T_SIZE)) {
2632     ;
2633   }
2634 #if KMP_USE_FUTEX
2635   else if ((__kmp_user_lock_kind == lk_futex) &&
2636            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2637             OMP_NEST_LOCK_T_SIZE)) {
2638     ;
2639   }
2640 #endif
2641   else {
2642     __kmp_user_lock_free(user_lock, gtid, lck);
2643   }
2644 #endif // KMP_USE_DYNAMIC_LOCK
2645 } // __kmpc_destroy_nest_lock
2646 
2647 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2648   KMP_COUNT_BLOCK(OMP_set_lock);
2649 #if KMP_USE_DYNAMIC_LOCK
2650   int tag = KMP_EXTRACT_D_TAG(user_lock);
2651 #if USE_ITT_BUILD
2652   __kmp_itt_lock_acquiring(
2653       (kmp_user_lock_p)
2654           user_lock); // itt function will get to the right lock object.
2655 #endif
2656 #if OMPT_SUPPORT && OMPT_OPTIONAL
2657   // This is the case, if called from omp_init_lock_with_hint:
2658   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2659   if (!codeptr)
2660     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2661   if (ompt_enabled.ompt_callback_mutex_acquire) {
2662     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2663         ompt_mutex_lock, omp_lock_hint_none,
2664         __ompt_get_mutex_impl_type(user_lock),
2665         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2666   }
2667 #endif
2668 #if KMP_USE_INLINED_TAS
2669   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2670     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2671   } else
2672 #elif KMP_USE_INLINED_FUTEX
2673   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2674     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2675   } else
2676 #endif
2677   {
2678     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2679   }
2680 #if USE_ITT_BUILD
2681   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2682 #endif
2683 #if OMPT_SUPPORT && OMPT_OPTIONAL
2684   if (ompt_enabled.ompt_callback_mutex_acquired) {
2685     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2686         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2687   }
2688 #endif
2689 
2690 #else // KMP_USE_DYNAMIC_LOCK
2691 
2692   kmp_user_lock_p lck;
2693 
2694   if ((__kmp_user_lock_kind == lk_tas) &&
2695       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2696     lck = (kmp_user_lock_p)user_lock;
2697   }
2698 #if KMP_USE_FUTEX
2699   else if ((__kmp_user_lock_kind == lk_futex) &&
2700            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2701     lck = (kmp_user_lock_p)user_lock;
2702   }
2703 #endif
2704   else {
2705     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2706   }
2707 
2708 #if USE_ITT_BUILD
2709   __kmp_itt_lock_acquiring(lck);
2710 #endif /* USE_ITT_BUILD */
2711 #if OMPT_SUPPORT && OMPT_OPTIONAL
2712   // This is the case, if called from omp_init_lock_with_hint:
2713   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2714   if (!codeptr)
2715     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2716   if (ompt_enabled.ompt_callback_mutex_acquire) {
2717     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2718         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2719         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2720   }
2721 #endif
2722 
2723   ACQUIRE_LOCK(lck, gtid);
2724 
2725 #if USE_ITT_BUILD
2726   __kmp_itt_lock_acquired(lck);
2727 #endif /* USE_ITT_BUILD */
2728 
2729 #if OMPT_SUPPORT && OMPT_OPTIONAL
2730   if (ompt_enabled.ompt_callback_mutex_acquired) {
2731     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2732         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2733   }
2734 #endif
2735 
2736 #endif // KMP_USE_DYNAMIC_LOCK
2737 }
2738 
2739 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2740 #if KMP_USE_DYNAMIC_LOCK
2741 
2742 #if USE_ITT_BUILD
2743   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2744 #endif
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746   // This is the case, if called from omp_init_lock_with_hint:
2747   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2748   if (!codeptr)
2749     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750   if (ompt_enabled.enabled) {
2751     if (ompt_enabled.ompt_callback_mutex_acquire) {
2752       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2753           ompt_mutex_nest_lock, omp_lock_hint_none,
2754           __ompt_get_mutex_impl_type(user_lock),
2755           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2756     }
2757   }
2758 #endif
2759   int acquire_status =
2760       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2761   (void)acquire_status;
2762 #if USE_ITT_BUILD
2763   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2764 #endif
2765 
2766 #if OMPT_SUPPORT && OMPT_OPTIONAL
2767   if (ompt_enabled.enabled) {
2768     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2769       if (ompt_enabled.ompt_callback_mutex_acquired) {
2770         // lock_first
2771         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2772             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2773             codeptr);
2774       }
2775     } else {
2776       if (ompt_enabled.ompt_callback_nest_lock) {
2777         // lock_next
2778         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2779             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2780       }
2781     }
2782   }
2783 #endif
2784 
2785 #else // KMP_USE_DYNAMIC_LOCK
2786   int acquire_status;
2787   kmp_user_lock_p lck;
2788 
2789   if ((__kmp_user_lock_kind == lk_tas) &&
2790       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2791        OMP_NEST_LOCK_T_SIZE)) {
2792     lck = (kmp_user_lock_p)user_lock;
2793   }
2794 #if KMP_USE_FUTEX
2795   else if ((__kmp_user_lock_kind == lk_futex) &&
2796            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2797             OMP_NEST_LOCK_T_SIZE)) {
2798     lck = (kmp_user_lock_p)user_lock;
2799   }
2800 #endif
2801   else {
2802     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2803   }
2804 
2805 #if USE_ITT_BUILD
2806   __kmp_itt_lock_acquiring(lck);
2807 #endif /* USE_ITT_BUILD */
2808 #if OMPT_SUPPORT && OMPT_OPTIONAL
2809   // This is the case, if called from omp_init_lock_with_hint:
2810   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2811   if (!codeptr)
2812     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2813   if (ompt_enabled.enabled) {
2814     if (ompt_enabled.ompt_callback_mutex_acquire) {
2815       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2816           ompt_mutex_nest_lock, omp_lock_hint_none,
2817           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2818           codeptr);
2819     }
2820   }
2821 #endif
2822 
2823   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2824 
2825 #if USE_ITT_BUILD
2826   __kmp_itt_lock_acquired(lck);
2827 #endif /* USE_ITT_BUILD */
2828 
2829 #if OMPT_SUPPORT && OMPT_OPTIONAL
2830   if (ompt_enabled.enabled) {
2831     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2832       if (ompt_enabled.ompt_callback_mutex_acquired) {
2833         // lock_first
2834         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2835             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2836       }
2837     } else {
2838       if (ompt_enabled.ompt_callback_nest_lock) {
2839         // lock_next
2840         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2841             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2842       }
2843     }
2844   }
2845 #endif
2846 
2847 #endif // KMP_USE_DYNAMIC_LOCK
2848 }
2849 
2850 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2851 #if KMP_USE_DYNAMIC_LOCK
2852 
2853   int tag = KMP_EXTRACT_D_TAG(user_lock);
2854 #if USE_ITT_BUILD
2855   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2856 #endif
2857 #if KMP_USE_INLINED_TAS
2858   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2859     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2860   } else
2861 #elif KMP_USE_INLINED_FUTEX
2862   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2863     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2864   } else
2865 #endif
2866   {
2867     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2868   }
2869 
2870 #if OMPT_SUPPORT && OMPT_OPTIONAL
2871   // This is the case, if called from omp_init_lock_with_hint:
2872   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2873   if (!codeptr)
2874     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2875   if (ompt_enabled.ompt_callback_mutex_released) {
2876     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2877         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2878   }
2879 #endif
2880 
2881 #else // KMP_USE_DYNAMIC_LOCK
2882 
2883   kmp_user_lock_p lck;
2884 
2885   /* Can't use serial interval since not block structured */
2886   /* release the lock */
2887 
2888   if ((__kmp_user_lock_kind == lk_tas) &&
2889       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2890 #if KMP_OS_LINUX &&                                                            \
2891     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2892 // "fast" path implemented to fix customer performance issue
2893 #if USE_ITT_BUILD
2894     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2895 #endif /* USE_ITT_BUILD */
2896     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2897     KMP_MB();
2898 
2899 #if OMPT_SUPPORT && OMPT_OPTIONAL
2900     // This is the case, if called from omp_init_lock_with_hint:
2901     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2902     if (!codeptr)
2903       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2904     if (ompt_enabled.ompt_callback_mutex_released) {
2905       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2906           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2907     }
2908 #endif
2909 
2910     return;
2911 #else
2912     lck = (kmp_user_lock_p)user_lock;
2913 #endif
2914   }
2915 #if KMP_USE_FUTEX
2916   else if ((__kmp_user_lock_kind == lk_futex) &&
2917            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2918     lck = (kmp_user_lock_p)user_lock;
2919   }
2920 #endif
2921   else {
2922     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2923   }
2924 
2925 #if USE_ITT_BUILD
2926   __kmp_itt_lock_releasing(lck);
2927 #endif /* USE_ITT_BUILD */
2928 
2929   RELEASE_LOCK(lck, gtid);
2930 
2931 #if OMPT_SUPPORT && OMPT_OPTIONAL
2932   // This is the case, if called from omp_init_lock_with_hint:
2933   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2934   if (!codeptr)
2935     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2936   if (ompt_enabled.ompt_callback_mutex_released) {
2937     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2938         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2939   }
2940 #endif
2941 
2942 #endif // KMP_USE_DYNAMIC_LOCK
2943 }
2944 
2945 /* release the lock */
2946 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2947 #if KMP_USE_DYNAMIC_LOCK
2948 
2949 #if USE_ITT_BUILD
2950   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2951 #endif
2952   int release_status =
2953       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2954   (void)release_status;
2955 
2956 #if OMPT_SUPPORT && OMPT_OPTIONAL
2957   // This is the case, if called from omp_init_lock_with_hint:
2958   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2959   if (!codeptr)
2960     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2961   if (ompt_enabled.enabled) {
2962     if (release_status == KMP_LOCK_RELEASED) {
2963       if (ompt_enabled.ompt_callback_mutex_released) {
2964         // release_lock_last
2965         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2966             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2967             codeptr);
2968       }
2969     } else if (ompt_enabled.ompt_callback_nest_lock) {
2970       // release_lock_prev
2971       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2972           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2973     }
2974   }
2975 #endif
2976 
2977 #else // KMP_USE_DYNAMIC_LOCK
2978 
2979   kmp_user_lock_p lck;
2980 
2981   /* Can't use serial interval since not block structured */
2982 
2983   if ((__kmp_user_lock_kind == lk_tas) &&
2984       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2985        OMP_NEST_LOCK_T_SIZE)) {
2986 #if KMP_OS_LINUX &&                                                            \
2987     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2988     // "fast" path implemented to fix customer performance issue
2989     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2990 #if USE_ITT_BUILD
2991     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2992 #endif /* USE_ITT_BUILD */
2993 
2994 #if OMPT_SUPPORT && OMPT_OPTIONAL
2995     int release_status = KMP_LOCK_STILL_HELD;
2996 #endif
2997 
2998     if (--(tl->lk.depth_locked) == 0) {
2999       TCW_4(tl->lk.poll, 0);
3000 #if OMPT_SUPPORT && OMPT_OPTIONAL
3001       release_status = KMP_LOCK_RELEASED;
3002 #endif
3003     }
3004     KMP_MB();
3005 
3006 #if OMPT_SUPPORT && OMPT_OPTIONAL
3007     // This is the case, if called from omp_init_lock_with_hint:
3008     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3009     if (!codeptr)
3010       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3011     if (ompt_enabled.enabled) {
3012       if (release_status == KMP_LOCK_RELEASED) {
3013         if (ompt_enabled.ompt_callback_mutex_released) {
3014           // release_lock_last
3015           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3016               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3017         }
3018       } else if (ompt_enabled.ompt_callback_nest_lock) {
3019         // release_lock_previous
3020         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3021             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3022       }
3023     }
3024 #endif
3025 
3026     return;
3027 #else
3028     lck = (kmp_user_lock_p)user_lock;
3029 #endif
3030   }
3031 #if KMP_USE_FUTEX
3032   else if ((__kmp_user_lock_kind == lk_futex) &&
3033            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3034             OMP_NEST_LOCK_T_SIZE)) {
3035     lck = (kmp_user_lock_p)user_lock;
3036   }
3037 #endif
3038   else {
3039     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3040   }
3041 
3042 #if USE_ITT_BUILD
3043   __kmp_itt_lock_releasing(lck);
3044 #endif /* USE_ITT_BUILD */
3045 
3046   int release_status;
3047   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3048 #if OMPT_SUPPORT && OMPT_OPTIONAL
3049   // This is the case, if called from omp_init_lock_with_hint:
3050   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3051   if (!codeptr)
3052     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3053   if (ompt_enabled.enabled) {
3054     if (release_status == KMP_LOCK_RELEASED) {
3055       if (ompt_enabled.ompt_callback_mutex_released) {
3056         // release_lock_last
3057         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3058             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059       }
3060     } else if (ompt_enabled.ompt_callback_nest_lock) {
3061       // release_lock_previous
3062       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3063           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3064     }
3065   }
3066 #endif
3067 
3068 #endif // KMP_USE_DYNAMIC_LOCK
3069 }
3070 
3071 /* try to acquire the lock */
3072 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3073   KMP_COUNT_BLOCK(OMP_test_lock);
3074 
3075 #if KMP_USE_DYNAMIC_LOCK
3076   int rc;
3077   int tag = KMP_EXTRACT_D_TAG(user_lock);
3078 #if USE_ITT_BUILD
3079   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3080 #endif
3081 #if OMPT_SUPPORT && OMPT_OPTIONAL
3082   // This is the case, if called from omp_init_lock_with_hint:
3083   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3084   if (!codeptr)
3085     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3086   if (ompt_enabled.ompt_callback_mutex_acquire) {
3087     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3088         ompt_mutex_lock, omp_lock_hint_none,
3089         __ompt_get_mutex_impl_type(user_lock),
3090         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3091   }
3092 #endif
3093 #if KMP_USE_INLINED_TAS
3094   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3095     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3096   } else
3097 #elif KMP_USE_INLINED_FUTEX
3098   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3099     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3100   } else
3101 #endif
3102   {
3103     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3104   }
3105   if (rc) {
3106 #if USE_ITT_BUILD
3107     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3108 #endif
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110     if (ompt_enabled.ompt_callback_mutex_acquired) {
3111       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3112           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3113     }
3114 #endif
3115     return FTN_TRUE;
3116   } else {
3117 #if USE_ITT_BUILD
3118     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3119 #endif
3120     return FTN_FALSE;
3121   }
3122 
3123 #else // KMP_USE_DYNAMIC_LOCK
3124 
3125   kmp_user_lock_p lck;
3126   int rc;
3127 
3128   if ((__kmp_user_lock_kind == lk_tas) &&
3129       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3130     lck = (kmp_user_lock_p)user_lock;
3131   }
3132 #if KMP_USE_FUTEX
3133   else if ((__kmp_user_lock_kind == lk_futex) &&
3134            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3135     lck = (kmp_user_lock_p)user_lock;
3136   }
3137 #endif
3138   else {
3139     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3140   }
3141 
3142 #if USE_ITT_BUILD
3143   __kmp_itt_lock_acquiring(lck);
3144 #endif /* USE_ITT_BUILD */
3145 #if OMPT_SUPPORT && OMPT_OPTIONAL
3146   // This is the case, if called from omp_init_lock_with_hint:
3147   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3148   if (!codeptr)
3149     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3150   if (ompt_enabled.ompt_callback_mutex_acquire) {
3151     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3152         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3153         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3154   }
3155 #endif
3156 
3157   rc = TEST_LOCK(lck, gtid);
3158 #if USE_ITT_BUILD
3159   if (rc) {
3160     __kmp_itt_lock_acquired(lck);
3161   } else {
3162     __kmp_itt_lock_cancelled(lck);
3163   }
3164 #endif /* USE_ITT_BUILD */
3165 #if OMPT_SUPPORT && OMPT_OPTIONAL
3166   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3167     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3168         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3169   }
3170 #endif
3171 
3172   return (rc ? FTN_TRUE : FTN_FALSE);
3173 
3174   /* Can't use serial interval since not block structured */
3175 
3176 #endif // KMP_USE_DYNAMIC_LOCK
3177 }
3178 
3179 /* try to acquire the lock */
3180 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3181 #if KMP_USE_DYNAMIC_LOCK
3182   int rc;
3183 #if USE_ITT_BUILD
3184   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3185 #endif
3186 #if OMPT_SUPPORT && OMPT_OPTIONAL
3187   // This is the case, if called from omp_init_lock_with_hint:
3188   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3189   if (!codeptr)
3190     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3191   if (ompt_enabled.ompt_callback_mutex_acquire) {
3192     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3193         ompt_mutex_nest_lock, omp_lock_hint_none,
3194         __ompt_get_mutex_impl_type(user_lock),
3195         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3196   }
3197 #endif
3198   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3199 #if USE_ITT_BUILD
3200   if (rc) {
3201     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3202   } else {
3203     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3204   }
3205 #endif
3206 #if OMPT_SUPPORT && OMPT_OPTIONAL
3207   if (ompt_enabled.enabled && rc) {
3208     if (rc == 1) {
3209       if (ompt_enabled.ompt_callback_mutex_acquired) {
3210         // lock_first
3211         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3212             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3213             codeptr);
3214       }
3215     } else {
3216       if (ompt_enabled.ompt_callback_nest_lock) {
3217         // lock_next
3218         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3219             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3220       }
3221     }
3222   }
3223 #endif
3224   return rc;
3225 
3226 #else // KMP_USE_DYNAMIC_LOCK
3227 
3228   kmp_user_lock_p lck;
3229   int rc;
3230 
3231   if ((__kmp_user_lock_kind == lk_tas) &&
3232       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3233        OMP_NEST_LOCK_T_SIZE)) {
3234     lck = (kmp_user_lock_p)user_lock;
3235   }
3236 #if KMP_USE_FUTEX
3237   else if ((__kmp_user_lock_kind == lk_futex) &&
3238            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3239             OMP_NEST_LOCK_T_SIZE)) {
3240     lck = (kmp_user_lock_p)user_lock;
3241   }
3242 #endif
3243   else {
3244     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3245   }
3246 
3247 #if USE_ITT_BUILD
3248   __kmp_itt_lock_acquiring(lck);
3249 #endif /* USE_ITT_BUILD */
3250 
3251 #if OMPT_SUPPORT && OMPT_OPTIONAL
3252   // This is the case, if called from omp_init_lock_with_hint:
3253   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3254   if (!codeptr)
3255     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3256   if (ompt_enabled.enabled) &&
3257         ompt_enabled.ompt_callback_mutex_acquire) {
3258       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3259           ompt_mutex_nest_lock, omp_lock_hint_none,
3260           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3261           codeptr);
3262     }
3263 #endif
3264 
3265   rc = TEST_NESTED_LOCK(lck, gtid);
3266 #if USE_ITT_BUILD
3267   if (rc) {
3268     __kmp_itt_lock_acquired(lck);
3269   } else {
3270     __kmp_itt_lock_cancelled(lck);
3271   }
3272 #endif /* USE_ITT_BUILD */
3273 #if OMPT_SUPPORT && OMPT_OPTIONAL
3274   if (ompt_enabled.enabled && rc) {
3275     if (rc == 1) {
3276       if (ompt_enabled.ompt_callback_mutex_acquired) {
3277         // lock_first
3278         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3279             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3280       }
3281     } else {
3282       if (ompt_enabled.ompt_callback_nest_lock) {
3283         // lock_next
3284         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3285             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3286       }
3287     }
3288   }
3289 #endif
3290   return rc;
3291 
3292   /* Can't use serial interval since not block structured */
3293 
3294 #endif // KMP_USE_DYNAMIC_LOCK
3295 }
3296 
3297 // Interface to fast scalable reduce methods routines
3298 
3299 // keep the selected method in a thread local structure for cross-function
3300 // usage: will be used in __kmpc_end_reduce* functions;
3301 // another solution: to re-determine the method one more time in
3302 // __kmpc_end_reduce* functions (new prototype required then)
3303 // AT: which solution is better?
3304 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3305   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3306 
3307 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3308   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3309 
3310 // description of the packed_reduction_method variable: look at the macros in
3311 // kmp.h
3312 
3313 // used in a critical section reduce block
3314 static __forceinline void
3315 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3316                                           kmp_critical_name *crit) {
3317 
3318   // this lock was visible to a customer and to the threading profile tool as a
3319   // serial overhead span (although it's used for an internal purpose only)
3320   //            why was it visible in previous implementation?
3321   //            should we keep it visible in new reduce block?
3322   kmp_user_lock_p lck;
3323 
3324 #if KMP_USE_DYNAMIC_LOCK
3325 
3326   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3327   // Check if it is initialized.
3328   if (*lk == 0) {
3329     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3330       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3331                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3332     } else {
3333       __kmp_init_indirect_csptr(crit, loc, global_tid,
3334                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3335     }
3336   }
3337   // Branch for accessing the actual lock object and set operation. This
3338   // branching is inevitable since this lock initialization does not follow the
3339   // normal dispatch path (lock table is not used).
3340   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3341     lck = (kmp_user_lock_p)lk;
3342     KMP_DEBUG_ASSERT(lck != NULL);
3343     if (__kmp_env_consistency_check) {
3344       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3345     }
3346     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3347   } else {
3348     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3349     lck = ilk->lock;
3350     KMP_DEBUG_ASSERT(lck != NULL);
3351     if (__kmp_env_consistency_check) {
3352       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3353     }
3354     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3355   }
3356 
3357 #else // KMP_USE_DYNAMIC_LOCK
3358 
3359   // We know that the fast reduction code is only emitted by Intel compilers
3360   // with 32 byte critical sections. If there isn't enough space, then we
3361   // have to use a pointer.
3362   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3363     lck = (kmp_user_lock_p)crit;
3364   } else {
3365     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3366   }
3367   KMP_DEBUG_ASSERT(lck != NULL);
3368 
3369   if (__kmp_env_consistency_check)
3370     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3371 
3372   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3373 
3374 #endif // KMP_USE_DYNAMIC_LOCK
3375 }
3376 
3377 // used in a critical section reduce block
3378 static __forceinline void
3379 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3380                                         kmp_critical_name *crit) {
3381 
3382   kmp_user_lock_p lck;
3383 
3384 #if KMP_USE_DYNAMIC_LOCK
3385 
3386   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3387     lck = (kmp_user_lock_p)crit;
3388     if (__kmp_env_consistency_check)
3389       __kmp_pop_sync(global_tid, ct_critical, loc);
3390     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3391   } else {
3392     kmp_indirect_lock_t *ilk =
3393         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3394     if (__kmp_env_consistency_check)
3395       __kmp_pop_sync(global_tid, ct_critical, loc);
3396     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3397   }
3398 
3399 #else // KMP_USE_DYNAMIC_LOCK
3400 
3401   // We know that the fast reduction code is only emitted by Intel compilers
3402   // with 32 byte critical sections. If there isn't enough space, then we have
3403   // to use a pointer.
3404   if (__kmp_base_user_lock_size > 32) {
3405     lck = *((kmp_user_lock_p *)crit);
3406     KMP_ASSERT(lck != NULL);
3407   } else {
3408     lck = (kmp_user_lock_p)crit;
3409   }
3410 
3411   if (__kmp_env_consistency_check)
3412     __kmp_pop_sync(global_tid, ct_critical, loc);
3413 
3414   __kmp_release_user_lock_with_checks(lck, global_tid);
3415 
3416 #endif // KMP_USE_DYNAMIC_LOCK
3417 } // __kmp_end_critical_section_reduce_block
3418 
3419 static __forceinline int
3420 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3421                                      int *task_state) {
3422   kmp_team_t *team;
3423 
3424   // Check if we are inside the teams construct?
3425   if (th->th.th_teams_microtask) {
3426     *team_p = team = th->th.th_team;
3427     if (team->t.t_level == th->th.th_teams_level) {
3428       // This is reduction at teams construct.
3429       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3430       // Let's swap teams temporarily for the reduction.
3431       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3432       th->th.th_team = team->t.t_parent;
3433       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3434       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3435       *task_state = th->th.th_task_state;
3436       th->th.th_task_state = 0;
3437 
3438       return 1;
3439     }
3440   }
3441   return 0;
3442 }
3443 
3444 static __forceinline void
3445 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3446   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3447   th->th.th_info.ds.ds_tid = 0;
3448   th->th.th_team = team;
3449   th->th.th_team_nproc = team->t.t_nproc;
3450   th->th.th_task_team = team->t.t_task_team[task_state];
3451   __kmp_type_convert(task_state, &(th->th.th_task_state));
3452 }
3453 
3454 /* 2.a.i. Reduce Block without a terminating barrier */
3455 /*!
3456 @ingroup SYNCHRONIZATION
3457 @param loc source location information
3458 @param global_tid global thread number
3459 @param num_vars number of items (variables) to be reduced
3460 @param reduce_size size of data in bytes to be reduced
3461 @param reduce_data pointer to data to be reduced
3462 @param reduce_func callback function providing reduction operation on two
3463 operands and returning result of reduction in lhs_data
3464 @param lck pointer to the unique lock data structure
3465 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3466 threads if atomic reduction needed
3467 
3468 The nowait version is used for a reduce clause with the nowait argument.
3469 */
3470 kmp_int32
3471 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3472                      size_t reduce_size, void *reduce_data,
3473                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3474                      kmp_critical_name *lck) {
3475 
3476   KMP_COUNT_BLOCK(REDUCE_nowait);
3477   int retval = 0;
3478   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3479   kmp_info_t *th;
3480   kmp_team_t *team;
3481   int teams_swapped = 0, task_state;
3482   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3483   __kmp_assert_valid_gtid(global_tid);
3484 
3485   // why do we need this initialization here at all?
3486   // Reduction clause can not be used as a stand-alone directive.
3487 
3488   // do not call __kmp_serial_initialize(), it will be called by
3489   // __kmp_parallel_initialize() if needed
3490   // possible detection of false-positive race by the threadchecker ???
3491   if (!TCR_4(__kmp_init_parallel))
3492     __kmp_parallel_initialize();
3493 
3494   __kmp_resume_if_soft_paused();
3495 
3496 // check correctness of reduce block nesting
3497 #if KMP_USE_DYNAMIC_LOCK
3498   if (__kmp_env_consistency_check)
3499     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3500 #else
3501   if (__kmp_env_consistency_check)
3502     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3503 #endif
3504 
3505   th = __kmp_thread_from_gtid(global_tid);
3506   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3507 
3508   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3509   // the value should be kept in a variable
3510   // the variable should be either a construct-specific or thread-specific
3511   // property, not a team specific property
3512   //     (a thread can reach the next reduce block on the next construct, reduce
3513   //     method may differ on the next construct)
3514   // an ident_t "loc" parameter could be used as a construct-specific property
3515   // (what if loc == 0?)
3516   //     (if both construct-specific and team-specific variables were shared,
3517   //     then unness extra syncs should be needed)
3518   // a thread-specific variable is better regarding two issues above (next
3519   // construct and extra syncs)
3520   // a thread-specific "th_local.reduction_method" variable is used currently
3521   // each thread executes 'determine' and 'set' lines (no need to execute by one
3522   // thread, to avoid unness extra syncs)
3523 
3524   packed_reduction_method = __kmp_determine_reduction_method(
3525       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3526   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3527 
3528   OMPT_REDUCTION_DECL(th, global_tid);
3529   if (packed_reduction_method == critical_reduce_block) {
3530 
3531     OMPT_REDUCTION_BEGIN;
3532 
3533     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3534     retval = 1;
3535 
3536   } else if (packed_reduction_method == empty_reduce_block) {
3537 
3538     OMPT_REDUCTION_BEGIN;
3539 
3540     // usage: if team size == 1, no synchronization is required ( Intel
3541     // platforms only )
3542     retval = 1;
3543 
3544   } else if (packed_reduction_method == atomic_reduce_block) {
3545 
3546     retval = 2;
3547 
3548     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3549     // won't be called by the code gen)
3550     //     (it's not quite good, because the checking block has been closed by
3551     //     this 'pop',
3552     //      but atomic operation has not been executed yet, will be executed
3553     //      slightly later, literally on next instruction)
3554     if (__kmp_env_consistency_check)
3555       __kmp_pop_sync(global_tid, ct_reduce, loc);
3556 
3557   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3558                                    tree_reduce_block)) {
3559 
3560 // AT: performance issue: a real barrier here
3561 // AT: (if primary thread is slow, other threads are blocked here waiting for
3562 //      the primary thread to come and release them)
3563 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3564 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3565 //      be confusing to a customer)
3566 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3567 // might go faster and be more in line with sense of NOWAIT
3568 // AT: TO DO: do epcc test and compare times
3569 
3570 // this barrier should be invisible to a customer and to the threading profile
3571 // tool (it's neither a terminating barrier nor customer's code, it's
3572 // used for an internal purpose)
3573 #if OMPT_SUPPORT
3574     // JP: can this barrier potentially leed to task scheduling?
3575     // JP: as long as there is a barrier in the implementation, OMPT should and
3576     // will provide the barrier events
3577     //         so we set-up the necessary frame/return addresses.
3578     ompt_frame_t *ompt_frame;
3579     if (ompt_enabled.enabled) {
3580       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3581       if (ompt_frame->enter_frame.ptr == NULL)
3582         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3583     }
3584     OMPT_STORE_RETURN_ADDRESS(global_tid);
3585 #endif
3586 #if USE_ITT_NOTIFY
3587     __kmp_threads[global_tid]->th.th_ident = loc;
3588 #endif
3589     retval =
3590         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3591                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3592     retval = (retval != 0) ? (0) : (1);
3593 #if OMPT_SUPPORT && OMPT_OPTIONAL
3594     if (ompt_enabled.enabled) {
3595       ompt_frame->enter_frame = ompt_data_none;
3596     }
3597 #endif
3598 
3599     // all other workers except primary thread should do this pop here
3600     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3601     if (__kmp_env_consistency_check) {
3602       if (retval == 0) {
3603         __kmp_pop_sync(global_tid, ct_reduce, loc);
3604       }
3605     }
3606 
3607   } else {
3608 
3609     // should never reach this block
3610     KMP_ASSERT(0); // "unexpected method"
3611   }
3612   if (teams_swapped) {
3613     __kmp_restore_swapped_teams(th, team, task_state);
3614   }
3615   KA_TRACE(
3616       10,
3617       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3618        global_tid, packed_reduction_method, retval));
3619 
3620   return retval;
3621 }
3622 
3623 /*!
3624 @ingroup SYNCHRONIZATION
3625 @param loc source location information
3626 @param global_tid global thread id.
3627 @param lck pointer to the unique lock data structure
3628 
3629 Finish the execution of a reduce nowait.
3630 */
3631 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3632                               kmp_critical_name *lck) {
3633 
3634   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3635 
3636   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3637   __kmp_assert_valid_gtid(global_tid);
3638 
3639   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3640 
3641   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3642 
3643   if (packed_reduction_method == critical_reduce_block) {
3644 
3645     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3646     OMPT_REDUCTION_END;
3647 
3648   } else if (packed_reduction_method == empty_reduce_block) {
3649 
3650     // usage: if team size == 1, no synchronization is required ( on Intel
3651     // platforms only )
3652 
3653     OMPT_REDUCTION_END;
3654 
3655   } else if (packed_reduction_method == atomic_reduce_block) {
3656 
3657     // neither primary thread nor other workers should get here
3658     //     (code gen does not generate this call in case 2: atomic reduce block)
3659     // actually it's better to remove this elseif at all;
3660     // after removal this value will checked by the 'else' and will assert
3661 
3662   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3663                                    tree_reduce_block)) {
3664 
3665     // only primary thread gets here
3666     // OMPT: tree reduction is annotated in the barrier code
3667 
3668   } else {
3669 
3670     // should never reach this block
3671     KMP_ASSERT(0); // "unexpected method"
3672   }
3673 
3674   if (__kmp_env_consistency_check)
3675     __kmp_pop_sync(global_tid, ct_reduce, loc);
3676 
3677   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3678                 global_tid, packed_reduction_method));
3679 
3680   return;
3681 }
3682 
3683 /* 2.a.ii. Reduce Block with a terminating barrier */
3684 
3685 /*!
3686 @ingroup SYNCHRONIZATION
3687 @param loc source location information
3688 @param global_tid global thread number
3689 @param num_vars number of items (variables) to be reduced
3690 @param reduce_size size of data in bytes to be reduced
3691 @param reduce_data pointer to data to be reduced
3692 @param reduce_func callback function providing reduction operation on two
3693 operands and returning result of reduction in lhs_data
3694 @param lck pointer to the unique lock data structure
3695 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3696 threads if atomic reduction needed
3697 
3698 A blocking reduce that includes an implicit barrier.
3699 */
3700 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3701                         size_t reduce_size, void *reduce_data,
3702                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3703                         kmp_critical_name *lck) {
3704   KMP_COUNT_BLOCK(REDUCE_wait);
3705   int retval = 0;
3706   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3707   kmp_info_t *th;
3708   kmp_team_t *team;
3709   int teams_swapped = 0, task_state;
3710 
3711   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3712   __kmp_assert_valid_gtid(global_tid);
3713 
3714   // why do we need this initialization here at all?
3715   // Reduction clause can not be a stand-alone directive.
3716 
3717   // do not call __kmp_serial_initialize(), it will be called by
3718   // __kmp_parallel_initialize() if needed
3719   // possible detection of false-positive race by the threadchecker ???
3720   if (!TCR_4(__kmp_init_parallel))
3721     __kmp_parallel_initialize();
3722 
3723   __kmp_resume_if_soft_paused();
3724 
3725 // check correctness of reduce block nesting
3726 #if KMP_USE_DYNAMIC_LOCK
3727   if (__kmp_env_consistency_check)
3728     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3729 #else
3730   if (__kmp_env_consistency_check)
3731     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3732 #endif
3733 
3734   th = __kmp_thread_from_gtid(global_tid);
3735   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3736 
3737   packed_reduction_method = __kmp_determine_reduction_method(
3738       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3739   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3740 
3741   OMPT_REDUCTION_DECL(th, global_tid);
3742 
3743   if (packed_reduction_method == critical_reduce_block) {
3744 
3745     OMPT_REDUCTION_BEGIN;
3746     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3747     retval = 1;
3748 
3749   } else if (packed_reduction_method == empty_reduce_block) {
3750 
3751     OMPT_REDUCTION_BEGIN;
3752     // usage: if team size == 1, no synchronization is required ( Intel
3753     // platforms only )
3754     retval = 1;
3755 
3756   } else if (packed_reduction_method == atomic_reduce_block) {
3757 
3758     retval = 2;
3759 
3760   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3761                                    tree_reduce_block)) {
3762 
3763 // case tree_reduce_block:
3764 // this barrier should be visible to a customer and to the threading profile
3765 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3766 #if OMPT_SUPPORT
3767     ompt_frame_t *ompt_frame;
3768     if (ompt_enabled.enabled) {
3769       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3770       if (ompt_frame->enter_frame.ptr == NULL)
3771         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3772     }
3773     OMPT_STORE_RETURN_ADDRESS(global_tid);
3774 #endif
3775 #if USE_ITT_NOTIFY
3776     __kmp_threads[global_tid]->th.th_ident =
3777         loc; // needed for correct notification of frames
3778 #endif
3779     retval =
3780         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3781                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3782     retval = (retval != 0) ? (0) : (1);
3783 #if OMPT_SUPPORT && OMPT_OPTIONAL
3784     if (ompt_enabled.enabled) {
3785       ompt_frame->enter_frame = ompt_data_none;
3786     }
3787 #endif
3788 
3789     // all other workers except primary thread should do this pop here
3790     // (none of other workers except primary will enter __kmpc_end_reduce())
3791     if (__kmp_env_consistency_check) {
3792       if (retval == 0) { // 0: all other workers; 1: primary thread
3793         __kmp_pop_sync(global_tid, ct_reduce, loc);
3794       }
3795     }
3796 
3797   } else {
3798 
3799     // should never reach this block
3800     KMP_ASSERT(0); // "unexpected method"
3801   }
3802   if (teams_swapped) {
3803     __kmp_restore_swapped_teams(th, team, task_state);
3804   }
3805 
3806   KA_TRACE(10,
3807            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3808             global_tid, packed_reduction_method, retval));
3809   return retval;
3810 }
3811 
3812 /*!
3813 @ingroup SYNCHRONIZATION
3814 @param loc source location information
3815 @param global_tid global thread id.
3816 @param lck pointer to the unique lock data structure
3817 
3818 Finish the execution of a blocking reduce.
3819 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3820 start function.
3821 */
3822 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3823                        kmp_critical_name *lck) {
3824 
3825   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3826   kmp_info_t *th;
3827   kmp_team_t *team;
3828   int teams_swapped = 0, task_state;
3829 
3830   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3831   __kmp_assert_valid_gtid(global_tid);
3832 
3833   th = __kmp_thread_from_gtid(global_tid);
3834   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3835 
3836   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3837 
3838   // this barrier should be visible to a customer and to the threading profile
3839   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3840   OMPT_REDUCTION_DECL(th, global_tid);
3841 
3842   if (packed_reduction_method == critical_reduce_block) {
3843     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3844 
3845     OMPT_REDUCTION_END;
3846 
3847 // TODO: implicit barrier: should be exposed
3848 #if OMPT_SUPPORT
3849     ompt_frame_t *ompt_frame;
3850     if (ompt_enabled.enabled) {
3851       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3852       if (ompt_frame->enter_frame.ptr == NULL)
3853         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3854     }
3855     OMPT_STORE_RETURN_ADDRESS(global_tid);
3856 #endif
3857 #if USE_ITT_NOTIFY
3858     __kmp_threads[global_tid]->th.th_ident = loc;
3859 #endif
3860     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3861 #if OMPT_SUPPORT && OMPT_OPTIONAL
3862     if (ompt_enabled.enabled) {
3863       ompt_frame->enter_frame = ompt_data_none;
3864     }
3865 #endif
3866 
3867   } else if (packed_reduction_method == empty_reduce_block) {
3868 
3869     OMPT_REDUCTION_END;
3870 
3871 // usage: if team size==1, no synchronization is required (Intel platforms only)
3872 
3873 // TODO: implicit barrier: should be exposed
3874 #if OMPT_SUPPORT
3875     ompt_frame_t *ompt_frame;
3876     if (ompt_enabled.enabled) {
3877       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3878       if (ompt_frame->enter_frame.ptr == NULL)
3879         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3880     }
3881     OMPT_STORE_RETURN_ADDRESS(global_tid);
3882 #endif
3883 #if USE_ITT_NOTIFY
3884     __kmp_threads[global_tid]->th.th_ident = loc;
3885 #endif
3886     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3887 #if OMPT_SUPPORT && OMPT_OPTIONAL
3888     if (ompt_enabled.enabled) {
3889       ompt_frame->enter_frame = ompt_data_none;
3890     }
3891 #endif
3892 
3893   } else if (packed_reduction_method == atomic_reduce_block) {
3894 
3895 #if OMPT_SUPPORT
3896     ompt_frame_t *ompt_frame;
3897     if (ompt_enabled.enabled) {
3898       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3899       if (ompt_frame->enter_frame.ptr == NULL)
3900         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3901     }
3902     OMPT_STORE_RETURN_ADDRESS(global_tid);
3903 #endif
3904 // TODO: implicit barrier: should be exposed
3905 #if USE_ITT_NOTIFY
3906     __kmp_threads[global_tid]->th.th_ident = loc;
3907 #endif
3908     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3909 #if OMPT_SUPPORT && OMPT_OPTIONAL
3910     if (ompt_enabled.enabled) {
3911       ompt_frame->enter_frame = ompt_data_none;
3912     }
3913 #endif
3914 
3915   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3916                                    tree_reduce_block)) {
3917 
3918     // only primary thread executes here (primary releases all other workers)
3919     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3920                             global_tid);
3921 
3922   } else {
3923 
3924     // should never reach this block
3925     KMP_ASSERT(0); // "unexpected method"
3926   }
3927   if (teams_swapped) {
3928     __kmp_restore_swapped_teams(th, team, task_state);
3929   }
3930 
3931   if (__kmp_env_consistency_check)
3932     __kmp_pop_sync(global_tid, ct_reduce, loc);
3933 
3934   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3935                 global_tid, packed_reduction_method));
3936 
3937   return;
3938 }
3939 
3940 #undef __KMP_GET_REDUCTION_METHOD
3941 #undef __KMP_SET_REDUCTION_METHOD
3942 
3943 /* end of interface to fast scalable reduce routines */
3944 
3945 kmp_uint64 __kmpc_get_taskid() {
3946 
3947   kmp_int32 gtid;
3948   kmp_info_t *thread;
3949 
3950   gtid = __kmp_get_gtid();
3951   if (gtid < 0) {
3952     return 0;
3953   }
3954   thread = __kmp_thread_from_gtid(gtid);
3955   return thread->th.th_current_task->td_task_id;
3956 
3957 } // __kmpc_get_taskid
3958 
3959 kmp_uint64 __kmpc_get_parent_taskid() {
3960 
3961   kmp_int32 gtid;
3962   kmp_info_t *thread;
3963   kmp_taskdata_t *parent_task;
3964 
3965   gtid = __kmp_get_gtid();
3966   if (gtid < 0) {
3967     return 0;
3968   }
3969   thread = __kmp_thread_from_gtid(gtid);
3970   parent_task = thread->th.th_current_task->td_parent;
3971   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3972 
3973 } // __kmpc_get_parent_taskid
3974 
3975 /*!
3976 @ingroup WORK_SHARING
3977 @param loc  source location information.
3978 @param gtid  global thread number.
3979 @param num_dims  number of associated doacross loops.
3980 @param dims  info on loops bounds.
3981 
3982 Initialize doacross loop information.
3983 Expect compiler send us inclusive bounds,
3984 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3985 */
3986 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3987                           const struct kmp_dim *dims) {
3988   __kmp_assert_valid_gtid(gtid);
3989   int j, idx;
3990   kmp_int64 last, trace_count;
3991   kmp_info_t *th = __kmp_threads[gtid];
3992   kmp_team_t *team = th->th.th_team;
3993   kmp_uint32 *flags;
3994   kmp_disp_t *pr_buf = th->th.th_dispatch;
3995   dispatch_shared_info_t *sh_buf;
3996 
3997   KA_TRACE(
3998       20,
3999       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4000        gtid, num_dims, !team->t.t_serialized));
4001   KMP_DEBUG_ASSERT(dims != NULL);
4002   KMP_DEBUG_ASSERT(num_dims > 0);
4003 
4004   if (team->t.t_serialized) {
4005     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4006     return; // no dependencies if team is serialized
4007   }
4008   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4009   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4010   // the next loop
4011   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4012 
4013   // Save bounds info into allocated private buffer
4014   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4015   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4016       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4017   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4018   pr_buf->th_doacross_info[0] =
4019       (kmp_int64)num_dims; // first element is number of dimensions
4020   // Save also address of num_done in order to access it later without knowing
4021   // the buffer index
4022   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4023   pr_buf->th_doacross_info[2] = dims[0].lo;
4024   pr_buf->th_doacross_info[3] = dims[0].up;
4025   pr_buf->th_doacross_info[4] = dims[0].st;
4026   last = 5;
4027   for (j = 1; j < num_dims; ++j) {
4028     kmp_int64
4029         range_length; // To keep ranges of all dimensions but the first dims[0]
4030     if (dims[j].st == 1) { // most common case
4031       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4032       range_length = dims[j].up - dims[j].lo + 1;
4033     } else {
4034       if (dims[j].st > 0) {
4035         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4036         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4037       } else { // negative increment
4038         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4039         range_length =
4040             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4041       }
4042     }
4043     pr_buf->th_doacross_info[last++] = range_length;
4044     pr_buf->th_doacross_info[last++] = dims[j].lo;
4045     pr_buf->th_doacross_info[last++] = dims[j].up;
4046     pr_buf->th_doacross_info[last++] = dims[j].st;
4047   }
4048 
4049   // Compute total trip count.
4050   // Start with range of dims[0] which we don't need to keep in the buffer.
4051   if (dims[0].st == 1) { // most common case
4052     trace_count = dims[0].up - dims[0].lo + 1;
4053   } else if (dims[0].st > 0) {
4054     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4055     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4056   } else { // negative increment
4057     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4058     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4059   }
4060   for (j = 1; j < num_dims; ++j) {
4061     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4062   }
4063   KMP_DEBUG_ASSERT(trace_count > 0);
4064 
4065   // Check if shared buffer is not occupied by other loop (idx -
4066   // __kmp_dispatch_num_buffers)
4067   if (idx != sh_buf->doacross_buf_idx) {
4068     // Shared buffer is occupied, wait for it to be free
4069     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4070                  __kmp_eq_4, NULL);
4071   }
4072 #if KMP_32_BIT_ARCH
4073   // Check if we are the first thread. After the CAS the first thread gets 0,
4074   // others get 1 if initialization is in progress, allocated pointer otherwise.
4075   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4076   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4077       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4078 #else
4079   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4080       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4081 #endif
4082   if (flags == NULL) {
4083     // we are the first thread, allocate the array of flags
4084     size_t size =
4085         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4086     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4087     KMP_MB();
4088     sh_buf->doacross_flags = flags;
4089   } else if (flags == (kmp_uint32 *)1) {
4090 #if KMP_32_BIT_ARCH
4091     // initialization is still in progress, need to wait
4092     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4093 #else
4094     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4095 #endif
4096       KMP_YIELD(TRUE);
4097     KMP_MB();
4098   } else {
4099     KMP_MB();
4100   }
4101   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4102   pr_buf->th_doacross_flags =
4103       sh_buf->doacross_flags; // save private copy in order to not
4104   // touch shared buffer on each iteration
4105   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4106 }
4107 
4108 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4109   __kmp_assert_valid_gtid(gtid);
4110   kmp_int64 shft;
4111   size_t num_dims, i;
4112   kmp_uint32 flag;
4113   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4114   kmp_info_t *th = __kmp_threads[gtid];
4115   kmp_team_t *team = th->th.th_team;
4116   kmp_disp_t *pr_buf;
4117   kmp_int64 lo, up, st;
4118 
4119   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4120   if (team->t.t_serialized) {
4121     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4122     return; // no dependencies if team is serialized
4123   }
4124 
4125   // calculate sequential iteration number and check out-of-bounds condition
4126   pr_buf = th->th.th_dispatch;
4127   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4128   num_dims = (size_t)pr_buf->th_doacross_info[0];
4129   lo = pr_buf->th_doacross_info[2];
4130   up = pr_buf->th_doacross_info[3];
4131   st = pr_buf->th_doacross_info[4];
4132 #if OMPT_SUPPORT && OMPT_OPTIONAL
4133   ompt_dependence_t deps[num_dims];
4134 #endif
4135   if (st == 1) { // most common case
4136     if (vec[0] < lo || vec[0] > up) {
4137       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4138                     "bounds [%lld,%lld]\n",
4139                     gtid, vec[0], lo, up));
4140       return;
4141     }
4142     iter_number = vec[0] - lo;
4143   } else if (st > 0) {
4144     if (vec[0] < lo || vec[0] > up) {
4145       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4146                     "bounds [%lld,%lld]\n",
4147                     gtid, vec[0], lo, up));
4148       return;
4149     }
4150     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4151   } else { // negative increment
4152     if (vec[0] > lo || vec[0] < up) {
4153       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4154                     "bounds [%lld,%lld]\n",
4155                     gtid, vec[0], lo, up));
4156       return;
4157     }
4158     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4159   }
4160 #if OMPT_SUPPORT && OMPT_OPTIONAL
4161   deps[0].variable.value = iter_number;
4162   deps[0].dependence_type = ompt_dependence_type_sink;
4163 #endif
4164   for (i = 1; i < num_dims; ++i) {
4165     kmp_int64 iter, ln;
4166     size_t j = i * 4;
4167     ln = pr_buf->th_doacross_info[j + 1];
4168     lo = pr_buf->th_doacross_info[j + 2];
4169     up = pr_buf->th_doacross_info[j + 3];
4170     st = pr_buf->th_doacross_info[j + 4];
4171     if (st == 1) {
4172       if (vec[i] < lo || vec[i] > up) {
4173         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4174                       "bounds [%lld,%lld]\n",
4175                       gtid, vec[i], lo, up));
4176         return;
4177       }
4178       iter = vec[i] - lo;
4179     } else if (st > 0) {
4180       if (vec[i] < lo || vec[i] > up) {
4181         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4182                       "bounds [%lld,%lld]\n",
4183                       gtid, vec[i], lo, up));
4184         return;
4185       }
4186       iter = (kmp_uint64)(vec[i] - lo) / st;
4187     } else { // st < 0
4188       if (vec[i] > lo || vec[i] < up) {
4189         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4190                       "bounds [%lld,%lld]\n",
4191                       gtid, vec[i], lo, up));
4192         return;
4193       }
4194       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4195     }
4196     iter_number = iter + ln * iter_number;
4197 #if OMPT_SUPPORT && OMPT_OPTIONAL
4198     deps[i].variable.value = iter;
4199     deps[i].dependence_type = ompt_dependence_type_sink;
4200 #endif
4201   }
4202   shft = iter_number % 32; // use 32-bit granularity
4203   iter_number >>= 5; // divided by 32
4204   flag = 1 << shft;
4205   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4206     KMP_YIELD(TRUE);
4207   }
4208   KMP_MB();
4209 #if OMPT_SUPPORT && OMPT_OPTIONAL
4210   if (ompt_enabled.ompt_callback_dependences) {
4211     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4212         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4213   }
4214 #endif
4215   KA_TRACE(20,
4216            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4217             gtid, (iter_number << 5) + shft));
4218 }
4219 
4220 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4221   __kmp_assert_valid_gtid(gtid);
4222   kmp_int64 shft;
4223   size_t num_dims, i;
4224   kmp_uint32 flag;
4225   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4226   kmp_info_t *th = __kmp_threads[gtid];
4227   kmp_team_t *team = th->th.th_team;
4228   kmp_disp_t *pr_buf;
4229   kmp_int64 lo, st;
4230 
4231   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4232   if (team->t.t_serialized) {
4233     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4234     return; // no dependencies if team is serialized
4235   }
4236 
4237   // calculate sequential iteration number (same as in "wait" but no
4238   // out-of-bounds checks)
4239   pr_buf = th->th.th_dispatch;
4240   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4241   num_dims = (size_t)pr_buf->th_doacross_info[0];
4242   lo = pr_buf->th_doacross_info[2];
4243   st = pr_buf->th_doacross_info[4];
4244 #if OMPT_SUPPORT && OMPT_OPTIONAL
4245   ompt_dependence_t deps[num_dims];
4246 #endif
4247   if (st == 1) { // most common case
4248     iter_number = vec[0] - lo;
4249   } else if (st > 0) {
4250     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4251   } else { // negative increment
4252     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4253   }
4254 #if OMPT_SUPPORT && OMPT_OPTIONAL
4255   deps[0].variable.value = iter_number;
4256   deps[0].dependence_type = ompt_dependence_type_source;
4257 #endif
4258   for (i = 1; i < num_dims; ++i) {
4259     kmp_int64 iter, ln;
4260     size_t j = i * 4;
4261     ln = pr_buf->th_doacross_info[j + 1];
4262     lo = pr_buf->th_doacross_info[j + 2];
4263     st = pr_buf->th_doacross_info[j + 4];
4264     if (st == 1) {
4265       iter = vec[i] - lo;
4266     } else if (st > 0) {
4267       iter = (kmp_uint64)(vec[i] - lo) / st;
4268     } else { // st < 0
4269       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4270     }
4271     iter_number = iter + ln * iter_number;
4272 #if OMPT_SUPPORT && OMPT_OPTIONAL
4273     deps[i].variable.value = iter;
4274     deps[i].dependence_type = ompt_dependence_type_source;
4275 #endif
4276   }
4277 #if OMPT_SUPPORT && OMPT_OPTIONAL
4278   if (ompt_enabled.ompt_callback_dependences) {
4279     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4280         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4281   }
4282 #endif
4283   shft = iter_number % 32; // use 32-bit granularity
4284   iter_number >>= 5; // divided by 32
4285   flag = 1 << shft;
4286   KMP_MB();
4287   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4288     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4289   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4290                 (iter_number << 5) + shft));
4291 }
4292 
4293 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4294   __kmp_assert_valid_gtid(gtid);
4295   kmp_int32 num_done;
4296   kmp_info_t *th = __kmp_threads[gtid];
4297   kmp_team_t *team = th->th.th_team;
4298   kmp_disp_t *pr_buf = th->th.th_dispatch;
4299 
4300   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4301   if (team->t.t_serialized) {
4302     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4303     return; // nothing to do
4304   }
4305   num_done =
4306       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4307   if (num_done == th->th.th_team_nproc) {
4308     // we are the last thread, need to free shared resources
4309     int idx = pr_buf->th_doacross_buf_idx - 1;
4310     dispatch_shared_info_t *sh_buf =
4311         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4312     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4313                      (kmp_int64)&sh_buf->doacross_num_done);
4314     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4315     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4316     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4317     sh_buf->doacross_flags = NULL;
4318     sh_buf->doacross_num_done = 0;
4319     sh_buf->doacross_buf_idx +=
4320         __kmp_dispatch_num_buffers; // free buffer for future re-use
4321   }
4322   // free private resources (need to keep buffer index forever)
4323   pr_buf->th_doacross_flags = NULL;
4324   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4325   pr_buf->th_doacross_info = NULL;
4326   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4327 }
4328 
4329 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4330 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4331   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4332 }
4333 
4334 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4335   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4336 }
4337 
4338 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4339                   omp_allocator_handle_t free_allocator) {
4340   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4341                         free_allocator);
4342 }
4343 
4344 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4345   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4346 }
4347 
4348 int __kmpc_get_target_offload(void) {
4349   if (!__kmp_init_serial) {
4350     __kmp_serial_initialize();
4351   }
4352   return __kmp_target_offload;
4353 }
4354 
4355 int __kmpc_pause_resource(kmp_pause_status_t level) {
4356   if (!__kmp_init_serial) {
4357     return 1; // Can't pause if runtime is not initialized
4358   }
4359   return __kmp_pause_resource(level);
4360 }
4361 
4362 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4363   if (!__kmp_init_serial)
4364     __kmp_serial_initialize();
4365 
4366   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4367 
4368 #if OMPT_SUPPORT
4369   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4370     ompt_callbacks.ompt_callback(ompt_callback_error)(
4371         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4372         OMPT_GET_RETURN_ADDRESS(0));
4373   }
4374 #endif // OMPT_SUPPORT
4375 
4376   char *src_loc;
4377   if (loc && loc->psource) {
4378     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4379     src_loc =
4380         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4381     __kmp_str_loc_free(&str_loc);
4382   } else {
4383     src_loc = __kmp_str_format("unknown");
4384   }
4385 
4386   if (severity == severity_warning)
4387     KMP_WARNING(UserDirectedWarning, src_loc, message);
4388   else
4389     KMP_FATAL(UserDirectedError, src_loc, message);
4390 
4391   __kmp_str_free(&src_loc);
4392 }
4393