1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the primary thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc source location information
357 @param global_tid global thread number
358 @param num_teams_lo lower bound on number of teams requested for the teams
359 construct
360 @param num_teams_up upper bound on number of teams requested for the teams
361 construct
362 @param num_threads number of threads per team requested for the teams construct
363 
364 Set the number of teams to be used by the teams construct. The number of initial
365 teams cretaed will be greater than or equal to the lower bound and less than or
366 equal to the upper bound.
367 This call is only required if the teams construct has a `num_teams` clause
368 or a `thread_limit` clause (or both).
369 */
370 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
371                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
372                               kmp_int32 num_threads) {
373   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
374                 " num_teams_ub=%d num_threads=%d\n",
375                 global_tid, num_teams_lb, num_teams_ub, num_threads));
376   __kmp_assert_valid_gtid(global_tid);
377   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
378                           num_threads);
379 }
380 
381 /*!
382 @ingroup PARALLEL
383 @param loc  source location information
384 @param argc  total number of arguments in the ellipsis
385 @param microtask  pointer to callback routine consisting of outlined teams
386 construct
387 @param ...  pointers to shared variables that aren't global
388 
389 Do the actual fork and call the microtask in the relevant number of threads.
390 */
391 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
392                        ...) {
393   int gtid = __kmp_entry_gtid();
394   kmp_info_t *this_thr = __kmp_threads[gtid];
395   va_list ap;
396   va_start(ap, microtask);
397 
398 #if KMP_STATS_ENABLED
399   KMP_COUNT_BLOCK(OMP_TEAMS);
400   stats_state_e previous_state = KMP_GET_THREAD_STATE();
401   if (previous_state == stats_state_e::SERIAL_REGION) {
402     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
403   } else {
404     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
405   }
406 #endif
407 
408   // remember teams entry point and nesting level
409   this_thr->th.th_teams_microtask = microtask;
410   this_thr->th.th_teams_level =
411       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
412 
413 #if OMPT_SUPPORT
414   kmp_team_t *parent_team = this_thr->th.th_team;
415   int tid = __kmp_tid_from_gtid(gtid);
416   if (ompt_enabled.enabled) {
417     parent_team->t.t_implicit_task_taskdata[tid]
418         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
419   }
420   OMPT_STORE_RETURN_ADDRESS(gtid);
421 #endif
422 
423   // check if __kmpc_push_num_teams called, set default number of teams
424   // otherwise
425   if (this_thr->th.th_teams_size.nteams == 0) {
426     __kmp_push_num_teams(loc, gtid, 0, 0);
427   }
428   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
431 
432   __kmp_fork_call(
433       loc, gtid, fork_context_intel, argc,
434       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
435       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
436   __kmp_join_call(loc, gtid
437 #if OMPT_SUPPORT
438                   ,
439                   fork_context_intel
440 #endif
441   );
442 
443   // Pop current CG root off list
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
446   this_thr->th.th_cg_roots = tmp->up;
447   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
448                  " to node %p. cg_nthreads was %d\n",
449                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
450   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
451   int i = tmp->cg_nthreads--;
452   if (i == 1) { // check is we are the last thread in CG (not always the case)
453     __kmp_free(tmp);
454   }
455   // Restore current task's thread_limit from CG root
456   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
457   this_thr->th.th_current_task->td_icvs.thread_limit =
458       this_thr->th.th_cg_roots->cg_thread_limit;
459 
460   this_thr->th.th_teams_microtask = NULL;
461   this_thr->th.th_teams_level = 0;
462   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
463   va_end(ap);
464 #if KMP_STATS_ENABLED
465   if (previous_state == stats_state_e::SERIAL_REGION) {
466     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
467     KMP_SET_THREAD_STATE(previous_state);
468   } else {
469     KMP_POP_PARTITIONED_TIMER();
470   }
471 #endif // KMP_STATS_ENABLED
472 }
473 
474 // I don't think this function should ever have been exported.
475 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
476 // openmp code ever called it, but it's been exported from the RTL for so
477 // long that I'm afraid to remove the definition.
478 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
479 
480 /*!
481 @ingroup PARALLEL
482 @param loc  source location information
483 @param global_tid  global thread number
484 
485 Enter a serialized parallel construct. This interface is used to handle a
486 conditional parallel region, like this,
487 @code
488 #pragma omp parallel if (condition)
489 @endcode
490 when the condition is false.
491 */
492 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
493   // The implementation is now in kmp_runtime.cpp so that it can share static
494   // functions with kmp_fork_call since the tasks to be done are similar in
495   // each case.
496   __kmp_assert_valid_gtid(global_tid);
497 #if OMPT_SUPPORT
498   OMPT_STORE_RETURN_ADDRESS(global_tid);
499 #endif
500   __kmp_serialized_parallel(loc, global_tid);
501 }
502 
503 /*!
504 @ingroup PARALLEL
505 @param loc  source location information
506 @param global_tid  global thread number
507 
508 Leave a serialized parallel construct.
509 */
510 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
511   kmp_internal_control_t *top;
512   kmp_info_t *this_thr;
513   kmp_team_t *serial_team;
514 
515   KC_TRACE(10,
516            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
517 
518   /* skip all this code for autopar serialized loops since it results in
519      unacceptable overhead */
520   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
521     return;
522 
523   // Not autopar code
524   __kmp_assert_valid_gtid(global_tid);
525   if (!TCR_4(__kmp_init_parallel))
526     __kmp_parallel_initialize();
527 
528   __kmp_resume_if_soft_paused();
529 
530   this_thr = __kmp_threads[global_tid];
531   serial_team = this_thr->th.th_serial_team;
532 
533   kmp_task_team_t *task_team = this_thr->th.th_task_team;
534   // we need to wait for the proxy tasks before finishing the thread
535   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   // if( serial_team -> t.t_serialized > 1 )
581   serial_team->t.t_level--;
582 
583   /* pop dispatch buffers stack */
584   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
585   {
586     dispatch_private_info_t *disp_buffer =
587         serial_team->t.t_dispatch->th_disp_buffer;
588     serial_team->t.t_dispatch->th_disp_buffer =
589         serial_team->t.t_dispatch->th_disp_buffer->next;
590     __kmp_free(disp_buffer);
591   }
592   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
593 
594   --serial_team->t.t_serialized;
595   if (serial_team->t.t_serialized == 0) {
596 
597     /* return to the parallel section */
598 
599 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
600     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
601       __kmp_clear_x87_fpu_status_word();
602       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
603       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
604     }
605 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
606 
607     this_thr->th.th_team = serial_team->t.t_parent;
608     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
609 
610     /* restore values cached in the thread */
611     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
612     this_thr->th.th_team_master =
613         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
614     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
615 
616     /* TODO the below shouldn't need to be adjusted for serialized teams */
617     this_thr->th.th_dispatch =
618         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
619 
620     __kmp_pop_current_task_from_thread(this_thr);
621 
622     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
623     this_thr->th.th_current_task->td_flags.executing = 1;
624 
625     if (__kmp_tasking_mode != tskm_immediate_exec) {
626       // Copy the task team from the new child / old parent team to the thread.
627       this_thr->th.th_task_team =
628           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
629       KA_TRACE(20,
630                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
631                 "team %p\n",
632                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
633     }
634   } else {
635     if (__kmp_tasking_mode != tskm_immediate_exec) {
636       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
637                     "depth of serial team %p to %d\n",
638                     global_tid, serial_team, serial_team->t.t_serialized));
639     }
640   }
641 
642   if (__kmp_env_consistency_check)
643     __kmp_pop_parallel(global_tid, NULL);
644 #if OMPT_SUPPORT
645   if (ompt_enabled.enabled)
646     this_thr->th.ompt_thread_info.state =
647         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
648                                            : ompt_state_work_parallel);
649 #endif
650 }
651 
652 /*!
653 @ingroup SYNCHRONIZATION
654 @param loc  source location information.
655 
656 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
657 depending on the memory ordering convention obeyed by the compiler
658 even that may not be necessary).
659 */
660 void __kmpc_flush(ident_t *loc) {
661   KC_TRACE(10, ("__kmpc_flush: called\n"));
662 
663   /* need explicit __mf() here since use volatile instead in library */
664   KMP_MB(); /* Flush all pending memory write invalidates.  */
665 
666 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
667 #if KMP_MIC
668 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
669 // We shouldn't need it, though, since the ABI rules require that
670 // * If the compiler generates NGO stores it also generates the fence
671 // * If users hand-code NGO stores they should insert the fence
672 // therefore no incomplete unordered stores should be visible.
673 #else
674   // C74404
675   // This is to address non-temporal store instructions (sfence needed).
676   // The clflush instruction is addressed either (mfence needed).
677   // Probably the non-temporal load monvtdqa instruction should also be
678   // addressed.
679   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
680   if (!__kmp_cpuinfo.initialized) {
681     __kmp_query_cpuid(&__kmp_cpuinfo);
682   }
683   if (!__kmp_cpuinfo.sse2) {
684     // CPU cannot execute SSE2 instructions.
685   } else {
686 #if KMP_COMPILER_ICC
687     _mm_mfence();
688 #elif KMP_COMPILER_MSVC
689     MemoryBarrier();
690 #else
691     __sync_synchronize();
692 #endif // KMP_COMPILER_ICC
693   }
694 #endif // KMP_MIC
695 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
696        KMP_ARCH_RISCV64)
697 // Nothing to see here move along
698 #elif KMP_ARCH_PPC64
699 // Nothing needed here (we have a real MB above).
700 #else
701 #error Unknown or unsupported architecture
702 #endif
703 
704 #if OMPT_SUPPORT && OMPT_OPTIONAL
705   if (ompt_enabled.ompt_callback_flush) {
706     ompt_callbacks.ompt_callback(ompt_callback_flush)(
707         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
708   }
709 #endif
710 }
711 
712 /* -------------------------------------------------------------------------- */
713 /*!
714 @ingroup SYNCHRONIZATION
715 @param loc source location information
716 @param global_tid thread id.
717 
718 Execute a barrier.
719 */
720 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
721   KMP_COUNT_BLOCK(OMP_BARRIER);
722   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
723   __kmp_assert_valid_gtid(global_tid);
724 
725   if (!TCR_4(__kmp_init_parallel))
726     __kmp_parallel_initialize();
727 
728   __kmp_resume_if_soft_paused();
729 
730   if (__kmp_env_consistency_check) {
731     if (loc == 0) {
732       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
733     }
734     __kmp_check_barrier(global_tid, ct_barrier, loc);
735   }
736 
737 #if OMPT_SUPPORT
738   ompt_frame_t *ompt_frame;
739   if (ompt_enabled.enabled) {
740     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
741     if (ompt_frame->enter_frame.ptr == NULL)
742       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
743   }
744   OMPT_STORE_RETURN_ADDRESS(global_tid);
745 #endif
746   __kmp_threads[global_tid]->th.th_ident = loc;
747   // TODO: explicit barrier_wait_id:
748   //   this function is called when 'barrier' directive is present or
749   //   implicit barrier at the end of a worksharing construct.
750   // 1) better to add a per-thread barrier counter to a thread data structure
751   // 2) set to 0 when a new team is created
752   // 4) no sync is required
753 
754   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
755 #if OMPT_SUPPORT && OMPT_OPTIONAL
756   if (ompt_enabled.enabled) {
757     ompt_frame->enter_frame = ompt_data_none;
758   }
759 #endif
760 }
761 
762 /* The BARRIER for a MASTER section is always explicit   */
763 /*!
764 @ingroup WORK_SHARING
765 @param loc  source location information.
766 @param global_tid  global thread number .
767 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
768 */
769 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
770   int status = 0;
771 
772   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
773   __kmp_assert_valid_gtid(global_tid);
774 
775   if (!TCR_4(__kmp_init_parallel))
776     __kmp_parallel_initialize();
777 
778   __kmp_resume_if_soft_paused();
779 
780   if (KMP_MASTER_GTID(global_tid)) {
781     KMP_COUNT_BLOCK(OMP_MASTER);
782     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
783     status = 1;
784   }
785 
786 #if OMPT_SUPPORT && OMPT_OPTIONAL
787   if (status) {
788     if (ompt_enabled.ompt_callback_masked) {
789       kmp_info_t *this_thr = __kmp_threads[global_tid];
790       kmp_team_t *team = this_thr->th.th_team;
791 
792       int tid = __kmp_tid_from_gtid(global_tid);
793       ompt_callbacks.ompt_callback(ompt_callback_masked)(
794           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
795           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
796           OMPT_GET_RETURN_ADDRESS(0));
797     }
798   }
799 #endif
800 
801   if (__kmp_env_consistency_check) {
802 #if KMP_USE_DYNAMIC_LOCK
803     if (status)
804       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
805     else
806       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
807 #else
808     if (status)
809       __kmp_push_sync(global_tid, ct_master, loc, NULL);
810     else
811       __kmp_check_sync(global_tid, ct_master, loc, NULL);
812 #endif
813   }
814 
815   return status;
816 }
817 
818 /*!
819 @ingroup WORK_SHARING
820 @param loc  source location information.
821 @param global_tid  global thread number .
822 
823 Mark the end of a <tt>master</tt> region. This should only be called by the
824 thread that executes the <tt>master</tt> region.
825 */
826 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
827   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
828   __kmp_assert_valid_gtid(global_tid);
829   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
830   KMP_POP_PARTITIONED_TIMER();
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_info_t *this_thr = __kmp_threads[global_tid];
834   kmp_team_t *team = this_thr->th.th_team;
835   if (ompt_enabled.ompt_callback_masked) {
836     int tid = __kmp_tid_from_gtid(global_tid);
837     ompt_callbacks.ompt_callback(ompt_callback_masked)(
838         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
839         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
840         OMPT_GET_RETURN_ADDRESS(0));
841   }
842 #endif
843 
844   if (__kmp_env_consistency_check) {
845     if (KMP_MASTER_GTID(global_tid))
846       __kmp_pop_sync(global_tid, ct_master, loc);
847   }
848 }
849 
850 /*!
851 @ingroup WORK_SHARING
852 @param loc  source location information.
853 @param global_tid  global thread number.
854 @param filter result of evaluating filter clause on thread global_tid, or zero
855 if no filter clause present
856 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
857 */
858 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
859   int status = 0;
860   int tid;
861   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
862   __kmp_assert_valid_gtid(global_tid);
863 
864   if (!TCR_4(__kmp_init_parallel))
865     __kmp_parallel_initialize();
866 
867   __kmp_resume_if_soft_paused();
868 
869   tid = __kmp_tid_from_gtid(global_tid);
870   if (tid == filter) {
871     KMP_COUNT_BLOCK(OMP_MASKED);
872     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
873     status = 1;
874   }
875 
876 #if OMPT_SUPPORT && OMPT_OPTIONAL
877   if (status) {
878     if (ompt_enabled.ompt_callback_masked) {
879       kmp_info_t *this_thr = __kmp_threads[global_tid];
880       kmp_team_t *team = this_thr->th.th_team;
881       ompt_callbacks.ompt_callback(ompt_callback_masked)(
882           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
883           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
884           OMPT_GET_RETURN_ADDRESS(0));
885     }
886   }
887 #endif
888 
889   if (__kmp_env_consistency_check) {
890 #if KMP_USE_DYNAMIC_LOCK
891     if (status)
892       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
893     else
894       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
895 #else
896     if (status)
897       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
898     else
899       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
900 #endif
901   }
902 
903   return status;
904 }
905 
906 /*!
907 @ingroup WORK_SHARING
908 @param loc  source location information.
909 @param global_tid  global thread number .
910 
911 Mark the end of a <tt>masked</tt> region. This should only be called by the
912 thread that executes the <tt>masked</tt> region.
913 */
914 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
915   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
916   __kmp_assert_valid_gtid(global_tid);
917   KMP_POP_PARTITIONED_TIMER();
918 
919 #if OMPT_SUPPORT && OMPT_OPTIONAL
920   kmp_info_t *this_thr = __kmp_threads[global_tid];
921   kmp_team_t *team = this_thr->th.th_team;
922   if (ompt_enabled.ompt_callback_masked) {
923     int tid = __kmp_tid_from_gtid(global_tid);
924     ompt_callbacks.ompt_callback(ompt_callback_masked)(
925         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
926         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
927         OMPT_GET_RETURN_ADDRESS(0));
928   }
929 #endif
930 
931   if (__kmp_env_consistency_check) {
932     __kmp_pop_sync(global_tid, ct_masked, loc);
933   }
934 }
935 
936 /*!
937 @ingroup WORK_SHARING
938 @param loc  source location information.
939 @param gtid  global thread number.
940 
941 Start execution of an <tt>ordered</tt> construct.
942 */
943 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
944   int cid = 0;
945   kmp_info_t *th;
946   KMP_DEBUG_ASSERT(__kmp_init_serial);
947 
948   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
949   __kmp_assert_valid_gtid(gtid);
950 
951   if (!TCR_4(__kmp_init_parallel))
952     __kmp_parallel_initialize();
953 
954   __kmp_resume_if_soft_paused();
955 
956 #if USE_ITT_BUILD
957   __kmp_itt_ordered_prep(gtid);
958 // TODO: ordered_wait_id
959 #endif /* USE_ITT_BUILD */
960 
961   th = __kmp_threads[gtid];
962 
963 #if OMPT_SUPPORT && OMPT_OPTIONAL
964   kmp_team_t *team;
965   ompt_wait_id_t lck;
966   void *codeptr_ra;
967   OMPT_STORE_RETURN_ADDRESS(gtid);
968   if (ompt_enabled.enabled) {
969     team = __kmp_team_from_gtid(gtid);
970     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
971     /* OMPT state update */
972     th->th.ompt_thread_info.wait_id = lck;
973     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
974 
975     /* OMPT event callback */
976     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
977     if (ompt_enabled.ompt_callback_mutex_acquire) {
978       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
979           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
980           codeptr_ra);
981     }
982   }
983 #endif
984 
985   if (th->th.th_dispatch->th_deo_fcn != 0)
986     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
987   else
988     __kmp_parallel_deo(&gtid, &cid, loc);
989 
990 #if OMPT_SUPPORT && OMPT_OPTIONAL
991   if (ompt_enabled.enabled) {
992     /* OMPT state update */
993     th->th.ompt_thread_info.state = ompt_state_work_parallel;
994     th->th.ompt_thread_info.wait_id = 0;
995 
996     /* OMPT event callback */
997     if (ompt_enabled.ompt_callback_mutex_acquired) {
998       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
999           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1000     }
1001   }
1002 #endif
1003 
1004 #if USE_ITT_BUILD
1005   __kmp_itt_ordered_start(gtid);
1006 #endif /* USE_ITT_BUILD */
1007 }
1008 
1009 /*!
1010 @ingroup WORK_SHARING
1011 @param loc  source location information.
1012 @param gtid  global thread number.
1013 
1014 End execution of an <tt>ordered</tt> construct.
1015 */
1016 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1017   int cid = 0;
1018   kmp_info_t *th;
1019 
1020   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1021   __kmp_assert_valid_gtid(gtid);
1022 
1023 #if USE_ITT_BUILD
1024   __kmp_itt_ordered_end(gtid);
1025 // TODO: ordered_wait_id
1026 #endif /* USE_ITT_BUILD */
1027 
1028   th = __kmp_threads[gtid];
1029 
1030   if (th->th.th_dispatch->th_dxo_fcn != 0)
1031     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1032   else
1033     __kmp_parallel_dxo(&gtid, &cid, loc);
1034 
1035 #if OMPT_SUPPORT && OMPT_OPTIONAL
1036   OMPT_STORE_RETURN_ADDRESS(gtid);
1037   if (ompt_enabled.ompt_callback_mutex_released) {
1038     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1039         ompt_mutex_ordered,
1040         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1041             ->t.t_ordered.dt.t_value,
1042         OMPT_LOAD_RETURN_ADDRESS(gtid));
1043   }
1044 #endif
1045 }
1046 
1047 #if KMP_USE_DYNAMIC_LOCK
1048 
1049 static __forceinline void
1050 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1051                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1052   // Pointer to the allocated indirect lock is written to crit, while indexing
1053   // is ignored.
1054   void *idx;
1055   kmp_indirect_lock_t **lck;
1056   lck = (kmp_indirect_lock_t **)crit;
1057   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1058   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1059   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1060   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1061   KA_TRACE(20,
1062            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1063 #if USE_ITT_BUILD
1064   __kmp_itt_critical_creating(ilk->lock, loc);
1065 #endif
1066   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1067   if (status == 0) {
1068 #if USE_ITT_BUILD
1069     __kmp_itt_critical_destroyed(ilk->lock);
1070 #endif
1071     // We don't really need to destroy the unclaimed lock here since it will be
1072     // cleaned up at program exit.
1073     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1074   }
1075   KMP_DEBUG_ASSERT(*lck != NULL);
1076 }
1077 
1078 // Fast-path acquire tas lock
1079 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1080   {                                                                            \
1081     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1082     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1083     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1084     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1085         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1086       kmp_uint32 spins;                                                        \
1087       KMP_FSYNC_PREPARE(l);                                                    \
1088       KMP_INIT_YIELD(spins);                                                   \
1089       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1090       do {                                                                     \
1091         if (TCR_4(__kmp_nth) >                                                 \
1092             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1093           KMP_YIELD(TRUE);                                                     \
1094         } else {                                                               \
1095           KMP_YIELD_SPIN(spins);                                               \
1096         }                                                                      \
1097         __kmp_spin_backoff(&backoff);                                          \
1098       } while (                                                                \
1099           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1100           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1101     }                                                                          \
1102     KMP_FSYNC_ACQUIRED(l);                                                     \
1103   }
1104 
1105 // Fast-path test tas lock
1106 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1107   {                                                                            \
1108     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1109     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1110     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1111     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1112          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1113   }
1114 
1115 // Fast-path release tas lock
1116 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1117   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1118 
1119 #if KMP_USE_FUTEX
1120 
1121 #include <sys/syscall.h>
1122 #include <unistd.h>
1123 #ifndef FUTEX_WAIT
1124 #define FUTEX_WAIT 0
1125 #endif
1126 #ifndef FUTEX_WAKE
1127 #define FUTEX_WAKE 1
1128 #endif
1129 
1130 // Fast-path acquire futex lock
1131 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1132   {                                                                            \
1133     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1134     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1135     KMP_MB();                                                                  \
1136     KMP_FSYNC_PREPARE(ftx);                                                    \
1137     kmp_int32 poll_val;                                                        \
1138     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1139                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1140                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1141       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1142       if (!cond) {                                                             \
1143         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1144                                          poll_val |                            \
1145                                              KMP_LOCK_BUSY(1, futex))) {       \
1146           continue;                                                            \
1147         }                                                                      \
1148         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1149       }                                                                        \
1150       kmp_int32 rc;                                                            \
1151       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1152                         NULL, NULL, 0)) != 0) {                                \
1153         continue;                                                              \
1154       }                                                                        \
1155       gtid_code |= 1;                                                          \
1156     }                                                                          \
1157     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1158   }
1159 
1160 // Fast-path test futex lock
1161 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1162   {                                                                            \
1163     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1164     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1165                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1166       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1167       rc = TRUE;                                                               \
1168     } else {                                                                   \
1169       rc = FALSE;                                                              \
1170     }                                                                          \
1171   }
1172 
1173 // Fast-path release futex lock
1174 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1175   {                                                                            \
1176     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1177     KMP_MB();                                                                  \
1178     KMP_FSYNC_RELEASING(ftx);                                                  \
1179     kmp_int32 poll_val =                                                       \
1180         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1181     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1182       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1183               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1184     }                                                                          \
1185     KMP_MB();                                                                  \
1186     KMP_YIELD_OVERSUB();                                                       \
1187   }
1188 
1189 #endif // KMP_USE_FUTEX
1190 
1191 #else // KMP_USE_DYNAMIC_LOCK
1192 
1193 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1194                                                       ident_t const *loc,
1195                                                       kmp_int32 gtid) {
1196   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1197 
1198   // Because of the double-check, the following load doesn't need to be volatile
1199   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1200 
1201   if (lck == NULL) {
1202     void *idx;
1203 
1204     // Allocate & initialize the lock.
1205     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1206     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1207     __kmp_init_user_lock_with_checks(lck);
1208     __kmp_set_user_lock_location(lck, loc);
1209 #if USE_ITT_BUILD
1210     __kmp_itt_critical_creating(lck);
1211 // __kmp_itt_critical_creating() should be called *before* the first usage
1212 // of underlying lock. It is the only place where we can guarantee it. There
1213 // are chances the lock will destroyed with no usage, but it is not a
1214 // problem, because this is not real event seen by user but rather setting
1215 // name for object (lock). See more details in kmp_itt.h.
1216 #endif /* USE_ITT_BUILD */
1217 
1218     // Use a cmpxchg instruction to slam the start of the critical section with
1219     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1220     // and use the lock that the other thread allocated.
1221     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1222 
1223     if (status == 0) {
1224 // Deallocate the lock and reload the value.
1225 #if USE_ITT_BUILD
1226       __kmp_itt_critical_destroyed(lck);
1227 // Let ITT know the lock is destroyed and the same memory location may be reused
1228 // for another purpose.
1229 #endif /* USE_ITT_BUILD */
1230       __kmp_destroy_user_lock_with_checks(lck);
1231       __kmp_user_lock_free(&idx, gtid, lck);
1232       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1233       KMP_DEBUG_ASSERT(lck != NULL);
1234     }
1235   }
1236   return lck;
1237 }
1238 
1239 #endif // KMP_USE_DYNAMIC_LOCK
1240 
1241 /*!
1242 @ingroup WORK_SHARING
1243 @param loc  source location information.
1244 @param global_tid  global thread number.
1245 @param crit identity of the critical section. This could be a pointer to a lock
1246 associated with the critical section, or some other suitably unique value.
1247 
1248 Enter code protected by a `critical` construct.
1249 This function blocks until the executing thread can enter the critical section.
1250 */
1251 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1252                      kmp_critical_name *crit) {
1253 #if KMP_USE_DYNAMIC_LOCK
1254 #if OMPT_SUPPORT && OMPT_OPTIONAL
1255   OMPT_STORE_RETURN_ADDRESS(global_tid);
1256 #endif // OMPT_SUPPORT
1257   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1258 #else
1259   KMP_COUNT_BLOCK(OMP_CRITICAL);
1260 #if OMPT_SUPPORT && OMPT_OPTIONAL
1261   ompt_state_t prev_state = ompt_state_undefined;
1262   ompt_thread_info_t ti;
1263 #endif
1264   kmp_user_lock_p lck;
1265 
1266   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1267   __kmp_assert_valid_gtid(global_tid);
1268 
1269   // TODO: add THR_OVHD_STATE
1270 
1271   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1272   KMP_CHECK_USER_LOCK_INIT();
1273 
1274   if ((__kmp_user_lock_kind == lk_tas) &&
1275       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1276     lck = (kmp_user_lock_p)crit;
1277   }
1278 #if KMP_USE_FUTEX
1279   else if ((__kmp_user_lock_kind == lk_futex) &&
1280            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1281     lck = (kmp_user_lock_p)crit;
1282   }
1283 #endif
1284   else { // ticket, queuing or drdpa
1285     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1286   }
1287 
1288   if (__kmp_env_consistency_check)
1289     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1290 
1291     // since the critical directive binds to all threads, not just the current
1292     // team we have to check this even if we are in a serialized team.
1293     // also, even if we are the uber thread, we still have to conduct the lock,
1294     // as we have to contend with sibling threads.
1295 
1296 #if USE_ITT_BUILD
1297   __kmp_itt_critical_acquiring(lck);
1298 #endif /* USE_ITT_BUILD */
1299 #if OMPT_SUPPORT && OMPT_OPTIONAL
1300   OMPT_STORE_RETURN_ADDRESS(gtid);
1301   void *codeptr_ra = NULL;
1302   if (ompt_enabled.enabled) {
1303     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1304     /* OMPT state update */
1305     prev_state = ti.state;
1306     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1307     ti.state = ompt_state_wait_critical;
1308 
1309     /* OMPT event callback */
1310     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1311     if (ompt_enabled.ompt_callback_mutex_acquire) {
1312       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1313           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1314           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1315     }
1316   }
1317 #endif
1318   // Value of 'crit' should be good for using as a critical_id of the critical
1319   // section directive.
1320   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1321 
1322 #if USE_ITT_BUILD
1323   __kmp_itt_critical_acquired(lck);
1324 #endif /* USE_ITT_BUILD */
1325 #if OMPT_SUPPORT && OMPT_OPTIONAL
1326   if (ompt_enabled.enabled) {
1327     /* OMPT state update */
1328     ti.state = prev_state;
1329     ti.wait_id = 0;
1330 
1331     /* OMPT event callback */
1332     if (ompt_enabled.ompt_callback_mutex_acquired) {
1333       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1334           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1335     }
1336   }
1337 #endif
1338   KMP_POP_PARTITIONED_TIMER();
1339 
1340   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1341   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1342 #endif // KMP_USE_DYNAMIC_LOCK
1343 }
1344 
1345 #if KMP_USE_DYNAMIC_LOCK
1346 
1347 // Converts the given hint to an internal lock implementation
1348 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1349 #if KMP_USE_TSX
1350 #define KMP_TSX_LOCK(seq) lockseq_##seq
1351 #else
1352 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1353 #endif
1354 
1355 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1356 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1357 #else
1358 #define KMP_CPUINFO_RTM 0
1359 #endif
1360 
1361   // Hints that do not require further logic
1362   if (hint & kmp_lock_hint_hle)
1363     return KMP_TSX_LOCK(hle);
1364   if (hint & kmp_lock_hint_rtm)
1365     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1366   if (hint & kmp_lock_hint_adaptive)
1367     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1368 
1369   // Rule out conflicting hints first by returning the default lock
1370   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1371     return __kmp_user_lock_seq;
1372   if ((hint & omp_lock_hint_speculative) &&
1373       (hint & omp_lock_hint_nonspeculative))
1374     return __kmp_user_lock_seq;
1375 
1376   // Do not even consider speculation when it appears to be contended
1377   if (hint & omp_lock_hint_contended)
1378     return lockseq_queuing;
1379 
1380   // Uncontended lock without speculation
1381   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1382     return lockseq_tas;
1383 
1384   // Use RTM lock for speculation
1385   if (hint & omp_lock_hint_speculative)
1386     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1387 
1388   return __kmp_user_lock_seq;
1389 }
1390 
1391 #if OMPT_SUPPORT && OMPT_OPTIONAL
1392 #if KMP_USE_DYNAMIC_LOCK
1393 static kmp_mutex_impl_t
1394 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1395   if (user_lock) {
1396     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1397     case 0:
1398       break;
1399 #if KMP_USE_FUTEX
1400     case locktag_futex:
1401       return kmp_mutex_impl_queuing;
1402 #endif
1403     case locktag_tas:
1404       return kmp_mutex_impl_spin;
1405 #if KMP_USE_TSX
1406     case locktag_hle:
1407     case locktag_rtm_spin:
1408       return kmp_mutex_impl_speculative;
1409 #endif
1410     default:
1411       return kmp_mutex_impl_none;
1412     }
1413     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1414   }
1415   KMP_ASSERT(ilock);
1416   switch (ilock->type) {
1417 #if KMP_USE_TSX
1418   case locktag_adaptive:
1419   case locktag_rtm_queuing:
1420     return kmp_mutex_impl_speculative;
1421 #endif
1422   case locktag_nested_tas:
1423     return kmp_mutex_impl_spin;
1424 #if KMP_USE_FUTEX
1425   case locktag_nested_futex:
1426 #endif
1427   case locktag_ticket:
1428   case locktag_queuing:
1429   case locktag_drdpa:
1430   case locktag_nested_ticket:
1431   case locktag_nested_queuing:
1432   case locktag_nested_drdpa:
1433     return kmp_mutex_impl_queuing;
1434   default:
1435     return kmp_mutex_impl_none;
1436   }
1437 }
1438 #else
1439 // For locks without dynamic binding
1440 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1441   switch (__kmp_user_lock_kind) {
1442   case lk_tas:
1443     return kmp_mutex_impl_spin;
1444 #if KMP_USE_FUTEX
1445   case lk_futex:
1446 #endif
1447   case lk_ticket:
1448   case lk_queuing:
1449   case lk_drdpa:
1450     return kmp_mutex_impl_queuing;
1451 #if KMP_USE_TSX
1452   case lk_hle:
1453   case lk_rtm_queuing:
1454   case lk_rtm_spin:
1455   case lk_adaptive:
1456     return kmp_mutex_impl_speculative;
1457 #endif
1458   default:
1459     return kmp_mutex_impl_none;
1460   }
1461 }
1462 #endif // KMP_USE_DYNAMIC_LOCK
1463 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1464 
1465 /*!
1466 @ingroup WORK_SHARING
1467 @param loc  source location information.
1468 @param global_tid  global thread number.
1469 @param crit identity of the critical section. This could be a pointer to a lock
1470 associated with the critical section, or some other suitably unique value.
1471 @param hint the lock hint.
1472 
1473 Enter code protected by a `critical` construct with a hint. The hint value is
1474 used to suggest a lock implementation. This function blocks until the executing
1475 thread can enter the critical section unless the hint suggests use of
1476 speculative execution and the hardware supports it.
1477 */
1478 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1479                                kmp_critical_name *crit, uint32_t hint) {
1480   KMP_COUNT_BLOCK(OMP_CRITICAL);
1481   kmp_user_lock_p lck;
1482 #if OMPT_SUPPORT && OMPT_OPTIONAL
1483   ompt_state_t prev_state = ompt_state_undefined;
1484   ompt_thread_info_t ti;
1485   // This is the case, if called from __kmpc_critical:
1486   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1487   if (!codeptr)
1488     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1489 #endif
1490 
1491   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1492   __kmp_assert_valid_gtid(global_tid);
1493 
1494   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1495   // Check if it is initialized.
1496   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1497   if (*lk == 0) {
1498     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1499     if (KMP_IS_D_LOCK(lckseq)) {
1500       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1501                                   KMP_GET_D_TAG(lckseq));
1502     } else {
1503       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1504     }
1505   }
1506   // Branch for accessing the actual lock object and set operation. This
1507   // branching is inevitable since this lock initialization does not follow the
1508   // normal dispatch path (lock table is not used).
1509   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1510     lck = (kmp_user_lock_p)lk;
1511     if (__kmp_env_consistency_check) {
1512       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1513                       __kmp_map_hint_to_lock(hint));
1514     }
1515 #if USE_ITT_BUILD
1516     __kmp_itt_critical_acquiring(lck);
1517 #endif
1518 #if OMPT_SUPPORT && OMPT_OPTIONAL
1519     if (ompt_enabled.enabled) {
1520       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1521       /* OMPT state update */
1522       prev_state = ti.state;
1523       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1524       ti.state = ompt_state_wait_critical;
1525 
1526       /* OMPT event callback */
1527       if (ompt_enabled.ompt_callback_mutex_acquire) {
1528         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1529             ompt_mutex_critical, (unsigned int)hint,
1530             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1531             codeptr);
1532       }
1533     }
1534 #endif
1535 #if KMP_USE_INLINED_TAS
1536     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1537       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1538     } else
1539 #elif KMP_USE_INLINED_FUTEX
1540     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1541       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1542     } else
1543 #endif
1544     {
1545       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1546     }
1547   } else {
1548     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1549     lck = ilk->lock;
1550     if (__kmp_env_consistency_check) {
1551       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1552                       __kmp_map_hint_to_lock(hint));
1553     }
1554 #if USE_ITT_BUILD
1555     __kmp_itt_critical_acquiring(lck);
1556 #endif
1557 #if OMPT_SUPPORT && OMPT_OPTIONAL
1558     if (ompt_enabled.enabled) {
1559       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1560       /* OMPT state update */
1561       prev_state = ti.state;
1562       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1563       ti.state = ompt_state_wait_critical;
1564 
1565       /* OMPT event callback */
1566       if (ompt_enabled.ompt_callback_mutex_acquire) {
1567         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1568             ompt_mutex_critical, (unsigned int)hint,
1569             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1570             codeptr);
1571       }
1572     }
1573 #endif
1574     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1575   }
1576   KMP_POP_PARTITIONED_TIMER();
1577 
1578 #if USE_ITT_BUILD
1579   __kmp_itt_critical_acquired(lck);
1580 #endif /* USE_ITT_BUILD */
1581 #if OMPT_SUPPORT && OMPT_OPTIONAL
1582   if (ompt_enabled.enabled) {
1583     /* OMPT state update */
1584     ti.state = prev_state;
1585     ti.wait_id = 0;
1586 
1587     /* OMPT event callback */
1588     if (ompt_enabled.ompt_callback_mutex_acquired) {
1589       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1590           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1591     }
1592   }
1593 #endif
1594 
1595   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1596   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1597 } // __kmpc_critical_with_hint
1598 
1599 #endif // KMP_USE_DYNAMIC_LOCK
1600 
1601 /*!
1602 @ingroup WORK_SHARING
1603 @param loc  source location information.
1604 @param global_tid  global thread number .
1605 @param crit identity of the critical section. This could be a pointer to a lock
1606 associated with the critical section, or some other suitably unique value.
1607 
1608 Leave a critical section, releasing any lock that was held during its execution.
1609 */
1610 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1611                          kmp_critical_name *crit) {
1612   kmp_user_lock_p lck;
1613 
1614   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1615 
1616 #if KMP_USE_DYNAMIC_LOCK
1617   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1618     lck = (kmp_user_lock_p)crit;
1619     KMP_ASSERT(lck != NULL);
1620     if (__kmp_env_consistency_check) {
1621       __kmp_pop_sync(global_tid, ct_critical, loc);
1622     }
1623 #if USE_ITT_BUILD
1624     __kmp_itt_critical_releasing(lck);
1625 #endif
1626 #if KMP_USE_INLINED_TAS
1627     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1628       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1629     } else
1630 #elif KMP_USE_INLINED_FUTEX
1631     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1632       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1633     } else
1634 #endif
1635     {
1636       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1637     }
1638   } else {
1639     kmp_indirect_lock_t *ilk =
1640         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1641     KMP_ASSERT(ilk != NULL);
1642     lck = ilk->lock;
1643     if (__kmp_env_consistency_check) {
1644       __kmp_pop_sync(global_tid, ct_critical, loc);
1645     }
1646 #if USE_ITT_BUILD
1647     __kmp_itt_critical_releasing(lck);
1648 #endif
1649     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1650   }
1651 
1652 #else // KMP_USE_DYNAMIC_LOCK
1653 
1654   if ((__kmp_user_lock_kind == lk_tas) &&
1655       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1656     lck = (kmp_user_lock_p)crit;
1657   }
1658 #if KMP_USE_FUTEX
1659   else if ((__kmp_user_lock_kind == lk_futex) &&
1660            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1661     lck = (kmp_user_lock_p)crit;
1662   }
1663 #endif
1664   else { // ticket, queuing or drdpa
1665     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1666   }
1667 
1668   KMP_ASSERT(lck != NULL);
1669 
1670   if (__kmp_env_consistency_check)
1671     __kmp_pop_sync(global_tid, ct_critical, loc);
1672 
1673 #if USE_ITT_BUILD
1674   __kmp_itt_critical_releasing(lck);
1675 #endif /* USE_ITT_BUILD */
1676   // Value of 'crit' should be good for using as a critical_id of the critical
1677   // section directive.
1678   __kmp_release_user_lock_with_checks(lck, global_tid);
1679 
1680 #endif // KMP_USE_DYNAMIC_LOCK
1681 
1682 #if OMPT_SUPPORT && OMPT_OPTIONAL
1683   /* OMPT release event triggers after lock is released; place here to trigger
1684    * for all #if branches */
1685   OMPT_STORE_RETURN_ADDRESS(global_tid);
1686   if (ompt_enabled.ompt_callback_mutex_released) {
1687     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1688         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1689         OMPT_LOAD_RETURN_ADDRESS(0));
1690   }
1691 #endif
1692 
1693   KMP_POP_PARTITIONED_TIMER();
1694   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1695 }
1696 
1697 /*!
1698 @ingroup SYNCHRONIZATION
1699 @param loc source location information
1700 @param global_tid thread id.
1701 @return one if the thread should execute the master block, zero otherwise
1702 
1703 Start execution of a combined barrier and master. The barrier is executed inside
1704 this function.
1705 */
1706 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1707   int status;
1708   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1709   __kmp_assert_valid_gtid(global_tid);
1710 
1711   if (!TCR_4(__kmp_init_parallel))
1712     __kmp_parallel_initialize();
1713 
1714   __kmp_resume_if_soft_paused();
1715 
1716   if (__kmp_env_consistency_check)
1717     __kmp_check_barrier(global_tid, ct_barrier, loc);
1718 
1719 #if OMPT_SUPPORT
1720   ompt_frame_t *ompt_frame;
1721   if (ompt_enabled.enabled) {
1722     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1723     if (ompt_frame->enter_frame.ptr == NULL)
1724       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1725   }
1726   OMPT_STORE_RETURN_ADDRESS(global_tid);
1727 #endif
1728 #if USE_ITT_NOTIFY
1729   __kmp_threads[global_tid]->th.th_ident = loc;
1730 #endif
1731   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1732 #if OMPT_SUPPORT && OMPT_OPTIONAL
1733   if (ompt_enabled.enabled) {
1734     ompt_frame->enter_frame = ompt_data_none;
1735   }
1736 #endif
1737 
1738   return (status != 0) ? 0 : 1;
1739 }
1740 
1741 /*!
1742 @ingroup SYNCHRONIZATION
1743 @param loc source location information
1744 @param global_tid thread id.
1745 
1746 Complete the execution of a combined barrier and master. This function should
1747 only be called at the completion of the <tt>master</tt> code. Other threads will
1748 still be waiting at the barrier and this call releases them.
1749 */
1750 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1751   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1752   __kmp_assert_valid_gtid(global_tid);
1753   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1754 }
1755 
1756 /*!
1757 @ingroup SYNCHRONIZATION
1758 @param loc source location information
1759 @param global_tid thread id.
1760 @return one if the thread should execute the master block, zero otherwise
1761 
1762 Start execution of a combined barrier and master(nowait) construct.
1763 The barrier is executed inside this function.
1764 There is no equivalent "end" function, since the
1765 */
1766 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1767   kmp_int32 ret;
1768   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1769   __kmp_assert_valid_gtid(global_tid);
1770 
1771   if (!TCR_4(__kmp_init_parallel))
1772     __kmp_parallel_initialize();
1773 
1774   __kmp_resume_if_soft_paused();
1775 
1776   if (__kmp_env_consistency_check) {
1777     if (loc == 0) {
1778       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1779     }
1780     __kmp_check_barrier(global_tid, ct_barrier, loc);
1781   }
1782 
1783 #if OMPT_SUPPORT
1784   ompt_frame_t *ompt_frame;
1785   if (ompt_enabled.enabled) {
1786     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1787     if (ompt_frame->enter_frame.ptr == NULL)
1788       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1789   }
1790   OMPT_STORE_RETURN_ADDRESS(global_tid);
1791 #endif
1792 #if USE_ITT_NOTIFY
1793   __kmp_threads[global_tid]->th.th_ident = loc;
1794 #endif
1795   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1796 #if OMPT_SUPPORT && OMPT_OPTIONAL
1797   if (ompt_enabled.enabled) {
1798     ompt_frame->enter_frame = ompt_data_none;
1799   }
1800 #endif
1801 
1802   ret = __kmpc_master(loc, global_tid);
1803 
1804   if (__kmp_env_consistency_check) {
1805     /*  there's no __kmpc_end_master called; so the (stats) */
1806     /*  actions of __kmpc_end_master are done here          */
1807     if (ret) {
1808       /* only one thread should do the pop since only */
1809       /* one did the push (see __kmpc_master())       */
1810       __kmp_pop_sync(global_tid, ct_master, loc);
1811     }
1812   }
1813 
1814   return (ret);
1815 }
1816 
1817 /* The BARRIER for a SINGLE process section is always explicit   */
1818 /*!
1819 @ingroup WORK_SHARING
1820 @param loc  source location information
1821 @param global_tid  global thread number
1822 @return One if this thread should execute the single construct, zero otherwise.
1823 
1824 Test whether to execute a <tt>single</tt> construct.
1825 There are no implicit barriers in the two "single" calls, rather the compiler
1826 should introduce an explicit barrier if it is required.
1827 */
1828 
1829 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1830   __kmp_assert_valid_gtid(global_tid);
1831   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1832 
1833   if (rc) {
1834     // We are going to execute the single statement, so we should count it.
1835     KMP_COUNT_BLOCK(OMP_SINGLE);
1836     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1837   }
1838 
1839 #if OMPT_SUPPORT && OMPT_OPTIONAL
1840   kmp_info_t *this_thr = __kmp_threads[global_tid];
1841   kmp_team_t *team = this_thr->th.th_team;
1842   int tid = __kmp_tid_from_gtid(global_tid);
1843 
1844   if (ompt_enabled.enabled) {
1845     if (rc) {
1846       if (ompt_enabled.ompt_callback_work) {
1847         ompt_callbacks.ompt_callback(ompt_callback_work)(
1848             ompt_work_single_executor, ompt_scope_begin,
1849             &(team->t.ompt_team_info.parallel_data),
1850             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1851             1, OMPT_GET_RETURN_ADDRESS(0));
1852       }
1853     } else {
1854       if (ompt_enabled.ompt_callback_work) {
1855         ompt_callbacks.ompt_callback(ompt_callback_work)(
1856             ompt_work_single_other, ompt_scope_begin,
1857             &(team->t.ompt_team_info.parallel_data),
1858             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1859             1, OMPT_GET_RETURN_ADDRESS(0));
1860         ompt_callbacks.ompt_callback(ompt_callback_work)(
1861             ompt_work_single_other, ompt_scope_end,
1862             &(team->t.ompt_team_info.parallel_data),
1863             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1864             1, OMPT_GET_RETURN_ADDRESS(0));
1865       }
1866     }
1867   }
1868 #endif
1869 
1870   return rc;
1871 }
1872 
1873 /*!
1874 @ingroup WORK_SHARING
1875 @param loc  source location information
1876 @param global_tid  global thread number
1877 
1878 Mark the end of a <tt>single</tt> construct.  This function should
1879 only be called by the thread that executed the block of code protected
1880 by the `single` construct.
1881 */
1882 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1883   __kmp_assert_valid_gtid(global_tid);
1884   __kmp_exit_single(global_tid);
1885   KMP_POP_PARTITIONED_TIMER();
1886 
1887 #if OMPT_SUPPORT && OMPT_OPTIONAL
1888   kmp_info_t *this_thr = __kmp_threads[global_tid];
1889   kmp_team_t *team = this_thr->th.th_team;
1890   int tid = __kmp_tid_from_gtid(global_tid);
1891 
1892   if (ompt_enabled.ompt_callback_work) {
1893     ompt_callbacks.ompt_callback(ompt_callback_work)(
1894         ompt_work_single_executor, ompt_scope_end,
1895         &(team->t.ompt_team_info.parallel_data),
1896         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1897         OMPT_GET_RETURN_ADDRESS(0));
1898   }
1899 #endif
1900 }
1901 
1902 /*!
1903 @ingroup WORK_SHARING
1904 @param loc Source location
1905 @param global_tid Global thread id
1906 
1907 Mark the end of a statically scheduled loop.
1908 */
1909 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1910   KMP_POP_PARTITIONED_TIMER();
1911   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1912 
1913 #if OMPT_SUPPORT && OMPT_OPTIONAL
1914   if (ompt_enabled.ompt_callback_work) {
1915     ompt_work_t ompt_work_type = ompt_work_loop;
1916     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1917     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1918     // Determine workshare type
1919     if (loc != NULL) {
1920       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1921         ompt_work_type = ompt_work_loop;
1922       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1923         ompt_work_type = ompt_work_sections;
1924       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1925         ompt_work_type = ompt_work_distribute;
1926       } else {
1927         // use default set above.
1928         // a warning about this case is provided in __kmpc_for_static_init
1929       }
1930       KMP_DEBUG_ASSERT(ompt_work_type);
1931     }
1932     ompt_callbacks.ompt_callback(ompt_callback_work)(
1933         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1934         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1935   }
1936 #endif
1937   if (__kmp_env_consistency_check)
1938     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1939 }
1940 
1941 // User routines which take C-style arguments (call by value)
1942 // different from the Fortran equivalent routines
1943 
1944 void ompc_set_num_threads(int arg) {
1945   // !!!!! TODO: check the per-task binding
1946   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1947 }
1948 
1949 void ompc_set_dynamic(int flag) {
1950   kmp_info_t *thread;
1951 
1952   /* For the thread-private implementation of the internal controls */
1953   thread = __kmp_entry_thread();
1954 
1955   __kmp_save_internal_controls(thread);
1956 
1957   set__dynamic(thread, flag ? true : false);
1958 }
1959 
1960 void ompc_set_nested(int flag) {
1961   kmp_info_t *thread;
1962 
1963   /* For the thread-private internal controls implementation */
1964   thread = __kmp_entry_thread();
1965 
1966   __kmp_save_internal_controls(thread);
1967 
1968   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1969 }
1970 
1971 void ompc_set_max_active_levels(int max_active_levels) {
1972   /* TO DO */
1973   /* we want per-task implementation of this internal control */
1974 
1975   /* For the per-thread internal controls implementation */
1976   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1977 }
1978 
1979 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1980   // !!!!! TODO: check the per-task binding
1981   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1982 }
1983 
1984 int ompc_get_ancestor_thread_num(int level) {
1985   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1986 }
1987 
1988 int ompc_get_team_size(int level) {
1989   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1990 }
1991 
1992 /* OpenMP 5.0 Affinity Format API */
1993 
1994 void ompc_set_affinity_format(char const *format) {
1995   if (!__kmp_init_serial) {
1996     __kmp_serial_initialize();
1997   }
1998   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1999                          format, KMP_STRLEN(format) + 1);
2000 }
2001 
2002 size_t ompc_get_affinity_format(char *buffer, size_t size) {
2003   size_t format_size;
2004   if (!__kmp_init_serial) {
2005     __kmp_serial_initialize();
2006   }
2007   format_size = KMP_STRLEN(__kmp_affinity_format);
2008   if (buffer && size) {
2009     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2010                            format_size + 1);
2011   }
2012   return format_size;
2013 }
2014 
2015 void ompc_display_affinity(char const *format) {
2016   int gtid;
2017   if (!TCR_4(__kmp_init_middle)) {
2018     __kmp_middle_initialize();
2019   }
2020   gtid = __kmp_get_gtid();
2021   __kmp_aux_display_affinity(gtid, format);
2022 }
2023 
2024 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
2025                              char const *format) {
2026   int gtid;
2027   size_t num_required;
2028   kmp_str_buf_t capture_buf;
2029   if (!TCR_4(__kmp_init_middle)) {
2030     __kmp_middle_initialize();
2031   }
2032   gtid = __kmp_get_gtid();
2033   __kmp_str_buf_init(&capture_buf);
2034   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2035   if (buffer && buf_size) {
2036     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2037                            capture_buf.used + 1);
2038   }
2039   __kmp_str_buf_free(&capture_buf);
2040   return num_required;
2041 }
2042 
2043 void kmpc_set_stacksize(int arg) {
2044   // __kmp_aux_set_stacksize initializes the library if needed
2045   __kmp_aux_set_stacksize(arg);
2046 }
2047 
2048 void kmpc_set_stacksize_s(size_t arg) {
2049   // __kmp_aux_set_stacksize initializes the library if needed
2050   __kmp_aux_set_stacksize(arg);
2051 }
2052 
2053 void kmpc_set_blocktime(int arg) {
2054   int gtid, tid;
2055   kmp_info_t *thread;
2056 
2057   gtid = __kmp_entry_gtid();
2058   tid = __kmp_tid_from_gtid(gtid);
2059   thread = __kmp_thread_from_gtid(gtid);
2060 
2061   __kmp_aux_set_blocktime(arg, thread, tid);
2062 }
2063 
2064 void kmpc_set_library(int arg) {
2065   // __kmp_user_set_library initializes the library if needed
2066   __kmp_user_set_library((enum library_type)arg);
2067 }
2068 
2069 void kmpc_set_defaults(char const *str) {
2070   // __kmp_aux_set_defaults initializes the library if needed
2071   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2072 }
2073 
2074 void kmpc_set_disp_num_buffers(int arg) {
2075   // ignore after initialization because some teams have already
2076   // allocated dispatch buffers
2077   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2078       arg <= KMP_MAX_DISP_NUM_BUFF) {
2079     __kmp_dispatch_num_buffers = arg;
2080   }
2081 }
2082 
2083 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2084 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2085   return -1;
2086 #else
2087   if (!TCR_4(__kmp_init_middle)) {
2088     __kmp_middle_initialize();
2089   }
2090   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2091 #endif
2092 }
2093 
2094 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2095 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2096   return -1;
2097 #else
2098   if (!TCR_4(__kmp_init_middle)) {
2099     __kmp_middle_initialize();
2100   }
2101   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2102 #endif
2103 }
2104 
2105 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2106 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2107   return -1;
2108 #else
2109   if (!TCR_4(__kmp_init_middle)) {
2110     __kmp_middle_initialize();
2111   }
2112   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2113 #endif
2114 }
2115 
2116 /* -------------------------------------------------------------------------- */
2117 /*!
2118 @ingroup THREADPRIVATE
2119 @param loc       source location information
2120 @param gtid      global thread number
2121 @param cpy_size  size of the cpy_data buffer
2122 @param cpy_data  pointer to data to be copied
2123 @param cpy_func  helper function to call for copying data
2124 @param didit     flag variable: 1=single thread; 0=not single thread
2125 
2126 __kmpc_copyprivate implements the interface for the private data broadcast
2127 needed for the copyprivate clause associated with a single region in an
2128 OpenMP<sup>*</sup> program (both C and Fortran).
2129 All threads participating in the parallel region call this routine.
2130 One of the threads (called the single thread) should have the <tt>didit</tt>
2131 variable set to 1 and all other threads should have that variable set to 0.
2132 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2133 
2134 The OpenMP specification forbids the use of nowait on the single region when a
2135 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2136 barrier internally to avoid race conditions, so the code generation for the
2137 single region should avoid generating a barrier after the call to @ref
2138 __kmpc_copyprivate.
2139 
2140 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2141 The <tt>loc</tt> parameter is a pointer to source location information.
2142 
2143 Internal implementation: The single thread will first copy its descriptor
2144 address (cpy_data) to a team-private location, then the other threads will each
2145 call the function pointed to by the parameter cpy_func, which carries out the
2146 copy by copying the data using the cpy_data buffer.
2147 
2148 The cpy_func routine used for the copy and the contents of the data area defined
2149 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2150 to be done. For instance, the cpy_data buffer can hold the actual data to be
2151 copied or it may hold a list of pointers to the data. The cpy_func routine must
2152 interpret the cpy_data buffer appropriately.
2153 
2154 The interface to cpy_func is as follows:
2155 @code
2156 void cpy_func( void *destination, void *source )
2157 @endcode
2158 where void *destination is the cpy_data pointer for the thread being copied to
2159 and void *source is the cpy_data pointer for the thread being copied from.
2160 */
2161 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2162                         void *cpy_data, void (*cpy_func)(void *, void *),
2163                         kmp_int32 didit) {
2164   void **data_ptr;
2165   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2166   __kmp_assert_valid_gtid(gtid);
2167 
2168   KMP_MB();
2169 
2170   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2171 
2172   if (__kmp_env_consistency_check) {
2173     if (loc == 0) {
2174       KMP_WARNING(ConstructIdentInvalid);
2175     }
2176   }
2177 
2178   // ToDo: Optimize the following two barriers into some kind of split barrier
2179 
2180   if (didit)
2181     *data_ptr = cpy_data;
2182 
2183 #if OMPT_SUPPORT
2184   ompt_frame_t *ompt_frame;
2185   if (ompt_enabled.enabled) {
2186     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2187     if (ompt_frame->enter_frame.ptr == NULL)
2188       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2189   }
2190   OMPT_STORE_RETURN_ADDRESS(gtid);
2191 #endif
2192 /* This barrier is not a barrier region boundary */
2193 #if USE_ITT_NOTIFY
2194   __kmp_threads[gtid]->th.th_ident = loc;
2195 #endif
2196   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2197 
2198   if (!didit)
2199     (*cpy_func)(cpy_data, *data_ptr);
2200 
2201   // Consider next barrier a user-visible barrier for barrier region boundaries
2202   // Nesting checks are already handled by the single construct checks
2203   {
2204 #if OMPT_SUPPORT
2205     OMPT_STORE_RETURN_ADDRESS(gtid);
2206 #endif
2207 #if USE_ITT_NOTIFY
2208     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2209 // tasks can overwrite the location)
2210 #endif
2211     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2212 #if OMPT_SUPPORT && OMPT_OPTIONAL
2213     if (ompt_enabled.enabled) {
2214       ompt_frame->enter_frame = ompt_data_none;
2215     }
2216 #endif
2217   }
2218 }
2219 
2220 /* -------------------------------------------------------------------------- */
2221 
2222 #define INIT_LOCK __kmp_init_user_lock_with_checks
2223 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2224 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2225 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2226 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2227 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2228   __kmp_acquire_nested_user_lock_with_checks_timed
2229 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2230 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2231 #define TEST_LOCK __kmp_test_user_lock_with_checks
2232 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2233 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2234 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2235 
2236 // TODO: Make check abort messages use location info & pass it into
2237 // with_checks routines
2238 
2239 #if KMP_USE_DYNAMIC_LOCK
2240 
2241 // internal lock initializer
2242 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2243                                                     kmp_dyna_lockseq_t seq) {
2244   if (KMP_IS_D_LOCK(seq)) {
2245     KMP_INIT_D_LOCK(lock, seq);
2246 #if USE_ITT_BUILD
2247     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2248 #endif
2249   } else {
2250     KMP_INIT_I_LOCK(lock, seq);
2251 #if USE_ITT_BUILD
2252     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2253     __kmp_itt_lock_creating(ilk->lock, loc);
2254 #endif
2255   }
2256 }
2257 
2258 // internal nest lock initializer
2259 static __forceinline void
2260 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2261                                kmp_dyna_lockseq_t seq) {
2262 #if KMP_USE_TSX
2263   // Don't have nested lock implementation for speculative locks
2264   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2265       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2266     seq = __kmp_user_lock_seq;
2267 #endif
2268   switch (seq) {
2269   case lockseq_tas:
2270     seq = lockseq_nested_tas;
2271     break;
2272 #if KMP_USE_FUTEX
2273   case lockseq_futex:
2274     seq = lockseq_nested_futex;
2275     break;
2276 #endif
2277   case lockseq_ticket:
2278     seq = lockseq_nested_ticket;
2279     break;
2280   case lockseq_queuing:
2281     seq = lockseq_nested_queuing;
2282     break;
2283   case lockseq_drdpa:
2284     seq = lockseq_nested_drdpa;
2285     break;
2286   default:
2287     seq = lockseq_nested_queuing;
2288   }
2289   KMP_INIT_I_LOCK(lock, seq);
2290 #if USE_ITT_BUILD
2291   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2292   __kmp_itt_lock_creating(ilk->lock, loc);
2293 #endif
2294 }
2295 
2296 /* initialize the lock with a hint */
2297 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2298                                 uintptr_t hint) {
2299   KMP_DEBUG_ASSERT(__kmp_init_serial);
2300   if (__kmp_env_consistency_check && user_lock == NULL) {
2301     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2302   }
2303 
2304   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2305 
2306 #if OMPT_SUPPORT && OMPT_OPTIONAL
2307   // This is the case, if called from omp_init_lock_with_hint:
2308   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2309   if (!codeptr)
2310     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2311   if (ompt_enabled.ompt_callback_lock_init) {
2312     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2313         ompt_mutex_lock, (omp_lock_hint_t)hint,
2314         __ompt_get_mutex_impl_type(user_lock),
2315         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2316   }
2317 #endif
2318 }
2319 
2320 /* initialize the lock with a hint */
2321 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2322                                      void **user_lock, uintptr_t hint) {
2323   KMP_DEBUG_ASSERT(__kmp_init_serial);
2324   if (__kmp_env_consistency_check && user_lock == NULL) {
2325     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2326   }
2327 
2328   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2329 
2330 #if OMPT_SUPPORT && OMPT_OPTIONAL
2331   // This is the case, if called from omp_init_lock_with_hint:
2332   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2333   if (!codeptr)
2334     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2335   if (ompt_enabled.ompt_callback_lock_init) {
2336     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2337         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2338         __ompt_get_mutex_impl_type(user_lock),
2339         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2340   }
2341 #endif
2342 }
2343 
2344 #endif // KMP_USE_DYNAMIC_LOCK
2345 
2346 /* initialize the lock */
2347 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2348 #if KMP_USE_DYNAMIC_LOCK
2349 
2350   KMP_DEBUG_ASSERT(__kmp_init_serial);
2351   if (__kmp_env_consistency_check && user_lock == NULL) {
2352     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2353   }
2354   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2355 
2356 #if OMPT_SUPPORT && OMPT_OPTIONAL
2357   // This is the case, if called from omp_init_lock_with_hint:
2358   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2359   if (!codeptr)
2360     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2361   if (ompt_enabled.ompt_callback_lock_init) {
2362     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2363         ompt_mutex_lock, omp_lock_hint_none,
2364         __ompt_get_mutex_impl_type(user_lock),
2365         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2366   }
2367 #endif
2368 
2369 #else // KMP_USE_DYNAMIC_LOCK
2370 
2371   static char const *const func = "omp_init_lock";
2372   kmp_user_lock_p lck;
2373   KMP_DEBUG_ASSERT(__kmp_init_serial);
2374 
2375   if (__kmp_env_consistency_check) {
2376     if (user_lock == NULL) {
2377       KMP_FATAL(LockIsUninitialized, func);
2378     }
2379   }
2380 
2381   KMP_CHECK_USER_LOCK_INIT();
2382 
2383   if ((__kmp_user_lock_kind == lk_tas) &&
2384       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2385     lck = (kmp_user_lock_p)user_lock;
2386   }
2387 #if KMP_USE_FUTEX
2388   else if ((__kmp_user_lock_kind == lk_futex) &&
2389            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2390     lck = (kmp_user_lock_p)user_lock;
2391   }
2392 #endif
2393   else {
2394     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2395   }
2396   INIT_LOCK(lck);
2397   __kmp_set_user_lock_location(lck, loc);
2398 
2399 #if OMPT_SUPPORT && OMPT_OPTIONAL
2400   // This is the case, if called from omp_init_lock_with_hint:
2401   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2402   if (!codeptr)
2403     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2404   if (ompt_enabled.ompt_callback_lock_init) {
2405     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2406         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2407         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2408   }
2409 #endif
2410 
2411 #if USE_ITT_BUILD
2412   __kmp_itt_lock_creating(lck);
2413 #endif /* USE_ITT_BUILD */
2414 
2415 #endif // KMP_USE_DYNAMIC_LOCK
2416 } // __kmpc_init_lock
2417 
2418 /* initialize the lock */
2419 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2420 #if KMP_USE_DYNAMIC_LOCK
2421 
2422   KMP_DEBUG_ASSERT(__kmp_init_serial);
2423   if (__kmp_env_consistency_check && user_lock == NULL) {
2424     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2425   }
2426   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2427 
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429   // This is the case, if called from omp_init_lock_with_hint:
2430   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2431   if (!codeptr)
2432     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2433   if (ompt_enabled.ompt_callback_lock_init) {
2434     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2435         ompt_mutex_nest_lock, omp_lock_hint_none,
2436         __ompt_get_mutex_impl_type(user_lock),
2437         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2438   }
2439 #endif
2440 
2441 #else // KMP_USE_DYNAMIC_LOCK
2442 
2443   static char const *const func = "omp_init_nest_lock";
2444   kmp_user_lock_p lck;
2445   KMP_DEBUG_ASSERT(__kmp_init_serial);
2446 
2447   if (__kmp_env_consistency_check) {
2448     if (user_lock == NULL) {
2449       KMP_FATAL(LockIsUninitialized, func);
2450     }
2451   }
2452 
2453   KMP_CHECK_USER_LOCK_INIT();
2454 
2455   if ((__kmp_user_lock_kind == lk_tas) &&
2456       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2457        OMP_NEST_LOCK_T_SIZE)) {
2458     lck = (kmp_user_lock_p)user_lock;
2459   }
2460 #if KMP_USE_FUTEX
2461   else if ((__kmp_user_lock_kind == lk_futex) &&
2462            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2463             OMP_NEST_LOCK_T_SIZE)) {
2464     lck = (kmp_user_lock_p)user_lock;
2465   }
2466 #endif
2467   else {
2468     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2469   }
2470 
2471   INIT_NESTED_LOCK(lck);
2472   __kmp_set_user_lock_location(lck, loc);
2473 
2474 #if OMPT_SUPPORT && OMPT_OPTIONAL
2475   // This is the case, if called from omp_init_lock_with_hint:
2476   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2477   if (!codeptr)
2478     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2479   if (ompt_enabled.ompt_callback_lock_init) {
2480     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2481         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2482         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2483   }
2484 #endif
2485 
2486 #if USE_ITT_BUILD
2487   __kmp_itt_lock_creating(lck);
2488 #endif /* USE_ITT_BUILD */
2489 
2490 #endif // KMP_USE_DYNAMIC_LOCK
2491 } // __kmpc_init_nest_lock
2492 
2493 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2494 #if KMP_USE_DYNAMIC_LOCK
2495 
2496 #if USE_ITT_BUILD
2497   kmp_user_lock_p lck;
2498   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2499     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2500   } else {
2501     lck = (kmp_user_lock_p)user_lock;
2502   }
2503   __kmp_itt_lock_destroyed(lck);
2504 #endif
2505 #if OMPT_SUPPORT && OMPT_OPTIONAL
2506   // This is the case, if called from omp_init_lock_with_hint:
2507   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2508   if (!codeptr)
2509     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2510   if (ompt_enabled.ompt_callback_lock_destroy) {
2511     kmp_user_lock_p lck;
2512     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2513       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2514     } else {
2515       lck = (kmp_user_lock_p)user_lock;
2516     }
2517     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2518         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2519   }
2520 #endif
2521   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2522 #else
2523   kmp_user_lock_p lck;
2524 
2525   if ((__kmp_user_lock_kind == lk_tas) &&
2526       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2527     lck = (kmp_user_lock_p)user_lock;
2528   }
2529 #if KMP_USE_FUTEX
2530   else if ((__kmp_user_lock_kind == lk_futex) &&
2531            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2532     lck = (kmp_user_lock_p)user_lock;
2533   }
2534 #endif
2535   else {
2536     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2537   }
2538 
2539 #if OMPT_SUPPORT && OMPT_OPTIONAL
2540   // This is the case, if called from omp_init_lock_with_hint:
2541   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2542   if (!codeptr)
2543     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2544   if (ompt_enabled.ompt_callback_lock_destroy) {
2545     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2546         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2547   }
2548 #endif
2549 
2550 #if USE_ITT_BUILD
2551   __kmp_itt_lock_destroyed(lck);
2552 #endif /* USE_ITT_BUILD */
2553   DESTROY_LOCK(lck);
2554 
2555   if ((__kmp_user_lock_kind == lk_tas) &&
2556       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2557     ;
2558   }
2559 #if KMP_USE_FUTEX
2560   else if ((__kmp_user_lock_kind == lk_futex) &&
2561            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2562     ;
2563   }
2564 #endif
2565   else {
2566     __kmp_user_lock_free(user_lock, gtid, lck);
2567   }
2568 #endif // KMP_USE_DYNAMIC_LOCK
2569 } // __kmpc_destroy_lock
2570 
2571 /* destroy the lock */
2572 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2573 #if KMP_USE_DYNAMIC_LOCK
2574 
2575 #if USE_ITT_BUILD
2576   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2577   __kmp_itt_lock_destroyed(ilk->lock);
2578 #endif
2579 #if OMPT_SUPPORT && OMPT_OPTIONAL
2580   // This is the case, if called from omp_init_lock_with_hint:
2581   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2582   if (!codeptr)
2583     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2584   if (ompt_enabled.ompt_callback_lock_destroy) {
2585     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2586         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2587   }
2588 #endif
2589   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2590 
2591 #else // KMP_USE_DYNAMIC_LOCK
2592 
2593   kmp_user_lock_p lck;
2594 
2595   if ((__kmp_user_lock_kind == lk_tas) &&
2596       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2597        OMP_NEST_LOCK_T_SIZE)) {
2598     lck = (kmp_user_lock_p)user_lock;
2599   }
2600 #if KMP_USE_FUTEX
2601   else if ((__kmp_user_lock_kind == lk_futex) &&
2602            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2603             OMP_NEST_LOCK_T_SIZE)) {
2604     lck = (kmp_user_lock_p)user_lock;
2605   }
2606 #endif
2607   else {
2608     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2609   }
2610 
2611 #if OMPT_SUPPORT && OMPT_OPTIONAL
2612   // This is the case, if called from omp_init_lock_with_hint:
2613   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2614   if (!codeptr)
2615     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2616   if (ompt_enabled.ompt_callback_lock_destroy) {
2617     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2618         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2619   }
2620 #endif
2621 
2622 #if USE_ITT_BUILD
2623   __kmp_itt_lock_destroyed(lck);
2624 #endif /* USE_ITT_BUILD */
2625 
2626   DESTROY_NESTED_LOCK(lck);
2627 
2628   if ((__kmp_user_lock_kind == lk_tas) &&
2629       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2630        OMP_NEST_LOCK_T_SIZE)) {
2631     ;
2632   }
2633 #if KMP_USE_FUTEX
2634   else if ((__kmp_user_lock_kind == lk_futex) &&
2635            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2636             OMP_NEST_LOCK_T_SIZE)) {
2637     ;
2638   }
2639 #endif
2640   else {
2641     __kmp_user_lock_free(user_lock, gtid, lck);
2642   }
2643 #endif // KMP_USE_DYNAMIC_LOCK
2644 } // __kmpc_destroy_nest_lock
2645 
2646 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2647   KMP_COUNT_BLOCK(OMP_set_lock);
2648 #if KMP_USE_DYNAMIC_LOCK
2649   int tag = KMP_EXTRACT_D_TAG(user_lock);
2650 #if USE_ITT_BUILD
2651   __kmp_itt_lock_acquiring(
2652       (kmp_user_lock_p)
2653           user_lock); // itt function will get to the right lock object.
2654 #endif
2655 #if OMPT_SUPPORT && OMPT_OPTIONAL
2656   // This is the case, if called from omp_init_lock_with_hint:
2657   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2658   if (!codeptr)
2659     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2660   if (ompt_enabled.ompt_callback_mutex_acquire) {
2661     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2662         ompt_mutex_lock, omp_lock_hint_none,
2663         __ompt_get_mutex_impl_type(user_lock),
2664         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2665   }
2666 #endif
2667 #if KMP_USE_INLINED_TAS
2668   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2669     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2670   } else
2671 #elif KMP_USE_INLINED_FUTEX
2672   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2673     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2674   } else
2675 #endif
2676   {
2677     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2678   }
2679 #if USE_ITT_BUILD
2680   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2681 #endif
2682 #if OMPT_SUPPORT && OMPT_OPTIONAL
2683   if (ompt_enabled.ompt_callback_mutex_acquired) {
2684     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2685         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2686   }
2687 #endif
2688 
2689 #else // KMP_USE_DYNAMIC_LOCK
2690 
2691   kmp_user_lock_p lck;
2692 
2693   if ((__kmp_user_lock_kind == lk_tas) &&
2694       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2695     lck = (kmp_user_lock_p)user_lock;
2696   }
2697 #if KMP_USE_FUTEX
2698   else if ((__kmp_user_lock_kind == lk_futex) &&
2699            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2700     lck = (kmp_user_lock_p)user_lock;
2701   }
2702 #endif
2703   else {
2704     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2705   }
2706 
2707 #if USE_ITT_BUILD
2708   __kmp_itt_lock_acquiring(lck);
2709 #endif /* USE_ITT_BUILD */
2710 #if OMPT_SUPPORT && OMPT_OPTIONAL
2711   // This is the case, if called from omp_init_lock_with_hint:
2712   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2713   if (!codeptr)
2714     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2715   if (ompt_enabled.ompt_callback_mutex_acquire) {
2716     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2717         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2718         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2719   }
2720 #endif
2721 
2722   ACQUIRE_LOCK(lck, gtid);
2723 
2724 #if USE_ITT_BUILD
2725   __kmp_itt_lock_acquired(lck);
2726 #endif /* USE_ITT_BUILD */
2727 
2728 #if OMPT_SUPPORT && OMPT_OPTIONAL
2729   if (ompt_enabled.ompt_callback_mutex_acquired) {
2730     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2731         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2732   }
2733 #endif
2734 
2735 #endif // KMP_USE_DYNAMIC_LOCK
2736 }
2737 
2738 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2739 #if KMP_USE_DYNAMIC_LOCK
2740 
2741 #if USE_ITT_BUILD
2742   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2743 #endif
2744 #if OMPT_SUPPORT && OMPT_OPTIONAL
2745   // This is the case, if called from omp_init_lock_with_hint:
2746   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2747   if (!codeptr)
2748     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2749   if (ompt_enabled.enabled) {
2750     if (ompt_enabled.ompt_callback_mutex_acquire) {
2751       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2752           ompt_mutex_nest_lock, omp_lock_hint_none,
2753           __ompt_get_mutex_impl_type(user_lock),
2754           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2755     }
2756   }
2757 #endif
2758   int acquire_status =
2759       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2760   (void)acquire_status;
2761 #if USE_ITT_BUILD
2762   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2763 #endif
2764 
2765 #if OMPT_SUPPORT && OMPT_OPTIONAL
2766   if (ompt_enabled.enabled) {
2767     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2768       if (ompt_enabled.ompt_callback_mutex_acquired) {
2769         // lock_first
2770         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2771             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2772             codeptr);
2773       }
2774     } else {
2775       if (ompt_enabled.ompt_callback_nest_lock) {
2776         // lock_next
2777         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2778             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2779       }
2780     }
2781   }
2782 #endif
2783 
2784 #else // KMP_USE_DYNAMIC_LOCK
2785   int acquire_status;
2786   kmp_user_lock_p lck;
2787 
2788   if ((__kmp_user_lock_kind == lk_tas) &&
2789       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2790        OMP_NEST_LOCK_T_SIZE)) {
2791     lck = (kmp_user_lock_p)user_lock;
2792   }
2793 #if KMP_USE_FUTEX
2794   else if ((__kmp_user_lock_kind == lk_futex) &&
2795            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2796             OMP_NEST_LOCK_T_SIZE)) {
2797     lck = (kmp_user_lock_p)user_lock;
2798   }
2799 #endif
2800   else {
2801     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2802   }
2803 
2804 #if USE_ITT_BUILD
2805   __kmp_itt_lock_acquiring(lck);
2806 #endif /* USE_ITT_BUILD */
2807 #if OMPT_SUPPORT && OMPT_OPTIONAL
2808   // This is the case, if called from omp_init_lock_with_hint:
2809   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2810   if (!codeptr)
2811     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2812   if (ompt_enabled.enabled) {
2813     if (ompt_enabled.ompt_callback_mutex_acquire) {
2814       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2815           ompt_mutex_nest_lock, omp_lock_hint_none,
2816           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2817           codeptr);
2818     }
2819   }
2820 #endif
2821 
2822   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2823 
2824 #if USE_ITT_BUILD
2825   __kmp_itt_lock_acquired(lck);
2826 #endif /* USE_ITT_BUILD */
2827 
2828 #if OMPT_SUPPORT && OMPT_OPTIONAL
2829   if (ompt_enabled.enabled) {
2830     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2831       if (ompt_enabled.ompt_callback_mutex_acquired) {
2832         // lock_first
2833         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2834             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2835       }
2836     } else {
2837       if (ompt_enabled.ompt_callback_nest_lock) {
2838         // lock_next
2839         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2840             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2841       }
2842     }
2843   }
2844 #endif
2845 
2846 #endif // KMP_USE_DYNAMIC_LOCK
2847 }
2848 
2849 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2850 #if KMP_USE_DYNAMIC_LOCK
2851 
2852   int tag = KMP_EXTRACT_D_TAG(user_lock);
2853 #if USE_ITT_BUILD
2854   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2855 #endif
2856 #if KMP_USE_INLINED_TAS
2857   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2858     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2859   } else
2860 #elif KMP_USE_INLINED_FUTEX
2861   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2862     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2863   } else
2864 #endif
2865   {
2866     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2867   }
2868 
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   // This is the case, if called from omp_init_lock_with_hint:
2871   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2872   if (!codeptr)
2873     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2874   if (ompt_enabled.ompt_callback_mutex_released) {
2875     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2876         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2877   }
2878 #endif
2879 
2880 #else // KMP_USE_DYNAMIC_LOCK
2881 
2882   kmp_user_lock_p lck;
2883 
2884   /* Can't use serial interval since not block structured */
2885   /* release the lock */
2886 
2887   if ((__kmp_user_lock_kind == lk_tas) &&
2888       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2889 #if KMP_OS_LINUX &&                                                            \
2890     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2891 // "fast" path implemented to fix customer performance issue
2892 #if USE_ITT_BUILD
2893     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2894 #endif /* USE_ITT_BUILD */
2895     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2896     KMP_MB();
2897 
2898 #if OMPT_SUPPORT && OMPT_OPTIONAL
2899     // This is the case, if called from omp_init_lock_with_hint:
2900     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2901     if (!codeptr)
2902       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2903     if (ompt_enabled.ompt_callback_mutex_released) {
2904       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2905           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2906     }
2907 #endif
2908 
2909     return;
2910 #else
2911     lck = (kmp_user_lock_p)user_lock;
2912 #endif
2913   }
2914 #if KMP_USE_FUTEX
2915   else if ((__kmp_user_lock_kind == lk_futex) &&
2916            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2917     lck = (kmp_user_lock_p)user_lock;
2918   }
2919 #endif
2920   else {
2921     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2922   }
2923 
2924 #if USE_ITT_BUILD
2925   __kmp_itt_lock_releasing(lck);
2926 #endif /* USE_ITT_BUILD */
2927 
2928   RELEASE_LOCK(lck, gtid);
2929 
2930 #if OMPT_SUPPORT && OMPT_OPTIONAL
2931   // This is the case, if called from omp_init_lock_with_hint:
2932   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2933   if (!codeptr)
2934     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2935   if (ompt_enabled.ompt_callback_mutex_released) {
2936     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2937         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2938   }
2939 #endif
2940 
2941 #endif // KMP_USE_DYNAMIC_LOCK
2942 }
2943 
2944 /* release the lock */
2945 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2946 #if KMP_USE_DYNAMIC_LOCK
2947 
2948 #if USE_ITT_BUILD
2949   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2950 #endif
2951   int release_status =
2952       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2953   (void)release_status;
2954 
2955 #if OMPT_SUPPORT && OMPT_OPTIONAL
2956   // This is the case, if called from omp_init_lock_with_hint:
2957   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2958   if (!codeptr)
2959     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2960   if (ompt_enabled.enabled) {
2961     if (release_status == KMP_LOCK_RELEASED) {
2962       if (ompt_enabled.ompt_callback_mutex_released) {
2963         // release_lock_last
2964         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2965             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2966             codeptr);
2967       }
2968     } else if (ompt_enabled.ompt_callback_nest_lock) {
2969       // release_lock_prev
2970       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2971           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2972     }
2973   }
2974 #endif
2975 
2976 #else // KMP_USE_DYNAMIC_LOCK
2977 
2978   kmp_user_lock_p lck;
2979 
2980   /* Can't use serial interval since not block structured */
2981 
2982   if ((__kmp_user_lock_kind == lk_tas) &&
2983       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2984        OMP_NEST_LOCK_T_SIZE)) {
2985 #if KMP_OS_LINUX &&                                                            \
2986     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2987     // "fast" path implemented to fix customer performance issue
2988     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2989 #if USE_ITT_BUILD
2990     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2991 #endif /* USE_ITT_BUILD */
2992 
2993 #if OMPT_SUPPORT && OMPT_OPTIONAL
2994     int release_status = KMP_LOCK_STILL_HELD;
2995 #endif
2996 
2997     if (--(tl->lk.depth_locked) == 0) {
2998       TCW_4(tl->lk.poll, 0);
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000       release_status = KMP_LOCK_RELEASED;
3001 #endif
3002     }
3003     KMP_MB();
3004 
3005 #if OMPT_SUPPORT && OMPT_OPTIONAL
3006     // This is the case, if called from omp_init_lock_with_hint:
3007     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3008     if (!codeptr)
3009       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3010     if (ompt_enabled.enabled) {
3011       if (release_status == KMP_LOCK_RELEASED) {
3012         if (ompt_enabled.ompt_callback_mutex_released) {
3013           // release_lock_last
3014           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3015               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3016         }
3017       } else if (ompt_enabled.ompt_callback_nest_lock) {
3018         // release_lock_previous
3019         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3020             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3021       }
3022     }
3023 #endif
3024 
3025     return;
3026 #else
3027     lck = (kmp_user_lock_p)user_lock;
3028 #endif
3029   }
3030 #if KMP_USE_FUTEX
3031   else if ((__kmp_user_lock_kind == lk_futex) &&
3032            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3033             OMP_NEST_LOCK_T_SIZE)) {
3034     lck = (kmp_user_lock_p)user_lock;
3035   }
3036 #endif
3037   else {
3038     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3039   }
3040 
3041 #if USE_ITT_BUILD
3042   __kmp_itt_lock_releasing(lck);
3043 #endif /* USE_ITT_BUILD */
3044 
3045   int release_status;
3046   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3047 #if OMPT_SUPPORT && OMPT_OPTIONAL
3048   // This is the case, if called from omp_init_lock_with_hint:
3049   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3050   if (!codeptr)
3051     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3052   if (ompt_enabled.enabled) {
3053     if (release_status == KMP_LOCK_RELEASED) {
3054       if (ompt_enabled.ompt_callback_mutex_released) {
3055         // release_lock_last
3056         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3057             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3058       }
3059     } else if (ompt_enabled.ompt_callback_nest_lock) {
3060       // release_lock_previous
3061       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3062           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3063     }
3064   }
3065 #endif
3066 
3067 #endif // KMP_USE_DYNAMIC_LOCK
3068 }
3069 
3070 /* try to acquire the lock */
3071 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3072   KMP_COUNT_BLOCK(OMP_test_lock);
3073 
3074 #if KMP_USE_DYNAMIC_LOCK
3075   int rc;
3076   int tag = KMP_EXTRACT_D_TAG(user_lock);
3077 #if USE_ITT_BUILD
3078   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3079 #endif
3080 #if OMPT_SUPPORT && OMPT_OPTIONAL
3081   // This is the case, if called from omp_init_lock_with_hint:
3082   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3083   if (!codeptr)
3084     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3085   if (ompt_enabled.ompt_callback_mutex_acquire) {
3086     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3087         ompt_mutex_lock, omp_lock_hint_none,
3088         __ompt_get_mutex_impl_type(user_lock),
3089         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3090   }
3091 #endif
3092 #if KMP_USE_INLINED_TAS
3093   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3094     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3095   } else
3096 #elif KMP_USE_INLINED_FUTEX
3097   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3098     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3099   } else
3100 #endif
3101   {
3102     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3103   }
3104   if (rc) {
3105 #if USE_ITT_BUILD
3106     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3107 #endif
3108 #if OMPT_SUPPORT && OMPT_OPTIONAL
3109     if (ompt_enabled.ompt_callback_mutex_acquired) {
3110       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3111           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3112     }
3113 #endif
3114     return FTN_TRUE;
3115   } else {
3116 #if USE_ITT_BUILD
3117     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3118 #endif
3119     return FTN_FALSE;
3120   }
3121 
3122 #else // KMP_USE_DYNAMIC_LOCK
3123 
3124   kmp_user_lock_p lck;
3125   int rc;
3126 
3127   if ((__kmp_user_lock_kind == lk_tas) &&
3128       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3129     lck = (kmp_user_lock_p)user_lock;
3130   }
3131 #if KMP_USE_FUTEX
3132   else if ((__kmp_user_lock_kind == lk_futex) &&
3133            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3134     lck = (kmp_user_lock_p)user_lock;
3135   }
3136 #endif
3137   else {
3138     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3139   }
3140 
3141 #if USE_ITT_BUILD
3142   __kmp_itt_lock_acquiring(lck);
3143 #endif /* USE_ITT_BUILD */
3144 #if OMPT_SUPPORT && OMPT_OPTIONAL
3145   // This is the case, if called from omp_init_lock_with_hint:
3146   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3147   if (!codeptr)
3148     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3149   if (ompt_enabled.ompt_callback_mutex_acquire) {
3150     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3151         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3152         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3153   }
3154 #endif
3155 
3156   rc = TEST_LOCK(lck, gtid);
3157 #if USE_ITT_BUILD
3158   if (rc) {
3159     __kmp_itt_lock_acquired(lck);
3160   } else {
3161     __kmp_itt_lock_cancelled(lck);
3162   }
3163 #endif /* USE_ITT_BUILD */
3164 #if OMPT_SUPPORT && OMPT_OPTIONAL
3165   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3166     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3167         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3168   }
3169 #endif
3170 
3171   return (rc ? FTN_TRUE : FTN_FALSE);
3172 
3173   /* Can't use serial interval since not block structured */
3174 
3175 #endif // KMP_USE_DYNAMIC_LOCK
3176 }
3177 
3178 /* try to acquire the lock */
3179 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3180 #if KMP_USE_DYNAMIC_LOCK
3181   int rc;
3182 #if USE_ITT_BUILD
3183   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3184 #endif
3185 #if OMPT_SUPPORT && OMPT_OPTIONAL
3186   // This is the case, if called from omp_init_lock_with_hint:
3187   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3188   if (!codeptr)
3189     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3190   if (ompt_enabled.ompt_callback_mutex_acquire) {
3191     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3192         ompt_mutex_nest_lock, omp_lock_hint_none,
3193         __ompt_get_mutex_impl_type(user_lock),
3194         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3195   }
3196 #endif
3197   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3198 #if USE_ITT_BUILD
3199   if (rc) {
3200     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3201   } else {
3202     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3203   }
3204 #endif
3205 #if OMPT_SUPPORT && OMPT_OPTIONAL
3206   if (ompt_enabled.enabled && rc) {
3207     if (rc == 1) {
3208       if (ompt_enabled.ompt_callback_mutex_acquired) {
3209         // lock_first
3210         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3211             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3212             codeptr);
3213       }
3214     } else {
3215       if (ompt_enabled.ompt_callback_nest_lock) {
3216         // lock_next
3217         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3218             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3219       }
3220     }
3221   }
3222 #endif
3223   return rc;
3224 
3225 #else // KMP_USE_DYNAMIC_LOCK
3226 
3227   kmp_user_lock_p lck;
3228   int rc;
3229 
3230   if ((__kmp_user_lock_kind == lk_tas) &&
3231       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3232        OMP_NEST_LOCK_T_SIZE)) {
3233     lck = (kmp_user_lock_p)user_lock;
3234   }
3235 #if KMP_USE_FUTEX
3236   else if ((__kmp_user_lock_kind == lk_futex) &&
3237            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3238             OMP_NEST_LOCK_T_SIZE)) {
3239     lck = (kmp_user_lock_p)user_lock;
3240   }
3241 #endif
3242   else {
3243     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3244   }
3245 
3246 #if USE_ITT_BUILD
3247   __kmp_itt_lock_acquiring(lck);
3248 #endif /* USE_ITT_BUILD */
3249 
3250 #if OMPT_SUPPORT && OMPT_OPTIONAL
3251   // This is the case, if called from omp_init_lock_with_hint:
3252   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3253   if (!codeptr)
3254     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3255   if (ompt_enabled.enabled) &&
3256         ompt_enabled.ompt_callback_mutex_acquire) {
3257       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3258           ompt_mutex_nest_lock, omp_lock_hint_none,
3259           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3260           codeptr);
3261     }
3262 #endif
3263 
3264   rc = TEST_NESTED_LOCK(lck, gtid);
3265 #if USE_ITT_BUILD
3266   if (rc) {
3267     __kmp_itt_lock_acquired(lck);
3268   } else {
3269     __kmp_itt_lock_cancelled(lck);
3270   }
3271 #endif /* USE_ITT_BUILD */
3272 #if OMPT_SUPPORT && OMPT_OPTIONAL
3273   if (ompt_enabled.enabled && rc) {
3274     if (rc == 1) {
3275       if (ompt_enabled.ompt_callback_mutex_acquired) {
3276         // lock_first
3277         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3278             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3279       }
3280     } else {
3281       if (ompt_enabled.ompt_callback_nest_lock) {
3282         // lock_next
3283         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3284             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3285       }
3286     }
3287   }
3288 #endif
3289   return rc;
3290 
3291   /* Can't use serial interval since not block structured */
3292 
3293 #endif // KMP_USE_DYNAMIC_LOCK
3294 }
3295 
3296 // Interface to fast scalable reduce methods routines
3297 
3298 // keep the selected method in a thread local structure for cross-function
3299 // usage: will be used in __kmpc_end_reduce* functions;
3300 // another solution: to re-determine the method one more time in
3301 // __kmpc_end_reduce* functions (new prototype required then)
3302 // AT: which solution is better?
3303 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3304   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3305 
3306 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3307   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3308 
3309 // description of the packed_reduction_method variable: look at the macros in
3310 // kmp.h
3311 
3312 // used in a critical section reduce block
3313 static __forceinline void
3314 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3315                                           kmp_critical_name *crit) {
3316 
3317   // this lock was visible to a customer and to the threading profile tool as a
3318   // serial overhead span (although it's used for an internal purpose only)
3319   //            why was it visible in previous implementation?
3320   //            should we keep it visible in new reduce block?
3321   kmp_user_lock_p lck;
3322 
3323 #if KMP_USE_DYNAMIC_LOCK
3324 
3325   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3326   // Check if it is initialized.
3327   if (*lk == 0) {
3328     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3329       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3330                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3331     } else {
3332       __kmp_init_indirect_csptr(crit, loc, global_tid,
3333                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3334     }
3335   }
3336   // Branch for accessing the actual lock object and set operation. This
3337   // branching is inevitable since this lock initialization does not follow the
3338   // normal dispatch path (lock table is not used).
3339   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3340     lck = (kmp_user_lock_p)lk;
3341     KMP_DEBUG_ASSERT(lck != NULL);
3342     if (__kmp_env_consistency_check) {
3343       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3344     }
3345     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3346   } else {
3347     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3348     lck = ilk->lock;
3349     KMP_DEBUG_ASSERT(lck != NULL);
3350     if (__kmp_env_consistency_check) {
3351       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3352     }
3353     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3354   }
3355 
3356 #else // KMP_USE_DYNAMIC_LOCK
3357 
3358   // We know that the fast reduction code is only emitted by Intel compilers
3359   // with 32 byte critical sections. If there isn't enough space, then we
3360   // have to use a pointer.
3361   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3362     lck = (kmp_user_lock_p)crit;
3363   } else {
3364     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3365   }
3366   KMP_DEBUG_ASSERT(lck != NULL);
3367 
3368   if (__kmp_env_consistency_check)
3369     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3370 
3371   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3372 
3373 #endif // KMP_USE_DYNAMIC_LOCK
3374 }
3375 
3376 // used in a critical section reduce block
3377 static __forceinline void
3378 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3379                                         kmp_critical_name *crit) {
3380 
3381   kmp_user_lock_p lck;
3382 
3383 #if KMP_USE_DYNAMIC_LOCK
3384 
3385   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3386     lck = (kmp_user_lock_p)crit;
3387     if (__kmp_env_consistency_check)
3388       __kmp_pop_sync(global_tid, ct_critical, loc);
3389     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3390   } else {
3391     kmp_indirect_lock_t *ilk =
3392         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3393     if (__kmp_env_consistency_check)
3394       __kmp_pop_sync(global_tid, ct_critical, loc);
3395     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3396   }
3397 
3398 #else // KMP_USE_DYNAMIC_LOCK
3399 
3400   // We know that the fast reduction code is only emitted by Intel compilers
3401   // with 32 byte critical sections. If there isn't enough space, then we have
3402   // to use a pointer.
3403   if (__kmp_base_user_lock_size > 32) {
3404     lck = *((kmp_user_lock_p *)crit);
3405     KMP_ASSERT(lck != NULL);
3406   } else {
3407     lck = (kmp_user_lock_p)crit;
3408   }
3409 
3410   if (__kmp_env_consistency_check)
3411     __kmp_pop_sync(global_tid, ct_critical, loc);
3412 
3413   __kmp_release_user_lock_with_checks(lck, global_tid);
3414 
3415 #endif // KMP_USE_DYNAMIC_LOCK
3416 } // __kmp_end_critical_section_reduce_block
3417 
3418 static __forceinline int
3419 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3420                                      int *task_state) {
3421   kmp_team_t *team;
3422 
3423   // Check if we are inside the teams construct?
3424   if (th->th.th_teams_microtask) {
3425     *team_p = team = th->th.th_team;
3426     if (team->t.t_level == th->th.th_teams_level) {
3427       // This is reduction at teams construct.
3428       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3429       // Let's swap teams temporarily for the reduction.
3430       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3431       th->th.th_team = team->t.t_parent;
3432       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3433       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3434       *task_state = th->th.th_task_state;
3435       th->th.th_task_state = 0;
3436 
3437       return 1;
3438     }
3439   }
3440   return 0;
3441 }
3442 
3443 static __forceinline void
3444 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3445   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3446   th->th.th_info.ds.ds_tid = 0;
3447   th->th.th_team = team;
3448   th->th.th_team_nproc = team->t.t_nproc;
3449   th->th.th_task_team = team->t.t_task_team[task_state];
3450   __kmp_type_convert(task_state, &(th->th.th_task_state));
3451 }
3452 
3453 /* 2.a.i. Reduce Block without a terminating barrier */
3454 /*!
3455 @ingroup SYNCHRONIZATION
3456 @param loc source location information
3457 @param global_tid global thread number
3458 @param num_vars number of items (variables) to be reduced
3459 @param reduce_size size of data in bytes to be reduced
3460 @param reduce_data pointer to data to be reduced
3461 @param reduce_func callback function providing reduction operation on two
3462 operands and returning result of reduction in lhs_data
3463 @param lck pointer to the unique lock data structure
3464 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3465 threads if atomic reduction needed
3466 
3467 The nowait version is used for a reduce clause with the nowait argument.
3468 */
3469 kmp_int32
3470 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3471                      size_t reduce_size, void *reduce_data,
3472                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3473                      kmp_critical_name *lck) {
3474 
3475   KMP_COUNT_BLOCK(REDUCE_nowait);
3476   int retval = 0;
3477   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3478   kmp_info_t *th;
3479   kmp_team_t *team;
3480   int teams_swapped = 0, task_state;
3481   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3482   __kmp_assert_valid_gtid(global_tid);
3483 
3484   // why do we need this initialization here at all?
3485   // Reduction clause can not be used as a stand-alone directive.
3486 
3487   // do not call __kmp_serial_initialize(), it will be called by
3488   // __kmp_parallel_initialize() if needed
3489   // possible detection of false-positive race by the threadchecker ???
3490   if (!TCR_4(__kmp_init_parallel))
3491     __kmp_parallel_initialize();
3492 
3493   __kmp_resume_if_soft_paused();
3494 
3495 // check correctness of reduce block nesting
3496 #if KMP_USE_DYNAMIC_LOCK
3497   if (__kmp_env_consistency_check)
3498     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3499 #else
3500   if (__kmp_env_consistency_check)
3501     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3502 #endif
3503 
3504   th = __kmp_thread_from_gtid(global_tid);
3505   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3506 
3507   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3508   // the value should be kept in a variable
3509   // the variable should be either a construct-specific or thread-specific
3510   // property, not a team specific property
3511   //     (a thread can reach the next reduce block on the next construct, reduce
3512   //     method may differ on the next construct)
3513   // an ident_t "loc" parameter could be used as a construct-specific property
3514   // (what if loc == 0?)
3515   //     (if both construct-specific and team-specific variables were shared,
3516   //     then unness extra syncs should be needed)
3517   // a thread-specific variable is better regarding two issues above (next
3518   // construct and extra syncs)
3519   // a thread-specific "th_local.reduction_method" variable is used currently
3520   // each thread executes 'determine' and 'set' lines (no need to execute by one
3521   // thread, to avoid unness extra syncs)
3522 
3523   packed_reduction_method = __kmp_determine_reduction_method(
3524       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3525   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3526 
3527   OMPT_REDUCTION_DECL(th, global_tid);
3528   if (packed_reduction_method == critical_reduce_block) {
3529 
3530     OMPT_REDUCTION_BEGIN;
3531 
3532     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3533     retval = 1;
3534 
3535   } else if (packed_reduction_method == empty_reduce_block) {
3536 
3537     OMPT_REDUCTION_BEGIN;
3538 
3539     // usage: if team size == 1, no synchronization is required ( Intel
3540     // platforms only )
3541     retval = 1;
3542 
3543   } else if (packed_reduction_method == atomic_reduce_block) {
3544 
3545     retval = 2;
3546 
3547     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3548     // won't be called by the code gen)
3549     //     (it's not quite good, because the checking block has been closed by
3550     //     this 'pop',
3551     //      but atomic operation has not been executed yet, will be executed
3552     //      slightly later, literally on next instruction)
3553     if (__kmp_env_consistency_check)
3554       __kmp_pop_sync(global_tid, ct_reduce, loc);
3555 
3556   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3557                                    tree_reduce_block)) {
3558 
3559 // AT: performance issue: a real barrier here
3560 // AT: (if primary thread is slow, other threads are blocked here waiting for
3561 //      the primary thread to come and release them)
3562 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3563 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3564 //      be confusing to a customer)
3565 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3566 // might go faster and be more in line with sense of NOWAIT
3567 // AT: TO DO: do epcc test and compare times
3568 
3569 // this barrier should be invisible to a customer and to the threading profile
3570 // tool (it's neither a terminating barrier nor customer's code, it's
3571 // used for an internal purpose)
3572 #if OMPT_SUPPORT
3573     // JP: can this barrier potentially leed to task scheduling?
3574     // JP: as long as there is a barrier in the implementation, OMPT should and
3575     // will provide the barrier events
3576     //         so we set-up the necessary frame/return addresses.
3577     ompt_frame_t *ompt_frame;
3578     if (ompt_enabled.enabled) {
3579       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3580       if (ompt_frame->enter_frame.ptr == NULL)
3581         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3582     }
3583     OMPT_STORE_RETURN_ADDRESS(global_tid);
3584 #endif
3585 #if USE_ITT_NOTIFY
3586     __kmp_threads[global_tid]->th.th_ident = loc;
3587 #endif
3588     retval =
3589         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3590                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3591     retval = (retval != 0) ? (0) : (1);
3592 #if OMPT_SUPPORT && OMPT_OPTIONAL
3593     if (ompt_enabled.enabled) {
3594       ompt_frame->enter_frame = ompt_data_none;
3595     }
3596 #endif
3597 
3598     // all other workers except primary thread should do this pop here
3599     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3600     if (__kmp_env_consistency_check) {
3601       if (retval == 0) {
3602         __kmp_pop_sync(global_tid, ct_reduce, loc);
3603       }
3604     }
3605 
3606   } else {
3607 
3608     // should never reach this block
3609     KMP_ASSERT(0); // "unexpected method"
3610   }
3611   if (teams_swapped) {
3612     __kmp_restore_swapped_teams(th, team, task_state);
3613   }
3614   KA_TRACE(
3615       10,
3616       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3617        global_tid, packed_reduction_method, retval));
3618 
3619   return retval;
3620 }
3621 
3622 /*!
3623 @ingroup SYNCHRONIZATION
3624 @param loc source location information
3625 @param global_tid global thread id.
3626 @param lck pointer to the unique lock data structure
3627 
3628 Finish the execution of a reduce nowait.
3629 */
3630 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3631                               kmp_critical_name *lck) {
3632 
3633   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3634 
3635   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3636   __kmp_assert_valid_gtid(global_tid);
3637 
3638   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3639 
3640   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3641 
3642   if (packed_reduction_method == critical_reduce_block) {
3643 
3644     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3645     OMPT_REDUCTION_END;
3646 
3647   } else if (packed_reduction_method == empty_reduce_block) {
3648 
3649     // usage: if team size == 1, no synchronization is required ( on Intel
3650     // platforms only )
3651 
3652     OMPT_REDUCTION_END;
3653 
3654   } else if (packed_reduction_method == atomic_reduce_block) {
3655 
3656     // neither primary thread nor other workers should get here
3657     //     (code gen does not generate this call in case 2: atomic reduce block)
3658     // actually it's better to remove this elseif at all;
3659     // after removal this value will checked by the 'else' and will assert
3660 
3661   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3662                                    tree_reduce_block)) {
3663 
3664     // only primary thread gets here
3665     // OMPT: tree reduction is annotated in the barrier code
3666 
3667   } else {
3668 
3669     // should never reach this block
3670     KMP_ASSERT(0); // "unexpected method"
3671   }
3672 
3673   if (__kmp_env_consistency_check)
3674     __kmp_pop_sync(global_tid, ct_reduce, loc);
3675 
3676   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3677                 global_tid, packed_reduction_method));
3678 
3679   return;
3680 }
3681 
3682 /* 2.a.ii. Reduce Block with a terminating barrier */
3683 
3684 /*!
3685 @ingroup SYNCHRONIZATION
3686 @param loc source location information
3687 @param global_tid global thread number
3688 @param num_vars number of items (variables) to be reduced
3689 @param reduce_size size of data in bytes to be reduced
3690 @param reduce_data pointer to data to be reduced
3691 @param reduce_func callback function providing reduction operation on two
3692 operands and returning result of reduction in lhs_data
3693 @param lck pointer to the unique lock data structure
3694 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3695 threads if atomic reduction needed
3696 
3697 A blocking reduce that includes an implicit barrier.
3698 */
3699 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3700                         size_t reduce_size, void *reduce_data,
3701                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3702                         kmp_critical_name *lck) {
3703   KMP_COUNT_BLOCK(REDUCE_wait);
3704   int retval = 0;
3705   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3706   kmp_info_t *th;
3707   kmp_team_t *team;
3708   int teams_swapped = 0, task_state;
3709 
3710   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3711   __kmp_assert_valid_gtid(global_tid);
3712 
3713   // why do we need this initialization here at all?
3714   // Reduction clause can not be a stand-alone directive.
3715 
3716   // do not call __kmp_serial_initialize(), it will be called by
3717   // __kmp_parallel_initialize() if needed
3718   // possible detection of false-positive race by the threadchecker ???
3719   if (!TCR_4(__kmp_init_parallel))
3720     __kmp_parallel_initialize();
3721 
3722   __kmp_resume_if_soft_paused();
3723 
3724 // check correctness of reduce block nesting
3725 #if KMP_USE_DYNAMIC_LOCK
3726   if (__kmp_env_consistency_check)
3727     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3728 #else
3729   if (__kmp_env_consistency_check)
3730     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3731 #endif
3732 
3733   th = __kmp_thread_from_gtid(global_tid);
3734   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3735 
3736   packed_reduction_method = __kmp_determine_reduction_method(
3737       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3738   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3739 
3740   OMPT_REDUCTION_DECL(th, global_tid);
3741 
3742   if (packed_reduction_method == critical_reduce_block) {
3743 
3744     OMPT_REDUCTION_BEGIN;
3745     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3746     retval = 1;
3747 
3748   } else if (packed_reduction_method == empty_reduce_block) {
3749 
3750     OMPT_REDUCTION_BEGIN;
3751     // usage: if team size == 1, no synchronization is required ( Intel
3752     // platforms only )
3753     retval = 1;
3754 
3755   } else if (packed_reduction_method == atomic_reduce_block) {
3756 
3757     retval = 2;
3758 
3759   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3760                                    tree_reduce_block)) {
3761 
3762 // case tree_reduce_block:
3763 // this barrier should be visible to a customer and to the threading profile
3764 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3765 #if OMPT_SUPPORT
3766     ompt_frame_t *ompt_frame;
3767     if (ompt_enabled.enabled) {
3768       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3769       if (ompt_frame->enter_frame.ptr == NULL)
3770         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3771     }
3772     OMPT_STORE_RETURN_ADDRESS(global_tid);
3773 #endif
3774 #if USE_ITT_NOTIFY
3775     __kmp_threads[global_tid]->th.th_ident =
3776         loc; // needed for correct notification of frames
3777 #endif
3778     retval =
3779         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3780                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3781     retval = (retval != 0) ? (0) : (1);
3782 #if OMPT_SUPPORT && OMPT_OPTIONAL
3783     if (ompt_enabled.enabled) {
3784       ompt_frame->enter_frame = ompt_data_none;
3785     }
3786 #endif
3787 
3788     // all other workers except primary thread should do this pop here
3789     // (none of other workers except primary will enter __kmpc_end_reduce())
3790     if (__kmp_env_consistency_check) {
3791       if (retval == 0) { // 0: all other workers; 1: primary thread
3792         __kmp_pop_sync(global_tid, ct_reduce, loc);
3793       }
3794     }
3795 
3796   } else {
3797 
3798     // should never reach this block
3799     KMP_ASSERT(0); // "unexpected method"
3800   }
3801   if (teams_swapped) {
3802     __kmp_restore_swapped_teams(th, team, task_state);
3803   }
3804 
3805   KA_TRACE(10,
3806            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3807             global_tid, packed_reduction_method, retval));
3808   return retval;
3809 }
3810 
3811 /*!
3812 @ingroup SYNCHRONIZATION
3813 @param loc source location information
3814 @param global_tid global thread id.
3815 @param lck pointer to the unique lock data structure
3816 
3817 Finish the execution of a blocking reduce.
3818 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3819 start function.
3820 */
3821 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3822                        kmp_critical_name *lck) {
3823 
3824   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3825   kmp_info_t *th;
3826   kmp_team_t *team;
3827   int teams_swapped = 0, task_state;
3828 
3829   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3830   __kmp_assert_valid_gtid(global_tid);
3831 
3832   th = __kmp_thread_from_gtid(global_tid);
3833   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3834 
3835   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3836 
3837   // this barrier should be visible to a customer and to the threading profile
3838   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3839   OMPT_REDUCTION_DECL(th, global_tid);
3840 
3841   if (packed_reduction_method == critical_reduce_block) {
3842     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3843 
3844     OMPT_REDUCTION_END;
3845 
3846 // TODO: implicit barrier: should be exposed
3847 #if OMPT_SUPPORT
3848     ompt_frame_t *ompt_frame;
3849     if (ompt_enabled.enabled) {
3850       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3851       if (ompt_frame->enter_frame.ptr == NULL)
3852         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3853     }
3854     OMPT_STORE_RETURN_ADDRESS(global_tid);
3855 #endif
3856 #if USE_ITT_NOTIFY
3857     __kmp_threads[global_tid]->th.th_ident = loc;
3858 #endif
3859     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3860 #if OMPT_SUPPORT && OMPT_OPTIONAL
3861     if (ompt_enabled.enabled) {
3862       ompt_frame->enter_frame = ompt_data_none;
3863     }
3864 #endif
3865 
3866   } else if (packed_reduction_method == empty_reduce_block) {
3867 
3868     OMPT_REDUCTION_END;
3869 
3870 // usage: if team size==1, no synchronization is required (Intel platforms only)
3871 
3872 // TODO: implicit barrier: should be exposed
3873 #if OMPT_SUPPORT
3874     ompt_frame_t *ompt_frame;
3875     if (ompt_enabled.enabled) {
3876       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3877       if (ompt_frame->enter_frame.ptr == NULL)
3878         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3879     }
3880     OMPT_STORE_RETURN_ADDRESS(global_tid);
3881 #endif
3882 #if USE_ITT_NOTIFY
3883     __kmp_threads[global_tid]->th.th_ident = loc;
3884 #endif
3885     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3886 #if OMPT_SUPPORT && OMPT_OPTIONAL
3887     if (ompt_enabled.enabled) {
3888       ompt_frame->enter_frame = ompt_data_none;
3889     }
3890 #endif
3891 
3892   } else if (packed_reduction_method == atomic_reduce_block) {
3893 
3894 #if OMPT_SUPPORT
3895     ompt_frame_t *ompt_frame;
3896     if (ompt_enabled.enabled) {
3897       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3898       if (ompt_frame->enter_frame.ptr == NULL)
3899         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3900     }
3901     OMPT_STORE_RETURN_ADDRESS(global_tid);
3902 #endif
3903 // TODO: implicit barrier: should be exposed
3904 #if USE_ITT_NOTIFY
3905     __kmp_threads[global_tid]->th.th_ident = loc;
3906 #endif
3907     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3908 #if OMPT_SUPPORT && OMPT_OPTIONAL
3909     if (ompt_enabled.enabled) {
3910       ompt_frame->enter_frame = ompt_data_none;
3911     }
3912 #endif
3913 
3914   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3915                                    tree_reduce_block)) {
3916 
3917     // only primary thread executes here (primary releases all other workers)
3918     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3919                             global_tid);
3920 
3921   } else {
3922 
3923     // should never reach this block
3924     KMP_ASSERT(0); // "unexpected method"
3925   }
3926   if (teams_swapped) {
3927     __kmp_restore_swapped_teams(th, team, task_state);
3928   }
3929 
3930   if (__kmp_env_consistency_check)
3931     __kmp_pop_sync(global_tid, ct_reduce, loc);
3932 
3933   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3934                 global_tid, packed_reduction_method));
3935 
3936   return;
3937 }
3938 
3939 #undef __KMP_GET_REDUCTION_METHOD
3940 #undef __KMP_SET_REDUCTION_METHOD
3941 
3942 /* end of interface to fast scalable reduce routines */
3943 
3944 kmp_uint64 __kmpc_get_taskid() {
3945 
3946   kmp_int32 gtid;
3947   kmp_info_t *thread;
3948 
3949   gtid = __kmp_get_gtid();
3950   if (gtid < 0) {
3951     return 0;
3952   }
3953   thread = __kmp_thread_from_gtid(gtid);
3954   return thread->th.th_current_task->td_task_id;
3955 
3956 } // __kmpc_get_taskid
3957 
3958 kmp_uint64 __kmpc_get_parent_taskid() {
3959 
3960   kmp_int32 gtid;
3961   kmp_info_t *thread;
3962   kmp_taskdata_t *parent_task;
3963 
3964   gtid = __kmp_get_gtid();
3965   if (gtid < 0) {
3966     return 0;
3967   }
3968   thread = __kmp_thread_from_gtid(gtid);
3969   parent_task = thread->th.th_current_task->td_parent;
3970   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3971 
3972 } // __kmpc_get_parent_taskid
3973 
3974 /*!
3975 @ingroup WORK_SHARING
3976 @param loc  source location information.
3977 @param gtid  global thread number.
3978 @param num_dims  number of associated doacross loops.
3979 @param dims  info on loops bounds.
3980 
3981 Initialize doacross loop information.
3982 Expect compiler send us inclusive bounds,
3983 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3984 */
3985 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3986                           const struct kmp_dim *dims) {
3987   __kmp_assert_valid_gtid(gtid);
3988   int j, idx;
3989   kmp_int64 last, trace_count;
3990   kmp_info_t *th = __kmp_threads[gtid];
3991   kmp_team_t *team = th->th.th_team;
3992   kmp_uint32 *flags;
3993   kmp_disp_t *pr_buf = th->th.th_dispatch;
3994   dispatch_shared_info_t *sh_buf;
3995 
3996   KA_TRACE(
3997       20,
3998       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3999        gtid, num_dims, !team->t.t_serialized));
4000   KMP_DEBUG_ASSERT(dims != NULL);
4001   KMP_DEBUG_ASSERT(num_dims > 0);
4002 
4003   if (team->t.t_serialized) {
4004     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4005     return; // no dependencies if team is serialized
4006   }
4007   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4008   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4009   // the next loop
4010   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4011 
4012   // Save bounds info into allocated private buffer
4013   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4014   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4015       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4016   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4017   pr_buf->th_doacross_info[0] =
4018       (kmp_int64)num_dims; // first element is number of dimensions
4019   // Save also address of num_done in order to access it later without knowing
4020   // the buffer index
4021   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4022   pr_buf->th_doacross_info[2] = dims[0].lo;
4023   pr_buf->th_doacross_info[3] = dims[0].up;
4024   pr_buf->th_doacross_info[4] = dims[0].st;
4025   last = 5;
4026   for (j = 1; j < num_dims; ++j) {
4027     kmp_int64
4028         range_length; // To keep ranges of all dimensions but the first dims[0]
4029     if (dims[j].st == 1) { // most common case
4030       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4031       range_length = dims[j].up - dims[j].lo + 1;
4032     } else {
4033       if (dims[j].st > 0) {
4034         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4035         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4036       } else { // negative increment
4037         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4038         range_length =
4039             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4040       }
4041     }
4042     pr_buf->th_doacross_info[last++] = range_length;
4043     pr_buf->th_doacross_info[last++] = dims[j].lo;
4044     pr_buf->th_doacross_info[last++] = dims[j].up;
4045     pr_buf->th_doacross_info[last++] = dims[j].st;
4046   }
4047 
4048   // Compute total trip count.
4049   // Start with range of dims[0] which we don't need to keep in the buffer.
4050   if (dims[0].st == 1) { // most common case
4051     trace_count = dims[0].up - dims[0].lo + 1;
4052   } else if (dims[0].st > 0) {
4053     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4054     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4055   } else { // negative increment
4056     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4057     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4058   }
4059   for (j = 1; j < num_dims; ++j) {
4060     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4061   }
4062   KMP_DEBUG_ASSERT(trace_count > 0);
4063 
4064   // Check if shared buffer is not occupied by other loop (idx -
4065   // __kmp_dispatch_num_buffers)
4066   if (idx != sh_buf->doacross_buf_idx) {
4067     // Shared buffer is occupied, wait for it to be free
4068     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4069                  __kmp_eq_4, NULL);
4070   }
4071 #if KMP_32_BIT_ARCH
4072   // Check if we are the first thread. After the CAS the first thread gets 0,
4073   // others get 1 if initialization is in progress, allocated pointer otherwise.
4074   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4075   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4076       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4077 #else
4078   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4079       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4080 #endif
4081   if (flags == NULL) {
4082     // we are the first thread, allocate the array of flags
4083     size_t size =
4084         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4085     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4086     KMP_MB();
4087     sh_buf->doacross_flags = flags;
4088   } else if (flags == (kmp_uint32 *)1) {
4089 #if KMP_32_BIT_ARCH
4090     // initialization is still in progress, need to wait
4091     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4092 #else
4093     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4094 #endif
4095       KMP_YIELD(TRUE);
4096     KMP_MB();
4097   } else {
4098     KMP_MB();
4099   }
4100   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4101   pr_buf->th_doacross_flags =
4102       sh_buf->doacross_flags; // save private copy in order to not
4103   // touch shared buffer on each iteration
4104   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4105 }
4106 
4107 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4108   __kmp_assert_valid_gtid(gtid);
4109   kmp_int64 shft;
4110   size_t num_dims, i;
4111   kmp_uint32 flag;
4112   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4113   kmp_info_t *th = __kmp_threads[gtid];
4114   kmp_team_t *team = th->th.th_team;
4115   kmp_disp_t *pr_buf;
4116   kmp_int64 lo, up, st;
4117 
4118   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4119   if (team->t.t_serialized) {
4120     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4121     return; // no dependencies if team is serialized
4122   }
4123 
4124   // calculate sequential iteration number and check out-of-bounds condition
4125   pr_buf = th->th.th_dispatch;
4126   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4127   num_dims = (size_t)pr_buf->th_doacross_info[0];
4128   lo = pr_buf->th_doacross_info[2];
4129   up = pr_buf->th_doacross_info[3];
4130   st = pr_buf->th_doacross_info[4];
4131 #if OMPT_SUPPORT && OMPT_OPTIONAL
4132   ompt_dependence_t deps[num_dims];
4133 #endif
4134   if (st == 1) { // most common case
4135     if (vec[0] < lo || vec[0] > up) {
4136       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4137                     "bounds [%lld,%lld]\n",
4138                     gtid, vec[0], lo, up));
4139       return;
4140     }
4141     iter_number = vec[0] - lo;
4142   } else if (st > 0) {
4143     if (vec[0] < lo || vec[0] > up) {
4144       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4145                     "bounds [%lld,%lld]\n",
4146                     gtid, vec[0], lo, up));
4147       return;
4148     }
4149     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4150   } else { // negative increment
4151     if (vec[0] > lo || vec[0] < up) {
4152       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4153                     "bounds [%lld,%lld]\n",
4154                     gtid, vec[0], lo, up));
4155       return;
4156     }
4157     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4158   }
4159 #if OMPT_SUPPORT && OMPT_OPTIONAL
4160   deps[0].variable.value = iter_number;
4161   deps[0].dependence_type = ompt_dependence_type_sink;
4162 #endif
4163   for (i = 1; i < num_dims; ++i) {
4164     kmp_int64 iter, ln;
4165     size_t j = i * 4;
4166     ln = pr_buf->th_doacross_info[j + 1];
4167     lo = pr_buf->th_doacross_info[j + 2];
4168     up = pr_buf->th_doacross_info[j + 3];
4169     st = pr_buf->th_doacross_info[j + 4];
4170     if (st == 1) {
4171       if (vec[i] < lo || vec[i] > up) {
4172         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4173                       "bounds [%lld,%lld]\n",
4174                       gtid, vec[i], lo, up));
4175         return;
4176       }
4177       iter = vec[i] - lo;
4178     } else if (st > 0) {
4179       if (vec[i] < lo || vec[i] > up) {
4180         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4181                       "bounds [%lld,%lld]\n",
4182                       gtid, vec[i], lo, up));
4183         return;
4184       }
4185       iter = (kmp_uint64)(vec[i] - lo) / st;
4186     } else { // st < 0
4187       if (vec[i] > lo || vec[i] < up) {
4188         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4189                       "bounds [%lld,%lld]\n",
4190                       gtid, vec[i], lo, up));
4191         return;
4192       }
4193       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4194     }
4195     iter_number = iter + ln * iter_number;
4196 #if OMPT_SUPPORT && OMPT_OPTIONAL
4197     deps[i].variable.value = iter;
4198     deps[i].dependence_type = ompt_dependence_type_sink;
4199 #endif
4200   }
4201   shft = iter_number % 32; // use 32-bit granularity
4202   iter_number >>= 5; // divided by 32
4203   flag = 1 << shft;
4204   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4205     KMP_YIELD(TRUE);
4206   }
4207   KMP_MB();
4208 #if OMPT_SUPPORT && OMPT_OPTIONAL
4209   if (ompt_enabled.ompt_callback_dependences) {
4210     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4211         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4212   }
4213 #endif
4214   KA_TRACE(20,
4215            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4216             gtid, (iter_number << 5) + shft));
4217 }
4218 
4219 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4220   __kmp_assert_valid_gtid(gtid);
4221   kmp_int64 shft;
4222   size_t num_dims, i;
4223   kmp_uint32 flag;
4224   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4225   kmp_info_t *th = __kmp_threads[gtid];
4226   kmp_team_t *team = th->th.th_team;
4227   kmp_disp_t *pr_buf;
4228   kmp_int64 lo, st;
4229 
4230   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4231   if (team->t.t_serialized) {
4232     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4233     return; // no dependencies if team is serialized
4234   }
4235 
4236   // calculate sequential iteration number (same as in "wait" but no
4237   // out-of-bounds checks)
4238   pr_buf = th->th.th_dispatch;
4239   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4240   num_dims = (size_t)pr_buf->th_doacross_info[0];
4241   lo = pr_buf->th_doacross_info[2];
4242   st = pr_buf->th_doacross_info[4];
4243 #if OMPT_SUPPORT && OMPT_OPTIONAL
4244   ompt_dependence_t deps[num_dims];
4245 #endif
4246   if (st == 1) { // most common case
4247     iter_number = vec[0] - lo;
4248   } else if (st > 0) {
4249     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4250   } else { // negative increment
4251     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4252   }
4253 #if OMPT_SUPPORT && OMPT_OPTIONAL
4254   deps[0].variable.value = iter_number;
4255   deps[0].dependence_type = ompt_dependence_type_source;
4256 #endif
4257   for (i = 1; i < num_dims; ++i) {
4258     kmp_int64 iter, ln;
4259     size_t j = i * 4;
4260     ln = pr_buf->th_doacross_info[j + 1];
4261     lo = pr_buf->th_doacross_info[j + 2];
4262     st = pr_buf->th_doacross_info[j + 4];
4263     if (st == 1) {
4264       iter = vec[i] - lo;
4265     } else if (st > 0) {
4266       iter = (kmp_uint64)(vec[i] - lo) / st;
4267     } else { // st < 0
4268       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4269     }
4270     iter_number = iter + ln * iter_number;
4271 #if OMPT_SUPPORT && OMPT_OPTIONAL
4272     deps[i].variable.value = iter;
4273     deps[i].dependence_type = ompt_dependence_type_source;
4274 #endif
4275   }
4276 #if OMPT_SUPPORT && OMPT_OPTIONAL
4277   if (ompt_enabled.ompt_callback_dependences) {
4278     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4279         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4280   }
4281 #endif
4282   shft = iter_number % 32; // use 32-bit granularity
4283   iter_number >>= 5; // divided by 32
4284   flag = 1 << shft;
4285   KMP_MB();
4286   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4287     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4288   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4289                 (iter_number << 5) + shft));
4290 }
4291 
4292 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4293   __kmp_assert_valid_gtid(gtid);
4294   kmp_int32 num_done;
4295   kmp_info_t *th = __kmp_threads[gtid];
4296   kmp_team_t *team = th->th.th_team;
4297   kmp_disp_t *pr_buf = th->th.th_dispatch;
4298 
4299   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4300   if (team->t.t_serialized) {
4301     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4302     return; // nothing to do
4303   }
4304   num_done =
4305       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4306   if (num_done == th->th.th_team_nproc) {
4307     // we are the last thread, need to free shared resources
4308     int idx = pr_buf->th_doacross_buf_idx - 1;
4309     dispatch_shared_info_t *sh_buf =
4310         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4311     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4312                      (kmp_int64)&sh_buf->doacross_num_done);
4313     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4314     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4315     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4316     sh_buf->doacross_flags = NULL;
4317     sh_buf->doacross_num_done = 0;
4318     sh_buf->doacross_buf_idx +=
4319         __kmp_dispatch_num_buffers; // free buffer for future re-use
4320   }
4321   // free private resources (need to keep buffer index forever)
4322   pr_buf->th_doacross_flags = NULL;
4323   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4324   pr_buf->th_doacross_info = NULL;
4325   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4326 }
4327 
4328 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4329 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4330   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4331 }
4332 
4333 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4334   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4335 }
4336 
4337 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4338                   omp_allocator_handle_t free_allocator) {
4339   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4340                         free_allocator);
4341 }
4342 
4343 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4344   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4345 }
4346 
4347 int __kmpc_get_target_offload(void) {
4348   if (!__kmp_init_serial) {
4349     __kmp_serial_initialize();
4350   }
4351   return __kmp_target_offload;
4352 }
4353 
4354 int __kmpc_pause_resource(kmp_pause_status_t level) {
4355   if (!__kmp_init_serial) {
4356     return 1; // Can't pause if runtime is not initialized
4357   }
4358   return __kmp_pause_resource(level);
4359 }
4360 
4361 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4362   if (!__kmp_init_serial)
4363     __kmp_serial_initialize();
4364 
4365   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4366 
4367 #if OMPT_SUPPORT
4368   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4369     ompt_callbacks.ompt_callback(ompt_callback_error)(
4370         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4371         OMPT_GET_RETURN_ADDRESS(0));
4372   }
4373 #endif // OMPT_SUPPORT
4374 
4375   char *src_loc;
4376   if (loc && loc->psource) {
4377     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4378     src_loc =
4379         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4380     __kmp_str_loc_free(&str_loc);
4381   } else {
4382     src_loc = __kmp_str_format("unknown");
4383   }
4384 
4385   if (severity == severity_warning)
4386     KMP_WARNING(UserDirectedWarning, src_loc, message);
4387   else
4388     KMP_FATAL(UserDirectedError, src_loc, message);
4389 
4390   __kmp_str_free(&src_loc);
4391 }
4392