1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
248                            kmp_int32 proc_bind) {
249   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
250                 proc_bind));
251 
252   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
253 }
254 
255 /*!
256 @ingroup PARALLEL
257 @param loc  source location information
258 @param argc  total number of arguments in the ellipsis
259 @param microtask  pointer to callback routine consisting of outlined parallel
260 construct
261 @param ...  pointers to shared variables that aren't global
262 
263 Do the actual fork and call the microtask in the relevant number of threads.
264 */
265 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
266   int gtid = __kmp_entry_gtid();
267 
268 #if (KMP_STATS_ENABLED)
269   // If we were in a serial region, then stop the serial timer, record
270   // the event, and start parallel region timer
271   stats_state_e previous_state = KMP_GET_THREAD_STATE();
272   if (previous_state == stats_state_e::SERIAL_REGION) {
273     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
274   } else {
275     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
276   }
277   int inParallel = __kmpc_in_parallel(loc);
278   if (inParallel) {
279     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
280   } else {
281     KMP_COUNT_BLOCK(OMP_PARALLEL);
282   }
283 #endif
284 
285   // maybe to save thr_state is enough here
286   {
287     va_list ap;
288     va_start(ap, microtask);
289 
290 #if OMPT_SUPPORT
291     ompt_frame_t *ompt_frame;
292     if (ompt_enabled.enabled) {
293       kmp_info_t *master_th = __kmp_threads[gtid];
294       kmp_team_t *parent_team = master_th->th.th_team;
295       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
296       if (lwt)
297         ompt_frame = &(lwt->ompt_task_info.frame);
298       else {
299         int tid = __kmp_tid_from_gtid(gtid);
300         ompt_frame = &(
301             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
302       }
303       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
304       OMPT_STORE_RETURN_ADDRESS(gtid);
305     }
306 #endif
307 
308 #if INCLUDE_SSC_MARKS
309     SSC_MARK_FORKING();
310 #endif
311     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
312                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
313                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
314 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
315 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
316                     &ap
317 #else
318                     ap
319 #endif
320                     );
321 #if INCLUDE_SSC_MARKS
322     SSC_MARK_JOINING();
323 #endif
324     __kmp_join_call(loc, gtid
325 #if OMPT_SUPPORT
326                     ,
327                     fork_context_intel
328 #endif
329                     );
330 
331     va_end(ap);
332   }
333 
334 #if KMP_STATS_ENABLED
335   if (previous_state == stats_state_e::SERIAL_REGION) {
336     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
337   } else {
338     KMP_POP_PARTITIONED_TIMER();
339   }
340 #endif // KMP_STATS_ENABLED
341 }
342 
343 /*!
344 @ingroup PARALLEL
345 @param loc source location information
346 @param global_tid global thread number
347 @param num_teams number of teams requested for the teams construct
348 @param num_threads number of threads per team requested for the teams construct
349 
350 Set the number of teams to be used by the teams construct.
351 This call is only required if the teams construct has a `num_teams` clause
352 or a `thread_limit` clause (or both).
353 */
354 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
355                            kmp_int32 num_teams, kmp_int32 num_threads) {
356   KA_TRACE(20,
357            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
358             global_tid, num_teams, num_threads));
359 
360   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
361 }
362 
363 /*!
364 @ingroup PARALLEL
365 @param loc  source location information
366 @param argc  total number of arguments in the ellipsis
367 @param microtask  pointer to callback routine consisting of outlined teams
368 construct
369 @param ...  pointers to shared variables that aren't global
370 
371 Do the actual fork and call the microtask in the relevant number of threads.
372 */
373 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
374                        ...) {
375   int gtid = __kmp_entry_gtid();
376   kmp_info_t *this_thr = __kmp_threads[gtid];
377   va_list ap;
378   va_start(ap, microtask);
379 
380 #if KMP_STATS_ENABLED
381   KMP_COUNT_BLOCK(OMP_TEAMS);
382   stats_state_e previous_state = KMP_GET_THREAD_STATE();
383   if (previous_state == stats_state_e::SERIAL_REGION) {
384     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
385   } else {
386     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
387   }
388 #endif
389 
390   // remember teams entry point and nesting level
391   this_thr->th.th_teams_microtask = microtask;
392   this_thr->th.th_teams_level =
393       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
394 
395 #if OMPT_SUPPORT
396   kmp_team_t *parent_team = this_thr->th.th_team;
397   int tid = __kmp_tid_from_gtid(gtid);
398   if (ompt_enabled.enabled) {
399     parent_team->t.t_implicit_task_taskdata[tid]
400         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
401   }
402   OMPT_STORE_RETURN_ADDRESS(gtid);
403 #endif
404 
405   // check if __kmpc_push_num_teams called, set default number of teams
406   // otherwise
407   if (this_thr->th.th_teams_size.nteams == 0) {
408     __kmp_push_num_teams(loc, gtid, 0, 0);
409   }
410   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
411   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
412   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
413 
414   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
415                   VOLATILE_CAST(microtask_t)
416                       __kmp_teams_master, // "wrapped" task
417                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
418 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
419                   &ap
420 #else
421                   ap
422 #endif
423                   );
424   __kmp_join_call(loc, gtid
425 #if OMPT_SUPPORT
426                   ,
427                   fork_context_intel
428 #endif
429                   );
430 
431   // Pop current CG root off list
432   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
433   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
434   this_thr->th.th_cg_roots = tmp->up;
435   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
436                  " to node %p. cg_nthreads was %d\n",
437                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
438   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
439   int i = tmp->cg_nthreads--;
440   if (i == 1) { // check is we are the last thread in CG (not always the case)
441     __kmp_free(tmp);
442   }
443   // Restore current task's thread_limit from CG root
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   this_thr->th.th_current_task->td_icvs.thread_limit =
446       this_thr->th.th_cg_roots->cg_thread_limit;
447 
448   this_thr->th.th_teams_microtask = NULL;
449   this_thr->th.th_teams_level = 0;
450   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
451   va_end(ap);
452 #if KMP_STATS_ENABLED
453   if (previous_state == stats_state_e::SERIAL_REGION) {
454     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
455   } else {
456     KMP_POP_PARTITIONED_TIMER();
457   }
458 #endif // KMP_STATS_ENABLED
459 }
460 
461 // I don't think this function should ever have been exported.
462 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
463 // openmp code ever called it, but it's been exported from the RTL for so
464 // long that I'm afraid to remove the definition.
465 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
466 
467 /*!
468 @ingroup PARALLEL
469 @param loc  source location information
470 @param global_tid  global thread number
471 
472 Enter a serialized parallel construct. This interface is used to handle a
473 conditional parallel region, like this,
474 @code
475 #pragma omp parallel if (condition)
476 @endcode
477 when the condition is false.
478 */
479 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
480 // The implementation is now in kmp_runtime.cpp so that it can share static
481 // functions with kmp_fork_call since the tasks to be done are similar in
482 // each case.
483 #if OMPT_SUPPORT
484   OMPT_STORE_RETURN_ADDRESS(global_tid);
485 #endif
486   __kmp_serialized_parallel(loc, global_tid);
487 }
488 
489 /*!
490 @ingroup PARALLEL
491 @param loc  source location information
492 @param global_tid  global thread number
493 
494 Leave a serialized parallel construct.
495 */
496 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
497   kmp_internal_control_t *top;
498   kmp_info_t *this_thr;
499   kmp_team_t *serial_team;
500 
501   KC_TRACE(10,
502            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
503 
504   /* skip all this code for autopar serialized loops since it results in
505      unacceptable overhead */
506   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
507     return;
508 
509   // Not autopar code
510   if (!TCR_4(__kmp_init_parallel))
511     __kmp_parallel_initialize();
512 
513   __kmp_resume_if_soft_paused();
514 
515   this_thr = __kmp_threads[global_tid];
516   serial_team = this_thr->th.th_serial_team;
517 
518   kmp_task_team_t *task_team = this_thr->th.th_task_team;
519   // we need to wait for the proxy tasks before finishing the thread
520   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
521     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
522 
523   KMP_MB();
524   KMP_DEBUG_ASSERT(serial_team);
525   KMP_ASSERT(serial_team->t.t_serialized);
526   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
527   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
528   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
529   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
530 
531 #if OMPT_SUPPORT
532   if (ompt_enabled.enabled &&
533       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
534     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
535     if (ompt_enabled.ompt_callback_implicit_task) {
536       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
537           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
538           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
539     }
540 
541     // reset clear the task id only after unlinking the task
542     ompt_data_t *parent_task_data;
543     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
544 
545     if (ompt_enabled.ompt_callback_parallel_end) {
546       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
547           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
548           ompt_parallel_invoker_program | ompt_parallel_team,
549           OMPT_LOAD_RETURN_ADDRESS(global_tid));
550     }
551     __ompt_lw_taskteam_unlink(this_thr);
552     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
553   }
554 #endif
555 
556   /* If necessary, pop the internal control stack values and replace the team
557    * values */
558   top = serial_team->t.t_control_stack_top;
559   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
560     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
561     serial_team->t.t_control_stack_top = top->next;
562     __kmp_free(top);
563   }
564 
565   // if( serial_team -> t.t_serialized > 1 )
566   serial_team->t.t_level--;
567 
568   /* pop dispatch buffers stack */
569   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
570   {
571     dispatch_private_info_t *disp_buffer =
572         serial_team->t.t_dispatch->th_disp_buffer;
573     serial_team->t.t_dispatch->th_disp_buffer =
574         serial_team->t.t_dispatch->th_disp_buffer->next;
575     __kmp_free(disp_buffer);
576   }
577   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
578 
579   --serial_team->t.t_serialized;
580   if (serial_team->t.t_serialized == 0) {
581 
582 /* return to the parallel section */
583 
584 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
585     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
586       __kmp_clear_x87_fpu_status_word();
587       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
588       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
589     }
590 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
591 
592     this_thr->th.th_team = serial_team->t.t_parent;
593     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
594 
595     /* restore values cached in the thread */
596     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
597     this_thr->th.th_team_master =
598         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
599     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
600 
601     /* TODO the below shouldn't need to be adjusted for serialized teams */
602     this_thr->th.th_dispatch =
603         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
604 
605     __kmp_pop_current_task_from_thread(this_thr);
606 
607     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
608     this_thr->th.th_current_task->td_flags.executing = 1;
609 
610     if (__kmp_tasking_mode != tskm_immediate_exec) {
611       // Copy the task team from the new child / old parent team to the thread.
612       this_thr->th.th_task_team =
613           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
614       KA_TRACE(20,
615                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
616                 "team %p\n",
617                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
618     }
619   } else {
620     if (__kmp_tasking_mode != tskm_immediate_exec) {
621       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
622                     "depth of serial team %p to %d\n",
623                     global_tid, serial_team, serial_team->t.t_serialized));
624     }
625   }
626 
627   if (__kmp_env_consistency_check)
628     __kmp_pop_parallel(global_tid, NULL);
629 #if OMPT_SUPPORT
630   if (ompt_enabled.enabled)
631     this_thr->th.ompt_thread_info.state =
632         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
633                                            : ompt_state_work_parallel);
634 #endif
635 }
636 
637 /*!
638 @ingroup SYNCHRONIZATION
639 @param loc  source location information.
640 
641 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
642 depending on the memory ordering convention obeyed by the compiler
643 even that may not be necessary).
644 */
645 void __kmpc_flush(ident_t *loc) {
646   KC_TRACE(10, ("__kmpc_flush: called\n"));
647 
648   /* need explicit __mf() here since use volatile instead in library */
649   KMP_MB(); /* Flush all pending memory write invalidates.  */
650 
651 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
652 #if KMP_MIC
653 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
654 // We shouldn't need it, though, since the ABI rules require that
655 // * If the compiler generates NGO stores it also generates the fence
656 // * If users hand-code NGO stores they should insert the fence
657 // therefore no incomplete unordered stores should be visible.
658 #else
659   // C74404
660   // This is to address non-temporal store instructions (sfence needed).
661   // The clflush instruction is addressed either (mfence needed).
662   // Probably the non-temporal load monvtdqa instruction should also be
663   // addressed.
664   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
665   if (!__kmp_cpuinfo.initialized) {
666     __kmp_query_cpuid(&__kmp_cpuinfo);
667   }
668   if (!__kmp_cpuinfo.sse2) {
669     // CPU cannot execute SSE2 instructions.
670   } else {
671 #if KMP_COMPILER_ICC
672     _mm_mfence();
673 #elif KMP_COMPILER_MSVC
674     MemoryBarrier();
675 #else
676     __sync_synchronize();
677 #endif // KMP_COMPILER_ICC
678   }
679 #endif // KMP_MIC
680 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
681        KMP_ARCH_RISCV64)
682 // Nothing to see here move along
683 #elif KMP_ARCH_PPC64
684 // Nothing needed here (we have a real MB above).
685 #if KMP_OS_CNK
686   // The flushing thread needs to yield here; this prevents a
687   // busy-waiting thread from saturating the pipeline. flush is
688   // often used in loops like this:
689   // while (!flag) {
690   //   #pragma omp flush(flag)
691   // }
692   // and adding the yield here is good for at least a 10x speedup
693   // when running >2 threads per core (on the NAS LU benchmark).
694   __kmp_yield();
695 #endif
696 #else
697 #error Unknown or unsupported architecture
698 #endif
699 
700 #if OMPT_SUPPORT && OMPT_OPTIONAL
701   if (ompt_enabled.ompt_callback_flush) {
702     ompt_callbacks.ompt_callback(ompt_callback_flush)(
703         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
704   }
705 #endif
706 }
707 
708 /* -------------------------------------------------------------------------- */
709 /*!
710 @ingroup SYNCHRONIZATION
711 @param loc source location information
712 @param global_tid thread id.
713 
714 Execute a barrier.
715 */
716 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
717   KMP_COUNT_BLOCK(OMP_BARRIER);
718   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
719 
720   if (!TCR_4(__kmp_init_parallel))
721     __kmp_parallel_initialize();
722 
723   __kmp_resume_if_soft_paused();
724 
725   if (__kmp_env_consistency_check) {
726     if (loc == 0) {
727       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
728     }
729     __kmp_check_barrier(global_tid, ct_barrier, loc);
730   }
731 
732 #if OMPT_SUPPORT
733   ompt_frame_t *ompt_frame;
734   if (ompt_enabled.enabled) {
735     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
736     if (ompt_frame->enter_frame.ptr == NULL)
737       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
738     OMPT_STORE_RETURN_ADDRESS(global_tid);
739   }
740 #endif
741   __kmp_threads[global_tid]->th.th_ident = loc;
742   // TODO: explicit barrier_wait_id:
743   //   this function is called when 'barrier' directive is present or
744   //   implicit barrier at the end of a worksharing construct.
745   // 1) better to add a per-thread barrier counter to a thread data structure
746   // 2) set to 0 when a new team is created
747   // 4) no sync is required
748 
749   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
750 #if OMPT_SUPPORT && OMPT_OPTIONAL
751   if (ompt_enabled.enabled) {
752     ompt_frame->enter_frame = ompt_data_none;
753   }
754 #endif
755 }
756 
757 /* The BARRIER for a MASTER section is always explicit   */
758 /*!
759 @ingroup WORK_SHARING
760 @param loc  source location information.
761 @param global_tid  global thread number .
762 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
763 */
764 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
765   int status = 0;
766 
767   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
768 
769   if (!TCR_4(__kmp_init_parallel))
770     __kmp_parallel_initialize();
771 
772   __kmp_resume_if_soft_paused();
773 
774   if (KMP_MASTER_GTID(global_tid)) {
775     KMP_COUNT_BLOCK(OMP_MASTER);
776     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
777     status = 1;
778   }
779 
780 #if OMPT_SUPPORT && OMPT_OPTIONAL
781   if (status) {
782     if (ompt_enabled.ompt_callback_master) {
783       kmp_info_t *this_thr = __kmp_threads[global_tid];
784       kmp_team_t *team = this_thr->th.th_team;
785 
786       int tid = __kmp_tid_from_gtid(global_tid);
787       ompt_callbacks.ompt_callback(ompt_callback_master)(
788           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
789           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
790           OMPT_GET_RETURN_ADDRESS(0));
791     }
792   }
793 #endif
794 
795   if (__kmp_env_consistency_check) {
796 #if KMP_USE_DYNAMIC_LOCK
797     if (status)
798       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
799     else
800       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
801 #else
802     if (status)
803       __kmp_push_sync(global_tid, ct_master, loc, NULL);
804     else
805       __kmp_check_sync(global_tid, ct_master, loc, NULL);
806 #endif
807   }
808 
809   return status;
810 }
811 
812 /*!
813 @ingroup WORK_SHARING
814 @param loc  source location information.
815 @param global_tid  global thread number .
816 
817 Mark the end of a <tt>master</tt> region. This should only be called by the
818 thread that executes the <tt>master</tt> region.
819 */
820 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
821   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
822 
823   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
824   KMP_POP_PARTITIONED_TIMER();
825 
826 #if OMPT_SUPPORT && OMPT_OPTIONAL
827   kmp_info_t *this_thr = __kmp_threads[global_tid];
828   kmp_team_t *team = this_thr->th.th_team;
829   if (ompt_enabled.ompt_callback_master) {
830     int tid = __kmp_tid_from_gtid(global_tid);
831     ompt_callbacks.ompt_callback(ompt_callback_master)(
832         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
833         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
834         OMPT_GET_RETURN_ADDRESS(0));
835   }
836 #endif
837 
838   if (__kmp_env_consistency_check) {
839     if (global_tid < 0)
840       KMP_WARNING(ThreadIdentInvalid);
841 
842     if (KMP_MASTER_GTID(global_tid))
843       __kmp_pop_sync(global_tid, ct_master, loc);
844   }
845 }
846 
847 /*!
848 @ingroup WORK_SHARING
849 @param loc  source location information.
850 @param gtid  global thread number.
851 
852 Start execution of an <tt>ordered</tt> construct.
853 */
854 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
855   int cid = 0;
856   kmp_info_t *th;
857   KMP_DEBUG_ASSERT(__kmp_init_serial);
858 
859   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
860 
861   if (!TCR_4(__kmp_init_parallel))
862     __kmp_parallel_initialize();
863 
864   __kmp_resume_if_soft_paused();
865 
866 #if USE_ITT_BUILD
867   __kmp_itt_ordered_prep(gtid);
868 // TODO: ordered_wait_id
869 #endif /* USE_ITT_BUILD */
870 
871   th = __kmp_threads[gtid];
872 
873 #if OMPT_SUPPORT && OMPT_OPTIONAL
874   kmp_team_t *team;
875   ompt_wait_id_t lck;
876   void *codeptr_ra;
877   if (ompt_enabled.enabled) {
878     OMPT_STORE_RETURN_ADDRESS(gtid);
879     team = __kmp_team_from_gtid(gtid);
880     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
881     /* OMPT state update */
882     th->th.ompt_thread_info.wait_id = lck;
883     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
884 
885     /* OMPT event callback */
886     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
887     if (ompt_enabled.ompt_callback_mutex_acquire) {
888       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
889           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
890           codeptr_ra);
891     }
892   }
893 #endif
894 
895   if (th->th.th_dispatch->th_deo_fcn != 0)
896     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
897   else
898     __kmp_parallel_deo(&gtid, &cid, loc);
899 
900 #if OMPT_SUPPORT && OMPT_OPTIONAL
901   if (ompt_enabled.enabled) {
902     /* OMPT state update */
903     th->th.ompt_thread_info.state = ompt_state_work_parallel;
904     th->th.ompt_thread_info.wait_id = 0;
905 
906     /* OMPT event callback */
907     if (ompt_enabled.ompt_callback_mutex_acquired) {
908       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
909           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
910     }
911   }
912 #endif
913 
914 #if USE_ITT_BUILD
915   __kmp_itt_ordered_start(gtid);
916 #endif /* USE_ITT_BUILD */
917 }
918 
919 /*!
920 @ingroup WORK_SHARING
921 @param loc  source location information.
922 @param gtid  global thread number.
923 
924 End execution of an <tt>ordered</tt> construct.
925 */
926 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
927   int cid = 0;
928   kmp_info_t *th;
929 
930   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
931 
932 #if USE_ITT_BUILD
933   __kmp_itt_ordered_end(gtid);
934 // TODO: ordered_wait_id
935 #endif /* USE_ITT_BUILD */
936 
937   th = __kmp_threads[gtid];
938 
939   if (th->th.th_dispatch->th_dxo_fcn != 0)
940     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
941   else
942     __kmp_parallel_dxo(&gtid, &cid, loc);
943 
944 #if OMPT_SUPPORT && OMPT_OPTIONAL
945   OMPT_STORE_RETURN_ADDRESS(gtid);
946   if (ompt_enabled.ompt_callback_mutex_released) {
947     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
948         ompt_mutex_ordered,
949         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
950             ->t.t_ordered.dt.t_value,
951         OMPT_LOAD_RETURN_ADDRESS(gtid));
952   }
953 #endif
954 }
955 
956 #if KMP_USE_DYNAMIC_LOCK
957 
958 static __forceinline void
959 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
960                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
961   // Pointer to the allocated indirect lock is written to crit, while indexing
962   // is ignored.
963   void *idx;
964   kmp_indirect_lock_t **lck;
965   lck = (kmp_indirect_lock_t **)crit;
966   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
967   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
968   KMP_SET_I_LOCK_LOCATION(ilk, loc);
969   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
970   KA_TRACE(20,
971            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
972 #if USE_ITT_BUILD
973   __kmp_itt_critical_creating(ilk->lock, loc);
974 #endif
975   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
976   if (status == 0) {
977 #if USE_ITT_BUILD
978     __kmp_itt_critical_destroyed(ilk->lock);
979 #endif
980     // We don't really need to destroy the unclaimed lock here since it will be
981     // cleaned up at program exit.
982     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
983   }
984   KMP_DEBUG_ASSERT(*lck != NULL);
985 }
986 
987 // Fast-path acquire tas lock
988 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
989   {                                                                            \
990     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
991     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
992     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
993     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
994         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
995       kmp_uint32 spins;                                                        \
996       KMP_FSYNC_PREPARE(l);                                                    \
997       KMP_INIT_YIELD(spins);                                                   \
998       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
999       do {                                                                     \
1000         if (TCR_4(__kmp_nth) >                                                 \
1001             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1002           KMP_YIELD(TRUE);                                                     \
1003         } else {                                                               \
1004           KMP_YIELD_SPIN(spins);                                               \
1005         }                                                                      \
1006         __kmp_spin_backoff(&backoff);                                          \
1007       } while (                                                                \
1008           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1009           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1010     }                                                                          \
1011     KMP_FSYNC_ACQUIRED(l);                                                     \
1012   }
1013 
1014 // Fast-path test tas lock
1015 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1016   {                                                                            \
1017     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1018     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1019     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1020     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1021          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1022   }
1023 
1024 // Fast-path release tas lock
1025 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1026   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1027 
1028 #if KMP_USE_FUTEX
1029 
1030 #include <sys/syscall.h>
1031 #include <unistd.h>
1032 #ifndef FUTEX_WAIT
1033 #define FUTEX_WAIT 0
1034 #endif
1035 #ifndef FUTEX_WAKE
1036 #define FUTEX_WAKE 1
1037 #endif
1038 
1039 // Fast-path acquire futex lock
1040 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1041   {                                                                            \
1042     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1043     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1044     KMP_MB();                                                                  \
1045     KMP_FSYNC_PREPARE(ftx);                                                    \
1046     kmp_int32 poll_val;                                                        \
1047     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1048                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1049                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1050       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1051       if (!cond) {                                                             \
1052         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1053                                          poll_val |                            \
1054                                              KMP_LOCK_BUSY(1, futex))) {       \
1055           continue;                                                            \
1056         }                                                                      \
1057         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1058       }                                                                        \
1059       kmp_int32 rc;                                                            \
1060       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1061                         NULL, NULL, 0)) != 0) {                                \
1062         continue;                                                              \
1063       }                                                                        \
1064       gtid_code |= 1;                                                          \
1065     }                                                                          \
1066     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1067   }
1068 
1069 // Fast-path test futex lock
1070 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1071   {                                                                            \
1072     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1073     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1074                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1075       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1076       rc = TRUE;                                                               \
1077     } else {                                                                   \
1078       rc = FALSE;                                                              \
1079     }                                                                          \
1080   }
1081 
1082 // Fast-path release futex lock
1083 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1084   {                                                                            \
1085     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1086     KMP_MB();                                                                  \
1087     KMP_FSYNC_RELEASING(ftx);                                                  \
1088     kmp_int32 poll_val =                                                       \
1089         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1090     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1091       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1092               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1093     }                                                                          \
1094     KMP_MB();                                                                  \
1095     KMP_YIELD_OVERSUB();                                                       \
1096   }
1097 
1098 #endif // KMP_USE_FUTEX
1099 
1100 #else // KMP_USE_DYNAMIC_LOCK
1101 
1102 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1103                                                       ident_t const *loc,
1104                                                       kmp_int32 gtid) {
1105   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1106 
1107   // Because of the double-check, the following load doesn't need to be volatile
1108   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1109 
1110   if (lck == NULL) {
1111     void *idx;
1112 
1113     // Allocate & initialize the lock.
1114     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1115     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1116     __kmp_init_user_lock_with_checks(lck);
1117     __kmp_set_user_lock_location(lck, loc);
1118 #if USE_ITT_BUILD
1119     __kmp_itt_critical_creating(lck);
1120 // __kmp_itt_critical_creating() should be called *before* the first usage
1121 // of underlying lock. It is the only place where we can guarantee it. There
1122 // are chances the lock will destroyed with no usage, but it is not a
1123 // problem, because this is not real event seen by user but rather setting
1124 // name for object (lock). See more details in kmp_itt.h.
1125 #endif /* USE_ITT_BUILD */
1126 
1127     // Use a cmpxchg instruction to slam the start of the critical section with
1128     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1129     // and use the lock that the other thread allocated.
1130     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1131 
1132     if (status == 0) {
1133 // Deallocate the lock and reload the value.
1134 #if USE_ITT_BUILD
1135       __kmp_itt_critical_destroyed(lck);
1136 // Let ITT know the lock is destroyed and the same memory location may be reused
1137 // for another purpose.
1138 #endif /* USE_ITT_BUILD */
1139       __kmp_destroy_user_lock_with_checks(lck);
1140       __kmp_user_lock_free(&idx, gtid, lck);
1141       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1142       KMP_DEBUG_ASSERT(lck != NULL);
1143     }
1144   }
1145   return lck;
1146 }
1147 
1148 #endif // KMP_USE_DYNAMIC_LOCK
1149 
1150 /*!
1151 @ingroup WORK_SHARING
1152 @param loc  source location information.
1153 @param global_tid  global thread number .
1154 @param crit identity of the critical section. This could be a pointer to a lock
1155 associated with the critical section, or some other suitably unique value.
1156 
1157 Enter code protected by a `critical` construct.
1158 This function blocks until the executing thread can enter the critical section.
1159 */
1160 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1161                      kmp_critical_name *crit) {
1162 #if KMP_USE_DYNAMIC_LOCK
1163 #if OMPT_SUPPORT && OMPT_OPTIONAL
1164   OMPT_STORE_RETURN_ADDRESS(global_tid);
1165 #endif // OMPT_SUPPORT
1166   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1167 #else
1168   KMP_COUNT_BLOCK(OMP_CRITICAL);
1169 #if OMPT_SUPPORT && OMPT_OPTIONAL
1170   ompt_state_t prev_state = ompt_state_undefined;
1171   ompt_thread_info_t ti;
1172 #endif
1173   kmp_user_lock_p lck;
1174 
1175   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1176 
1177   // TODO: add THR_OVHD_STATE
1178 
1179   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1180   KMP_CHECK_USER_LOCK_INIT();
1181 
1182   if ((__kmp_user_lock_kind == lk_tas) &&
1183       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1184     lck = (kmp_user_lock_p)crit;
1185   }
1186 #if KMP_USE_FUTEX
1187   else if ((__kmp_user_lock_kind == lk_futex) &&
1188            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1189     lck = (kmp_user_lock_p)crit;
1190   }
1191 #endif
1192   else { // ticket, queuing or drdpa
1193     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1194   }
1195 
1196   if (__kmp_env_consistency_check)
1197     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1198 
1199 // since the critical directive binds to all threads, not just the current
1200 // team we have to check this even if we are in a serialized team.
1201 // also, even if we are the uber thread, we still have to conduct the lock,
1202 // as we have to contend with sibling threads.
1203 
1204 #if USE_ITT_BUILD
1205   __kmp_itt_critical_acquiring(lck);
1206 #endif /* USE_ITT_BUILD */
1207 #if OMPT_SUPPORT && OMPT_OPTIONAL
1208   OMPT_STORE_RETURN_ADDRESS(gtid);
1209   void *codeptr_ra = NULL;
1210   if (ompt_enabled.enabled) {
1211     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1212     /* OMPT state update */
1213     prev_state = ti.state;
1214     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1215     ti.state = ompt_state_wait_critical;
1216 
1217     /* OMPT event callback */
1218     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1219     if (ompt_enabled.ompt_callback_mutex_acquire) {
1220       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1221           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1222           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1223     }
1224   }
1225 #endif
1226   // Value of 'crit' should be good for using as a critical_id of the critical
1227   // section directive.
1228   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1229 
1230 #if USE_ITT_BUILD
1231   __kmp_itt_critical_acquired(lck);
1232 #endif /* USE_ITT_BUILD */
1233 #if OMPT_SUPPORT && OMPT_OPTIONAL
1234   if (ompt_enabled.enabled) {
1235     /* OMPT state update */
1236     ti.state = prev_state;
1237     ti.wait_id = 0;
1238 
1239     /* OMPT event callback */
1240     if (ompt_enabled.ompt_callback_mutex_acquired) {
1241       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1242           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1243     }
1244   }
1245 #endif
1246   KMP_POP_PARTITIONED_TIMER();
1247 
1248   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1249   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1250 #endif // KMP_USE_DYNAMIC_LOCK
1251 }
1252 
1253 #if KMP_USE_DYNAMIC_LOCK
1254 
1255 // Converts the given hint to an internal lock implementation
1256 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1257 #if KMP_USE_TSX
1258 #define KMP_TSX_LOCK(seq) lockseq_##seq
1259 #else
1260 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1261 #endif
1262 
1263 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1264 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1265 #else
1266 #define KMP_CPUINFO_RTM 0
1267 #endif
1268 
1269   // Hints that do not require further logic
1270   if (hint & kmp_lock_hint_hle)
1271     return KMP_TSX_LOCK(hle);
1272   if (hint & kmp_lock_hint_rtm)
1273     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1274   if (hint & kmp_lock_hint_adaptive)
1275     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1276 
1277   // Rule out conflicting hints first by returning the default lock
1278   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1279     return __kmp_user_lock_seq;
1280   if ((hint & omp_lock_hint_speculative) &&
1281       (hint & omp_lock_hint_nonspeculative))
1282     return __kmp_user_lock_seq;
1283 
1284   // Do not even consider speculation when it appears to be contended
1285   if (hint & omp_lock_hint_contended)
1286     return lockseq_queuing;
1287 
1288   // Uncontended lock without speculation
1289   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1290     return lockseq_tas;
1291 
1292   // HLE lock for speculation
1293   if (hint & omp_lock_hint_speculative)
1294     return KMP_TSX_LOCK(hle);
1295 
1296   return __kmp_user_lock_seq;
1297 }
1298 
1299 #if OMPT_SUPPORT && OMPT_OPTIONAL
1300 #if KMP_USE_DYNAMIC_LOCK
1301 static kmp_mutex_impl_t
1302 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1303   if (user_lock) {
1304     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1305     case 0:
1306       break;
1307 #if KMP_USE_FUTEX
1308     case locktag_futex:
1309       return kmp_mutex_impl_queuing;
1310 #endif
1311     case locktag_tas:
1312       return kmp_mutex_impl_spin;
1313 #if KMP_USE_TSX
1314     case locktag_hle:
1315       return kmp_mutex_impl_speculative;
1316 #endif
1317     default:
1318       return kmp_mutex_impl_none;
1319     }
1320     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1321   }
1322   KMP_ASSERT(ilock);
1323   switch (ilock->type) {
1324 #if KMP_USE_TSX
1325   case locktag_adaptive:
1326   case locktag_rtm:
1327     return kmp_mutex_impl_speculative;
1328 #endif
1329   case locktag_nested_tas:
1330     return kmp_mutex_impl_spin;
1331 #if KMP_USE_FUTEX
1332   case locktag_nested_futex:
1333 #endif
1334   case locktag_ticket:
1335   case locktag_queuing:
1336   case locktag_drdpa:
1337   case locktag_nested_ticket:
1338   case locktag_nested_queuing:
1339   case locktag_nested_drdpa:
1340     return kmp_mutex_impl_queuing;
1341   default:
1342     return kmp_mutex_impl_none;
1343   }
1344 }
1345 #else
1346 // For locks without dynamic binding
1347 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1348   switch (__kmp_user_lock_kind) {
1349   case lk_tas:
1350     return kmp_mutex_impl_spin;
1351 #if KMP_USE_FUTEX
1352   case lk_futex:
1353 #endif
1354   case lk_ticket:
1355   case lk_queuing:
1356   case lk_drdpa:
1357     return kmp_mutex_impl_queuing;
1358 #if KMP_USE_TSX
1359   case lk_hle:
1360   case lk_rtm:
1361   case lk_adaptive:
1362     return kmp_mutex_impl_speculative;
1363 #endif
1364   default:
1365     return kmp_mutex_impl_none;
1366   }
1367 }
1368 #endif // KMP_USE_DYNAMIC_LOCK
1369 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1370 
1371 /*!
1372 @ingroup WORK_SHARING
1373 @param loc  source location information.
1374 @param global_tid  global thread number.
1375 @param crit identity of the critical section. This could be a pointer to a lock
1376 associated with the critical section, or some other suitably unique value.
1377 @param hint the lock hint.
1378 
1379 Enter code protected by a `critical` construct with a hint. The hint value is
1380 used to suggest a lock implementation. This function blocks until the executing
1381 thread can enter the critical section unless the hint suggests use of
1382 speculative execution and the hardware supports it.
1383 */
1384 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1385                                kmp_critical_name *crit, uint32_t hint) {
1386   KMP_COUNT_BLOCK(OMP_CRITICAL);
1387   kmp_user_lock_p lck;
1388 #if OMPT_SUPPORT && OMPT_OPTIONAL
1389   ompt_state_t prev_state = ompt_state_undefined;
1390   ompt_thread_info_t ti;
1391   // This is the case, if called from __kmpc_critical:
1392   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1393   if (!codeptr)
1394     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1395 #endif
1396 
1397   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1398 
1399   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1400   // Check if it is initialized.
1401   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1402   if (*lk == 0) {
1403     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1404     if (KMP_IS_D_LOCK(lckseq)) {
1405       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1406                                   KMP_GET_D_TAG(lckseq));
1407     } else {
1408       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1409     }
1410   }
1411   // Branch for accessing the actual lock object and set operation. This
1412   // branching is inevitable since this lock initialization does not follow the
1413   // normal dispatch path (lock table is not used).
1414   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1415     lck = (kmp_user_lock_p)lk;
1416     if (__kmp_env_consistency_check) {
1417       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1418                       __kmp_map_hint_to_lock(hint));
1419     }
1420 #if USE_ITT_BUILD
1421     __kmp_itt_critical_acquiring(lck);
1422 #endif
1423 #if OMPT_SUPPORT && OMPT_OPTIONAL
1424     if (ompt_enabled.enabled) {
1425       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1426       /* OMPT state update */
1427       prev_state = ti.state;
1428       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1429       ti.state = ompt_state_wait_critical;
1430 
1431       /* OMPT event callback */
1432       if (ompt_enabled.ompt_callback_mutex_acquire) {
1433         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1434             ompt_mutex_critical, (unsigned int)hint,
1435             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1436             codeptr);
1437       }
1438     }
1439 #endif
1440 #if KMP_USE_INLINED_TAS
1441     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1442       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1443     } else
1444 #elif KMP_USE_INLINED_FUTEX
1445     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1446       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1447     } else
1448 #endif
1449     {
1450       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1451     }
1452   } else {
1453     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1454     lck = ilk->lock;
1455     if (__kmp_env_consistency_check) {
1456       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1457                       __kmp_map_hint_to_lock(hint));
1458     }
1459 #if USE_ITT_BUILD
1460     __kmp_itt_critical_acquiring(lck);
1461 #endif
1462 #if OMPT_SUPPORT && OMPT_OPTIONAL
1463     if (ompt_enabled.enabled) {
1464       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1465       /* OMPT state update */
1466       prev_state = ti.state;
1467       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1468       ti.state = ompt_state_wait_critical;
1469 
1470       /* OMPT event callback */
1471       if (ompt_enabled.ompt_callback_mutex_acquire) {
1472         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1473             ompt_mutex_critical, (unsigned int)hint,
1474             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1475             codeptr);
1476       }
1477     }
1478 #endif
1479     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1480   }
1481   KMP_POP_PARTITIONED_TIMER();
1482 
1483 #if USE_ITT_BUILD
1484   __kmp_itt_critical_acquired(lck);
1485 #endif /* USE_ITT_BUILD */
1486 #if OMPT_SUPPORT && OMPT_OPTIONAL
1487   if (ompt_enabled.enabled) {
1488     /* OMPT state update */
1489     ti.state = prev_state;
1490     ti.wait_id = 0;
1491 
1492     /* OMPT event callback */
1493     if (ompt_enabled.ompt_callback_mutex_acquired) {
1494       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1495           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1496     }
1497   }
1498 #endif
1499 
1500   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1501   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1502 } // __kmpc_critical_with_hint
1503 
1504 #endif // KMP_USE_DYNAMIC_LOCK
1505 
1506 /*!
1507 @ingroup WORK_SHARING
1508 @param loc  source location information.
1509 @param global_tid  global thread number .
1510 @param crit identity of the critical section. This could be a pointer to a lock
1511 associated with the critical section, or some other suitably unique value.
1512 
1513 Leave a critical section, releasing any lock that was held during its execution.
1514 */
1515 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1516                          kmp_critical_name *crit) {
1517   kmp_user_lock_p lck;
1518 
1519   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1520 
1521 #if KMP_USE_DYNAMIC_LOCK
1522   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1523     lck = (kmp_user_lock_p)crit;
1524     KMP_ASSERT(lck != NULL);
1525     if (__kmp_env_consistency_check) {
1526       __kmp_pop_sync(global_tid, ct_critical, loc);
1527     }
1528 #if USE_ITT_BUILD
1529     __kmp_itt_critical_releasing(lck);
1530 #endif
1531 #if KMP_USE_INLINED_TAS
1532     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1533       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1534     } else
1535 #elif KMP_USE_INLINED_FUTEX
1536     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1537       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1538     } else
1539 #endif
1540     {
1541       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1542     }
1543   } else {
1544     kmp_indirect_lock_t *ilk =
1545         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1546     KMP_ASSERT(ilk != NULL);
1547     lck = ilk->lock;
1548     if (__kmp_env_consistency_check) {
1549       __kmp_pop_sync(global_tid, ct_critical, loc);
1550     }
1551 #if USE_ITT_BUILD
1552     __kmp_itt_critical_releasing(lck);
1553 #endif
1554     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1555   }
1556 
1557 #else // KMP_USE_DYNAMIC_LOCK
1558 
1559   if ((__kmp_user_lock_kind == lk_tas) &&
1560       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1561     lck = (kmp_user_lock_p)crit;
1562   }
1563 #if KMP_USE_FUTEX
1564   else if ((__kmp_user_lock_kind == lk_futex) &&
1565            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1566     lck = (kmp_user_lock_p)crit;
1567   }
1568 #endif
1569   else { // ticket, queuing or drdpa
1570     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1571   }
1572 
1573   KMP_ASSERT(lck != NULL);
1574 
1575   if (__kmp_env_consistency_check)
1576     __kmp_pop_sync(global_tid, ct_critical, loc);
1577 
1578 #if USE_ITT_BUILD
1579   __kmp_itt_critical_releasing(lck);
1580 #endif /* USE_ITT_BUILD */
1581   // Value of 'crit' should be good for using as a critical_id of the critical
1582   // section directive.
1583   __kmp_release_user_lock_with_checks(lck, global_tid);
1584 
1585 #endif // KMP_USE_DYNAMIC_LOCK
1586 
1587 #if OMPT_SUPPORT && OMPT_OPTIONAL
1588   /* OMPT release event triggers after lock is released; place here to trigger
1589    * for all #if branches */
1590   OMPT_STORE_RETURN_ADDRESS(global_tid);
1591   if (ompt_enabled.ompt_callback_mutex_released) {
1592     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1593         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1594         OMPT_LOAD_RETURN_ADDRESS(0));
1595   }
1596 #endif
1597 
1598   KMP_POP_PARTITIONED_TIMER();
1599   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1600 }
1601 
1602 /*!
1603 @ingroup SYNCHRONIZATION
1604 @param loc source location information
1605 @param global_tid thread id.
1606 @return one if the thread should execute the master block, zero otherwise
1607 
1608 Start execution of a combined barrier and master. The barrier is executed inside
1609 this function.
1610 */
1611 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1612   int status;
1613 
1614   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1615 
1616   if (!TCR_4(__kmp_init_parallel))
1617     __kmp_parallel_initialize();
1618 
1619   __kmp_resume_if_soft_paused();
1620 
1621   if (__kmp_env_consistency_check)
1622     __kmp_check_barrier(global_tid, ct_barrier, loc);
1623 
1624 #if OMPT_SUPPORT
1625   ompt_frame_t *ompt_frame;
1626   if (ompt_enabled.enabled) {
1627     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1628     if (ompt_frame->enter_frame.ptr == NULL)
1629       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1630     OMPT_STORE_RETURN_ADDRESS(global_tid);
1631   }
1632 #endif
1633 #if USE_ITT_NOTIFY
1634   __kmp_threads[global_tid]->th.th_ident = loc;
1635 #endif
1636   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1637 #if OMPT_SUPPORT && OMPT_OPTIONAL
1638   if (ompt_enabled.enabled) {
1639     ompt_frame->enter_frame = ompt_data_none;
1640   }
1641 #endif
1642 
1643   return (status != 0) ? 0 : 1;
1644 }
1645 
1646 /*!
1647 @ingroup SYNCHRONIZATION
1648 @param loc source location information
1649 @param global_tid thread id.
1650 
1651 Complete the execution of a combined barrier and master. This function should
1652 only be called at the completion of the <tt>master</tt> code. Other threads will
1653 still be waiting at the barrier and this call releases them.
1654 */
1655 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1656   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1657 
1658   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1659 }
1660 
1661 /*!
1662 @ingroup SYNCHRONIZATION
1663 @param loc source location information
1664 @param global_tid thread id.
1665 @return one if the thread should execute the master block, zero otherwise
1666 
1667 Start execution of a combined barrier and master(nowait) construct.
1668 The barrier is executed inside this function.
1669 There is no equivalent "end" function, since the
1670 */
1671 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1672   kmp_int32 ret;
1673 
1674   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1675 
1676   if (!TCR_4(__kmp_init_parallel))
1677     __kmp_parallel_initialize();
1678 
1679   __kmp_resume_if_soft_paused();
1680 
1681   if (__kmp_env_consistency_check) {
1682     if (loc == 0) {
1683       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1684     }
1685     __kmp_check_barrier(global_tid, ct_barrier, loc);
1686   }
1687 
1688 #if OMPT_SUPPORT
1689   ompt_frame_t *ompt_frame;
1690   if (ompt_enabled.enabled) {
1691     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1692     if (ompt_frame->enter_frame.ptr == NULL)
1693       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1694     OMPT_STORE_RETURN_ADDRESS(global_tid);
1695   }
1696 #endif
1697 #if USE_ITT_NOTIFY
1698   __kmp_threads[global_tid]->th.th_ident = loc;
1699 #endif
1700   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1701 #if OMPT_SUPPORT && OMPT_OPTIONAL
1702   if (ompt_enabled.enabled) {
1703     ompt_frame->enter_frame = ompt_data_none;
1704   }
1705 #endif
1706 
1707   ret = __kmpc_master(loc, global_tid);
1708 
1709   if (__kmp_env_consistency_check) {
1710     /*  there's no __kmpc_end_master called; so the (stats) */
1711     /*  actions of __kmpc_end_master are done here          */
1712 
1713     if (global_tid < 0) {
1714       KMP_WARNING(ThreadIdentInvalid);
1715     }
1716     if (ret) {
1717       /* only one thread should do the pop since only */
1718       /* one did the push (see __kmpc_master())       */
1719 
1720       __kmp_pop_sync(global_tid, ct_master, loc);
1721     }
1722   }
1723 
1724   return (ret);
1725 }
1726 
1727 /* The BARRIER for a SINGLE process section is always explicit   */
1728 /*!
1729 @ingroup WORK_SHARING
1730 @param loc  source location information
1731 @param global_tid  global thread number
1732 @return One if this thread should execute the single construct, zero otherwise.
1733 
1734 Test whether to execute a <tt>single</tt> construct.
1735 There are no implicit barriers in the two "single" calls, rather the compiler
1736 should introduce an explicit barrier if it is required.
1737 */
1738 
1739 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1740   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1741 
1742   if (rc) {
1743     // We are going to execute the single statement, so we should count it.
1744     KMP_COUNT_BLOCK(OMP_SINGLE);
1745     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1746   }
1747 
1748 #if OMPT_SUPPORT && OMPT_OPTIONAL
1749   kmp_info_t *this_thr = __kmp_threads[global_tid];
1750   kmp_team_t *team = this_thr->th.th_team;
1751   int tid = __kmp_tid_from_gtid(global_tid);
1752 
1753   if (ompt_enabled.enabled) {
1754     if (rc) {
1755       if (ompt_enabled.ompt_callback_work) {
1756         ompt_callbacks.ompt_callback(ompt_callback_work)(
1757             ompt_work_single_executor, ompt_scope_begin,
1758             &(team->t.ompt_team_info.parallel_data),
1759             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1760             1, OMPT_GET_RETURN_ADDRESS(0));
1761       }
1762     } else {
1763       if (ompt_enabled.ompt_callback_work) {
1764         ompt_callbacks.ompt_callback(ompt_callback_work)(
1765             ompt_work_single_other, ompt_scope_begin,
1766             &(team->t.ompt_team_info.parallel_data),
1767             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1768             1, OMPT_GET_RETURN_ADDRESS(0));
1769         ompt_callbacks.ompt_callback(ompt_callback_work)(
1770             ompt_work_single_other, ompt_scope_end,
1771             &(team->t.ompt_team_info.parallel_data),
1772             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1773             1, OMPT_GET_RETURN_ADDRESS(0));
1774       }
1775     }
1776   }
1777 #endif
1778 
1779   return rc;
1780 }
1781 
1782 /*!
1783 @ingroup WORK_SHARING
1784 @param loc  source location information
1785 @param global_tid  global thread number
1786 
1787 Mark the end of a <tt>single</tt> construct.  This function should
1788 only be called by the thread that executed the block of code protected
1789 by the `single` construct.
1790 */
1791 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1792   __kmp_exit_single(global_tid);
1793   KMP_POP_PARTITIONED_TIMER();
1794 
1795 #if OMPT_SUPPORT && OMPT_OPTIONAL
1796   kmp_info_t *this_thr = __kmp_threads[global_tid];
1797   kmp_team_t *team = this_thr->th.th_team;
1798   int tid = __kmp_tid_from_gtid(global_tid);
1799 
1800   if (ompt_enabled.ompt_callback_work) {
1801     ompt_callbacks.ompt_callback(ompt_callback_work)(
1802         ompt_work_single_executor, ompt_scope_end,
1803         &(team->t.ompt_team_info.parallel_data),
1804         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1805         OMPT_GET_RETURN_ADDRESS(0));
1806   }
1807 #endif
1808 }
1809 
1810 /*!
1811 @ingroup WORK_SHARING
1812 @param loc Source location
1813 @param global_tid Global thread id
1814 
1815 Mark the end of a statically scheduled loop.
1816 */
1817 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1818   KMP_POP_PARTITIONED_TIMER();
1819   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1820 
1821 #if OMPT_SUPPORT && OMPT_OPTIONAL
1822   if (ompt_enabled.ompt_callback_work) {
1823     ompt_work_t ompt_work_type = ompt_work_loop;
1824     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1825     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1826     // Determine workshare type
1827     if (loc != NULL) {
1828       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1829         ompt_work_type = ompt_work_loop;
1830       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1831         ompt_work_type = ompt_work_sections;
1832       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1833         ompt_work_type = ompt_work_distribute;
1834       } else {
1835         // use default set above.
1836         // a warning about this case is provided in __kmpc_for_static_init
1837       }
1838       KMP_DEBUG_ASSERT(ompt_work_type);
1839     }
1840     ompt_callbacks.ompt_callback(ompt_callback_work)(
1841         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1842         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1843   }
1844 #endif
1845   if (__kmp_env_consistency_check)
1846     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1847 }
1848 
1849 // User routines which take C-style arguments (call by value)
1850 // different from the Fortran equivalent routines
1851 
1852 void ompc_set_num_threads(int arg) {
1853   // !!!!! TODO: check the per-task binding
1854   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1855 }
1856 
1857 void ompc_set_dynamic(int flag) {
1858   kmp_info_t *thread;
1859 
1860   /* For the thread-private implementation of the internal controls */
1861   thread = __kmp_entry_thread();
1862 
1863   __kmp_save_internal_controls(thread);
1864 
1865   set__dynamic(thread, flag ? TRUE : FALSE);
1866 }
1867 
1868 void ompc_set_nested(int flag) {
1869   kmp_info_t *thread;
1870 
1871   /* For the thread-private internal controls implementation */
1872   thread = __kmp_entry_thread();
1873 
1874   __kmp_save_internal_controls(thread);
1875 
1876   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1877 }
1878 
1879 void ompc_set_max_active_levels(int max_active_levels) {
1880   /* TO DO */
1881   /* we want per-task implementation of this internal control */
1882 
1883   /* For the per-thread internal controls implementation */
1884   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1885 }
1886 
1887 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1888   // !!!!! TODO: check the per-task binding
1889   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1890 }
1891 
1892 int ompc_get_ancestor_thread_num(int level) {
1893   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1894 }
1895 
1896 int ompc_get_team_size(int level) {
1897   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1898 }
1899 
1900 /* OpenMP 5.0 Affinity Format API */
1901 
1902 void ompc_set_affinity_format(char const *format) {
1903   if (!__kmp_init_serial) {
1904     __kmp_serial_initialize();
1905   }
1906   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1907                          format, KMP_STRLEN(format) + 1);
1908 }
1909 
1910 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1911   size_t format_size;
1912   if (!__kmp_init_serial) {
1913     __kmp_serial_initialize();
1914   }
1915   format_size = KMP_STRLEN(__kmp_affinity_format);
1916   if (buffer && size) {
1917     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1918                            format_size + 1);
1919   }
1920   return format_size;
1921 }
1922 
1923 void ompc_display_affinity(char const *format) {
1924   int gtid;
1925   if (!TCR_4(__kmp_init_middle)) {
1926     __kmp_middle_initialize();
1927   }
1928   gtid = __kmp_get_gtid();
1929   __kmp_aux_display_affinity(gtid, format);
1930 }
1931 
1932 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1933                              char const *format) {
1934   int gtid;
1935   size_t num_required;
1936   kmp_str_buf_t capture_buf;
1937   if (!TCR_4(__kmp_init_middle)) {
1938     __kmp_middle_initialize();
1939   }
1940   gtid = __kmp_get_gtid();
1941   __kmp_str_buf_init(&capture_buf);
1942   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1943   if (buffer && buf_size) {
1944     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1945                            capture_buf.used + 1);
1946   }
1947   __kmp_str_buf_free(&capture_buf);
1948   return num_required;
1949 }
1950 
1951 void kmpc_set_stacksize(int arg) {
1952   // __kmp_aux_set_stacksize initializes the library if needed
1953   __kmp_aux_set_stacksize(arg);
1954 }
1955 
1956 void kmpc_set_stacksize_s(size_t arg) {
1957   // __kmp_aux_set_stacksize initializes the library if needed
1958   __kmp_aux_set_stacksize(arg);
1959 }
1960 
1961 void kmpc_set_blocktime(int arg) {
1962   int gtid, tid;
1963   kmp_info_t *thread;
1964 
1965   gtid = __kmp_entry_gtid();
1966   tid = __kmp_tid_from_gtid(gtid);
1967   thread = __kmp_thread_from_gtid(gtid);
1968 
1969   __kmp_aux_set_blocktime(arg, thread, tid);
1970 }
1971 
1972 void kmpc_set_library(int arg) {
1973   // __kmp_user_set_library initializes the library if needed
1974   __kmp_user_set_library((enum library_type)arg);
1975 }
1976 
1977 void kmpc_set_defaults(char const *str) {
1978   // __kmp_aux_set_defaults initializes the library if needed
1979   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1980 }
1981 
1982 void kmpc_set_disp_num_buffers(int arg) {
1983   // ignore after initialization because some teams have already
1984   // allocated dispatch buffers
1985   if (__kmp_init_serial == 0 && arg > 0)
1986     __kmp_dispatch_num_buffers = arg;
1987 }
1988 
1989 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1990 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1991   return -1;
1992 #else
1993   if (!TCR_4(__kmp_init_middle)) {
1994     __kmp_middle_initialize();
1995   }
1996   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1997 #endif
1998 }
1999 
2000 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2001 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2002   return -1;
2003 #else
2004   if (!TCR_4(__kmp_init_middle)) {
2005     __kmp_middle_initialize();
2006   }
2007   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2008 #endif
2009 }
2010 
2011 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2012 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2013   return -1;
2014 #else
2015   if (!TCR_4(__kmp_init_middle)) {
2016     __kmp_middle_initialize();
2017   }
2018   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2019 #endif
2020 }
2021 
2022 /* -------------------------------------------------------------------------- */
2023 /*!
2024 @ingroup THREADPRIVATE
2025 @param loc       source location information
2026 @param gtid      global thread number
2027 @param cpy_size  size of the cpy_data buffer
2028 @param cpy_data  pointer to data to be copied
2029 @param cpy_func  helper function to call for copying data
2030 @param didit     flag variable: 1=single thread; 0=not single thread
2031 
2032 __kmpc_copyprivate implements the interface for the private data broadcast
2033 needed for the copyprivate clause associated with a single region in an
2034 OpenMP<sup>*</sup> program (both C and Fortran).
2035 All threads participating in the parallel region call this routine.
2036 One of the threads (called the single thread) should have the <tt>didit</tt>
2037 variable set to 1 and all other threads should have that variable set to 0.
2038 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2039 
2040 The OpenMP specification forbids the use of nowait on the single region when a
2041 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2042 barrier internally to avoid race conditions, so the code generation for the
2043 single region should avoid generating a barrier after the call to @ref
2044 __kmpc_copyprivate.
2045 
2046 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2047 The <tt>loc</tt> parameter is a pointer to source location information.
2048 
2049 Internal implementation: The single thread will first copy its descriptor
2050 address (cpy_data) to a team-private location, then the other threads will each
2051 call the function pointed to by the parameter cpy_func, which carries out the
2052 copy by copying the data using the cpy_data buffer.
2053 
2054 The cpy_func routine used for the copy and the contents of the data area defined
2055 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2056 to be done. For instance, the cpy_data buffer can hold the actual data to be
2057 copied or it may hold a list of pointers to the data. The cpy_func routine must
2058 interpret the cpy_data buffer appropriately.
2059 
2060 The interface to cpy_func is as follows:
2061 @code
2062 void cpy_func( void *destination, void *source )
2063 @endcode
2064 where void *destination is the cpy_data pointer for the thread being copied to
2065 and void *source is the cpy_data pointer for the thread being copied from.
2066 */
2067 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2068                         void *cpy_data, void (*cpy_func)(void *, void *),
2069                         kmp_int32 didit) {
2070   void **data_ptr;
2071 
2072   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2073 
2074   KMP_MB();
2075 
2076   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2077 
2078   if (__kmp_env_consistency_check) {
2079     if (loc == 0) {
2080       KMP_WARNING(ConstructIdentInvalid);
2081     }
2082   }
2083 
2084   // ToDo: Optimize the following two barriers into some kind of split barrier
2085 
2086   if (didit)
2087     *data_ptr = cpy_data;
2088 
2089 #if OMPT_SUPPORT
2090   ompt_frame_t *ompt_frame;
2091   if (ompt_enabled.enabled) {
2092     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2093     if (ompt_frame->enter_frame.ptr == NULL)
2094       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2095     OMPT_STORE_RETURN_ADDRESS(gtid);
2096   }
2097 #endif
2098 /* This barrier is not a barrier region boundary */
2099 #if USE_ITT_NOTIFY
2100   __kmp_threads[gtid]->th.th_ident = loc;
2101 #endif
2102   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2103 
2104   if (!didit)
2105     (*cpy_func)(cpy_data, *data_ptr);
2106 
2107 // Consider next barrier a user-visible barrier for barrier region boundaries
2108 // Nesting checks are already handled by the single construct checks
2109 
2110 #if OMPT_SUPPORT
2111   if (ompt_enabled.enabled) {
2112     OMPT_STORE_RETURN_ADDRESS(gtid);
2113   }
2114 #endif
2115 #if USE_ITT_NOTIFY
2116   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2117 // tasks can overwrite the location)
2118 #endif
2119   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2120 #if OMPT_SUPPORT && OMPT_OPTIONAL
2121   if (ompt_enabled.enabled) {
2122     ompt_frame->enter_frame = ompt_data_none;
2123   }
2124 #endif
2125 }
2126 
2127 /* -------------------------------------------------------------------------- */
2128 
2129 #define INIT_LOCK __kmp_init_user_lock_with_checks
2130 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2131 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2132 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2133 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2134 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2135   __kmp_acquire_nested_user_lock_with_checks_timed
2136 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2137 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2138 #define TEST_LOCK __kmp_test_user_lock_with_checks
2139 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2140 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2141 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2142 
2143 // TODO: Make check abort messages use location info & pass it into
2144 // with_checks routines
2145 
2146 #if KMP_USE_DYNAMIC_LOCK
2147 
2148 // internal lock initializer
2149 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2150                                                     kmp_dyna_lockseq_t seq) {
2151   if (KMP_IS_D_LOCK(seq)) {
2152     KMP_INIT_D_LOCK(lock, seq);
2153 #if USE_ITT_BUILD
2154     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2155 #endif
2156   } else {
2157     KMP_INIT_I_LOCK(lock, seq);
2158 #if USE_ITT_BUILD
2159     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2160     __kmp_itt_lock_creating(ilk->lock, loc);
2161 #endif
2162   }
2163 }
2164 
2165 // internal nest lock initializer
2166 static __forceinline void
2167 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2168                                kmp_dyna_lockseq_t seq) {
2169 #if KMP_USE_TSX
2170   // Don't have nested lock implementation for speculative locks
2171   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2172     seq = __kmp_user_lock_seq;
2173 #endif
2174   switch (seq) {
2175   case lockseq_tas:
2176     seq = lockseq_nested_tas;
2177     break;
2178 #if KMP_USE_FUTEX
2179   case lockseq_futex:
2180     seq = lockseq_nested_futex;
2181     break;
2182 #endif
2183   case lockseq_ticket:
2184     seq = lockseq_nested_ticket;
2185     break;
2186   case lockseq_queuing:
2187     seq = lockseq_nested_queuing;
2188     break;
2189   case lockseq_drdpa:
2190     seq = lockseq_nested_drdpa;
2191     break;
2192   default:
2193     seq = lockseq_nested_queuing;
2194   }
2195   KMP_INIT_I_LOCK(lock, seq);
2196 #if USE_ITT_BUILD
2197   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2198   __kmp_itt_lock_creating(ilk->lock, loc);
2199 #endif
2200 }
2201 
2202 /* initialize the lock with a hint */
2203 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2204                                 uintptr_t hint) {
2205   KMP_DEBUG_ASSERT(__kmp_init_serial);
2206   if (__kmp_env_consistency_check && user_lock == NULL) {
2207     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2208   }
2209 
2210   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2211 
2212 #if OMPT_SUPPORT && OMPT_OPTIONAL
2213   // This is the case, if called from omp_init_lock_with_hint:
2214   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2215   if (!codeptr)
2216     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2217   if (ompt_enabled.ompt_callback_lock_init) {
2218     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2219         ompt_mutex_lock, (omp_lock_hint_t)hint,
2220         __ompt_get_mutex_impl_type(user_lock),
2221         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2222   }
2223 #endif
2224 }
2225 
2226 /* initialize the lock with a hint */
2227 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2228                                      void **user_lock, uintptr_t hint) {
2229   KMP_DEBUG_ASSERT(__kmp_init_serial);
2230   if (__kmp_env_consistency_check && user_lock == NULL) {
2231     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2232   }
2233 
2234   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2235 
2236 #if OMPT_SUPPORT && OMPT_OPTIONAL
2237   // This is the case, if called from omp_init_lock_with_hint:
2238   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2239   if (!codeptr)
2240     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2241   if (ompt_enabled.ompt_callback_lock_init) {
2242     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2243         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2244         __ompt_get_mutex_impl_type(user_lock),
2245         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2246   }
2247 #endif
2248 }
2249 
2250 #endif // KMP_USE_DYNAMIC_LOCK
2251 
2252 /* initialize the lock */
2253 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2254 #if KMP_USE_DYNAMIC_LOCK
2255 
2256   KMP_DEBUG_ASSERT(__kmp_init_serial);
2257   if (__kmp_env_consistency_check && user_lock == NULL) {
2258     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2259   }
2260   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2261 
2262 #if OMPT_SUPPORT && OMPT_OPTIONAL
2263   // This is the case, if called from omp_init_lock_with_hint:
2264   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2265   if (!codeptr)
2266     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2267   if (ompt_enabled.ompt_callback_lock_init) {
2268     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2269         ompt_mutex_lock, omp_lock_hint_none,
2270         __ompt_get_mutex_impl_type(user_lock),
2271         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2272   }
2273 #endif
2274 
2275 #else // KMP_USE_DYNAMIC_LOCK
2276 
2277   static char const *const func = "omp_init_lock";
2278   kmp_user_lock_p lck;
2279   KMP_DEBUG_ASSERT(__kmp_init_serial);
2280 
2281   if (__kmp_env_consistency_check) {
2282     if (user_lock == NULL) {
2283       KMP_FATAL(LockIsUninitialized, func);
2284     }
2285   }
2286 
2287   KMP_CHECK_USER_LOCK_INIT();
2288 
2289   if ((__kmp_user_lock_kind == lk_tas) &&
2290       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2291     lck = (kmp_user_lock_p)user_lock;
2292   }
2293 #if KMP_USE_FUTEX
2294   else if ((__kmp_user_lock_kind == lk_futex) &&
2295            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2296     lck = (kmp_user_lock_p)user_lock;
2297   }
2298 #endif
2299   else {
2300     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2301   }
2302   INIT_LOCK(lck);
2303   __kmp_set_user_lock_location(lck, loc);
2304 
2305 #if OMPT_SUPPORT && OMPT_OPTIONAL
2306   // This is the case, if called from omp_init_lock_with_hint:
2307   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2308   if (!codeptr)
2309     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2310   if (ompt_enabled.ompt_callback_lock_init) {
2311     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2312         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2313         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2314   }
2315 #endif
2316 
2317 #if USE_ITT_BUILD
2318   __kmp_itt_lock_creating(lck);
2319 #endif /* USE_ITT_BUILD */
2320 
2321 #endif // KMP_USE_DYNAMIC_LOCK
2322 } // __kmpc_init_lock
2323 
2324 /* initialize the lock */
2325 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2326 #if KMP_USE_DYNAMIC_LOCK
2327 
2328   KMP_DEBUG_ASSERT(__kmp_init_serial);
2329   if (__kmp_env_consistency_check && user_lock == NULL) {
2330     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2331   }
2332   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2333 
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL
2335   // This is the case, if called from omp_init_lock_with_hint:
2336   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2337   if (!codeptr)
2338     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2339   if (ompt_enabled.ompt_callback_lock_init) {
2340     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2341         ompt_mutex_nest_lock, omp_lock_hint_none,
2342         __ompt_get_mutex_impl_type(user_lock),
2343         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2344   }
2345 #endif
2346 
2347 #else // KMP_USE_DYNAMIC_LOCK
2348 
2349   static char const *const func = "omp_init_nest_lock";
2350   kmp_user_lock_p lck;
2351   KMP_DEBUG_ASSERT(__kmp_init_serial);
2352 
2353   if (__kmp_env_consistency_check) {
2354     if (user_lock == NULL) {
2355       KMP_FATAL(LockIsUninitialized, func);
2356     }
2357   }
2358 
2359   KMP_CHECK_USER_LOCK_INIT();
2360 
2361   if ((__kmp_user_lock_kind == lk_tas) &&
2362       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2363        OMP_NEST_LOCK_T_SIZE)) {
2364     lck = (kmp_user_lock_p)user_lock;
2365   }
2366 #if KMP_USE_FUTEX
2367   else if ((__kmp_user_lock_kind == lk_futex) &&
2368            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2369             OMP_NEST_LOCK_T_SIZE)) {
2370     lck = (kmp_user_lock_p)user_lock;
2371   }
2372 #endif
2373   else {
2374     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2375   }
2376 
2377   INIT_NESTED_LOCK(lck);
2378   __kmp_set_user_lock_location(lck, loc);
2379 
2380 #if OMPT_SUPPORT && OMPT_OPTIONAL
2381   // This is the case, if called from omp_init_lock_with_hint:
2382   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2383   if (!codeptr)
2384     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2385   if (ompt_enabled.ompt_callback_lock_init) {
2386     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2387         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2388         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2389   }
2390 #endif
2391 
2392 #if USE_ITT_BUILD
2393   __kmp_itt_lock_creating(lck);
2394 #endif /* USE_ITT_BUILD */
2395 
2396 #endif // KMP_USE_DYNAMIC_LOCK
2397 } // __kmpc_init_nest_lock
2398 
2399 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2400 #if KMP_USE_DYNAMIC_LOCK
2401 
2402 #if USE_ITT_BUILD
2403   kmp_user_lock_p lck;
2404   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2405     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2406   } else {
2407     lck = (kmp_user_lock_p)user_lock;
2408   }
2409   __kmp_itt_lock_destroyed(lck);
2410 #endif
2411 #if OMPT_SUPPORT && OMPT_OPTIONAL
2412   // This is the case, if called from omp_init_lock_with_hint:
2413   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2414   if (!codeptr)
2415     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2416   if (ompt_enabled.ompt_callback_lock_destroy) {
2417     kmp_user_lock_p lck;
2418     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2419       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2420     } else {
2421       lck = (kmp_user_lock_p)user_lock;
2422     }
2423     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2424         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2425   }
2426 #endif
2427   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2428 #else
2429   kmp_user_lock_p lck;
2430 
2431   if ((__kmp_user_lock_kind == lk_tas) &&
2432       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2433     lck = (kmp_user_lock_p)user_lock;
2434   }
2435 #if KMP_USE_FUTEX
2436   else if ((__kmp_user_lock_kind == lk_futex) &&
2437            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2438     lck = (kmp_user_lock_p)user_lock;
2439   }
2440 #endif
2441   else {
2442     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2443   }
2444 
2445 #if OMPT_SUPPORT && OMPT_OPTIONAL
2446   // This is the case, if called from omp_init_lock_with_hint:
2447   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2448   if (!codeptr)
2449     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2450   if (ompt_enabled.ompt_callback_lock_destroy) {
2451     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2452         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2453   }
2454 #endif
2455 
2456 #if USE_ITT_BUILD
2457   __kmp_itt_lock_destroyed(lck);
2458 #endif /* USE_ITT_BUILD */
2459   DESTROY_LOCK(lck);
2460 
2461   if ((__kmp_user_lock_kind == lk_tas) &&
2462       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2463     ;
2464   }
2465 #if KMP_USE_FUTEX
2466   else if ((__kmp_user_lock_kind == lk_futex) &&
2467            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2468     ;
2469   }
2470 #endif
2471   else {
2472     __kmp_user_lock_free(user_lock, gtid, lck);
2473   }
2474 #endif // KMP_USE_DYNAMIC_LOCK
2475 } // __kmpc_destroy_lock
2476 
2477 /* destroy the lock */
2478 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2479 #if KMP_USE_DYNAMIC_LOCK
2480 
2481 #if USE_ITT_BUILD
2482   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2483   __kmp_itt_lock_destroyed(ilk->lock);
2484 #endif
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486   // This is the case, if called from omp_init_lock_with_hint:
2487   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2488   if (!codeptr)
2489     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2490   if (ompt_enabled.ompt_callback_lock_destroy) {
2491     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2492         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2493   }
2494 #endif
2495   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2496 
2497 #else // KMP_USE_DYNAMIC_LOCK
2498 
2499   kmp_user_lock_p lck;
2500 
2501   if ((__kmp_user_lock_kind == lk_tas) &&
2502       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2503        OMP_NEST_LOCK_T_SIZE)) {
2504     lck = (kmp_user_lock_p)user_lock;
2505   }
2506 #if KMP_USE_FUTEX
2507   else if ((__kmp_user_lock_kind == lk_futex) &&
2508            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2509             OMP_NEST_LOCK_T_SIZE)) {
2510     lck = (kmp_user_lock_p)user_lock;
2511   }
2512 #endif
2513   else {
2514     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2515   }
2516 
2517 #if OMPT_SUPPORT && OMPT_OPTIONAL
2518   // This is the case, if called from omp_init_lock_with_hint:
2519   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2520   if (!codeptr)
2521     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2522   if (ompt_enabled.ompt_callback_lock_destroy) {
2523     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2524         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2525   }
2526 #endif
2527 
2528 #if USE_ITT_BUILD
2529   __kmp_itt_lock_destroyed(lck);
2530 #endif /* USE_ITT_BUILD */
2531 
2532   DESTROY_NESTED_LOCK(lck);
2533 
2534   if ((__kmp_user_lock_kind == lk_tas) &&
2535       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2536        OMP_NEST_LOCK_T_SIZE)) {
2537     ;
2538   }
2539 #if KMP_USE_FUTEX
2540   else if ((__kmp_user_lock_kind == lk_futex) &&
2541            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2542             OMP_NEST_LOCK_T_SIZE)) {
2543     ;
2544   }
2545 #endif
2546   else {
2547     __kmp_user_lock_free(user_lock, gtid, lck);
2548   }
2549 #endif // KMP_USE_DYNAMIC_LOCK
2550 } // __kmpc_destroy_nest_lock
2551 
2552 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2553   KMP_COUNT_BLOCK(OMP_set_lock);
2554 #if KMP_USE_DYNAMIC_LOCK
2555   int tag = KMP_EXTRACT_D_TAG(user_lock);
2556 #if USE_ITT_BUILD
2557   __kmp_itt_lock_acquiring(
2558       (kmp_user_lock_p)
2559           user_lock); // itt function will get to the right lock object.
2560 #endif
2561 #if OMPT_SUPPORT && OMPT_OPTIONAL
2562   // This is the case, if called from omp_init_lock_with_hint:
2563   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2564   if (!codeptr)
2565     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2566   if (ompt_enabled.ompt_callback_mutex_acquire) {
2567     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2568         ompt_mutex_lock, omp_lock_hint_none,
2569         __ompt_get_mutex_impl_type(user_lock),
2570         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2571   }
2572 #endif
2573 #if KMP_USE_INLINED_TAS
2574   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2575     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2576   } else
2577 #elif KMP_USE_INLINED_FUTEX
2578   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2579     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2580   } else
2581 #endif
2582   {
2583     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2584   }
2585 #if USE_ITT_BUILD
2586   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2587 #endif
2588 #if OMPT_SUPPORT && OMPT_OPTIONAL
2589   if (ompt_enabled.ompt_callback_mutex_acquired) {
2590     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2591         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2592   }
2593 #endif
2594 
2595 #else // KMP_USE_DYNAMIC_LOCK
2596 
2597   kmp_user_lock_p lck;
2598 
2599   if ((__kmp_user_lock_kind == lk_tas) &&
2600       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2601     lck = (kmp_user_lock_p)user_lock;
2602   }
2603 #if KMP_USE_FUTEX
2604   else if ((__kmp_user_lock_kind == lk_futex) &&
2605            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2606     lck = (kmp_user_lock_p)user_lock;
2607   }
2608 #endif
2609   else {
2610     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2611   }
2612 
2613 #if USE_ITT_BUILD
2614   __kmp_itt_lock_acquiring(lck);
2615 #endif /* USE_ITT_BUILD */
2616 #if OMPT_SUPPORT && OMPT_OPTIONAL
2617   // This is the case, if called from omp_init_lock_with_hint:
2618   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2619   if (!codeptr)
2620     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2621   if (ompt_enabled.ompt_callback_mutex_acquire) {
2622     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2623         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2624         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2625   }
2626 #endif
2627 
2628   ACQUIRE_LOCK(lck, gtid);
2629 
2630 #if USE_ITT_BUILD
2631   __kmp_itt_lock_acquired(lck);
2632 #endif /* USE_ITT_BUILD */
2633 
2634 #if OMPT_SUPPORT && OMPT_OPTIONAL
2635   if (ompt_enabled.ompt_callback_mutex_acquired) {
2636     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2637         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2638   }
2639 #endif
2640 
2641 #endif // KMP_USE_DYNAMIC_LOCK
2642 }
2643 
2644 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2645 #if KMP_USE_DYNAMIC_LOCK
2646 
2647 #if USE_ITT_BUILD
2648   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2649 #endif
2650 #if OMPT_SUPPORT && OMPT_OPTIONAL
2651   // This is the case, if called from omp_init_lock_with_hint:
2652   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2653   if (!codeptr)
2654     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2655   if (ompt_enabled.enabled) {
2656     if (ompt_enabled.ompt_callback_mutex_acquire) {
2657       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2658           ompt_mutex_nest_lock, omp_lock_hint_none,
2659           __ompt_get_mutex_impl_type(user_lock),
2660           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2661     }
2662   }
2663 #endif
2664   int acquire_status =
2665       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2666   (void) acquire_status;
2667 #if USE_ITT_BUILD
2668   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2669 #endif
2670 
2671 #if OMPT_SUPPORT && OMPT_OPTIONAL
2672   if (ompt_enabled.enabled) {
2673     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2674       if (ompt_enabled.ompt_callback_mutex_acquired) {
2675         // lock_first
2676         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2677             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2678             codeptr);
2679       }
2680     } else {
2681       if (ompt_enabled.ompt_callback_nest_lock) {
2682         // lock_next
2683         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2684             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2685       }
2686     }
2687   }
2688 #endif
2689 
2690 #else // KMP_USE_DYNAMIC_LOCK
2691   int acquire_status;
2692   kmp_user_lock_p lck;
2693 
2694   if ((__kmp_user_lock_kind == lk_tas) &&
2695       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2696        OMP_NEST_LOCK_T_SIZE)) {
2697     lck = (kmp_user_lock_p)user_lock;
2698   }
2699 #if KMP_USE_FUTEX
2700   else if ((__kmp_user_lock_kind == lk_futex) &&
2701            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2702             OMP_NEST_LOCK_T_SIZE)) {
2703     lck = (kmp_user_lock_p)user_lock;
2704   }
2705 #endif
2706   else {
2707     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2708   }
2709 
2710 #if USE_ITT_BUILD
2711   __kmp_itt_lock_acquiring(lck);
2712 #endif /* USE_ITT_BUILD */
2713 #if OMPT_SUPPORT && OMPT_OPTIONAL
2714   // This is the case, if called from omp_init_lock_with_hint:
2715   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2716   if (!codeptr)
2717     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2718   if (ompt_enabled.enabled) {
2719     if (ompt_enabled.ompt_callback_mutex_acquire) {
2720       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2721           ompt_mutex_nest_lock, omp_lock_hint_none,
2722           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2723           codeptr);
2724     }
2725   }
2726 #endif
2727 
2728   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2729 
2730 #if USE_ITT_BUILD
2731   __kmp_itt_lock_acquired(lck);
2732 #endif /* USE_ITT_BUILD */
2733 
2734 #if OMPT_SUPPORT && OMPT_OPTIONAL
2735   if (ompt_enabled.enabled) {
2736     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2737       if (ompt_enabled.ompt_callback_mutex_acquired) {
2738         // lock_first
2739         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2740             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2741       }
2742     } else {
2743       if (ompt_enabled.ompt_callback_nest_lock) {
2744         // lock_next
2745         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2746             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2747       }
2748     }
2749   }
2750 #endif
2751 
2752 #endif // KMP_USE_DYNAMIC_LOCK
2753 }
2754 
2755 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2756 #if KMP_USE_DYNAMIC_LOCK
2757 
2758   int tag = KMP_EXTRACT_D_TAG(user_lock);
2759 #if USE_ITT_BUILD
2760   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2761 #endif
2762 #if KMP_USE_INLINED_TAS
2763   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2764     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2765   } else
2766 #elif KMP_USE_INLINED_FUTEX
2767   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2768     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2769   } else
2770 #endif
2771   {
2772     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2773   }
2774 
2775 #if OMPT_SUPPORT && OMPT_OPTIONAL
2776   // This is the case, if called from omp_init_lock_with_hint:
2777   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2778   if (!codeptr)
2779     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2780   if (ompt_enabled.ompt_callback_mutex_released) {
2781     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2782         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2783   }
2784 #endif
2785 
2786 #else // KMP_USE_DYNAMIC_LOCK
2787 
2788   kmp_user_lock_p lck;
2789 
2790   /* Can't use serial interval since not block structured */
2791   /* release the lock */
2792 
2793   if ((__kmp_user_lock_kind == lk_tas) &&
2794       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2795 #if KMP_OS_LINUX &&                                                            \
2796     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2797 // "fast" path implemented to fix customer performance issue
2798 #if USE_ITT_BUILD
2799     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2800 #endif /* USE_ITT_BUILD */
2801     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2802     KMP_MB();
2803 
2804 #if OMPT_SUPPORT && OMPT_OPTIONAL
2805     // This is the case, if called from omp_init_lock_with_hint:
2806     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2807     if (!codeptr)
2808       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2809     if (ompt_enabled.ompt_callback_mutex_released) {
2810       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2811           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2812     }
2813 #endif
2814 
2815     return;
2816 #else
2817     lck = (kmp_user_lock_p)user_lock;
2818 #endif
2819   }
2820 #if KMP_USE_FUTEX
2821   else if ((__kmp_user_lock_kind == lk_futex) &&
2822            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2823     lck = (kmp_user_lock_p)user_lock;
2824   }
2825 #endif
2826   else {
2827     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2828   }
2829 
2830 #if USE_ITT_BUILD
2831   __kmp_itt_lock_releasing(lck);
2832 #endif /* USE_ITT_BUILD */
2833 
2834   RELEASE_LOCK(lck, gtid);
2835 
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837   // This is the case, if called from omp_init_lock_with_hint:
2838   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2839   if (!codeptr)
2840     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2841   if (ompt_enabled.ompt_callback_mutex_released) {
2842     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2843         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2844   }
2845 #endif
2846 
2847 #endif // KMP_USE_DYNAMIC_LOCK
2848 }
2849 
2850 /* release the lock */
2851 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2852 #if KMP_USE_DYNAMIC_LOCK
2853 
2854 #if USE_ITT_BUILD
2855   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2856 #endif
2857   int release_status =
2858       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2859   (void) release_status;
2860 
2861 #if OMPT_SUPPORT && OMPT_OPTIONAL
2862   // This is the case, if called from omp_init_lock_with_hint:
2863   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2864   if (!codeptr)
2865     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2866   if (ompt_enabled.enabled) {
2867     if (release_status == KMP_LOCK_RELEASED) {
2868       if (ompt_enabled.ompt_callback_mutex_released) {
2869         // release_lock_last
2870         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2871             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2872             codeptr);
2873       }
2874     } else if (ompt_enabled.ompt_callback_nest_lock) {
2875       // release_lock_prev
2876       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2877           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2878     }
2879   }
2880 #endif
2881 
2882 #else // KMP_USE_DYNAMIC_LOCK
2883 
2884   kmp_user_lock_p lck;
2885 
2886   /* Can't use serial interval since not block structured */
2887 
2888   if ((__kmp_user_lock_kind == lk_tas) &&
2889       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2890        OMP_NEST_LOCK_T_SIZE)) {
2891 #if KMP_OS_LINUX &&                                                            \
2892     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2893     // "fast" path implemented to fix customer performance issue
2894     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2895 #if USE_ITT_BUILD
2896     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2897 #endif /* USE_ITT_BUILD */
2898 
2899 #if OMPT_SUPPORT && OMPT_OPTIONAL
2900     int release_status = KMP_LOCK_STILL_HELD;
2901 #endif
2902 
2903     if (--(tl->lk.depth_locked) == 0) {
2904       TCW_4(tl->lk.poll, 0);
2905 #if OMPT_SUPPORT && OMPT_OPTIONAL
2906       release_status = KMP_LOCK_RELEASED;
2907 #endif
2908     }
2909     KMP_MB();
2910 
2911 #if OMPT_SUPPORT && OMPT_OPTIONAL
2912     // This is the case, if called from omp_init_lock_with_hint:
2913     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2914     if (!codeptr)
2915       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2916     if (ompt_enabled.enabled) {
2917       if (release_status == KMP_LOCK_RELEASED) {
2918         if (ompt_enabled.ompt_callback_mutex_released) {
2919           // release_lock_last
2920           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2921               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2922         }
2923       } else if (ompt_enabled.ompt_callback_nest_lock) {
2924         // release_lock_previous
2925         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2926             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2927       }
2928     }
2929 #endif
2930 
2931     return;
2932 #else
2933     lck = (kmp_user_lock_p)user_lock;
2934 #endif
2935   }
2936 #if KMP_USE_FUTEX
2937   else if ((__kmp_user_lock_kind == lk_futex) &&
2938            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2939             OMP_NEST_LOCK_T_SIZE)) {
2940     lck = (kmp_user_lock_p)user_lock;
2941   }
2942 #endif
2943   else {
2944     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2945   }
2946 
2947 #if USE_ITT_BUILD
2948   __kmp_itt_lock_releasing(lck);
2949 #endif /* USE_ITT_BUILD */
2950 
2951   int release_status;
2952   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2953 #if OMPT_SUPPORT && OMPT_OPTIONAL
2954   // This is the case, if called from omp_init_lock_with_hint:
2955   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2956   if (!codeptr)
2957     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2958   if (ompt_enabled.enabled) {
2959     if (release_status == KMP_LOCK_RELEASED) {
2960       if (ompt_enabled.ompt_callback_mutex_released) {
2961         // release_lock_last
2962         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2963             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2964       }
2965     } else if (ompt_enabled.ompt_callback_nest_lock) {
2966       // release_lock_previous
2967       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2968           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2969     }
2970   }
2971 #endif
2972 
2973 #endif // KMP_USE_DYNAMIC_LOCK
2974 }
2975 
2976 /* try to acquire the lock */
2977 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2978   KMP_COUNT_BLOCK(OMP_test_lock);
2979 
2980 #if KMP_USE_DYNAMIC_LOCK
2981   int rc;
2982   int tag = KMP_EXTRACT_D_TAG(user_lock);
2983 #if USE_ITT_BUILD
2984   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2985 #endif
2986 #if OMPT_SUPPORT && OMPT_OPTIONAL
2987   // This is the case, if called from omp_init_lock_with_hint:
2988   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2989   if (!codeptr)
2990     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2991   if (ompt_enabled.ompt_callback_mutex_acquire) {
2992     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2993         ompt_mutex_lock, omp_lock_hint_none,
2994         __ompt_get_mutex_impl_type(user_lock),
2995         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2996   }
2997 #endif
2998 #if KMP_USE_INLINED_TAS
2999   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3000     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3001   } else
3002 #elif KMP_USE_INLINED_FUTEX
3003   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3004     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3005   } else
3006 #endif
3007   {
3008     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3009   }
3010   if (rc) {
3011 #if USE_ITT_BUILD
3012     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3013 #endif
3014 #if OMPT_SUPPORT && OMPT_OPTIONAL
3015     if (ompt_enabled.ompt_callback_mutex_acquired) {
3016       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3017           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3018     }
3019 #endif
3020     return FTN_TRUE;
3021   } else {
3022 #if USE_ITT_BUILD
3023     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3024 #endif
3025     return FTN_FALSE;
3026   }
3027 
3028 #else // KMP_USE_DYNAMIC_LOCK
3029 
3030   kmp_user_lock_p lck;
3031   int rc;
3032 
3033   if ((__kmp_user_lock_kind == lk_tas) &&
3034       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3035     lck = (kmp_user_lock_p)user_lock;
3036   }
3037 #if KMP_USE_FUTEX
3038   else if ((__kmp_user_lock_kind == lk_futex) &&
3039            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3040     lck = (kmp_user_lock_p)user_lock;
3041   }
3042 #endif
3043   else {
3044     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3045   }
3046 
3047 #if USE_ITT_BUILD
3048   __kmp_itt_lock_acquiring(lck);
3049 #endif /* USE_ITT_BUILD */
3050 #if OMPT_SUPPORT && OMPT_OPTIONAL
3051   // This is the case, if called from omp_init_lock_with_hint:
3052   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3053   if (!codeptr)
3054     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3055   if (ompt_enabled.ompt_callback_mutex_acquire) {
3056     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3057         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3058         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059   }
3060 #endif
3061 
3062   rc = TEST_LOCK(lck, gtid);
3063 #if USE_ITT_BUILD
3064   if (rc) {
3065     __kmp_itt_lock_acquired(lck);
3066   } else {
3067     __kmp_itt_lock_cancelled(lck);
3068   }
3069 #endif /* USE_ITT_BUILD */
3070 #if OMPT_SUPPORT && OMPT_OPTIONAL
3071   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3072     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3073         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3074   }
3075 #endif
3076 
3077   return (rc ? FTN_TRUE : FTN_FALSE);
3078 
3079 /* Can't use serial interval since not block structured */
3080 
3081 #endif // KMP_USE_DYNAMIC_LOCK
3082 }
3083 
3084 /* try to acquire the lock */
3085 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3086 #if KMP_USE_DYNAMIC_LOCK
3087   int rc;
3088 #if USE_ITT_BUILD
3089   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3090 #endif
3091 #if OMPT_SUPPORT && OMPT_OPTIONAL
3092   // This is the case, if called from omp_init_lock_with_hint:
3093   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3094   if (!codeptr)
3095     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3096   if (ompt_enabled.ompt_callback_mutex_acquire) {
3097     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3098         ompt_mutex_nest_lock, omp_lock_hint_none,
3099         __ompt_get_mutex_impl_type(user_lock),
3100         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3101   }
3102 #endif
3103   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3104 #if USE_ITT_BUILD
3105   if (rc) {
3106     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3107   } else {
3108     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3109   }
3110 #endif
3111 #if OMPT_SUPPORT && OMPT_OPTIONAL
3112   if (ompt_enabled.enabled && rc) {
3113     if (rc == 1) {
3114       if (ompt_enabled.ompt_callback_mutex_acquired) {
3115         // lock_first
3116         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3117             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3118             codeptr);
3119       }
3120     } else {
3121       if (ompt_enabled.ompt_callback_nest_lock) {
3122         // lock_next
3123         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3124             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3125       }
3126     }
3127   }
3128 #endif
3129   return rc;
3130 
3131 #else // KMP_USE_DYNAMIC_LOCK
3132 
3133   kmp_user_lock_p lck;
3134   int rc;
3135 
3136   if ((__kmp_user_lock_kind == lk_tas) &&
3137       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3138        OMP_NEST_LOCK_T_SIZE)) {
3139     lck = (kmp_user_lock_p)user_lock;
3140   }
3141 #if KMP_USE_FUTEX
3142   else if ((__kmp_user_lock_kind == lk_futex) &&
3143            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3144             OMP_NEST_LOCK_T_SIZE)) {
3145     lck = (kmp_user_lock_p)user_lock;
3146   }
3147 #endif
3148   else {
3149     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3150   }
3151 
3152 #if USE_ITT_BUILD
3153   __kmp_itt_lock_acquiring(lck);
3154 #endif /* USE_ITT_BUILD */
3155 
3156 #if OMPT_SUPPORT && OMPT_OPTIONAL
3157   // This is the case, if called from omp_init_lock_with_hint:
3158   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3159   if (!codeptr)
3160     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3161   if (ompt_enabled.enabled) &&
3162         ompt_enabled.ompt_callback_mutex_acquire) {
3163       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3164           ompt_mutex_nest_lock, omp_lock_hint_none,
3165           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3166           codeptr);
3167     }
3168 #endif
3169 
3170   rc = TEST_NESTED_LOCK(lck, gtid);
3171 #if USE_ITT_BUILD
3172   if (rc) {
3173     __kmp_itt_lock_acquired(lck);
3174   } else {
3175     __kmp_itt_lock_cancelled(lck);
3176   }
3177 #endif /* USE_ITT_BUILD */
3178 #if OMPT_SUPPORT && OMPT_OPTIONAL
3179   if (ompt_enabled.enabled && rc) {
3180     if (rc == 1) {
3181       if (ompt_enabled.ompt_callback_mutex_acquired) {
3182         // lock_first
3183         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3184             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3185       }
3186     } else {
3187       if (ompt_enabled.ompt_callback_nest_lock) {
3188         // lock_next
3189         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3190             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3191       }
3192     }
3193   }
3194 #endif
3195   return rc;
3196 
3197 /* Can't use serial interval since not block structured */
3198 
3199 #endif // KMP_USE_DYNAMIC_LOCK
3200 }
3201 
3202 // Interface to fast scalable reduce methods routines
3203 
3204 // keep the selected method in a thread local structure for cross-function
3205 // usage: will be used in __kmpc_end_reduce* functions;
3206 // another solution: to re-determine the method one more time in
3207 // __kmpc_end_reduce* functions (new prototype required then)
3208 // AT: which solution is better?
3209 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3210   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3211 
3212 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3213   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3214 
3215 // description of the packed_reduction_method variable: look at the macros in
3216 // kmp.h
3217 
3218 // used in a critical section reduce block
3219 static __forceinline void
3220 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3221                                           kmp_critical_name *crit) {
3222 
3223   // this lock was visible to a customer and to the threading profile tool as a
3224   // serial overhead span (although it's used for an internal purpose only)
3225   //            why was it visible in previous implementation?
3226   //            should we keep it visible in new reduce block?
3227   kmp_user_lock_p lck;
3228 
3229 #if KMP_USE_DYNAMIC_LOCK
3230 
3231   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3232   // Check if it is initialized.
3233   if (*lk == 0) {
3234     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3235       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3236                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3237     } else {
3238       __kmp_init_indirect_csptr(crit, loc, global_tid,
3239                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3240     }
3241   }
3242   // Branch for accessing the actual lock object and set operation. This
3243   // branching is inevitable since this lock initialization does not follow the
3244   // normal dispatch path (lock table is not used).
3245   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3246     lck = (kmp_user_lock_p)lk;
3247     KMP_DEBUG_ASSERT(lck != NULL);
3248     if (__kmp_env_consistency_check) {
3249       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3250     }
3251     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3252   } else {
3253     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3254     lck = ilk->lock;
3255     KMP_DEBUG_ASSERT(lck != NULL);
3256     if (__kmp_env_consistency_check) {
3257       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3258     }
3259     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3260   }
3261 
3262 #else // KMP_USE_DYNAMIC_LOCK
3263 
3264   // We know that the fast reduction code is only emitted by Intel compilers
3265   // with 32 byte critical sections. If there isn't enough space, then we
3266   // have to use a pointer.
3267   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3268     lck = (kmp_user_lock_p)crit;
3269   } else {
3270     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3271   }
3272   KMP_DEBUG_ASSERT(lck != NULL);
3273 
3274   if (__kmp_env_consistency_check)
3275     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3276 
3277   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3278 
3279 #endif // KMP_USE_DYNAMIC_LOCK
3280 }
3281 
3282 // used in a critical section reduce block
3283 static __forceinline void
3284 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3285                                         kmp_critical_name *crit) {
3286 
3287   kmp_user_lock_p lck;
3288 
3289 #if KMP_USE_DYNAMIC_LOCK
3290 
3291   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3292     lck = (kmp_user_lock_p)crit;
3293     if (__kmp_env_consistency_check)
3294       __kmp_pop_sync(global_tid, ct_critical, loc);
3295     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3296   } else {
3297     kmp_indirect_lock_t *ilk =
3298         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3299     if (__kmp_env_consistency_check)
3300       __kmp_pop_sync(global_tid, ct_critical, loc);
3301     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3302   }
3303 
3304 #else // KMP_USE_DYNAMIC_LOCK
3305 
3306   // We know that the fast reduction code is only emitted by Intel compilers
3307   // with 32 byte critical sections. If there isn't enough space, then we have
3308   // to use a pointer.
3309   if (__kmp_base_user_lock_size > 32) {
3310     lck = *((kmp_user_lock_p *)crit);
3311     KMP_ASSERT(lck != NULL);
3312   } else {
3313     lck = (kmp_user_lock_p)crit;
3314   }
3315 
3316   if (__kmp_env_consistency_check)
3317     __kmp_pop_sync(global_tid, ct_critical, loc);
3318 
3319   __kmp_release_user_lock_with_checks(lck, global_tid);
3320 
3321 #endif // KMP_USE_DYNAMIC_LOCK
3322 } // __kmp_end_critical_section_reduce_block
3323 
3324 static __forceinline int
3325 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3326                                      int *task_state) {
3327   kmp_team_t *team;
3328 
3329   // Check if we are inside the teams construct?
3330   if (th->th.th_teams_microtask) {
3331     *team_p = team = th->th.th_team;
3332     if (team->t.t_level == th->th.th_teams_level) {
3333       // This is reduction at teams construct.
3334       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3335       // Let's swap teams temporarily for the reduction.
3336       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3337       th->th.th_team = team->t.t_parent;
3338       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3339       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3340       *task_state = th->th.th_task_state;
3341       th->th.th_task_state = 0;
3342 
3343       return 1;
3344     }
3345   }
3346   return 0;
3347 }
3348 
3349 static __forceinline void
3350 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3351   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3352   th->th.th_info.ds.ds_tid = 0;
3353   th->th.th_team = team;
3354   th->th.th_team_nproc = team->t.t_nproc;
3355   th->th.th_task_team = team->t.t_task_team[task_state];
3356   th->th.th_task_state = task_state;
3357 }
3358 
3359 /* 2.a.i. Reduce Block without a terminating barrier */
3360 /*!
3361 @ingroup SYNCHRONIZATION
3362 @param loc source location information
3363 @param global_tid global thread number
3364 @param num_vars number of items (variables) to be reduced
3365 @param reduce_size size of data in bytes to be reduced
3366 @param reduce_data pointer to data to be reduced
3367 @param reduce_func callback function providing reduction operation on two
3368 operands and returning result of reduction in lhs_data
3369 @param lck pointer to the unique lock data structure
3370 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3371 threads if atomic reduction needed
3372 
3373 The nowait version is used for a reduce clause with the nowait argument.
3374 */
3375 kmp_int32
3376 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3377                      size_t reduce_size, void *reduce_data,
3378                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3379                      kmp_critical_name *lck) {
3380 
3381   KMP_COUNT_BLOCK(REDUCE_nowait);
3382   int retval = 0;
3383   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3384   kmp_info_t *th;
3385   kmp_team_t *team;
3386   int teams_swapped = 0, task_state;
3387   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3388 
3389   // why do we need this initialization here at all?
3390   // Reduction clause can not be used as a stand-alone directive.
3391 
3392   // do not call __kmp_serial_initialize(), it will be called by
3393   // __kmp_parallel_initialize() if needed
3394   // possible detection of false-positive race by the threadchecker ???
3395   if (!TCR_4(__kmp_init_parallel))
3396     __kmp_parallel_initialize();
3397 
3398   __kmp_resume_if_soft_paused();
3399 
3400 // check correctness of reduce block nesting
3401 #if KMP_USE_DYNAMIC_LOCK
3402   if (__kmp_env_consistency_check)
3403     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3404 #else
3405   if (__kmp_env_consistency_check)
3406     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3407 #endif
3408 
3409   th = __kmp_thread_from_gtid(global_tid);
3410   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3411 
3412   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3413   // the value should be kept in a variable
3414   // the variable should be either a construct-specific or thread-specific
3415   // property, not a team specific property
3416   //     (a thread can reach the next reduce block on the next construct, reduce
3417   //     method may differ on the next construct)
3418   // an ident_t "loc" parameter could be used as a construct-specific property
3419   // (what if loc == 0?)
3420   //     (if both construct-specific and team-specific variables were shared,
3421   //     then unness extra syncs should be needed)
3422   // a thread-specific variable is better regarding two issues above (next
3423   // construct and extra syncs)
3424   // a thread-specific "th_local.reduction_method" variable is used currently
3425   // each thread executes 'determine' and 'set' lines (no need to execute by one
3426   // thread, to avoid unness extra syncs)
3427 
3428   packed_reduction_method = __kmp_determine_reduction_method(
3429       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3430   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3431 
3432   if (packed_reduction_method == critical_reduce_block) {
3433 
3434     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3435     retval = 1;
3436 
3437   } else if (packed_reduction_method == empty_reduce_block) {
3438 
3439     // usage: if team size == 1, no synchronization is required ( Intel
3440     // platforms only )
3441     retval = 1;
3442 
3443   } else if (packed_reduction_method == atomic_reduce_block) {
3444 
3445     retval = 2;
3446 
3447     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3448     // won't be called by the code gen)
3449     //     (it's not quite good, because the checking block has been closed by
3450     //     this 'pop',
3451     //      but atomic operation has not been executed yet, will be executed
3452     //      slightly later, literally on next instruction)
3453     if (__kmp_env_consistency_check)
3454       __kmp_pop_sync(global_tid, ct_reduce, loc);
3455 
3456   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3457                                    tree_reduce_block)) {
3458 
3459 // AT: performance issue: a real barrier here
3460 // AT:     (if master goes slow, other threads are blocked here waiting for the
3461 // master to come and release them)
3462 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3463 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3464 // be confusing to a customer)
3465 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3466 // might go faster and be more in line with sense of NOWAIT
3467 // AT: TO DO: do epcc test and compare times
3468 
3469 // this barrier should be invisible to a customer and to the threading profile
3470 // tool (it's neither a terminating barrier nor customer's code, it's
3471 // used for an internal purpose)
3472 #if OMPT_SUPPORT
3473     // JP: can this barrier potentially leed to task scheduling?
3474     // JP: as long as there is a barrier in the implementation, OMPT should and
3475     // will provide the barrier events
3476     //         so we set-up the necessary frame/return addresses.
3477     ompt_frame_t *ompt_frame;
3478     if (ompt_enabled.enabled) {
3479       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3480       if (ompt_frame->enter_frame.ptr == NULL)
3481         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3482       OMPT_STORE_RETURN_ADDRESS(global_tid);
3483     }
3484 #endif
3485 #if USE_ITT_NOTIFY
3486     __kmp_threads[global_tid]->th.th_ident = loc;
3487 #endif
3488     retval =
3489         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3490                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3491     retval = (retval != 0) ? (0) : (1);
3492 #if OMPT_SUPPORT && OMPT_OPTIONAL
3493     if (ompt_enabled.enabled) {
3494       ompt_frame->enter_frame = ompt_data_none;
3495     }
3496 #endif
3497 
3498     // all other workers except master should do this pop here
3499     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3500     if (__kmp_env_consistency_check) {
3501       if (retval == 0) {
3502         __kmp_pop_sync(global_tid, ct_reduce, loc);
3503       }
3504     }
3505 
3506   } else {
3507 
3508     // should never reach this block
3509     KMP_ASSERT(0); // "unexpected method"
3510   }
3511   if (teams_swapped) {
3512     __kmp_restore_swapped_teams(th, team, task_state);
3513   }
3514   KA_TRACE(
3515       10,
3516       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3517        global_tid, packed_reduction_method, retval));
3518 
3519   return retval;
3520 }
3521 
3522 /*!
3523 @ingroup SYNCHRONIZATION
3524 @param loc source location information
3525 @param global_tid global thread id.
3526 @param lck pointer to the unique lock data structure
3527 
3528 Finish the execution of a reduce nowait.
3529 */
3530 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3531                               kmp_critical_name *lck) {
3532 
3533   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3534 
3535   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3536 
3537   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3538 
3539   if (packed_reduction_method == critical_reduce_block) {
3540 
3541     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3542 
3543   } else if (packed_reduction_method == empty_reduce_block) {
3544 
3545     // usage: if team size == 1, no synchronization is required ( on Intel
3546     // platforms only )
3547 
3548   } else if (packed_reduction_method == atomic_reduce_block) {
3549 
3550     // neither master nor other workers should get here
3551     //     (code gen does not generate this call in case 2: atomic reduce block)
3552     // actually it's better to remove this elseif at all;
3553     // after removal this value will checked by the 'else' and will assert
3554 
3555   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3556                                    tree_reduce_block)) {
3557 
3558     // only master gets here
3559 
3560   } else {
3561 
3562     // should never reach this block
3563     KMP_ASSERT(0); // "unexpected method"
3564   }
3565 
3566   if (__kmp_env_consistency_check)
3567     __kmp_pop_sync(global_tid, ct_reduce, loc);
3568 
3569   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3570                 global_tid, packed_reduction_method));
3571 
3572   return;
3573 }
3574 
3575 /* 2.a.ii. Reduce Block with a terminating barrier */
3576 
3577 /*!
3578 @ingroup SYNCHRONIZATION
3579 @param loc source location information
3580 @param global_tid global thread number
3581 @param num_vars number of items (variables) to be reduced
3582 @param reduce_size size of data in bytes to be reduced
3583 @param reduce_data pointer to data to be reduced
3584 @param reduce_func callback function providing reduction operation on two
3585 operands and returning result of reduction in lhs_data
3586 @param lck pointer to the unique lock data structure
3587 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3588 threads if atomic reduction needed
3589 
3590 A blocking reduce that includes an implicit barrier.
3591 */
3592 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3593                         size_t reduce_size, void *reduce_data,
3594                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3595                         kmp_critical_name *lck) {
3596   KMP_COUNT_BLOCK(REDUCE_wait);
3597   int retval = 0;
3598   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3599   kmp_info_t *th;
3600   kmp_team_t *team;
3601   int teams_swapped = 0, task_state;
3602 
3603   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3604 
3605   // why do we need this initialization here at all?
3606   // Reduction clause can not be a stand-alone directive.
3607 
3608   // do not call __kmp_serial_initialize(), it will be called by
3609   // __kmp_parallel_initialize() if needed
3610   // possible detection of false-positive race by the threadchecker ???
3611   if (!TCR_4(__kmp_init_parallel))
3612     __kmp_parallel_initialize();
3613 
3614   __kmp_resume_if_soft_paused();
3615 
3616 // check correctness of reduce block nesting
3617 #if KMP_USE_DYNAMIC_LOCK
3618   if (__kmp_env_consistency_check)
3619     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3620 #else
3621   if (__kmp_env_consistency_check)
3622     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3623 #endif
3624 
3625   th = __kmp_thread_from_gtid(global_tid);
3626   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3627 
3628   packed_reduction_method = __kmp_determine_reduction_method(
3629       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3630   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3631 
3632   if (packed_reduction_method == critical_reduce_block) {
3633 
3634     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3635     retval = 1;
3636 
3637   } else if (packed_reduction_method == empty_reduce_block) {
3638 
3639     // usage: if team size == 1, no synchronization is required ( Intel
3640     // platforms only )
3641     retval = 1;
3642 
3643   } else if (packed_reduction_method == atomic_reduce_block) {
3644 
3645     retval = 2;
3646 
3647   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3648                                    tree_reduce_block)) {
3649 
3650 // case tree_reduce_block:
3651 // this barrier should be visible to a customer and to the threading profile
3652 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3653 #if OMPT_SUPPORT
3654     ompt_frame_t *ompt_frame;
3655     if (ompt_enabled.enabled) {
3656       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3657       if (ompt_frame->enter_frame.ptr == NULL)
3658         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3659       OMPT_STORE_RETURN_ADDRESS(global_tid);
3660     }
3661 #endif
3662 #if USE_ITT_NOTIFY
3663     __kmp_threads[global_tid]->th.th_ident =
3664         loc; // needed for correct notification of frames
3665 #endif
3666     retval =
3667         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3668                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3669     retval = (retval != 0) ? (0) : (1);
3670 #if OMPT_SUPPORT && OMPT_OPTIONAL
3671     if (ompt_enabled.enabled) {
3672       ompt_frame->enter_frame = ompt_data_none;
3673     }
3674 #endif
3675 
3676     // all other workers except master should do this pop here
3677     // ( none of other workers except master will enter __kmpc_end_reduce() )
3678     if (__kmp_env_consistency_check) {
3679       if (retval == 0) { // 0: all other workers; 1: master
3680         __kmp_pop_sync(global_tid, ct_reduce, loc);
3681       }
3682     }
3683 
3684   } else {
3685 
3686     // should never reach this block
3687     KMP_ASSERT(0); // "unexpected method"
3688   }
3689   if (teams_swapped) {
3690     __kmp_restore_swapped_teams(th, team, task_state);
3691   }
3692 
3693   KA_TRACE(10,
3694            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3695             global_tid, packed_reduction_method, retval));
3696   return retval;
3697 }
3698 
3699 /*!
3700 @ingroup SYNCHRONIZATION
3701 @param loc source location information
3702 @param global_tid global thread id.
3703 @param lck pointer to the unique lock data structure
3704 
3705 Finish the execution of a blocking reduce.
3706 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3707 start function.
3708 */
3709 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3710                        kmp_critical_name *lck) {
3711 
3712   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3713   kmp_info_t *th;
3714   kmp_team_t *team;
3715   int teams_swapped = 0, task_state;
3716 
3717   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3718 
3719   th = __kmp_thread_from_gtid(global_tid);
3720   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3721 
3722   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3723 
3724   // this barrier should be visible to a customer and to the threading profile
3725   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3726 
3727   if (packed_reduction_method == critical_reduce_block) {
3728     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3729 
3730 // TODO: implicit barrier: should be exposed
3731 #if OMPT_SUPPORT
3732     ompt_frame_t *ompt_frame;
3733     if (ompt_enabled.enabled) {
3734       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3735       if (ompt_frame->enter_frame.ptr == NULL)
3736         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3737       OMPT_STORE_RETURN_ADDRESS(global_tid);
3738     }
3739 #endif
3740 #if USE_ITT_NOTIFY
3741     __kmp_threads[global_tid]->th.th_ident = loc;
3742 #endif
3743     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3744 #if OMPT_SUPPORT && OMPT_OPTIONAL
3745     if (ompt_enabled.enabled) {
3746       ompt_frame->enter_frame = ompt_data_none;
3747     }
3748 #endif
3749 
3750   } else if (packed_reduction_method == empty_reduce_block) {
3751 
3752 // usage: if team size==1, no synchronization is required (Intel platforms only)
3753 
3754 // TODO: implicit barrier: should be exposed
3755 #if OMPT_SUPPORT
3756     ompt_frame_t *ompt_frame;
3757     if (ompt_enabled.enabled) {
3758       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3759       if (ompt_frame->enter_frame.ptr == NULL)
3760         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3761       OMPT_STORE_RETURN_ADDRESS(global_tid);
3762     }
3763 #endif
3764 #if USE_ITT_NOTIFY
3765     __kmp_threads[global_tid]->th.th_ident = loc;
3766 #endif
3767     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3768 #if OMPT_SUPPORT && OMPT_OPTIONAL
3769     if (ompt_enabled.enabled) {
3770       ompt_frame->enter_frame = ompt_data_none;
3771     }
3772 #endif
3773 
3774   } else if (packed_reduction_method == atomic_reduce_block) {
3775 
3776 #if OMPT_SUPPORT
3777     ompt_frame_t *ompt_frame;
3778     if (ompt_enabled.enabled) {
3779       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3780       if (ompt_frame->enter_frame.ptr == NULL)
3781         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3782       OMPT_STORE_RETURN_ADDRESS(global_tid);
3783     }
3784 #endif
3785 // TODO: implicit barrier: should be exposed
3786 #if USE_ITT_NOTIFY
3787     __kmp_threads[global_tid]->th.th_ident = loc;
3788 #endif
3789     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3790 #if OMPT_SUPPORT && OMPT_OPTIONAL
3791     if (ompt_enabled.enabled) {
3792       ompt_frame->enter_frame = ompt_data_none;
3793     }
3794 #endif
3795 
3796   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3797                                    tree_reduce_block)) {
3798 
3799     // only master executes here (master releases all other workers)
3800     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3801                             global_tid);
3802 
3803   } else {
3804 
3805     // should never reach this block
3806     KMP_ASSERT(0); // "unexpected method"
3807   }
3808   if (teams_swapped) {
3809     __kmp_restore_swapped_teams(th, team, task_state);
3810   }
3811 
3812   if (__kmp_env_consistency_check)
3813     __kmp_pop_sync(global_tid, ct_reduce, loc);
3814 
3815   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3816                 global_tid, packed_reduction_method));
3817 
3818   return;
3819 }
3820 
3821 #undef __KMP_GET_REDUCTION_METHOD
3822 #undef __KMP_SET_REDUCTION_METHOD
3823 
3824 /* end of interface to fast scalable reduce routines */
3825 
3826 kmp_uint64 __kmpc_get_taskid() {
3827 
3828   kmp_int32 gtid;
3829   kmp_info_t *thread;
3830 
3831   gtid = __kmp_get_gtid();
3832   if (gtid < 0) {
3833     return 0;
3834   }
3835   thread = __kmp_thread_from_gtid(gtid);
3836   return thread->th.th_current_task->td_task_id;
3837 
3838 } // __kmpc_get_taskid
3839 
3840 kmp_uint64 __kmpc_get_parent_taskid() {
3841 
3842   kmp_int32 gtid;
3843   kmp_info_t *thread;
3844   kmp_taskdata_t *parent_task;
3845 
3846   gtid = __kmp_get_gtid();
3847   if (gtid < 0) {
3848     return 0;
3849   }
3850   thread = __kmp_thread_from_gtid(gtid);
3851   parent_task = thread->th.th_current_task->td_parent;
3852   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3853 
3854 } // __kmpc_get_parent_taskid
3855 
3856 /*!
3857 @ingroup WORK_SHARING
3858 @param loc  source location information.
3859 @param gtid  global thread number.
3860 @param num_dims  number of associated doacross loops.
3861 @param dims  info on loops bounds.
3862 
3863 Initialize doacross loop information.
3864 Expect compiler send us inclusive bounds,
3865 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3866 */
3867 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3868                           const struct kmp_dim *dims) {
3869   int j, idx;
3870   kmp_int64 last, trace_count;
3871   kmp_info_t *th = __kmp_threads[gtid];
3872   kmp_team_t *team = th->th.th_team;
3873   kmp_uint32 *flags;
3874   kmp_disp_t *pr_buf = th->th.th_dispatch;
3875   dispatch_shared_info_t *sh_buf;
3876 
3877   KA_TRACE(
3878       20,
3879       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3880        gtid, num_dims, !team->t.t_serialized));
3881   KMP_DEBUG_ASSERT(dims != NULL);
3882   KMP_DEBUG_ASSERT(num_dims > 0);
3883 
3884   if (team->t.t_serialized) {
3885     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3886     return; // no dependencies if team is serialized
3887   }
3888   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3889   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3890   // the next loop
3891   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3892 
3893   // Save bounds info into allocated private buffer
3894   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3895   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3896       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3897   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3898   pr_buf->th_doacross_info[0] =
3899       (kmp_int64)num_dims; // first element is number of dimensions
3900   // Save also address of num_done in order to access it later without knowing
3901   // the buffer index
3902   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3903   pr_buf->th_doacross_info[2] = dims[0].lo;
3904   pr_buf->th_doacross_info[3] = dims[0].up;
3905   pr_buf->th_doacross_info[4] = dims[0].st;
3906   last = 5;
3907   for (j = 1; j < num_dims; ++j) {
3908     kmp_int64
3909         range_length; // To keep ranges of all dimensions but the first dims[0]
3910     if (dims[j].st == 1) { // most common case
3911       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3912       range_length = dims[j].up - dims[j].lo + 1;
3913     } else {
3914       if (dims[j].st > 0) {
3915         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3916         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3917       } else { // negative increment
3918         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3919         range_length =
3920             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3921       }
3922     }
3923     pr_buf->th_doacross_info[last++] = range_length;
3924     pr_buf->th_doacross_info[last++] = dims[j].lo;
3925     pr_buf->th_doacross_info[last++] = dims[j].up;
3926     pr_buf->th_doacross_info[last++] = dims[j].st;
3927   }
3928 
3929   // Compute total trip count.
3930   // Start with range of dims[0] which we don't need to keep in the buffer.
3931   if (dims[0].st == 1) { // most common case
3932     trace_count = dims[0].up - dims[0].lo + 1;
3933   } else if (dims[0].st > 0) {
3934     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3935     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3936   } else { // negative increment
3937     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3938     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3939   }
3940   for (j = 1; j < num_dims; ++j) {
3941     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3942   }
3943   KMP_DEBUG_ASSERT(trace_count > 0);
3944 
3945   // Check if shared buffer is not occupied by other loop (idx -
3946   // __kmp_dispatch_num_buffers)
3947   if (idx != sh_buf->doacross_buf_idx) {
3948     // Shared buffer is occupied, wait for it to be free
3949     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3950                  __kmp_eq_4, NULL);
3951   }
3952 #if KMP_32_BIT_ARCH
3953   // Check if we are the first thread. After the CAS the first thread gets 0,
3954   // others get 1 if initialization is in progress, allocated pointer otherwise.
3955   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3956   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3957       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3958 #else
3959   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3960       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3961 #endif
3962   if (flags == NULL) {
3963     // we are the first thread, allocate the array of flags
3964     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3965     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3966     KMP_MB();
3967     sh_buf->doacross_flags = flags;
3968   } else if (flags == (kmp_uint32 *)1) {
3969 #if KMP_32_BIT_ARCH
3970     // initialization is still in progress, need to wait
3971     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3972 #else
3973     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3974 #endif
3975       KMP_YIELD(TRUE);
3976     KMP_MB();
3977   } else {
3978     KMP_MB();
3979   }
3980   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3981   pr_buf->th_doacross_flags =
3982       sh_buf->doacross_flags; // save private copy in order to not
3983   // touch shared buffer on each iteration
3984   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3985 }
3986 
3987 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3988   kmp_int32 shft, num_dims, i;
3989   kmp_uint32 flag;
3990   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3991   kmp_info_t *th = __kmp_threads[gtid];
3992   kmp_team_t *team = th->th.th_team;
3993   kmp_disp_t *pr_buf;
3994   kmp_int64 lo, up, st;
3995 
3996   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3997   if (team->t.t_serialized) {
3998     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3999     return; // no dependencies if team is serialized
4000   }
4001 
4002   // calculate sequential iteration number and check out-of-bounds condition
4003   pr_buf = th->th.th_dispatch;
4004   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4005   num_dims = pr_buf->th_doacross_info[0];
4006   lo = pr_buf->th_doacross_info[2];
4007   up = pr_buf->th_doacross_info[3];
4008   st = pr_buf->th_doacross_info[4];
4009   if (st == 1) { // most common case
4010     if (vec[0] < lo || vec[0] > up) {
4011       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4012                     "bounds [%lld,%lld]\n",
4013                     gtid, vec[0], lo, up));
4014       return;
4015     }
4016     iter_number = vec[0] - lo;
4017   } else if (st > 0) {
4018     if (vec[0] < lo || vec[0] > up) {
4019       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4020                     "bounds [%lld,%lld]\n",
4021                     gtid, vec[0], lo, up));
4022       return;
4023     }
4024     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4025   } else { // negative increment
4026     if (vec[0] > lo || vec[0] < up) {
4027       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4028                     "bounds [%lld,%lld]\n",
4029                     gtid, vec[0], lo, up));
4030       return;
4031     }
4032     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4033   }
4034   for (i = 1; i < num_dims; ++i) {
4035     kmp_int64 iter, ln;
4036     kmp_int32 j = i * 4;
4037     ln = pr_buf->th_doacross_info[j + 1];
4038     lo = pr_buf->th_doacross_info[j + 2];
4039     up = pr_buf->th_doacross_info[j + 3];
4040     st = pr_buf->th_doacross_info[j + 4];
4041     if (st == 1) {
4042       if (vec[i] < lo || vec[i] > up) {
4043         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4044                       "bounds [%lld,%lld]\n",
4045                       gtid, vec[i], lo, up));
4046         return;
4047       }
4048       iter = vec[i] - lo;
4049     } else if (st > 0) {
4050       if (vec[i] < lo || vec[i] > up) {
4051         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4052                       "bounds [%lld,%lld]\n",
4053                       gtid, vec[i], lo, up));
4054         return;
4055       }
4056       iter = (kmp_uint64)(vec[i] - lo) / st;
4057     } else { // st < 0
4058       if (vec[i] > lo || vec[i] < up) {
4059         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4060                       "bounds [%lld,%lld]\n",
4061                       gtid, vec[i], lo, up));
4062         return;
4063       }
4064       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4065     }
4066     iter_number = iter + ln * iter_number;
4067   }
4068   shft = iter_number % 32; // use 32-bit granularity
4069   iter_number >>= 5; // divided by 32
4070   flag = 1 << shft;
4071   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4072     KMP_YIELD(TRUE);
4073   }
4074   KMP_MB();
4075   KA_TRACE(20,
4076            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4077             gtid, (iter_number << 5) + shft));
4078 }
4079 
4080 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4081   kmp_int32 shft, num_dims, i;
4082   kmp_uint32 flag;
4083   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4084   kmp_info_t *th = __kmp_threads[gtid];
4085   kmp_team_t *team = th->th.th_team;
4086   kmp_disp_t *pr_buf;
4087   kmp_int64 lo, st;
4088 
4089   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4090   if (team->t.t_serialized) {
4091     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4092     return; // no dependencies if team is serialized
4093   }
4094 
4095   // calculate sequential iteration number (same as in "wait" but no
4096   // out-of-bounds checks)
4097   pr_buf = th->th.th_dispatch;
4098   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4099   num_dims = pr_buf->th_doacross_info[0];
4100   lo = pr_buf->th_doacross_info[2];
4101   st = pr_buf->th_doacross_info[4];
4102   if (st == 1) { // most common case
4103     iter_number = vec[0] - lo;
4104   } else if (st > 0) {
4105     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4106   } else { // negative increment
4107     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4108   }
4109   for (i = 1; i < num_dims; ++i) {
4110     kmp_int64 iter, ln;
4111     kmp_int32 j = i * 4;
4112     ln = pr_buf->th_doacross_info[j + 1];
4113     lo = pr_buf->th_doacross_info[j + 2];
4114     st = pr_buf->th_doacross_info[j + 4];
4115     if (st == 1) {
4116       iter = vec[i] - lo;
4117     } else if (st > 0) {
4118       iter = (kmp_uint64)(vec[i] - lo) / st;
4119     } else { // st < 0
4120       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4121     }
4122     iter_number = iter + ln * iter_number;
4123   }
4124   shft = iter_number % 32; // use 32-bit granularity
4125   iter_number >>= 5; // divided by 32
4126   flag = 1 << shft;
4127   KMP_MB();
4128   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4129     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4130   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4131                 (iter_number << 5) + shft));
4132 }
4133 
4134 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4135   kmp_int32 num_done;
4136   kmp_info_t *th = __kmp_threads[gtid];
4137   kmp_team_t *team = th->th.th_team;
4138   kmp_disp_t *pr_buf = th->th.th_dispatch;
4139 
4140   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4141   if (team->t.t_serialized) {
4142     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4143     return; // nothing to do
4144   }
4145   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4146   if (num_done == th->th.th_team_nproc) {
4147     // we are the last thread, need to free shared resources
4148     int idx = pr_buf->th_doacross_buf_idx - 1;
4149     dispatch_shared_info_t *sh_buf =
4150         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4151     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4152                      (kmp_int64)&sh_buf->doacross_num_done);
4153     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4154     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4155     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4156     sh_buf->doacross_flags = NULL;
4157     sh_buf->doacross_num_done = 0;
4158     sh_buf->doacross_buf_idx +=
4159         __kmp_dispatch_num_buffers; // free buffer for future re-use
4160   }
4161   // free private resources (need to keep buffer index forever)
4162   pr_buf->th_doacross_flags = NULL;
4163   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4164   pr_buf->th_doacross_info = NULL;
4165   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4166 }
4167 
4168 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4169 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4170   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4171 }
4172 
4173 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4174   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4175 }
4176 
4177 int __kmpc_get_target_offload(void) {
4178   if (!__kmp_init_serial) {
4179     __kmp_serial_initialize();
4180   }
4181   return __kmp_target_offload;
4182 }
4183 
4184 int __kmpc_pause_resource(kmp_pause_status_t level) {
4185   if (!__kmp_init_serial) {
4186     return 1; // Can't pause if runtime is not initialized
4187   }
4188   return __kmp_pause_resource(level);
4189 }
4190