1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
248                            kmp_int32 proc_bind) {
249   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
250                 proc_bind));
251 
252   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
253 }
254 
255 /*!
256 @ingroup PARALLEL
257 @param loc  source location information
258 @param argc  total number of arguments in the ellipsis
259 @param microtask  pointer to callback routine consisting of outlined parallel
260 construct
261 @param ...  pointers to shared variables that aren't global
262 
263 Do the actual fork and call the microtask in the relevant number of threads.
264 */
265 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
266   int gtid = __kmp_entry_gtid();
267 
268 #if (KMP_STATS_ENABLED)
269   // If we were in a serial region, then stop the serial timer, record
270   // the event, and start parallel region timer
271   stats_state_e previous_state = KMP_GET_THREAD_STATE();
272   if (previous_state == stats_state_e::SERIAL_REGION) {
273     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
274   } else {
275     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
276   }
277   int inParallel = __kmpc_in_parallel(loc);
278   if (inParallel) {
279     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
280   } else {
281     KMP_COUNT_BLOCK(OMP_PARALLEL);
282   }
283 #endif
284 
285   // maybe to save thr_state is enough here
286   {
287     va_list ap;
288     va_start(ap, microtask);
289 
290 #if OMPT_SUPPORT
291     ompt_frame_t *ompt_frame;
292     if (ompt_enabled.enabled) {
293       kmp_info_t *master_th = __kmp_threads[gtid];
294       kmp_team_t *parent_team = master_th->th.th_team;
295       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
296       if (lwt)
297         ompt_frame = &(lwt->ompt_task_info.frame);
298       else {
299         int tid = __kmp_tid_from_gtid(gtid);
300         ompt_frame = &(
301             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
302       }
303       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
304       OMPT_STORE_RETURN_ADDRESS(gtid);
305     }
306 #endif
307 
308 #if INCLUDE_SSC_MARKS
309     SSC_MARK_FORKING();
310 #endif
311     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
312                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
313                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
314 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
315 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
316                     &ap
317 #else
318                     ap
319 #endif
320                     );
321 #if INCLUDE_SSC_MARKS
322     SSC_MARK_JOINING();
323 #endif
324     __kmp_join_call(loc, gtid
325 #if OMPT_SUPPORT
326                     ,
327                     fork_context_intel
328 #endif
329                     );
330 
331     va_end(ap);
332   }
333 
334 #if KMP_STATS_ENABLED
335   if (previous_state == stats_state_e::SERIAL_REGION) {
336     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
337   } else {
338     KMP_POP_PARTITIONED_TIMER();
339   }
340 #endif // KMP_STATS_ENABLED
341 }
342 
343 /*!
344 @ingroup PARALLEL
345 @param loc source location information
346 @param global_tid global thread number
347 @param num_teams number of teams requested for the teams construct
348 @param num_threads number of threads per team requested for the teams construct
349 
350 Set the number of teams to be used by the teams construct.
351 This call is only required if the teams construct has a `num_teams` clause
352 or a `thread_limit` clause (or both).
353 */
354 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
355                            kmp_int32 num_teams, kmp_int32 num_threads) {
356   KA_TRACE(20,
357            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
358             global_tid, num_teams, num_threads));
359 
360   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
361 }
362 
363 /*!
364 @ingroup PARALLEL
365 @param loc  source location information
366 @param argc  total number of arguments in the ellipsis
367 @param microtask  pointer to callback routine consisting of outlined teams
368 construct
369 @param ...  pointers to shared variables that aren't global
370 
371 Do the actual fork and call the microtask in the relevant number of threads.
372 */
373 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
374                        ...) {
375   int gtid = __kmp_entry_gtid();
376   kmp_info_t *this_thr = __kmp_threads[gtid];
377   va_list ap;
378   va_start(ap, microtask);
379 
380 #if KMP_STATS_ENABLED
381   KMP_COUNT_BLOCK(OMP_TEAMS);
382   stats_state_e previous_state = KMP_GET_THREAD_STATE();
383   if (previous_state == stats_state_e::SERIAL_REGION) {
384     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
385   } else {
386     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
387   }
388 #endif
389 
390   // remember teams entry point and nesting level
391   this_thr->th.th_teams_microtask = microtask;
392   this_thr->th.th_teams_level =
393       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
394 
395 #if OMPT_SUPPORT
396   kmp_team_t *parent_team = this_thr->th.th_team;
397   int tid = __kmp_tid_from_gtid(gtid);
398   if (ompt_enabled.enabled) {
399     parent_team->t.t_implicit_task_taskdata[tid]
400         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
401   }
402   OMPT_STORE_RETURN_ADDRESS(gtid);
403 #endif
404 
405   // check if __kmpc_push_num_teams called, set default number of teams
406   // otherwise
407   if (this_thr->th.th_teams_size.nteams == 0) {
408     __kmp_push_num_teams(loc, gtid, 0, 0);
409   }
410   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
411   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
412   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
413 
414   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
415                   VOLATILE_CAST(microtask_t)
416                       __kmp_teams_master, // "wrapped" task
417                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
418 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
419                   &ap
420 #else
421                   ap
422 #endif
423                   );
424   __kmp_join_call(loc, gtid
425 #if OMPT_SUPPORT
426                   ,
427                   fork_context_intel
428 #endif
429                   );
430 
431   // Pop current CG root off list
432   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
433   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
434   this_thr->th.th_cg_roots = tmp->up;
435   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
436                  " to node %p. cg_nthreads was %d\n",
437                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
438   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
439   int i = tmp->cg_nthreads--;
440   if (i == 1) { // check is we are the last thread in CG (not always the case)
441     __kmp_free(tmp);
442   }
443   // Restore current task's thread_limit from CG root
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   this_thr->th.th_current_task->td_icvs.thread_limit =
446       this_thr->th.th_cg_roots->cg_thread_limit;
447 
448   this_thr->th.th_teams_microtask = NULL;
449   this_thr->th.th_teams_level = 0;
450   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
451   va_end(ap);
452 #if KMP_STATS_ENABLED
453   if (previous_state == stats_state_e::SERIAL_REGION) {
454     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
455   } else {
456     KMP_POP_PARTITIONED_TIMER();
457   }
458 #endif // KMP_STATS_ENABLED
459 }
460 
461 // I don't think this function should ever have been exported.
462 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
463 // openmp code ever called it, but it's been exported from the RTL for so
464 // long that I'm afraid to remove the definition.
465 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
466 
467 /*!
468 @ingroup PARALLEL
469 @param loc  source location information
470 @param global_tid  global thread number
471 
472 Enter a serialized parallel construct. This interface is used to handle a
473 conditional parallel region, like this,
474 @code
475 #pragma omp parallel if (condition)
476 @endcode
477 when the condition is false.
478 */
479 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
480 // The implementation is now in kmp_runtime.cpp so that it can share static
481 // functions with kmp_fork_call since the tasks to be done are similar in
482 // each case.
483 #if OMPT_SUPPORT
484   OMPT_STORE_RETURN_ADDRESS(global_tid);
485 #endif
486   __kmp_serialized_parallel(loc, global_tid);
487 }
488 
489 /*!
490 @ingroup PARALLEL
491 @param loc  source location information
492 @param global_tid  global thread number
493 
494 Leave a serialized parallel construct.
495 */
496 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
497   kmp_internal_control_t *top;
498   kmp_info_t *this_thr;
499   kmp_team_t *serial_team;
500 
501   KC_TRACE(10,
502            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
503 
504   /* skip all this code for autopar serialized loops since it results in
505      unacceptable overhead */
506   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
507     return;
508 
509   // Not autopar code
510   if (!TCR_4(__kmp_init_parallel))
511     __kmp_parallel_initialize();
512 
513   __kmp_resume_if_soft_paused();
514 
515   this_thr = __kmp_threads[global_tid];
516   serial_team = this_thr->th.th_serial_team;
517 
518   kmp_task_team_t *task_team = this_thr->th.th_task_team;
519   // we need to wait for the proxy tasks before finishing the thread
520   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
521     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
522 
523   KMP_MB();
524   KMP_DEBUG_ASSERT(serial_team);
525   KMP_ASSERT(serial_team->t.t_serialized);
526   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
527   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
528   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
529   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
530 
531 #if OMPT_SUPPORT
532   if (ompt_enabled.enabled &&
533       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
534     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
535     if (ompt_enabled.ompt_callback_implicit_task) {
536       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
537           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
538           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
539     }
540 
541     // reset clear the task id only after unlinking the task
542     ompt_data_t *parent_task_data;
543     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
544 
545     if (ompt_enabled.ompt_callback_parallel_end) {
546       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
547           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
548           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
549     }
550     __ompt_lw_taskteam_unlink(this_thr);
551     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
552   }
553 #endif
554 
555   /* If necessary, pop the internal control stack values and replace the team
556    * values */
557   top = serial_team->t.t_control_stack_top;
558   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
559     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
560     serial_team->t.t_control_stack_top = top->next;
561     __kmp_free(top);
562   }
563 
564   // if( serial_team -> t.t_serialized > 1 )
565   serial_team->t.t_level--;
566 
567   /* pop dispatch buffers stack */
568   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
569   {
570     dispatch_private_info_t *disp_buffer =
571         serial_team->t.t_dispatch->th_disp_buffer;
572     serial_team->t.t_dispatch->th_disp_buffer =
573         serial_team->t.t_dispatch->th_disp_buffer->next;
574     __kmp_free(disp_buffer);
575   }
576   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
577 
578   --serial_team->t.t_serialized;
579   if (serial_team->t.t_serialized == 0) {
580 
581 /* return to the parallel section */
582 
583 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
584     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
585       __kmp_clear_x87_fpu_status_word();
586       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
587       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
588     }
589 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
590 
591     this_thr->th.th_team = serial_team->t.t_parent;
592     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
593 
594     /* restore values cached in the thread */
595     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
596     this_thr->th.th_team_master =
597         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
598     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
599 
600     /* TODO the below shouldn't need to be adjusted for serialized teams */
601     this_thr->th.th_dispatch =
602         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
603 
604     __kmp_pop_current_task_from_thread(this_thr);
605 
606     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
607     this_thr->th.th_current_task->td_flags.executing = 1;
608 
609     if (__kmp_tasking_mode != tskm_immediate_exec) {
610       // Copy the task team from the new child / old parent team to the thread.
611       this_thr->th.th_task_team =
612           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
613       KA_TRACE(20,
614                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
615                 "team %p\n",
616                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
617     }
618   } else {
619     if (__kmp_tasking_mode != tskm_immediate_exec) {
620       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
621                     "depth of serial team %p to %d\n",
622                     global_tid, serial_team, serial_team->t.t_serialized));
623     }
624   }
625 
626   if (__kmp_env_consistency_check)
627     __kmp_pop_parallel(global_tid, NULL);
628 #if OMPT_SUPPORT
629   if (ompt_enabled.enabled)
630     this_thr->th.ompt_thread_info.state =
631         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
632                                            : ompt_state_work_parallel);
633 #endif
634 }
635 
636 /*!
637 @ingroup SYNCHRONIZATION
638 @param loc  source location information.
639 
640 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
641 depending on the memory ordering convention obeyed by the compiler
642 even that may not be necessary).
643 */
644 void __kmpc_flush(ident_t *loc) {
645   KC_TRACE(10, ("__kmpc_flush: called\n"));
646 
647   /* need explicit __mf() here since use volatile instead in library */
648   KMP_MB(); /* Flush all pending memory write invalidates.  */
649 
650 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
651 #if KMP_MIC
652 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
653 // We shouldn't need it, though, since the ABI rules require that
654 // * If the compiler generates NGO stores it also generates the fence
655 // * If users hand-code NGO stores they should insert the fence
656 // therefore no incomplete unordered stores should be visible.
657 #else
658   // C74404
659   // This is to address non-temporal store instructions (sfence needed).
660   // The clflush instruction is addressed either (mfence needed).
661   // Probably the non-temporal load monvtdqa instruction should also be
662   // addressed.
663   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
664   if (!__kmp_cpuinfo.initialized) {
665     __kmp_query_cpuid(&__kmp_cpuinfo);
666   }
667   if (!__kmp_cpuinfo.sse2) {
668     // CPU cannot execute SSE2 instructions.
669   } else {
670 #if KMP_COMPILER_ICC
671     _mm_mfence();
672 #elif KMP_COMPILER_MSVC
673     MemoryBarrier();
674 #else
675     __sync_synchronize();
676 #endif // KMP_COMPILER_ICC
677   }
678 #endif // KMP_MIC
679 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
680        KMP_ARCH_RISCV64)
681 // Nothing to see here move along
682 #elif KMP_ARCH_PPC64
683 // Nothing needed here (we have a real MB above).
684 #if KMP_OS_CNK
685   // The flushing thread needs to yield here; this prevents a
686   // busy-waiting thread from saturating the pipeline. flush is
687   // often used in loops like this:
688   // while (!flag) {
689   //   #pragma omp flush(flag)
690   // }
691   // and adding the yield here is good for at least a 10x speedup
692   // when running >2 threads per core (on the NAS LU benchmark).
693   __kmp_yield();
694 #endif
695 #else
696 #error Unknown or unsupported architecture
697 #endif
698 
699 #if OMPT_SUPPORT && OMPT_OPTIONAL
700   if (ompt_enabled.ompt_callback_flush) {
701     ompt_callbacks.ompt_callback(ompt_callback_flush)(
702         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
703   }
704 #endif
705 }
706 
707 /* -------------------------------------------------------------------------- */
708 /*!
709 @ingroup SYNCHRONIZATION
710 @param loc source location information
711 @param global_tid thread id.
712 
713 Execute a barrier.
714 */
715 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
716   KMP_COUNT_BLOCK(OMP_BARRIER);
717   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
718 
719   if (!TCR_4(__kmp_init_parallel))
720     __kmp_parallel_initialize();
721 
722   __kmp_resume_if_soft_paused();
723 
724   if (__kmp_env_consistency_check) {
725     if (loc == 0) {
726       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
727     }
728     __kmp_check_barrier(global_tid, ct_barrier, loc);
729   }
730 
731 #if OMPT_SUPPORT
732   ompt_frame_t *ompt_frame;
733   if (ompt_enabled.enabled) {
734     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
735     if (ompt_frame->enter_frame.ptr == NULL)
736       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
737     OMPT_STORE_RETURN_ADDRESS(global_tid);
738   }
739 #endif
740   __kmp_threads[global_tid]->th.th_ident = loc;
741   // TODO: explicit barrier_wait_id:
742   //   this function is called when 'barrier' directive is present or
743   //   implicit barrier at the end of a worksharing construct.
744   // 1) better to add a per-thread barrier counter to a thread data structure
745   // 2) set to 0 when a new team is created
746   // 4) no sync is required
747 
748   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
749 #if OMPT_SUPPORT && OMPT_OPTIONAL
750   if (ompt_enabled.enabled) {
751     ompt_frame->enter_frame = ompt_data_none;
752   }
753 #endif
754 }
755 
756 /* The BARRIER for a MASTER section is always explicit   */
757 /*!
758 @ingroup WORK_SHARING
759 @param loc  source location information.
760 @param global_tid  global thread number .
761 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
762 */
763 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
764   int status = 0;
765 
766   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
767 
768   if (!TCR_4(__kmp_init_parallel))
769     __kmp_parallel_initialize();
770 
771   __kmp_resume_if_soft_paused();
772 
773   if (KMP_MASTER_GTID(global_tid)) {
774     KMP_COUNT_BLOCK(OMP_MASTER);
775     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
776     status = 1;
777   }
778 
779 #if OMPT_SUPPORT && OMPT_OPTIONAL
780   if (status) {
781     if (ompt_enabled.ompt_callback_master) {
782       kmp_info_t *this_thr = __kmp_threads[global_tid];
783       kmp_team_t *team = this_thr->th.th_team;
784 
785       int tid = __kmp_tid_from_gtid(global_tid);
786       ompt_callbacks.ompt_callback(ompt_callback_master)(
787           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
788           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
789           OMPT_GET_RETURN_ADDRESS(0));
790     }
791   }
792 #endif
793 
794   if (__kmp_env_consistency_check) {
795 #if KMP_USE_DYNAMIC_LOCK
796     if (status)
797       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
798     else
799       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
800 #else
801     if (status)
802       __kmp_push_sync(global_tid, ct_master, loc, NULL);
803     else
804       __kmp_check_sync(global_tid, ct_master, loc, NULL);
805 #endif
806   }
807 
808   return status;
809 }
810 
811 /*!
812 @ingroup WORK_SHARING
813 @param loc  source location information.
814 @param global_tid  global thread number .
815 
816 Mark the end of a <tt>master</tt> region. This should only be called by the
817 thread that executes the <tt>master</tt> region.
818 */
819 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
820   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
821 
822   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
823   KMP_POP_PARTITIONED_TIMER();
824 
825 #if OMPT_SUPPORT && OMPT_OPTIONAL
826   kmp_info_t *this_thr = __kmp_threads[global_tid];
827   kmp_team_t *team = this_thr->th.th_team;
828   if (ompt_enabled.ompt_callback_master) {
829     int tid = __kmp_tid_from_gtid(global_tid);
830     ompt_callbacks.ompt_callback(ompt_callback_master)(
831         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
832         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
833         OMPT_GET_RETURN_ADDRESS(0));
834   }
835 #endif
836 
837   if (__kmp_env_consistency_check) {
838     if (global_tid < 0)
839       KMP_WARNING(ThreadIdentInvalid);
840 
841     if (KMP_MASTER_GTID(global_tid))
842       __kmp_pop_sync(global_tid, ct_master, loc);
843   }
844 }
845 
846 /*!
847 @ingroup WORK_SHARING
848 @param loc  source location information.
849 @param gtid  global thread number.
850 
851 Start execution of an <tt>ordered</tt> construct.
852 */
853 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
854   int cid = 0;
855   kmp_info_t *th;
856   KMP_DEBUG_ASSERT(__kmp_init_serial);
857 
858   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
859 
860   if (!TCR_4(__kmp_init_parallel))
861     __kmp_parallel_initialize();
862 
863   __kmp_resume_if_soft_paused();
864 
865 #if USE_ITT_BUILD
866   __kmp_itt_ordered_prep(gtid);
867 // TODO: ordered_wait_id
868 #endif /* USE_ITT_BUILD */
869 
870   th = __kmp_threads[gtid];
871 
872 #if OMPT_SUPPORT && OMPT_OPTIONAL
873   kmp_team_t *team;
874   ompt_wait_id_t lck;
875   void *codeptr_ra;
876   if (ompt_enabled.enabled) {
877     OMPT_STORE_RETURN_ADDRESS(gtid);
878     team = __kmp_team_from_gtid(gtid);
879     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
880     /* OMPT state update */
881     th->th.ompt_thread_info.wait_id = lck;
882     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
883 
884     /* OMPT event callback */
885     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
886     if (ompt_enabled.ompt_callback_mutex_acquire) {
887       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
888           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
889           codeptr_ra);
890     }
891   }
892 #endif
893 
894   if (th->th.th_dispatch->th_deo_fcn != 0)
895     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
896   else
897     __kmp_parallel_deo(&gtid, &cid, loc);
898 
899 #if OMPT_SUPPORT && OMPT_OPTIONAL
900   if (ompt_enabled.enabled) {
901     /* OMPT state update */
902     th->th.ompt_thread_info.state = ompt_state_work_parallel;
903     th->th.ompt_thread_info.wait_id = 0;
904 
905     /* OMPT event callback */
906     if (ompt_enabled.ompt_callback_mutex_acquired) {
907       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
908           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
909     }
910   }
911 #endif
912 
913 #if USE_ITT_BUILD
914   __kmp_itt_ordered_start(gtid);
915 #endif /* USE_ITT_BUILD */
916 }
917 
918 /*!
919 @ingroup WORK_SHARING
920 @param loc  source location information.
921 @param gtid  global thread number.
922 
923 End execution of an <tt>ordered</tt> construct.
924 */
925 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
926   int cid = 0;
927   kmp_info_t *th;
928 
929   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
930 
931 #if USE_ITT_BUILD
932   __kmp_itt_ordered_end(gtid);
933 // TODO: ordered_wait_id
934 #endif /* USE_ITT_BUILD */
935 
936   th = __kmp_threads[gtid];
937 
938   if (th->th.th_dispatch->th_dxo_fcn != 0)
939     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
940   else
941     __kmp_parallel_dxo(&gtid, &cid, loc);
942 
943 #if OMPT_SUPPORT && OMPT_OPTIONAL
944   OMPT_STORE_RETURN_ADDRESS(gtid);
945   if (ompt_enabled.ompt_callback_mutex_released) {
946     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
947         ompt_mutex_ordered,
948         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
949             ->t.t_ordered.dt.t_value,
950         OMPT_LOAD_RETURN_ADDRESS(gtid));
951   }
952 #endif
953 }
954 
955 #if KMP_USE_DYNAMIC_LOCK
956 
957 static __forceinline void
958 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
959                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
960   // Pointer to the allocated indirect lock is written to crit, while indexing
961   // is ignored.
962   void *idx;
963   kmp_indirect_lock_t **lck;
964   lck = (kmp_indirect_lock_t **)crit;
965   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
966   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
967   KMP_SET_I_LOCK_LOCATION(ilk, loc);
968   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
969   KA_TRACE(20,
970            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
971 #if USE_ITT_BUILD
972   __kmp_itt_critical_creating(ilk->lock, loc);
973 #endif
974   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
975   if (status == 0) {
976 #if USE_ITT_BUILD
977     __kmp_itt_critical_destroyed(ilk->lock);
978 #endif
979     // We don't really need to destroy the unclaimed lock here since it will be
980     // cleaned up at program exit.
981     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
982   }
983   KMP_DEBUG_ASSERT(*lck != NULL);
984 }
985 
986 // Fast-path acquire tas lock
987 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
988   {                                                                            \
989     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
990     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
991     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
992     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
993         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
994       kmp_uint32 spins;                                                        \
995       KMP_FSYNC_PREPARE(l);                                                    \
996       KMP_INIT_YIELD(spins);                                                   \
997       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
998       do {                                                                     \
999         if (TCR_4(__kmp_nth) >                                                 \
1000             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1001           KMP_YIELD(TRUE);                                                     \
1002         } else {                                                               \
1003           KMP_YIELD_SPIN(spins);                                               \
1004         }                                                                      \
1005         __kmp_spin_backoff(&backoff);                                          \
1006       } while (                                                                \
1007           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1008           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1009     }                                                                          \
1010     KMP_FSYNC_ACQUIRED(l);                                                     \
1011   }
1012 
1013 // Fast-path test tas lock
1014 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1015   {                                                                            \
1016     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1017     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1018     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1019     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1020          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1021   }
1022 
1023 // Fast-path release tas lock
1024 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1025   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1026 
1027 #if KMP_USE_FUTEX
1028 
1029 #include <sys/syscall.h>
1030 #include <unistd.h>
1031 #ifndef FUTEX_WAIT
1032 #define FUTEX_WAIT 0
1033 #endif
1034 #ifndef FUTEX_WAKE
1035 #define FUTEX_WAKE 1
1036 #endif
1037 
1038 // Fast-path acquire futex lock
1039 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1040   {                                                                            \
1041     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1042     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1043     KMP_MB();                                                                  \
1044     KMP_FSYNC_PREPARE(ftx);                                                    \
1045     kmp_int32 poll_val;                                                        \
1046     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1047                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1048                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1049       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1050       if (!cond) {                                                             \
1051         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1052                                          poll_val |                            \
1053                                              KMP_LOCK_BUSY(1, futex))) {       \
1054           continue;                                                            \
1055         }                                                                      \
1056         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1057       }                                                                        \
1058       kmp_int32 rc;                                                            \
1059       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1060                         NULL, NULL, 0)) != 0) {                                \
1061         continue;                                                              \
1062       }                                                                        \
1063       gtid_code |= 1;                                                          \
1064     }                                                                          \
1065     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1066   }
1067 
1068 // Fast-path test futex lock
1069 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1070   {                                                                            \
1071     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1072     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1073                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1074       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1075       rc = TRUE;                                                               \
1076     } else {                                                                   \
1077       rc = FALSE;                                                              \
1078     }                                                                          \
1079   }
1080 
1081 // Fast-path release futex lock
1082 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1083   {                                                                            \
1084     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1085     KMP_MB();                                                                  \
1086     KMP_FSYNC_RELEASING(ftx);                                                  \
1087     kmp_int32 poll_val =                                                       \
1088         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1089     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1090       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1091               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1092     }                                                                          \
1093     KMP_MB();                                                                  \
1094     KMP_YIELD_OVERSUB();                                                       \
1095   }
1096 
1097 #endif // KMP_USE_FUTEX
1098 
1099 #else // KMP_USE_DYNAMIC_LOCK
1100 
1101 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1102                                                       ident_t const *loc,
1103                                                       kmp_int32 gtid) {
1104   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1105 
1106   // Because of the double-check, the following load doesn't need to be volatile
1107   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1108 
1109   if (lck == NULL) {
1110     void *idx;
1111 
1112     // Allocate & initialize the lock.
1113     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1114     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1115     __kmp_init_user_lock_with_checks(lck);
1116     __kmp_set_user_lock_location(lck, loc);
1117 #if USE_ITT_BUILD
1118     __kmp_itt_critical_creating(lck);
1119 // __kmp_itt_critical_creating() should be called *before* the first usage
1120 // of underlying lock. It is the only place where we can guarantee it. There
1121 // are chances the lock will destroyed with no usage, but it is not a
1122 // problem, because this is not real event seen by user but rather setting
1123 // name for object (lock). See more details in kmp_itt.h.
1124 #endif /* USE_ITT_BUILD */
1125 
1126     // Use a cmpxchg instruction to slam the start of the critical section with
1127     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1128     // and use the lock that the other thread allocated.
1129     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1130 
1131     if (status == 0) {
1132 // Deallocate the lock and reload the value.
1133 #if USE_ITT_BUILD
1134       __kmp_itt_critical_destroyed(lck);
1135 // Let ITT know the lock is destroyed and the same memory location may be reused
1136 // for another purpose.
1137 #endif /* USE_ITT_BUILD */
1138       __kmp_destroy_user_lock_with_checks(lck);
1139       __kmp_user_lock_free(&idx, gtid, lck);
1140       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1141       KMP_DEBUG_ASSERT(lck != NULL);
1142     }
1143   }
1144   return lck;
1145 }
1146 
1147 #endif // KMP_USE_DYNAMIC_LOCK
1148 
1149 /*!
1150 @ingroup WORK_SHARING
1151 @param loc  source location information.
1152 @param global_tid  global thread number .
1153 @param crit identity of the critical section. This could be a pointer to a lock
1154 associated with the critical section, or some other suitably unique value.
1155 
1156 Enter code protected by a `critical` construct.
1157 This function blocks until the executing thread can enter the critical section.
1158 */
1159 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1160                      kmp_critical_name *crit) {
1161 #if KMP_USE_DYNAMIC_LOCK
1162 #if OMPT_SUPPORT && OMPT_OPTIONAL
1163   OMPT_STORE_RETURN_ADDRESS(global_tid);
1164 #endif // OMPT_SUPPORT
1165   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1166 #else
1167   KMP_COUNT_BLOCK(OMP_CRITICAL);
1168 #if OMPT_SUPPORT && OMPT_OPTIONAL
1169   ompt_state_t prev_state = ompt_state_undefined;
1170   ompt_thread_info_t ti;
1171 #endif
1172   kmp_user_lock_p lck;
1173 
1174   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1175 
1176   // TODO: add THR_OVHD_STATE
1177 
1178   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1179   KMP_CHECK_USER_LOCK_INIT();
1180 
1181   if ((__kmp_user_lock_kind == lk_tas) &&
1182       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1183     lck = (kmp_user_lock_p)crit;
1184   }
1185 #if KMP_USE_FUTEX
1186   else if ((__kmp_user_lock_kind == lk_futex) &&
1187            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1188     lck = (kmp_user_lock_p)crit;
1189   }
1190 #endif
1191   else { // ticket, queuing or drdpa
1192     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1193   }
1194 
1195   if (__kmp_env_consistency_check)
1196     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1197 
1198 // since the critical directive binds to all threads, not just the current
1199 // team we have to check this even if we are in a serialized team.
1200 // also, even if we are the uber thread, we still have to conduct the lock,
1201 // as we have to contend with sibling threads.
1202 
1203 #if USE_ITT_BUILD
1204   __kmp_itt_critical_acquiring(lck);
1205 #endif /* USE_ITT_BUILD */
1206 #if OMPT_SUPPORT && OMPT_OPTIONAL
1207   OMPT_STORE_RETURN_ADDRESS(gtid);
1208   void *codeptr_ra = NULL;
1209   if (ompt_enabled.enabled) {
1210     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1211     /* OMPT state update */
1212     prev_state = ti.state;
1213     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1214     ti.state = ompt_state_wait_critical;
1215 
1216     /* OMPT event callback */
1217     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1218     if (ompt_enabled.ompt_callback_mutex_acquire) {
1219       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1220           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1221           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1222     }
1223   }
1224 #endif
1225   // Value of 'crit' should be good for using as a critical_id of the critical
1226   // section directive.
1227   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1228 
1229 #if USE_ITT_BUILD
1230   __kmp_itt_critical_acquired(lck);
1231 #endif /* USE_ITT_BUILD */
1232 #if OMPT_SUPPORT && OMPT_OPTIONAL
1233   if (ompt_enabled.enabled) {
1234     /* OMPT state update */
1235     ti.state = prev_state;
1236     ti.wait_id = 0;
1237 
1238     /* OMPT event callback */
1239     if (ompt_enabled.ompt_callback_mutex_acquired) {
1240       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1241           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1242     }
1243   }
1244 #endif
1245   KMP_POP_PARTITIONED_TIMER();
1246 
1247   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1248   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1249 #endif // KMP_USE_DYNAMIC_LOCK
1250 }
1251 
1252 #if KMP_USE_DYNAMIC_LOCK
1253 
1254 // Converts the given hint to an internal lock implementation
1255 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1256 #if KMP_USE_TSX
1257 #define KMP_TSX_LOCK(seq) lockseq_##seq
1258 #else
1259 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1260 #endif
1261 
1262 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1263 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1264 #else
1265 #define KMP_CPUINFO_RTM 0
1266 #endif
1267 
1268   // Hints that do not require further logic
1269   if (hint & kmp_lock_hint_hle)
1270     return KMP_TSX_LOCK(hle);
1271   if (hint & kmp_lock_hint_rtm)
1272     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1273   if (hint & kmp_lock_hint_adaptive)
1274     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1275 
1276   // Rule out conflicting hints first by returning the default lock
1277   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1278     return __kmp_user_lock_seq;
1279   if ((hint & omp_lock_hint_speculative) &&
1280       (hint & omp_lock_hint_nonspeculative))
1281     return __kmp_user_lock_seq;
1282 
1283   // Do not even consider speculation when it appears to be contended
1284   if (hint & omp_lock_hint_contended)
1285     return lockseq_queuing;
1286 
1287   // Uncontended lock without speculation
1288   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1289     return lockseq_tas;
1290 
1291   // HLE lock for speculation
1292   if (hint & omp_lock_hint_speculative)
1293     return KMP_TSX_LOCK(hle);
1294 
1295   return __kmp_user_lock_seq;
1296 }
1297 
1298 #if OMPT_SUPPORT && OMPT_OPTIONAL
1299 #if KMP_USE_DYNAMIC_LOCK
1300 static kmp_mutex_impl_t
1301 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1302   if (user_lock) {
1303     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1304     case 0:
1305       break;
1306 #if KMP_USE_FUTEX
1307     case locktag_futex:
1308       return kmp_mutex_impl_queuing;
1309 #endif
1310     case locktag_tas:
1311       return kmp_mutex_impl_spin;
1312 #if KMP_USE_TSX
1313     case locktag_hle:
1314       return kmp_mutex_impl_speculative;
1315 #endif
1316     default:
1317       return kmp_mutex_impl_none;
1318     }
1319     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1320   }
1321   KMP_ASSERT(ilock);
1322   switch (ilock->type) {
1323 #if KMP_USE_TSX
1324   case locktag_adaptive:
1325   case locktag_rtm:
1326     return kmp_mutex_impl_speculative;
1327 #endif
1328   case locktag_nested_tas:
1329     return kmp_mutex_impl_spin;
1330 #if KMP_USE_FUTEX
1331   case locktag_nested_futex:
1332 #endif
1333   case locktag_ticket:
1334   case locktag_queuing:
1335   case locktag_drdpa:
1336   case locktag_nested_ticket:
1337   case locktag_nested_queuing:
1338   case locktag_nested_drdpa:
1339     return kmp_mutex_impl_queuing;
1340   default:
1341     return kmp_mutex_impl_none;
1342   }
1343 }
1344 #else
1345 // For locks without dynamic binding
1346 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1347   switch (__kmp_user_lock_kind) {
1348   case lk_tas:
1349     return kmp_mutex_impl_spin;
1350 #if KMP_USE_FUTEX
1351   case lk_futex:
1352 #endif
1353   case lk_ticket:
1354   case lk_queuing:
1355   case lk_drdpa:
1356     return kmp_mutex_impl_queuing;
1357 #if KMP_USE_TSX
1358   case lk_hle:
1359   case lk_rtm:
1360   case lk_adaptive:
1361     return kmp_mutex_impl_speculative;
1362 #endif
1363   default:
1364     return kmp_mutex_impl_none;
1365   }
1366 }
1367 #endif // KMP_USE_DYNAMIC_LOCK
1368 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1369 
1370 /*!
1371 @ingroup WORK_SHARING
1372 @param loc  source location information.
1373 @param global_tid  global thread number.
1374 @param crit identity of the critical section. This could be a pointer to a lock
1375 associated with the critical section, or some other suitably unique value.
1376 @param hint the lock hint.
1377 
1378 Enter code protected by a `critical` construct with a hint. The hint value is
1379 used to suggest a lock implementation. This function blocks until the executing
1380 thread can enter the critical section unless the hint suggests use of
1381 speculative execution and the hardware supports it.
1382 */
1383 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1384                                kmp_critical_name *crit, uint32_t hint) {
1385   KMP_COUNT_BLOCK(OMP_CRITICAL);
1386   kmp_user_lock_p lck;
1387 #if OMPT_SUPPORT && OMPT_OPTIONAL
1388   ompt_state_t prev_state = ompt_state_undefined;
1389   ompt_thread_info_t ti;
1390   // This is the case, if called from __kmpc_critical:
1391   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1392   if (!codeptr)
1393     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1394 #endif
1395 
1396   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1397 
1398   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1399   // Check if it is initialized.
1400   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1401   if (*lk == 0) {
1402     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1403     if (KMP_IS_D_LOCK(lckseq)) {
1404       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1405                                   KMP_GET_D_TAG(lckseq));
1406     } else {
1407       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1408     }
1409   }
1410   // Branch for accessing the actual lock object and set operation. This
1411   // branching is inevitable since this lock initialization does not follow the
1412   // normal dispatch path (lock table is not used).
1413   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1414     lck = (kmp_user_lock_p)lk;
1415     if (__kmp_env_consistency_check) {
1416       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1417                       __kmp_map_hint_to_lock(hint));
1418     }
1419 #if USE_ITT_BUILD
1420     __kmp_itt_critical_acquiring(lck);
1421 #endif
1422 #if OMPT_SUPPORT && OMPT_OPTIONAL
1423     if (ompt_enabled.enabled) {
1424       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1425       /* OMPT state update */
1426       prev_state = ti.state;
1427       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1428       ti.state = ompt_state_wait_critical;
1429 
1430       /* OMPT event callback */
1431       if (ompt_enabled.ompt_callback_mutex_acquire) {
1432         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1433             ompt_mutex_critical, (unsigned int)hint,
1434             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1435             codeptr);
1436       }
1437     }
1438 #endif
1439 #if KMP_USE_INLINED_TAS
1440     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1441       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1442     } else
1443 #elif KMP_USE_INLINED_FUTEX
1444     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1445       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1446     } else
1447 #endif
1448     {
1449       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1450     }
1451   } else {
1452     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1453     lck = ilk->lock;
1454     if (__kmp_env_consistency_check) {
1455       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1456                       __kmp_map_hint_to_lock(hint));
1457     }
1458 #if USE_ITT_BUILD
1459     __kmp_itt_critical_acquiring(lck);
1460 #endif
1461 #if OMPT_SUPPORT && OMPT_OPTIONAL
1462     if (ompt_enabled.enabled) {
1463       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1464       /* OMPT state update */
1465       prev_state = ti.state;
1466       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1467       ti.state = ompt_state_wait_critical;
1468 
1469       /* OMPT event callback */
1470       if (ompt_enabled.ompt_callback_mutex_acquire) {
1471         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1472             ompt_mutex_critical, (unsigned int)hint,
1473             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1474             codeptr);
1475       }
1476     }
1477 #endif
1478     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1479   }
1480   KMP_POP_PARTITIONED_TIMER();
1481 
1482 #if USE_ITT_BUILD
1483   __kmp_itt_critical_acquired(lck);
1484 #endif /* USE_ITT_BUILD */
1485 #if OMPT_SUPPORT && OMPT_OPTIONAL
1486   if (ompt_enabled.enabled) {
1487     /* OMPT state update */
1488     ti.state = prev_state;
1489     ti.wait_id = 0;
1490 
1491     /* OMPT event callback */
1492     if (ompt_enabled.ompt_callback_mutex_acquired) {
1493       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1494           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1495     }
1496   }
1497 #endif
1498 
1499   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1500   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1501 } // __kmpc_critical_with_hint
1502 
1503 #endif // KMP_USE_DYNAMIC_LOCK
1504 
1505 /*!
1506 @ingroup WORK_SHARING
1507 @param loc  source location information.
1508 @param global_tid  global thread number .
1509 @param crit identity of the critical section. This could be a pointer to a lock
1510 associated with the critical section, or some other suitably unique value.
1511 
1512 Leave a critical section, releasing any lock that was held during its execution.
1513 */
1514 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1515                          kmp_critical_name *crit) {
1516   kmp_user_lock_p lck;
1517 
1518   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1519 
1520 #if KMP_USE_DYNAMIC_LOCK
1521   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1522     lck = (kmp_user_lock_p)crit;
1523     KMP_ASSERT(lck != NULL);
1524     if (__kmp_env_consistency_check) {
1525       __kmp_pop_sync(global_tid, ct_critical, loc);
1526     }
1527 #if USE_ITT_BUILD
1528     __kmp_itt_critical_releasing(lck);
1529 #endif
1530 #if KMP_USE_INLINED_TAS
1531     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1532       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1533     } else
1534 #elif KMP_USE_INLINED_FUTEX
1535     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1536       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1537     } else
1538 #endif
1539     {
1540       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1541     }
1542   } else {
1543     kmp_indirect_lock_t *ilk =
1544         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1545     KMP_ASSERT(ilk != NULL);
1546     lck = ilk->lock;
1547     if (__kmp_env_consistency_check) {
1548       __kmp_pop_sync(global_tid, ct_critical, loc);
1549     }
1550 #if USE_ITT_BUILD
1551     __kmp_itt_critical_releasing(lck);
1552 #endif
1553     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1554   }
1555 
1556 #else // KMP_USE_DYNAMIC_LOCK
1557 
1558   if ((__kmp_user_lock_kind == lk_tas) &&
1559       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1560     lck = (kmp_user_lock_p)crit;
1561   }
1562 #if KMP_USE_FUTEX
1563   else if ((__kmp_user_lock_kind == lk_futex) &&
1564            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1565     lck = (kmp_user_lock_p)crit;
1566   }
1567 #endif
1568   else { // ticket, queuing or drdpa
1569     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1570   }
1571 
1572   KMP_ASSERT(lck != NULL);
1573 
1574   if (__kmp_env_consistency_check)
1575     __kmp_pop_sync(global_tid, ct_critical, loc);
1576 
1577 #if USE_ITT_BUILD
1578   __kmp_itt_critical_releasing(lck);
1579 #endif /* USE_ITT_BUILD */
1580   // Value of 'crit' should be good for using as a critical_id of the critical
1581   // section directive.
1582   __kmp_release_user_lock_with_checks(lck, global_tid);
1583 
1584 #endif // KMP_USE_DYNAMIC_LOCK
1585 
1586 #if OMPT_SUPPORT && OMPT_OPTIONAL
1587   /* OMPT release event triggers after lock is released; place here to trigger
1588    * for all #if branches */
1589   OMPT_STORE_RETURN_ADDRESS(global_tid);
1590   if (ompt_enabled.ompt_callback_mutex_released) {
1591     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1592         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1593         OMPT_LOAD_RETURN_ADDRESS(0));
1594   }
1595 #endif
1596 
1597   KMP_POP_PARTITIONED_TIMER();
1598   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1599 }
1600 
1601 /*!
1602 @ingroup SYNCHRONIZATION
1603 @param loc source location information
1604 @param global_tid thread id.
1605 @return one if the thread should execute the master block, zero otherwise
1606 
1607 Start execution of a combined barrier and master. The barrier is executed inside
1608 this function.
1609 */
1610 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1611   int status;
1612 
1613   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1614 
1615   if (!TCR_4(__kmp_init_parallel))
1616     __kmp_parallel_initialize();
1617 
1618   __kmp_resume_if_soft_paused();
1619 
1620   if (__kmp_env_consistency_check)
1621     __kmp_check_barrier(global_tid, ct_barrier, loc);
1622 
1623 #if OMPT_SUPPORT
1624   ompt_frame_t *ompt_frame;
1625   if (ompt_enabled.enabled) {
1626     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1627     if (ompt_frame->enter_frame.ptr == NULL)
1628       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1629     OMPT_STORE_RETURN_ADDRESS(global_tid);
1630   }
1631 #endif
1632 #if USE_ITT_NOTIFY
1633   __kmp_threads[global_tid]->th.th_ident = loc;
1634 #endif
1635   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1636 #if OMPT_SUPPORT && OMPT_OPTIONAL
1637   if (ompt_enabled.enabled) {
1638     ompt_frame->enter_frame = ompt_data_none;
1639   }
1640 #endif
1641 
1642   return (status != 0) ? 0 : 1;
1643 }
1644 
1645 /*!
1646 @ingroup SYNCHRONIZATION
1647 @param loc source location information
1648 @param global_tid thread id.
1649 
1650 Complete the execution of a combined barrier and master. This function should
1651 only be called at the completion of the <tt>master</tt> code. Other threads will
1652 still be waiting at the barrier and this call releases them.
1653 */
1654 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1655   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1656 
1657   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1658 }
1659 
1660 /*!
1661 @ingroup SYNCHRONIZATION
1662 @param loc source location information
1663 @param global_tid thread id.
1664 @return one if the thread should execute the master block, zero otherwise
1665 
1666 Start execution of a combined barrier and master(nowait) construct.
1667 The barrier is executed inside this function.
1668 There is no equivalent "end" function, since the
1669 */
1670 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1671   kmp_int32 ret;
1672 
1673   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1674 
1675   if (!TCR_4(__kmp_init_parallel))
1676     __kmp_parallel_initialize();
1677 
1678   __kmp_resume_if_soft_paused();
1679 
1680   if (__kmp_env_consistency_check) {
1681     if (loc == 0) {
1682       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1683     }
1684     __kmp_check_barrier(global_tid, ct_barrier, loc);
1685   }
1686 
1687 #if OMPT_SUPPORT
1688   ompt_frame_t *ompt_frame;
1689   if (ompt_enabled.enabled) {
1690     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1691     if (ompt_frame->enter_frame.ptr == NULL)
1692       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1693     OMPT_STORE_RETURN_ADDRESS(global_tid);
1694   }
1695 #endif
1696 #if USE_ITT_NOTIFY
1697   __kmp_threads[global_tid]->th.th_ident = loc;
1698 #endif
1699   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1700 #if OMPT_SUPPORT && OMPT_OPTIONAL
1701   if (ompt_enabled.enabled) {
1702     ompt_frame->enter_frame = ompt_data_none;
1703   }
1704 #endif
1705 
1706   ret = __kmpc_master(loc, global_tid);
1707 
1708   if (__kmp_env_consistency_check) {
1709     /*  there's no __kmpc_end_master called; so the (stats) */
1710     /*  actions of __kmpc_end_master are done here          */
1711 
1712     if (global_tid < 0) {
1713       KMP_WARNING(ThreadIdentInvalid);
1714     }
1715     if (ret) {
1716       /* only one thread should do the pop since only */
1717       /* one did the push (see __kmpc_master())       */
1718 
1719       __kmp_pop_sync(global_tid, ct_master, loc);
1720     }
1721   }
1722 
1723   return (ret);
1724 }
1725 
1726 /* The BARRIER for a SINGLE process section is always explicit   */
1727 /*!
1728 @ingroup WORK_SHARING
1729 @param loc  source location information
1730 @param global_tid  global thread number
1731 @return One if this thread should execute the single construct, zero otherwise.
1732 
1733 Test whether to execute a <tt>single</tt> construct.
1734 There are no implicit barriers in the two "single" calls, rather the compiler
1735 should introduce an explicit barrier if it is required.
1736 */
1737 
1738 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1739   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1740 
1741   if (rc) {
1742     // We are going to execute the single statement, so we should count it.
1743     KMP_COUNT_BLOCK(OMP_SINGLE);
1744     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1745   }
1746 
1747 #if OMPT_SUPPORT && OMPT_OPTIONAL
1748   kmp_info_t *this_thr = __kmp_threads[global_tid];
1749   kmp_team_t *team = this_thr->th.th_team;
1750   int tid = __kmp_tid_from_gtid(global_tid);
1751 
1752   if (ompt_enabled.enabled) {
1753     if (rc) {
1754       if (ompt_enabled.ompt_callback_work) {
1755         ompt_callbacks.ompt_callback(ompt_callback_work)(
1756             ompt_work_single_executor, ompt_scope_begin,
1757             &(team->t.ompt_team_info.parallel_data),
1758             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1759             1, OMPT_GET_RETURN_ADDRESS(0));
1760       }
1761     } else {
1762       if (ompt_enabled.ompt_callback_work) {
1763         ompt_callbacks.ompt_callback(ompt_callback_work)(
1764             ompt_work_single_other, ompt_scope_begin,
1765             &(team->t.ompt_team_info.parallel_data),
1766             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1767             1, OMPT_GET_RETURN_ADDRESS(0));
1768         ompt_callbacks.ompt_callback(ompt_callback_work)(
1769             ompt_work_single_other, ompt_scope_end,
1770             &(team->t.ompt_team_info.parallel_data),
1771             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1772             1, OMPT_GET_RETURN_ADDRESS(0));
1773       }
1774     }
1775   }
1776 #endif
1777 
1778   return rc;
1779 }
1780 
1781 /*!
1782 @ingroup WORK_SHARING
1783 @param loc  source location information
1784 @param global_tid  global thread number
1785 
1786 Mark the end of a <tt>single</tt> construct.  This function should
1787 only be called by the thread that executed the block of code protected
1788 by the `single` construct.
1789 */
1790 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1791   __kmp_exit_single(global_tid);
1792   KMP_POP_PARTITIONED_TIMER();
1793 
1794 #if OMPT_SUPPORT && OMPT_OPTIONAL
1795   kmp_info_t *this_thr = __kmp_threads[global_tid];
1796   kmp_team_t *team = this_thr->th.th_team;
1797   int tid = __kmp_tid_from_gtid(global_tid);
1798 
1799   if (ompt_enabled.ompt_callback_work) {
1800     ompt_callbacks.ompt_callback(ompt_callback_work)(
1801         ompt_work_single_executor, ompt_scope_end,
1802         &(team->t.ompt_team_info.parallel_data),
1803         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1804         OMPT_GET_RETURN_ADDRESS(0));
1805   }
1806 #endif
1807 }
1808 
1809 /*!
1810 @ingroup WORK_SHARING
1811 @param loc Source location
1812 @param global_tid Global thread id
1813 
1814 Mark the end of a statically scheduled loop.
1815 */
1816 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1817   KMP_POP_PARTITIONED_TIMER();
1818   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1819 
1820 #if OMPT_SUPPORT && OMPT_OPTIONAL
1821   if (ompt_enabled.ompt_callback_work) {
1822     ompt_work_t ompt_work_type = ompt_work_loop;
1823     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1824     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1825     // Determine workshare type
1826     if (loc != NULL) {
1827       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1828         ompt_work_type = ompt_work_loop;
1829       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1830         ompt_work_type = ompt_work_sections;
1831       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1832         ompt_work_type = ompt_work_distribute;
1833       } else {
1834         // use default set above.
1835         // a warning about this case is provided in __kmpc_for_static_init
1836       }
1837       KMP_DEBUG_ASSERT(ompt_work_type);
1838     }
1839     ompt_callbacks.ompt_callback(ompt_callback_work)(
1840         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1841         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1842   }
1843 #endif
1844   if (__kmp_env_consistency_check)
1845     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1846 }
1847 
1848 // User routines which take C-style arguments (call by value)
1849 // different from the Fortran equivalent routines
1850 
1851 void ompc_set_num_threads(int arg) {
1852   // !!!!! TODO: check the per-task binding
1853   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1854 }
1855 
1856 void ompc_set_dynamic(int flag) {
1857   kmp_info_t *thread;
1858 
1859   /* For the thread-private implementation of the internal controls */
1860   thread = __kmp_entry_thread();
1861 
1862   __kmp_save_internal_controls(thread);
1863 
1864   set__dynamic(thread, flag ? TRUE : FALSE);
1865 }
1866 
1867 void ompc_set_nested(int flag) {
1868   kmp_info_t *thread;
1869 
1870   /* For the thread-private internal controls implementation */
1871   thread = __kmp_entry_thread();
1872 
1873   __kmp_save_internal_controls(thread);
1874 
1875   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1876 }
1877 
1878 void ompc_set_max_active_levels(int max_active_levels) {
1879   /* TO DO */
1880   /* we want per-task implementation of this internal control */
1881 
1882   /* For the per-thread internal controls implementation */
1883   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1884 }
1885 
1886 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1887   // !!!!! TODO: check the per-task binding
1888   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1889 }
1890 
1891 int ompc_get_ancestor_thread_num(int level) {
1892   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1893 }
1894 
1895 int ompc_get_team_size(int level) {
1896   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1897 }
1898 
1899 /* OpenMP 5.0 Affinity Format API */
1900 
1901 void ompc_set_affinity_format(char const *format) {
1902   if (!__kmp_init_serial) {
1903     __kmp_serial_initialize();
1904   }
1905   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1906                          format, KMP_STRLEN(format) + 1);
1907 }
1908 
1909 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1910   size_t format_size;
1911   if (!__kmp_init_serial) {
1912     __kmp_serial_initialize();
1913   }
1914   format_size = KMP_STRLEN(__kmp_affinity_format);
1915   if (buffer && size) {
1916     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1917                            format_size + 1);
1918   }
1919   return format_size;
1920 }
1921 
1922 void ompc_display_affinity(char const *format) {
1923   int gtid;
1924   if (!TCR_4(__kmp_init_middle)) {
1925     __kmp_middle_initialize();
1926   }
1927   gtid = __kmp_get_gtid();
1928   __kmp_aux_display_affinity(gtid, format);
1929 }
1930 
1931 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1932                              char const *format) {
1933   int gtid;
1934   size_t num_required;
1935   kmp_str_buf_t capture_buf;
1936   if (!TCR_4(__kmp_init_middle)) {
1937     __kmp_middle_initialize();
1938   }
1939   gtid = __kmp_get_gtid();
1940   __kmp_str_buf_init(&capture_buf);
1941   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1942   if (buffer && buf_size) {
1943     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1944                            capture_buf.used + 1);
1945   }
1946   __kmp_str_buf_free(&capture_buf);
1947   return num_required;
1948 }
1949 
1950 void kmpc_set_stacksize(int arg) {
1951   // __kmp_aux_set_stacksize initializes the library if needed
1952   __kmp_aux_set_stacksize(arg);
1953 }
1954 
1955 void kmpc_set_stacksize_s(size_t arg) {
1956   // __kmp_aux_set_stacksize initializes the library if needed
1957   __kmp_aux_set_stacksize(arg);
1958 }
1959 
1960 void kmpc_set_blocktime(int arg) {
1961   int gtid, tid;
1962   kmp_info_t *thread;
1963 
1964   gtid = __kmp_entry_gtid();
1965   tid = __kmp_tid_from_gtid(gtid);
1966   thread = __kmp_thread_from_gtid(gtid);
1967 
1968   __kmp_aux_set_blocktime(arg, thread, tid);
1969 }
1970 
1971 void kmpc_set_library(int arg) {
1972   // __kmp_user_set_library initializes the library if needed
1973   __kmp_user_set_library((enum library_type)arg);
1974 }
1975 
1976 void kmpc_set_defaults(char const *str) {
1977   // __kmp_aux_set_defaults initializes the library if needed
1978   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1979 }
1980 
1981 void kmpc_set_disp_num_buffers(int arg) {
1982   // ignore after initialization because some teams have already
1983   // allocated dispatch buffers
1984   if (__kmp_init_serial == 0 && arg > 0)
1985     __kmp_dispatch_num_buffers = arg;
1986 }
1987 
1988 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1989 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1990   return -1;
1991 #else
1992   if (!TCR_4(__kmp_init_middle)) {
1993     __kmp_middle_initialize();
1994   }
1995   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1996 #endif
1997 }
1998 
1999 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2000 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2001   return -1;
2002 #else
2003   if (!TCR_4(__kmp_init_middle)) {
2004     __kmp_middle_initialize();
2005   }
2006   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2007 #endif
2008 }
2009 
2010 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2011 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2012   return -1;
2013 #else
2014   if (!TCR_4(__kmp_init_middle)) {
2015     __kmp_middle_initialize();
2016   }
2017   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2018 #endif
2019 }
2020 
2021 /* -------------------------------------------------------------------------- */
2022 /*!
2023 @ingroup THREADPRIVATE
2024 @param loc       source location information
2025 @param gtid      global thread number
2026 @param cpy_size  size of the cpy_data buffer
2027 @param cpy_data  pointer to data to be copied
2028 @param cpy_func  helper function to call for copying data
2029 @param didit     flag variable: 1=single thread; 0=not single thread
2030 
2031 __kmpc_copyprivate implements the interface for the private data broadcast
2032 needed for the copyprivate clause associated with a single region in an
2033 OpenMP<sup>*</sup> program (both C and Fortran).
2034 All threads participating in the parallel region call this routine.
2035 One of the threads (called the single thread) should have the <tt>didit</tt>
2036 variable set to 1 and all other threads should have that variable set to 0.
2037 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2038 
2039 The OpenMP specification forbids the use of nowait on the single region when a
2040 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2041 barrier internally to avoid race conditions, so the code generation for the
2042 single region should avoid generating a barrier after the call to @ref
2043 __kmpc_copyprivate.
2044 
2045 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2046 The <tt>loc</tt> parameter is a pointer to source location information.
2047 
2048 Internal implementation: The single thread will first copy its descriptor
2049 address (cpy_data) to a team-private location, then the other threads will each
2050 call the function pointed to by the parameter cpy_func, which carries out the
2051 copy by copying the data using the cpy_data buffer.
2052 
2053 The cpy_func routine used for the copy and the contents of the data area defined
2054 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2055 to be done. For instance, the cpy_data buffer can hold the actual data to be
2056 copied or it may hold a list of pointers to the data. The cpy_func routine must
2057 interpret the cpy_data buffer appropriately.
2058 
2059 The interface to cpy_func is as follows:
2060 @code
2061 void cpy_func( void *destination, void *source )
2062 @endcode
2063 where void *destination is the cpy_data pointer for the thread being copied to
2064 and void *source is the cpy_data pointer for the thread being copied from.
2065 */
2066 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2067                         void *cpy_data, void (*cpy_func)(void *, void *),
2068                         kmp_int32 didit) {
2069   void **data_ptr;
2070 
2071   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2072 
2073   KMP_MB();
2074 
2075   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2076 
2077   if (__kmp_env_consistency_check) {
2078     if (loc == 0) {
2079       KMP_WARNING(ConstructIdentInvalid);
2080     }
2081   }
2082 
2083   // ToDo: Optimize the following two barriers into some kind of split barrier
2084 
2085   if (didit)
2086     *data_ptr = cpy_data;
2087 
2088 #if OMPT_SUPPORT
2089   ompt_frame_t *ompt_frame;
2090   if (ompt_enabled.enabled) {
2091     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2092     if (ompt_frame->enter_frame.ptr == NULL)
2093       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2094     OMPT_STORE_RETURN_ADDRESS(gtid);
2095   }
2096 #endif
2097 /* This barrier is not a barrier region boundary */
2098 #if USE_ITT_NOTIFY
2099   __kmp_threads[gtid]->th.th_ident = loc;
2100 #endif
2101   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2102 
2103   if (!didit)
2104     (*cpy_func)(cpy_data, *data_ptr);
2105 
2106 // Consider next barrier a user-visible barrier for barrier region boundaries
2107 // Nesting checks are already handled by the single construct checks
2108 
2109 #if OMPT_SUPPORT
2110   if (ompt_enabled.enabled) {
2111     OMPT_STORE_RETURN_ADDRESS(gtid);
2112   }
2113 #endif
2114 #if USE_ITT_NOTIFY
2115   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2116 // tasks can overwrite the location)
2117 #endif
2118   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2119 #if OMPT_SUPPORT && OMPT_OPTIONAL
2120   if (ompt_enabled.enabled) {
2121     ompt_frame->enter_frame = ompt_data_none;
2122   }
2123 #endif
2124 }
2125 
2126 /* -------------------------------------------------------------------------- */
2127 
2128 #define INIT_LOCK __kmp_init_user_lock_with_checks
2129 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2130 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2131 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2132 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2133 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2134   __kmp_acquire_nested_user_lock_with_checks_timed
2135 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2136 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2137 #define TEST_LOCK __kmp_test_user_lock_with_checks
2138 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2139 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2140 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2141 
2142 // TODO: Make check abort messages use location info & pass it into
2143 // with_checks routines
2144 
2145 #if KMP_USE_DYNAMIC_LOCK
2146 
2147 // internal lock initializer
2148 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2149                                                     kmp_dyna_lockseq_t seq) {
2150   if (KMP_IS_D_LOCK(seq)) {
2151     KMP_INIT_D_LOCK(lock, seq);
2152 #if USE_ITT_BUILD
2153     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2154 #endif
2155   } else {
2156     KMP_INIT_I_LOCK(lock, seq);
2157 #if USE_ITT_BUILD
2158     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2159     __kmp_itt_lock_creating(ilk->lock, loc);
2160 #endif
2161   }
2162 }
2163 
2164 // internal nest lock initializer
2165 static __forceinline void
2166 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2167                                kmp_dyna_lockseq_t seq) {
2168 #if KMP_USE_TSX
2169   // Don't have nested lock implementation for speculative locks
2170   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2171     seq = __kmp_user_lock_seq;
2172 #endif
2173   switch (seq) {
2174   case lockseq_tas:
2175     seq = lockseq_nested_tas;
2176     break;
2177 #if KMP_USE_FUTEX
2178   case lockseq_futex:
2179     seq = lockseq_nested_futex;
2180     break;
2181 #endif
2182   case lockseq_ticket:
2183     seq = lockseq_nested_ticket;
2184     break;
2185   case lockseq_queuing:
2186     seq = lockseq_nested_queuing;
2187     break;
2188   case lockseq_drdpa:
2189     seq = lockseq_nested_drdpa;
2190     break;
2191   default:
2192     seq = lockseq_nested_queuing;
2193   }
2194   KMP_INIT_I_LOCK(lock, seq);
2195 #if USE_ITT_BUILD
2196   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2197   __kmp_itt_lock_creating(ilk->lock, loc);
2198 #endif
2199 }
2200 
2201 /* initialize the lock with a hint */
2202 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2203                                 uintptr_t hint) {
2204   KMP_DEBUG_ASSERT(__kmp_init_serial);
2205   if (__kmp_env_consistency_check && user_lock == NULL) {
2206     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2207   }
2208 
2209   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2210 
2211 #if OMPT_SUPPORT && OMPT_OPTIONAL
2212   // This is the case, if called from omp_init_lock_with_hint:
2213   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2214   if (!codeptr)
2215     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2216   if (ompt_enabled.ompt_callback_lock_init) {
2217     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2218         ompt_mutex_lock, (omp_lock_hint_t)hint,
2219         __ompt_get_mutex_impl_type(user_lock),
2220         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2221   }
2222 #endif
2223 }
2224 
2225 /* initialize the lock with a hint */
2226 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2227                                      void **user_lock, uintptr_t hint) {
2228   KMP_DEBUG_ASSERT(__kmp_init_serial);
2229   if (__kmp_env_consistency_check && user_lock == NULL) {
2230     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2231   }
2232 
2233   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2234 
2235 #if OMPT_SUPPORT && OMPT_OPTIONAL
2236   // This is the case, if called from omp_init_lock_with_hint:
2237   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2238   if (!codeptr)
2239     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2240   if (ompt_enabled.ompt_callback_lock_init) {
2241     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2242         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2243         __ompt_get_mutex_impl_type(user_lock),
2244         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2245   }
2246 #endif
2247 }
2248 
2249 #endif // KMP_USE_DYNAMIC_LOCK
2250 
2251 /* initialize the lock */
2252 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2253 #if KMP_USE_DYNAMIC_LOCK
2254 
2255   KMP_DEBUG_ASSERT(__kmp_init_serial);
2256   if (__kmp_env_consistency_check && user_lock == NULL) {
2257     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2258   }
2259   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2260 
2261 #if OMPT_SUPPORT && OMPT_OPTIONAL
2262   // This is the case, if called from omp_init_lock_with_hint:
2263   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2264   if (!codeptr)
2265     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2266   if (ompt_enabled.ompt_callback_lock_init) {
2267     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2268         ompt_mutex_lock, omp_lock_hint_none,
2269         __ompt_get_mutex_impl_type(user_lock),
2270         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2271   }
2272 #endif
2273 
2274 #else // KMP_USE_DYNAMIC_LOCK
2275 
2276   static char const *const func = "omp_init_lock";
2277   kmp_user_lock_p lck;
2278   KMP_DEBUG_ASSERT(__kmp_init_serial);
2279 
2280   if (__kmp_env_consistency_check) {
2281     if (user_lock == NULL) {
2282       KMP_FATAL(LockIsUninitialized, func);
2283     }
2284   }
2285 
2286   KMP_CHECK_USER_LOCK_INIT();
2287 
2288   if ((__kmp_user_lock_kind == lk_tas) &&
2289       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2290     lck = (kmp_user_lock_p)user_lock;
2291   }
2292 #if KMP_USE_FUTEX
2293   else if ((__kmp_user_lock_kind == lk_futex) &&
2294            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2295     lck = (kmp_user_lock_p)user_lock;
2296   }
2297 #endif
2298   else {
2299     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2300   }
2301   INIT_LOCK(lck);
2302   __kmp_set_user_lock_location(lck, loc);
2303 
2304 #if OMPT_SUPPORT && OMPT_OPTIONAL
2305   // This is the case, if called from omp_init_lock_with_hint:
2306   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2307   if (!codeptr)
2308     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2309   if (ompt_enabled.ompt_callback_lock_init) {
2310     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2311         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2312         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2313   }
2314 #endif
2315 
2316 #if USE_ITT_BUILD
2317   __kmp_itt_lock_creating(lck);
2318 #endif /* USE_ITT_BUILD */
2319 
2320 #endif // KMP_USE_DYNAMIC_LOCK
2321 } // __kmpc_init_lock
2322 
2323 /* initialize the lock */
2324 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2325 #if KMP_USE_DYNAMIC_LOCK
2326 
2327   KMP_DEBUG_ASSERT(__kmp_init_serial);
2328   if (__kmp_env_consistency_check && user_lock == NULL) {
2329     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2330   }
2331   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2332 
2333 #if OMPT_SUPPORT && OMPT_OPTIONAL
2334   // This is the case, if called from omp_init_lock_with_hint:
2335   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2336   if (!codeptr)
2337     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2338   if (ompt_enabled.ompt_callback_lock_init) {
2339     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2340         ompt_mutex_nest_lock, omp_lock_hint_none,
2341         __ompt_get_mutex_impl_type(user_lock),
2342         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2343   }
2344 #endif
2345 
2346 #else // KMP_USE_DYNAMIC_LOCK
2347 
2348   static char const *const func = "omp_init_nest_lock";
2349   kmp_user_lock_p lck;
2350   KMP_DEBUG_ASSERT(__kmp_init_serial);
2351 
2352   if (__kmp_env_consistency_check) {
2353     if (user_lock == NULL) {
2354       KMP_FATAL(LockIsUninitialized, func);
2355     }
2356   }
2357 
2358   KMP_CHECK_USER_LOCK_INIT();
2359 
2360   if ((__kmp_user_lock_kind == lk_tas) &&
2361       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2362        OMP_NEST_LOCK_T_SIZE)) {
2363     lck = (kmp_user_lock_p)user_lock;
2364   }
2365 #if KMP_USE_FUTEX
2366   else if ((__kmp_user_lock_kind == lk_futex) &&
2367            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2368             OMP_NEST_LOCK_T_SIZE)) {
2369     lck = (kmp_user_lock_p)user_lock;
2370   }
2371 #endif
2372   else {
2373     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2374   }
2375 
2376   INIT_NESTED_LOCK(lck);
2377   __kmp_set_user_lock_location(lck, loc);
2378 
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380   // This is the case, if called from omp_init_lock_with_hint:
2381   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2382   if (!codeptr)
2383     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2384   if (ompt_enabled.ompt_callback_lock_init) {
2385     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2386         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2387         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2388   }
2389 #endif
2390 
2391 #if USE_ITT_BUILD
2392   __kmp_itt_lock_creating(lck);
2393 #endif /* USE_ITT_BUILD */
2394 
2395 #endif // KMP_USE_DYNAMIC_LOCK
2396 } // __kmpc_init_nest_lock
2397 
2398 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2399 #if KMP_USE_DYNAMIC_LOCK
2400 
2401 #if USE_ITT_BUILD
2402   kmp_user_lock_p lck;
2403   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2404     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2405   } else {
2406     lck = (kmp_user_lock_p)user_lock;
2407   }
2408   __kmp_itt_lock_destroyed(lck);
2409 #endif
2410 #if OMPT_SUPPORT && OMPT_OPTIONAL
2411   // This is the case, if called from omp_init_lock_with_hint:
2412   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2413   if (!codeptr)
2414     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2415   if (ompt_enabled.ompt_callback_lock_destroy) {
2416     kmp_user_lock_p lck;
2417     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2418       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2419     } else {
2420       lck = (kmp_user_lock_p)user_lock;
2421     }
2422     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2423         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2424   }
2425 #endif
2426   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2427 #else
2428   kmp_user_lock_p lck;
2429 
2430   if ((__kmp_user_lock_kind == lk_tas) &&
2431       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2432     lck = (kmp_user_lock_p)user_lock;
2433   }
2434 #if KMP_USE_FUTEX
2435   else if ((__kmp_user_lock_kind == lk_futex) &&
2436            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2437     lck = (kmp_user_lock_p)user_lock;
2438   }
2439 #endif
2440   else {
2441     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2442   }
2443 
2444 #if OMPT_SUPPORT && OMPT_OPTIONAL
2445   // This is the case, if called from omp_init_lock_with_hint:
2446   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2447   if (!codeptr)
2448     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2449   if (ompt_enabled.ompt_callback_lock_destroy) {
2450     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2451         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2452   }
2453 #endif
2454 
2455 #if USE_ITT_BUILD
2456   __kmp_itt_lock_destroyed(lck);
2457 #endif /* USE_ITT_BUILD */
2458   DESTROY_LOCK(lck);
2459 
2460   if ((__kmp_user_lock_kind == lk_tas) &&
2461       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2462     ;
2463   }
2464 #if KMP_USE_FUTEX
2465   else if ((__kmp_user_lock_kind == lk_futex) &&
2466            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2467     ;
2468   }
2469 #endif
2470   else {
2471     __kmp_user_lock_free(user_lock, gtid, lck);
2472   }
2473 #endif // KMP_USE_DYNAMIC_LOCK
2474 } // __kmpc_destroy_lock
2475 
2476 /* destroy the lock */
2477 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2478 #if KMP_USE_DYNAMIC_LOCK
2479 
2480 #if USE_ITT_BUILD
2481   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2482   __kmp_itt_lock_destroyed(ilk->lock);
2483 #endif
2484 #if OMPT_SUPPORT && OMPT_OPTIONAL
2485   // This is the case, if called from omp_init_lock_with_hint:
2486   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2487   if (!codeptr)
2488     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2489   if (ompt_enabled.ompt_callback_lock_destroy) {
2490     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2491         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2492   }
2493 #endif
2494   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2495 
2496 #else // KMP_USE_DYNAMIC_LOCK
2497 
2498   kmp_user_lock_p lck;
2499 
2500   if ((__kmp_user_lock_kind == lk_tas) &&
2501       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2502        OMP_NEST_LOCK_T_SIZE)) {
2503     lck = (kmp_user_lock_p)user_lock;
2504   }
2505 #if KMP_USE_FUTEX
2506   else if ((__kmp_user_lock_kind == lk_futex) &&
2507            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2508             OMP_NEST_LOCK_T_SIZE)) {
2509     lck = (kmp_user_lock_p)user_lock;
2510   }
2511 #endif
2512   else {
2513     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2514   }
2515 
2516 #if OMPT_SUPPORT && OMPT_OPTIONAL
2517   // This is the case, if called from omp_init_lock_with_hint:
2518   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2519   if (!codeptr)
2520     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2521   if (ompt_enabled.ompt_callback_lock_destroy) {
2522     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2523         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2524   }
2525 #endif
2526 
2527 #if USE_ITT_BUILD
2528   __kmp_itt_lock_destroyed(lck);
2529 #endif /* USE_ITT_BUILD */
2530 
2531   DESTROY_NESTED_LOCK(lck);
2532 
2533   if ((__kmp_user_lock_kind == lk_tas) &&
2534       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2535        OMP_NEST_LOCK_T_SIZE)) {
2536     ;
2537   }
2538 #if KMP_USE_FUTEX
2539   else if ((__kmp_user_lock_kind == lk_futex) &&
2540            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2541             OMP_NEST_LOCK_T_SIZE)) {
2542     ;
2543   }
2544 #endif
2545   else {
2546     __kmp_user_lock_free(user_lock, gtid, lck);
2547   }
2548 #endif // KMP_USE_DYNAMIC_LOCK
2549 } // __kmpc_destroy_nest_lock
2550 
2551 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2552   KMP_COUNT_BLOCK(OMP_set_lock);
2553 #if KMP_USE_DYNAMIC_LOCK
2554   int tag = KMP_EXTRACT_D_TAG(user_lock);
2555 #if USE_ITT_BUILD
2556   __kmp_itt_lock_acquiring(
2557       (kmp_user_lock_p)
2558           user_lock); // itt function will get to the right lock object.
2559 #endif
2560 #if OMPT_SUPPORT && OMPT_OPTIONAL
2561   // This is the case, if called from omp_init_lock_with_hint:
2562   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2563   if (!codeptr)
2564     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2565   if (ompt_enabled.ompt_callback_mutex_acquire) {
2566     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2567         ompt_mutex_lock, omp_lock_hint_none,
2568         __ompt_get_mutex_impl_type(user_lock),
2569         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2570   }
2571 #endif
2572 #if KMP_USE_INLINED_TAS
2573   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2574     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2575   } else
2576 #elif KMP_USE_INLINED_FUTEX
2577   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2578     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2579   } else
2580 #endif
2581   {
2582     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2583   }
2584 #if USE_ITT_BUILD
2585   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2586 #endif
2587 #if OMPT_SUPPORT && OMPT_OPTIONAL
2588   if (ompt_enabled.ompt_callback_mutex_acquired) {
2589     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2590         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2591   }
2592 #endif
2593 
2594 #else // KMP_USE_DYNAMIC_LOCK
2595 
2596   kmp_user_lock_p lck;
2597 
2598   if ((__kmp_user_lock_kind == lk_tas) &&
2599       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2600     lck = (kmp_user_lock_p)user_lock;
2601   }
2602 #if KMP_USE_FUTEX
2603   else if ((__kmp_user_lock_kind == lk_futex) &&
2604            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2605     lck = (kmp_user_lock_p)user_lock;
2606   }
2607 #endif
2608   else {
2609     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2610   }
2611 
2612 #if USE_ITT_BUILD
2613   __kmp_itt_lock_acquiring(lck);
2614 #endif /* USE_ITT_BUILD */
2615 #if OMPT_SUPPORT && OMPT_OPTIONAL
2616   // This is the case, if called from omp_init_lock_with_hint:
2617   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2618   if (!codeptr)
2619     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2620   if (ompt_enabled.ompt_callback_mutex_acquire) {
2621     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2622         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2623         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2624   }
2625 #endif
2626 
2627   ACQUIRE_LOCK(lck, gtid);
2628 
2629 #if USE_ITT_BUILD
2630   __kmp_itt_lock_acquired(lck);
2631 #endif /* USE_ITT_BUILD */
2632 
2633 #if OMPT_SUPPORT && OMPT_OPTIONAL
2634   if (ompt_enabled.ompt_callback_mutex_acquired) {
2635     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2636         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2637   }
2638 #endif
2639 
2640 #endif // KMP_USE_DYNAMIC_LOCK
2641 }
2642 
2643 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2644 #if KMP_USE_DYNAMIC_LOCK
2645 
2646 #if USE_ITT_BUILD
2647   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2648 #endif
2649 #if OMPT_SUPPORT && OMPT_OPTIONAL
2650   // This is the case, if called from omp_init_lock_with_hint:
2651   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2652   if (!codeptr)
2653     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2654   if (ompt_enabled.enabled) {
2655     if (ompt_enabled.ompt_callback_mutex_acquire) {
2656       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2657           ompt_mutex_nest_lock, omp_lock_hint_none,
2658           __ompt_get_mutex_impl_type(user_lock),
2659           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2660     }
2661   }
2662 #endif
2663   int acquire_status =
2664       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2665   (void) acquire_status;
2666 #if USE_ITT_BUILD
2667   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2668 #endif
2669 
2670 #if OMPT_SUPPORT && OMPT_OPTIONAL
2671   if (ompt_enabled.enabled) {
2672     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2673       if (ompt_enabled.ompt_callback_mutex_acquired) {
2674         // lock_first
2675         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2676             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2677             codeptr);
2678       }
2679     } else {
2680       if (ompt_enabled.ompt_callback_nest_lock) {
2681         // lock_next
2682         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2683             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2684       }
2685     }
2686   }
2687 #endif
2688 
2689 #else // KMP_USE_DYNAMIC_LOCK
2690   int acquire_status;
2691   kmp_user_lock_p lck;
2692 
2693   if ((__kmp_user_lock_kind == lk_tas) &&
2694       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2695        OMP_NEST_LOCK_T_SIZE)) {
2696     lck = (kmp_user_lock_p)user_lock;
2697   }
2698 #if KMP_USE_FUTEX
2699   else if ((__kmp_user_lock_kind == lk_futex) &&
2700            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2701             OMP_NEST_LOCK_T_SIZE)) {
2702     lck = (kmp_user_lock_p)user_lock;
2703   }
2704 #endif
2705   else {
2706     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2707   }
2708 
2709 #if USE_ITT_BUILD
2710   __kmp_itt_lock_acquiring(lck);
2711 #endif /* USE_ITT_BUILD */
2712 #if OMPT_SUPPORT && OMPT_OPTIONAL
2713   // This is the case, if called from omp_init_lock_with_hint:
2714   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2715   if (!codeptr)
2716     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2717   if (ompt_enabled.enabled) {
2718     if (ompt_enabled.ompt_callback_mutex_acquire) {
2719       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2720           ompt_mutex_nest_lock, omp_lock_hint_none,
2721           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2722           codeptr);
2723     }
2724   }
2725 #endif
2726 
2727   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2728 
2729 #if USE_ITT_BUILD
2730   __kmp_itt_lock_acquired(lck);
2731 #endif /* USE_ITT_BUILD */
2732 
2733 #if OMPT_SUPPORT && OMPT_OPTIONAL
2734   if (ompt_enabled.enabled) {
2735     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2736       if (ompt_enabled.ompt_callback_mutex_acquired) {
2737         // lock_first
2738         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2739             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2740       }
2741     } else {
2742       if (ompt_enabled.ompt_callback_nest_lock) {
2743         // lock_next
2744         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2745             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2746       }
2747     }
2748   }
2749 #endif
2750 
2751 #endif // KMP_USE_DYNAMIC_LOCK
2752 }
2753 
2754 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2755 #if KMP_USE_DYNAMIC_LOCK
2756 
2757   int tag = KMP_EXTRACT_D_TAG(user_lock);
2758 #if USE_ITT_BUILD
2759   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2760 #endif
2761 #if KMP_USE_INLINED_TAS
2762   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2763     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2764   } else
2765 #elif KMP_USE_INLINED_FUTEX
2766   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2767     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2768   } else
2769 #endif
2770   {
2771     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2772   }
2773 
2774 #if OMPT_SUPPORT && OMPT_OPTIONAL
2775   // This is the case, if called from omp_init_lock_with_hint:
2776   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2777   if (!codeptr)
2778     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2779   if (ompt_enabled.ompt_callback_mutex_released) {
2780     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2781         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2782   }
2783 #endif
2784 
2785 #else // KMP_USE_DYNAMIC_LOCK
2786 
2787   kmp_user_lock_p lck;
2788 
2789   /* Can't use serial interval since not block structured */
2790   /* release the lock */
2791 
2792   if ((__kmp_user_lock_kind == lk_tas) &&
2793       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2794 #if KMP_OS_LINUX &&                                                            \
2795     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2796 // "fast" path implemented to fix customer performance issue
2797 #if USE_ITT_BUILD
2798     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2799 #endif /* USE_ITT_BUILD */
2800     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2801     KMP_MB();
2802 
2803 #if OMPT_SUPPORT && OMPT_OPTIONAL
2804     // This is the case, if called from omp_init_lock_with_hint:
2805     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2806     if (!codeptr)
2807       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2808     if (ompt_enabled.ompt_callback_mutex_released) {
2809       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2810           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2811     }
2812 #endif
2813 
2814     return;
2815 #else
2816     lck = (kmp_user_lock_p)user_lock;
2817 #endif
2818   }
2819 #if KMP_USE_FUTEX
2820   else if ((__kmp_user_lock_kind == lk_futex) &&
2821            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2822     lck = (kmp_user_lock_p)user_lock;
2823   }
2824 #endif
2825   else {
2826     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2827   }
2828 
2829 #if USE_ITT_BUILD
2830   __kmp_itt_lock_releasing(lck);
2831 #endif /* USE_ITT_BUILD */
2832 
2833   RELEASE_LOCK(lck, gtid);
2834 
2835 #if OMPT_SUPPORT && OMPT_OPTIONAL
2836   // This is the case, if called from omp_init_lock_with_hint:
2837   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2838   if (!codeptr)
2839     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2840   if (ompt_enabled.ompt_callback_mutex_released) {
2841     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2842         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2843   }
2844 #endif
2845 
2846 #endif // KMP_USE_DYNAMIC_LOCK
2847 }
2848 
2849 /* release the lock */
2850 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2851 #if KMP_USE_DYNAMIC_LOCK
2852 
2853 #if USE_ITT_BUILD
2854   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2855 #endif
2856   int release_status =
2857       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2858   (void) release_status;
2859 
2860 #if OMPT_SUPPORT && OMPT_OPTIONAL
2861   // This is the case, if called from omp_init_lock_with_hint:
2862   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2863   if (!codeptr)
2864     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2865   if (ompt_enabled.enabled) {
2866     if (release_status == KMP_LOCK_RELEASED) {
2867       if (ompt_enabled.ompt_callback_mutex_released) {
2868         // release_lock_last
2869         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2870             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2871             codeptr);
2872       }
2873     } else if (ompt_enabled.ompt_callback_nest_lock) {
2874       // release_lock_prev
2875       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2876           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2877     }
2878   }
2879 #endif
2880 
2881 #else // KMP_USE_DYNAMIC_LOCK
2882 
2883   kmp_user_lock_p lck;
2884 
2885   /* Can't use serial interval since not block structured */
2886 
2887   if ((__kmp_user_lock_kind == lk_tas) &&
2888       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2889        OMP_NEST_LOCK_T_SIZE)) {
2890 #if KMP_OS_LINUX &&                                                            \
2891     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2892     // "fast" path implemented to fix customer performance issue
2893     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2894 #if USE_ITT_BUILD
2895     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2896 #endif /* USE_ITT_BUILD */
2897 
2898 #if OMPT_SUPPORT && OMPT_OPTIONAL
2899     int release_status = KMP_LOCK_STILL_HELD;
2900 #endif
2901 
2902     if (--(tl->lk.depth_locked) == 0) {
2903       TCW_4(tl->lk.poll, 0);
2904 #if OMPT_SUPPORT && OMPT_OPTIONAL
2905       release_status = KMP_LOCK_RELEASED;
2906 #endif
2907     }
2908     KMP_MB();
2909 
2910 #if OMPT_SUPPORT && OMPT_OPTIONAL
2911     // This is the case, if called from omp_init_lock_with_hint:
2912     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2913     if (!codeptr)
2914       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2915     if (ompt_enabled.enabled) {
2916       if (release_status == KMP_LOCK_RELEASED) {
2917         if (ompt_enabled.ompt_callback_mutex_released) {
2918           // release_lock_last
2919           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2920               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2921         }
2922       } else if (ompt_enabled.ompt_callback_nest_lock) {
2923         // release_lock_previous
2924         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2925             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2926       }
2927     }
2928 #endif
2929 
2930     return;
2931 #else
2932     lck = (kmp_user_lock_p)user_lock;
2933 #endif
2934   }
2935 #if KMP_USE_FUTEX
2936   else if ((__kmp_user_lock_kind == lk_futex) &&
2937            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2938             OMP_NEST_LOCK_T_SIZE)) {
2939     lck = (kmp_user_lock_p)user_lock;
2940   }
2941 #endif
2942   else {
2943     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2944   }
2945 
2946 #if USE_ITT_BUILD
2947   __kmp_itt_lock_releasing(lck);
2948 #endif /* USE_ITT_BUILD */
2949 
2950   int release_status;
2951   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2952 #if OMPT_SUPPORT && OMPT_OPTIONAL
2953   // This is the case, if called from omp_init_lock_with_hint:
2954   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2955   if (!codeptr)
2956     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2957   if (ompt_enabled.enabled) {
2958     if (release_status == KMP_LOCK_RELEASED) {
2959       if (ompt_enabled.ompt_callback_mutex_released) {
2960         // release_lock_last
2961         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2962             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2963       }
2964     } else if (ompt_enabled.ompt_callback_nest_lock) {
2965       // release_lock_previous
2966       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2967           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2968     }
2969   }
2970 #endif
2971 
2972 #endif // KMP_USE_DYNAMIC_LOCK
2973 }
2974 
2975 /* try to acquire the lock */
2976 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2977   KMP_COUNT_BLOCK(OMP_test_lock);
2978 
2979 #if KMP_USE_DYNAMIC_LOCK
2980   int rc;
2981   int tag = KMP_EXTRACT_D_TAG(user_lock);
2982 #if USE_ITT_BUILD
2983   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2984 #endif
2985 #if OMPT_SUPPORT && OMPT_OPTIONAL
2986   // This is the case, if called from omp_init_lock_with_hint:
2987   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2988   if (!codeptr)
2989     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2990   if (ompt_enabled.ompt_callback_mutex_acquire) {
2991     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2992         ompt_mutex_lock, omp_lock_hint_none,
2993         __ompt_get_mutex_impl_type(user_lock),
2994         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2995   }
2996 #endif
2997 #if KMP_USE_INLINED_TAS
2998   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2999     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3000   } else
3001 #elif KMP_USE_INLINED_FUTEX
3002   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3003     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3004   } else
3005 #endif
3006   {
3007     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3008   }
3009   if (rc) {
3010 #if USE_ITT_BUILD
3011     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3012 #endif
3013 #if OMPT_SUPPORT && OMPT_OPTIONAL
3014     if (ompt_enabled.ompt_callback_mutex_acquired) {
3015       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3016           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3017     }
3018 #endif
3019     return FTN_TRUE;
3020   } else {
3021 #if USE_ITT_BUILD
3022     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3023 #endif
3024     return FTN_FALSE;
3025   }
3026 
3027 #else // KMP_USE_DYNAMIC_LOCK
3028 
3029   kmp_user_lock_p lck;
3030   int rc;
3031 
3032   if ((__kmp_user_lock_kind == lk_tas) &&
3033       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3034     lck = (kmp_user_lock_p)user_lock;
3035   }
3036 #if KMP_USE_FUTEX
3037   else if ((__kmp_user_lock_kind == lk_futex) &&
3038            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3039     lck = (kmp_user_lock_p)user_lock;
3040   }
3041 #endif
3042   else {
3043     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3044   }
3045 
3046 #if USE_ITT_BUILD
3047   __kmp_itt_lock_acquiring(lck);
3048 #endif /* USE_ITT_BUILD */
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050   // This is the case, if called from omp_init_lock_with_hint:
3051   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3052   if (!codeptr)
3053     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3054   if (ompt_enabled.ompt_callback_mutex_acquire) {
3055     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3056         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3057         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3058   }
3059 #endif
3060 
3061   rc = TEST_LOCK(lck, gtid);
3062 #if USE_ITT_BUILD
3063   if (rc) {
3064     __kmp_itt_lock_acquired(lck);
3065   } else {
3066     __kmp_itt_lock_cancelled(lck);
3067   }
3068 #endif /* USE_ITT_BUILD */
3069 #if OMPT_SUPPORT && OMPT_OPTIONAL
3070   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3071     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3072         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3073   }
3074 #endif
3075 
3076   return (rc ? FTN_TRUE : FTN_FALSE);
3077 
3078 /* Can't use serial interval since not block structured */
3079 
3080 #endif // KMP_USE_DYNAMIC_LOCK
3081 }
3082 
3083 /* try to acquire the lock */
3084 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3085 #if KMP_USE_DYNAMIC_LOCK
3086   int rc;
3087 #if USE_ITT_BUILD
3088   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3089 #endif
3090 #if OMPT_SUPPORT && OMPT_OPTIONAL
3091   // This is the case, if called from omp_init_lock_with_hint:
3092   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3093   if (!codeptr)
3094     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3095   if (ompt_enabled.ompt_callback_mutex_acquire) {
3096     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3097         ompt_mutex_nest_lock, omp_lock_hint_none,
3098         __ompt_get_mutex_impl_type(user_lock),
3099         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3100   }
3101 #endif
3102   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3103 #if USE_ITT_BUILD
3104   if (rc) {
3105     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3106   } else {
3107     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3108   }
3109 #endif
3110 #if OMPT_SUPPORT && OMPT_OPTIONAL
3111   if (ompt_enabled.enabled && rc) {
3112     if (rc == 1) {
3113       if (ompt_enabled.ompt_callback_mutex_acquired) {
3114         // lock_first
3115         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3116             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3117             codeptr);
3118       }
3119     } else {
3120       if (ompt_enabled.ompt_callback_nest_lock) {
3121         // lock_next
3122         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3123             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3124       }
3125     }
3126   }
3127 #endif
3128   return rc;
3129 
3130 #else // KMP_USE_DYNAMIC_LOCK
3131 
3132   kmp_user_lock_p lck;
3133   int rc;
3134 
3135   if ((__kmp_user_lock_kind == lk_tas) &&
3136       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3137        OMP_NEST_LOCK_T_SIZE)) {
3138     lck = (kmp_user_lock_p)user_lock;
3139   }
3140 #if KMP_USE_FUTEX
3141   else if ((__kmp_user_lock_kind == lk_futex) &&
3142            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3143             OMP_NEST_LOCK_T_SIZE)) {
3144     lck = (kmp_user_lock_p)user_lock;
3145   }
3146 #endif
3147   else {
3148     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3149   }
3150 
3151 #if USE_ITT_BUILD
3152   __kmp_itt_lock_acquiring(lck);
3153 #endif /* USE_ITT_BUILD */
3154 
3155 #if OMPT_SUPPORT && OMPT_OPTIONAL
3156   // This is the case, if called from omp_init_lock_with_hint:
3157   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3158   if (!codeptr)
3159     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3160   if (ompt_enabled.enabled) &&
3161         ompt_enabled.ompt_callback_mutex_acquire) {
3162       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3163           ompt_mutex_nest_lock, omp_lock_hint_none,
3164           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3165           codeptr);
3166     }
3167 #endif
3168 
3169   rc = TEST_NESTED_LOCK(lck, gtid);
3170 #if USE_ITT_BUILD
3171   if (rc) {
3172     __kmp_itt_lock_acquired(lck);
3173   } else {
3174     __kmp_itt_lock_cancelled(lck);
3175   }
3176 #endif /* USE_ITT_BUILD */
3177 #if OMPT_SUPPORT && OMPT_OPTIONAL
3178   if (ompt_enabled.enabled && rc) {
3179     if (rc == 1) {
3180       if (ompt_enabled.ompt_callback_mutex_acquired) {
3181         // lock_first
3182         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3183             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3184       }
3185     } else {
3186       if (ompt_enabled.ompt_callback_nest_lock) {
3187         // lock_next
3188         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3189             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3190       }
3191     }
3192   }
3193 #endif
3194   return rc;
3195 
3196 /* Can't use serial interval since not block structured */
3197 
3198 #endif // KMP_USE_DYNAMIC_LOCK
3199 }
3200 
3201 // Interface to fast scalable reduce methods routines
3202 
3203 // keep the selected method in a thread local structure for cross-function
3204 // usage: will be used in __kmpc_end_reduce* functions;
3205 // another solution: to re-determine the method one more time in
3206 // __kmpc_end_reduce* functions (new prototype required then)
3207 // AT: which solution is better?
3208 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3209   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3210 
3211 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3212   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3213 
3214 // description of the packed_reduction_method variable: look at the macros in
3215 // kmp.h
3216 
3217 // used in a critical section reduce block
3218 static __forceinline void
3219 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3220                                           kmp_critical_name *crit) {
3221 
3222   // this lock was visible to a customer and to the threading profile tool as a
3223   // serial overhead span (although it's used for an internal purpose only)
3224   //            why was it visible in previous implementation?
3225   //            should we keep it visible in new reduce block?
3226   kmp_user_lock_p lck;
3227 
3228 #if KMP_USE_DYNAMIC_LOCK
3229 
3230   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3231   // Check if it is initialized.
3232   if (*lk == 0) {
3233     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3234       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3235                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3236     } else {
3237       __kmp_init_indirect_csptr(crit, loc, global_tid,
3238                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3239     }
3240   }
3241   // Branch for accessing the actual lock object and set operation. This
3242   // branching is inevitable since this lock initialization does not follow the
3243   // normal dispatch path (lock table is not used).
3244   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3245     lck = (kmp_user_lock_p)lk;
3246     KMP_DEBUG_ASSERT(lck != NULL);
3247     if (__kmp_env_consistency_check) {
3248       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3249     }
3250     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3251   } else {
3252     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3253     lck = ilk->lock;
3254     KMP_DEBUG_ASSERT(lck != NULL);
3255     if (__kmp_env_consistency_check) {
3256       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3257     }
3258     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3259   }
3260 
3261 #else // KMP_USE_DYNAMIC_LOCK
3262 
3263   // We know that the fast reduction code is only emitted by Intel compilers
3264   // with 32 byte critical sections. If there isn't enough space, then we
3265   // have to use a pointer.
3266   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3267     lck = (kmp_user_lock_p)crit;
3268   } else {
3269     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3270   }
3271   KMP_DEBUG_ASSERT(lck != NULL);
3272 
3273   if (__kmp_env_consistency_check)
3274     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3275 
3276   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3277 
3278 #endif // KMP_USE_DYNAMIC_LOCK
3279 }
3280 
3281 // used in a critical section reduce block
3282 static __forceinline void
3283 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3284                                         kmp_critical_name *crit) {
3285 
3286   kmp_user_lock_p lck;
3287 
3288 #if KMP_USE_DYNAMIC_LOCK
3289 
3290   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3291     lck = (kmp_user_lock_p)crit;
3292     if (__kmp_env_consistency_check)
3293       __kmp_pop_sync(global_tid, ct_critical, loc);
3294     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3295   } else {
3296     kmp_indirect_lock_t *ilk =
3297         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3298     if (__kmp_env_consistency_check)
3299       __kmp_pop_sync(global_tid, ct_critical, loc);
3300     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3301   }
3302 
3303 #else // KMP_USE_DYNAMIC_LOCK
3304 
3305   // We know that the fast reduction code is only emitted by Intel compilers
3306   // with 32 byte critical sections. If there isn't enough space, then we have
3307   // to use a pointer.
3308   if (__kmp_base_user_lock_size > 32) {
3309     lck = *((kmp_user_lock_p *)crit);
3310     KMP_ASSERT(lck != NULL);
3311   } else {
3312     lck = (kmp_user_lock_p)crit;
3313   }
3314 
3315   if (__kmp_env_consistency_check)
3316     __kmp_pop_sync(global_tid, ct_critical, loc);
3317 
3318   __kmp_release_user_lock_with_checks(lck, global_tid);
3319 
3320 #endif // KMP_USE_DYNAMIC_LOCK
3321 } // __kmp_end_critical_section_reduce_block
3322 
3323 static __forceinline int
3324 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3325                                      int *task_state) {
3326   kmp_team_t *team;
3327 
3328   // Check if we are inside the teams construct?
3329   if (th->th.th_teams_microtask) {
3330     *team_p = team = th->th.th_team;
3331     if (team->t.t_level == th->th.th_teams_level) {
3332       // This is reduction at teams construct.
3333       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3334       // Let's swap teams temporarily for the reduction.
3335       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3336       th->th.th_team = team->t.t_parent;
3337       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3338       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3339       *task_state = th->th.th_task_state;
3340       th->th.th_task_state = 0;
3341 
3342       return 1;
3343     }
3344   }
3345   return 0;
3346 }
3347 
3348 static __forceinline void
3349 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3350   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3351   th->th.th_info.ds.ds_tid = 0;
3352   th->th.th_team = team;
3353   th->th.th_team_nproc = team->t.t_nproc;
3354   th->th.th_task_team = team->t.t_task_team[task_state];
3355   th->th.th_task_state = task_state;
3356 }
3357 
3358 /* 2.a.i. Reduce Block without a terminating barrier */
3359 /*!
3360 @ingroup SYNCHRONIZATION
3361 @param loc source location information
3362 @param global_tid global thread number
3363 @param num_vars number of items (variables) to be reduced
3364 @param reduce_size size of data in bytes to be reduced
3365 @param reduce_data pointer to data to be reduced
3366 @param reduce_func callback function providing reduction operation on two
3367 operands and returning result of reduction in lhs_data
3368 @param lck pointer to the unique lock data structure
3369 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3370 threads if atomic reduction needed
3371 
3372 The nowait version is used for a reduce clause with the nowait argument.
3373 */
3374 kmp_int32
3375 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3376                      size_t reduce_size, void *reduce_data,
3377                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3378                      kmp_critical_name *lck) {
3379 
3380   KMP_COUNT_BLOCK(REDUCE_nowait);
3381   int retval = 0;
3382   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3383   kmp_info_t *th;
3384   kmp_team_t *team;
3385   int teams_swapped = 0, task_state;
3386   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3387 
3388   // why do we need this initialization here at all?
3389   // Reduction clause can not be used as a stand-alone directive.
3390 
3391   // do not call __kmp_serial_initialize(), it will be called by
3392   // __kmp_parallel_initialize() if needed
3393   // possible detection of false-positive race by the threadchecker ???
3394   if (!TCR_4(__kmp_init_parallel))
3395     __kmp_parallel_initialize();
3396 
3397   __kmp_resume_if_soft_paused();
3398 
3399 // check correctness of reduce block nesting
3400 #if KMP_USE_DYNAMIC_LOCK
3401   if (__kmp_env_consistency_check)
3402     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3403 #else
3404   if (__kmp_env_consistency_check)
3405     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3406 #endif
3407 
3408   th = __kmp_thread_from_gtid(global_tid);
3409   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3410 
3411   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3412   // the value should be kept in a variable
3413   // the variable should be either a construct-specific or thread-specific
3414   // property, not a team specific property
3415   //     (a thread can reach the next reduce block on the next construct, reduce
3416   //     method may differ on the next construct)
3417   // an ident_t "loc" parameter could be used as a construct-specific property
3418   // (what if loc == 0?)
3419   //     (if both construct-specific and team-specific variables were shared,
3420   //     then unness extra syncs should be needed)
3421   // a thread-specific variable is better regarding two issues above (next
3422   // construct and extra syncs)
3423   // a thread-specific "th_local.reduction_method" variable is used currently
3424   // each thread executes 'determine' and 'set' lines (no need to execute by one
3425   // thread, to avoid unness extra syncs)
3426 
3427   packed_reduction_method = __kmp_determine_reduction_method(
3428       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3429   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3430 
3431   if (packed_reduction_method == critical_reduce_block) {
3432 
3433     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3434     retval = 1;
3435 
3436   } else if (packed_reduction_method == empty_reduce_block) {
3437 
3438     // usage: if team size == 1, no synchronization is required ( Intel
3439     // platforms only )
3440     retval = 1;
3441 
3442   } else if (packed_reduction_method == atomic_reduce_block) {
3443 
3444     retval = 2;
3445 
3446     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3447     // won't be called by the code gen)
3448     //     (it's not quite good, because the checking block has been closed by
3449     //     this 'pop',
3450     //      but atomic operation has not been executed yet, will be executed
3451     //      slightly later, literally on next instruction)
3452     if (__kmp_env_consistency_check)
3453       __kmp_pop_sync(global_tid, ct_reduce, loc);
3454 
3455   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3456                                    tree_reduce_block)) {
3457 
3458 // AT: performance issue: a real barrier here
3459 // AT:     (if master goes slow, other threads are blocked here waiting for the
3460 // master to come and release them)
3461 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3462 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3463 // be confusing to a customer)
3464 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3465 // might go faster and be more in line with sense of NOWAIT
3466 // AT: TO DO: do epcc test and compare times
3467 
3468 // this barrier should be invisible to a customer and to the threading profile
3469 // tool (it's neither a terminating barrier nor customer's code, it's
3470 // used for an internal purpose)
3471 #if OMPT_SUPPORT
3472     // JP: can this barrier potentially leed to task scheduling?
3473     // JP: as long as there is a barrier in the implementation, OMPT should and
3474     // will provide the barrier events
3475     //         so we set-up the necessary frame/return addresses.
3476     ompt_frame_t *ompt_frame;
3477     if (ompt_enabled.enabled) {
3478       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3479       if (ompt_frame->enter_frame.ptr == NULL)
3480         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3481       OMPT_STORE_RETURN_ADDRESS(global_tid);
3482     }
3483 #endif
3484 #if USE_ITT_NOTIFY
3485     __kmp_threads[global_tid]->th.th_ident = loc;
3486 #endif
3487     retval =
3488         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3489                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3490     retval = (retval != 0) ? (0) : (1);
3491 #if OMPT_SUPPORT && OMPT_OPTIONAL
3492     if (ompt_enabled.enabled) {
3493       ompt_frame->enter_frame = ompt_data_none;
3494     }
3495 #endif
3496 
3497     // all other workers except master should do this pop here
3498     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3499     if (__kmp_env_consistency_check) {
3500       if (retval == 0) {
3501         __kmp_pop_sync(global_tid, ct_reduce, loc);
3502       }
3503     }
3504 
3505   } else {
3506 
3507     // should never reach this block
3508     KMP_ASSERT(0); // "unexpected method"
3509   }
3510   if (teams_swapped) {
3511     __kmp_restore_swapped_teams(th, team, task_state);
3512   }
3513   KA_TRACE(
3514       10,
3515       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3516        global_tid, packed_reduction_method, retval));
3517 
3518   return retval;
3519 }
3520 
3521 /*!
3522 @ingroup SYNCHRONIZATION
3523 @param loc source location information
3524 @param global_tid global thread id.
3525 @param lck pointer to the unique lock data structure
3526 
3527 Finish the execution of a reduce nowait.
3528 */
3529 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3530                               kmp_critical_name *lck) {
3531 
3532   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3533 
3534   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3535 
3536   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3537 
3538   if (packed_reduction_method == critical_reduce_block) {
3539 
3540     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3541 
3542   } else if (packed_reduction_method == empty_reduce_block) {
3543 
3544     // usage: if team size == 1, no synchronization is required ( on Intel
3545     // platforms only )
3546 
3547   } else if (packed_reduction_method == atomic_reduce_block) {
3548 
3549     // neither master nor other workers should get here
3550     //     (code gen does not generate this call in case 2: atomic reduce block)
3551     // actually it's better to remove this elseif at all;
3552     // after removal this value will checked by the 'else' and will assert
3553 
3554   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3555                                    tree_reduce_block)) {
3556 
3557     // only master gets here
3558 
3559   } else {
3560 
3561     // should never reach this block
3562     KMP_ASSERT(0); // "unexpected method"
3563   }
3564 
3565   if (__kmp_env_consistency_check)
3566     __kmp_pop_sync(global_tid, ct_reduce, loc);
3567 
3568   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3569                 global_tid, packed_reduction_method));
3570 
3571   return;
3572 }
3573 
3574 /* 2.a.ii. Reduce Block with a terminating barrier */
3575 
3576 /*!
3577 @ingroup SYNCHRONIZATION
3578 @param loc source location information
3579 @param global_tid global thread number
3580 @param num_vars number of items (variables) to be reduced
3581 @param reduce_size size of data in bytes to be reduced
3582 @param reduce_data pointer to data to be reduced
3583 @param reduce_func callback function providing reduction operation on two
3584 operands and returning result of reduction in lhs_data
3585 @param lck pointer to the unique lock data structure
3586 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3587 threads if atomic reduction needed
3588 
3589 A blocking reduce that includes an implicit barrier.
3590 */
3591 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3592                         size_t reduce_size, void *reduce_data,
3593                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3594                         kmp_critical_name *lck) {
3595   KMP_COUNT_BLOCK(REDUCE_wait);
3596   int retval = 0;
3597   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3598   kmp_info_t *th;
3599   kmp_team_t *team;
3600   int teams_swapped = 0, task_state;
3601 
3602   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3603 
3604   // why do we need this initialization here at all?
3605   // Reduction clause can not be a stand-alone directive.
3606 
3607   // do not call __kmp_serial_initialize(), it will be called by
3608   // __kmp_parallel_initialize() if needed
3609   // possible detection of false-positive race by the threadchecker ???
3610   if (!TCR_4(__kmp_init_parallel))
3611     __kmp_parallel_initialize();
3612 
3613   __kmp_resume_if_soft_paused();
3614 
3615 // check correctness of reduce block nesting
3616 #if KMP_USE_DYNAMIC_LOCK
3617   if (__kmp_env_consistency_check)
3618     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3619 #else
3620   if (__kmp_env_consistency_check)
3621     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3622 #endif
3623 
3624   th = __kmp_thread_from_gtid(global_tid);
3625   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3626 
3627   packed_reduction_method = __kmp_determine_reduction_method(
3628       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3629   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3630 
3631   if (packed_reduction_method == critical_reduce_block) {
3632 
3633     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3634     retval = 1;
3635 
3636   } else if (packed_reduction_method == empty_reduce_block) {
3637 
3638     // usage: if team size == 1, no synchronization is required ( Intel
3639     // platforms only )
3640     retval = 1;
3641 
3642   } else if (packed_reduction_method == atomic_reduce_block) {
3643 
3644     retval = 2;
3645 
3646   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3647                                    tree_reduce_block)) {
3648 
3649 // case tree_reduce_block:
3650 // this barrier should be visible to a customer and to the threading profile
3651 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3652 #if OMPT_SUPPORT
3653     ompt_frame_t *ompt_frame;
3654     if (ompt_enabled.enabled) {
3655       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3656       if (ompt_frame->enter_frame.ptr == NULL)
3657         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3658       OMPT_STORE_RETURN_ADDRESS(global_tid);
3659     }
3660 #endif
3661 #if USE_ITT_NOTIFY
3662     __kmp_threads[global_tid]->th.th_ident =
3663         loc; // needed for correct notification of frames
3664 #endif
3665     retval =
3666         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3667                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3668     retval = (retval != 0) ? (0) : (1);
3669 #if OMPT_SUPPORT && OMPT_OPTIONAL
3670     if (ompt_enabled.enabled) {
3671       ompt_frame->enter_frame = ompt_data_none;
3672     }
3673 #endif
3674 
3675     // all other workers except master should do this pop here
3676     // ( none of other workers except master will enter __kmpc_end_reduce() )
3677     if (__kmp_env_consistency_check) {
3678       if (retval == 0) { // 0: all other workers; 1: master
3679         __kmp_pop_sync(global_tid, ct_reduce, loc);
3680       }
3681     }
3682 
3683   } else {
3684 
3685     // should never reach this block
3686     KMP_ASSERT(0); // "unexpected method"
3687   }
3688   if (teams_swapped) {
3689     __kmp_restore_swapped_teams(th, team, task_state);
3690   }
3691 
3692   KA_TRACE(10,
3693            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3694             global_tid, packed_reduction_method, retval));
3695   return retval;
3696 }
3697 
3698 /*!
3699 @ingroup SYNCHRONIZATION
3700 @param loc source location information
3701 @param global_tid global thread id.
3702 @param lck pointer to the unique lock data structure
3703 
3704 Finish the execution of a blocking reduce.
3705 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3706 start function.
3707 */
3708 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3709                        kmp_critical_name *lck) {
3710 
3711   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3712   kmp_info_t *th;
3713   kmp_team_t *team;
3714   int teams_swapped = 0, task_state;
3715 
3716   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3717 
3718   th = __kmp_thread_from_gtid(global_tid);
3719   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3720 
3721   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3722 
3723   // this barrier should be visible to a customer and to the threading profile
3724   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3725 
3726   if (packed_reduction_method == critical_reduce_block) {
3727     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3728 
3729 // TODO: implicit barrier: should be exposed
3730 #if OMPT_SUPPORT
3731     ompt_frame_t *ompt_frame;
3732     if (ompt_enabled.enabled) {
3733       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3734       if (ompt_frame->enter_frame.ptr == NULL)
3735         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3736       OMPT_STORE_RETURN_ADDRESS(global_tid);
3737     }
3738 #endif
3739 #if USE_ITT_NOTIFY
3740     __kmp_threads[global_tid]->th.th_ident = loc;
3741 #endif
3742     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3743 #if OMPT_SUPPORT && OMPT_OPTIONAL
3744     if (ompt_enabled.enabled) {
3745       ompt_frame->enter_frame = ompt_data_none;
3746     }
3747 #endif
3748 
3749   } else if (packed_reduction_method == empty_reduce_block) {
3750 
3751 // usage: if team size==1, no synchronization is required (Intel platforms only)
3752 
3753 // TODO: implicit barrier: should be exposed
3754 #if OMPT_SUPPORT
3755     ompt_frame_t *ompt_frame;
3756     if (ompt_enabled.enabled) {
3757       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3758       if (ompt_frame->enter_frame.ptr == NULL)
3759         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3760       OMPT_STORE_RETURN_ADDRESS(global_tid);
3761     }
3762 #endif
3763 #if USE_ITT_NOTIFY
3764     __kmp_threads[global_tid]->th.th_ident = loc;
3765 #endif
3766     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3767 #if OMPT_SUPPORT && OMPT_OPTIONAL
3768     if (ompt_enabled.enabled) {
3769       ompt_frame->enter_frame = ompt_data_none;
3770     }
3771 #endif
3772 
3773   } else if (packed_reduction_method == atomic_reduce_block) {
3774 
3775 #if OMPT_SUPPORT
3776     ompt_frame_t *ompt_frame;
3777     if (ompt_enabled.enabled) {
3778       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3779       if (ompt_frame->enter_frame.ptr == NULL)
3780         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3781       OMPT_STORE_RETURN_ADDRESS(global_tid);
3782     }
3783 #endif
3784 // TODO: implicit barrier: should be exposed
3785 #if USE_ITT_NOTIFY
3786     __kmp_threads[global_tid]->th.th_ident = loc;
3787 #endif
3788     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3789 #if OMPT_SUPPORT && OMPT_OPTIONAL
3790     if (ompt_enabled.enabled) {
3791       ompt_frame->enter_frame = ompt_data_none;
3792     }
3793 #endif
3794 
3795   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3796                                    tree_reduce_block)) {
3797 
3798     // only master executes here (master releases all other workers)
3799     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3800                             global_tid);
3801 
3802   } else {
3803 
3804     // should never reach this block
3805     KMP_ASSERT(0); // "unexpected method"
3806   }
3807   if (teams_swapped) {
3808     __kmp_restore_swapped_teams(th, team, task_state);
3809   }
3810 
3811   if (__kmp_env_consistency_check)
3812     __kmp_pop_sync(global_tid, ct_reduce, loc);
3813 
3814   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3815                 global_tid, packed_reduction_method));
3816 
3817   return;
3818 }
3819 
3820 #undef __KMP_GET_REDUCTION_METHOD
3821 #undef __KMP_SET_REDUCTION_METHOD
3822 
3823 /* end of interface to fast scalable reduce routines */
3824 
3825 kmp_uint64 __kmpc_get_taskid() {
3826 
3827   kmp_int32 gtid;
3828   kmp_info_t *thread;
3829 
3830   gtid = __kmp_get_gtid();
3831   if (gtid < 0) {
3832     return 0;
3833   }
3834   thread = __kmp_thread_from_gtid(gtid);
3835   return thread->th.th_current_task->td_task_id;
3836 
3837 } // __kmpc_get_taskid
3838 
3839 kmp_uint64 __kmpc_get_parent_taskid() {
3840 
3841   kmp_int32 gtid;
3842   kmp_info_t *thread;
3843   kmp_taskdata_t *parent_task;
3844 
3845   gtid = __kmp_get_gtid();
3846   if (gtid < 0) {
3847     return 0;
3848   }
3849   thread = __kmp_thread_from_gtid(gtid);
3850   parent_task = thread->th.th_current_task->td_parent;
3851   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3852 
3853 } // __kmpc_get_parent_taskid
3854 
3855 /*!
3856 @ingroup WORK_SHARING
3857 @param loc  source location information.
3858 @param gtid  global thread number.
3859 @param num_dims  number of associated doacross loops.
3860 @param dims  info on loops bounds.
3861 
3862 Initialize doacross loop information.
3863 Expect compiler send us inclusive bounds,
3864 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3865 */
3866 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3867                           const struct kmp_dim *dims) {
3868   int j, idx;
3869   kmp_int64 last, trace_count;
3870   kmp_info_t *th = __kmp_threads[gtid];
3871   kmp_team_t *team = th->th.th_team;
3872   kmp_uint32 *flags;
3873   kmp_disp_t *pr_buf = th->th.th_dispatch;
3874   dispatch_shared_info_t *sh_buf;
3875 
3876   KA_TRACE(
3877       20,
3878       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3879        gtid, num_dims, !team->t.t_serialized));
3880   KMP_DEBUG_ASSERT(dims != NULL);
3881   KMP_DEBUG_ASSERT(num_dims > 0);
3882 
3883   if (team->t.t_serialized) {
3884     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3885     return; // no dependencies if team is serialized
3886   }
3887   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3888   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3889   // the next loop
3890   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3891 
3892   // Save bounds info into allocated private buffer
3893   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3894   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3895       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3896   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3897   pr_buf->th_doacross_info[0] =
3898       (kmp_int64)num_dims; // first element is number of dimensions
3899   // Save also address of num_done in order to access it later without knowing
3900   // the buffer index
3901   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3902   pr_buf->th_doacross_info[2] = dims[0].lo;
3903   pr_buf->th_doacross_info[3] = dims[0].up;
3904   pr_buf->th_doacross_info[4] = dims[0].st;
3905   last = 5;
3906   for (j = 1; j < num_dims; ++j) {
3907     kmp_int64
3908         range_length; // To keep ranges of all dimensions but the first dims[0]
3909     if (dims[j].st == 1) { // most common case
3910       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3911       range_length = dims[j].up - dims[j].lo + 1;
3912     } else {
3913       if (dims[j].st > 0) {
3914         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3915         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3916       } else { // negative increment
3917         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3918         range_length =
3919             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3920       }
3921     }
3922     pr_buf->th_doacross_info[last++] = range_length;
3923     pr_buf->th_doacross_info[last++] = dims[j].lo;
3924     pr_buf->th_doacross_info[last++] = dims[j].up;
3925     pr_buf->th_doacross_info[last++] = dims[j].st;
3926   }
3927 
3928   // Compute total trip count.
3929   // Start with range of dims[0] which we don't need to keep in the buffer.
3930   if (dims[0].st == 1) { // most common case
3931     trace_count = dims[0].up - dims[0].lo + 1;
3932   } else if (dims[0].st > 0) {
3933     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3934     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3935   } else { // negative increment
3936     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3937     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3938   }
3939   for (j = 1; j < num_dims; ++j) {
3940     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3941   }
3942   KMP_DEBUG_ASSERT(trace_count > 0);
3943 
3944   // Check if shared buffer is not occupied by other loop (idx -
3945   // __kmp_dispatch_num_buffers)
3946   if (idx != sh_buf->doacross_buf_idx) {
3947     // Shared buffer is occupied, wait for it to be free
3948     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3949                  __kmp_eq_4, NULL);
3950   }
3951 #if KMP_32_BIT_ARCH
3952   // Check if we are the first thread. After the CAS the first thread gets 0,
3953   // others get 1 if initialization is in progress, allocated pointer otherwise.
3954   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3955   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3956       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3957 #else
3958   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3959       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3960 #endif
3961   if (flags == NULL) {
3962     // we are the first thread, allocate the array of flags
3963     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3964     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3965     KMP_MB();
3966     sh_buf->doacross_flags = flags;
3967   } else if (flags == (kmp_uint32 *)1) {
3968 #if KMP_32_BIT_ARCH
3969     // initialization is still in progress, need to wait
3970     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3971 #else
3972     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3973 #endif
3974       KMP_YIELD(TRUE);
3975     KMP_MB();
3976   } else {
3977     KMP_MB();
3978   }
3979   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3980   pr_buf->th_doacross_flags =
3981       sh_buf->doacross_flags; // save private copy in order to not
3982   // touch shared buffer on each iteration
3983   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3984 }
3985 
3986 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3987   kmp_int32 shft, num_dims, i;
3988   kmp_uint32 flag;
3989   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3990   kmp_info_t *th = __kmp_threads[gtid];
3991   kmp_team_t *team = th->th.th_team;
3992   kmp_disp_t *pr_buf;
3993   kmp_int64 lo, up, st;
3994 
3995   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3996   if (team->t.t_serialized) {
3997     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3998     return; // no dependencies if team is serialized
3999   }
4000 
4001   // calculate sequential iteration number and check out-of-bounds condition
4002   pr_buf = th->th.th_dispatch;
4003   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4004   num_dims = pr_buf->th_doacross_info[0];
4005   lo = pr_buf->th_doacross_info[2];
4006   up = pr_buf->th_doacross_info[3];
4007   st = pr_buf->th_doacross_info[4];
4008   if (st == 1) { // most common case
4009     if (vec[0] < lo || vec[0] > up) {
4010       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4011                     "bounds [%lld,%lld]\n",
4012                     gtid, vec[0], lo, up));
4013       return;
4014     }
4015     iter_number = vec[0] - lo;
4016   } else if (st > 0) {
4017     if (vec[0] < lo || vec[0] > up) {
4018       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4019                     "bounds [%lld,%lld]\n",
4020                     gtid, vec[0], lo, up));
4021       return;
4022     }
4023     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4024   } else { // negative increment
4025     if (vec[0] > lo || vec[0] < up) {
4026       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4027                     "bounds [%lld,%lld]\n",
4028                     gtid, vec[0], lo, up));
4029       return;
4030     }
4031     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4032   }
4033   for (i = 1; i < num_dims; ++i) {
4034     kmp_int64 iter, ln;
4035     kmp_int32 j = i * 4;
4036     ln = pr_buf->th_doacross_info[j + 1];
4037     lo = pr_buf->th_doacross_info[j + 2];
4038     up = pr_buf->th_doacross_info[j + 3];
4039     st = pr_buf->th_doacross_info[j + 4];
4040     if (st == 1) {
4041       if (vec[i] < lo || vec[i] > up) {
4042         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4043                       "bounds [%lld,%lld]\n",
4044                       gtid, vec[i], lo, up));
4045         return;
4046       }
4047       iter = vec[i] - lo;
4048     } else if (st > 0) {
4049       if (vec[i] < lo || vec[i] > up) {
4050         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4051                       "bounds [%lld,%lld]\n",
4052                       gtid, vec[i], lo, up));
4053         return;
4054       }
4055       iter = (kmp_uint64)(vec[i] - lo) / st;
4056     } else { // st < 0
4057       if (vec[i] > lo || vec[i] < up) {
4058         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4059                       "bounds [%lld,%lld]\n",
4060                       gtid, vec[i], lo, up));
4061         return;
4062       }
4063       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4064     }
4065     iter_number = iter + ln * iter_number;
4066   }
4067   shft = iter_number % 32; // use 32-bit granularity
4068   iter_number >>= 5; // divided by 32
4069   flag = 1 << shft;
4070   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4071     KMP_YIELD(TRUE);
4072   }
4073   KMP_MB();
4074   KA_TRACE(20,
4075            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4076             gtid, (iter_number << 5) + shft));
4077 }
4078 
4079 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4080   kmp_int32 shft, num_dims, i;
4081   kmp_uint32 flag;
4082   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4083   kmp_info_t *th = __kmp_threads[gtid];
4084   kmp_team_t *team = th->th.th_team;
4085   kmp_disp_t *pr_buf;
4086   kmp_int64 lo, st;
4087 
4088   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4089   if (team->t.t_serialized) {
4090     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4091     return; // no dependencies if team is serialized
4092   }
4093 
4094   // calculate sequential iteration number (same as in "wait" but no
4095   // out-of-bounds checks)
4096   pr_buf = th->th.th_dispatch;
4097   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4098   num_dims = pr_buf->th_doacross_info[0];
4099   lo = pr_buf->th_doacross_info[2];
4100   st = pr_buf->th_doacross_info[4];
4101   if (st == 1) { // most common case
4102     iter_number = vec[0] - lo;
4103   } else if (st > 0) {
4104     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4105   } else { // negative increment
4106     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4107   }
4108   for (i = 1; i < num_dims; ++i) {
4109     kmp_int64 iter, ln;
4110     kmp_int32 j = i * 4;
4111     ln = pr_buf->th_doacross_info[j + 1];
4112     lo = pr_buf->th_doacross_info[j + 2];
4113     st = pr_buf->th_doacross_info[j + 4];
4114     if (st == 1) {
4115       iter = vec[i] - lo;
4116     } else if (st > 0) {
4117       iter = (kmp_uint64)(vec[i] - lo) / st;
4118     } else { // st < 0
4119       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4120     }
4121     iter_number = iter + ln * iter_number;
4122   }
4123   shft = iter_number % 32; // use 32-bit granularity
4124   iter_number >>= 5; // divided by 32
4125   flag = 1 << shft;
4126   KMP_MB();
4127   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4128     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4129   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4130                 (iter_number << 5) + shft));
4131 }
4132 
4133 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4134   kmp_int32 num_done;
4135   kmp_info_t *th = __kmp_threads[gtid];
4136   kmp_team_t *team = th->th.th_team;
4137   kmp_disp_t *pr_buf = th->th.th_dispatch;
4138 
4139   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4140   if (team->t.t_serialized) {
4141     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4142     return; // nothing to do
4143   }
4144   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4145   if (num_done == th->th.th_team_nproc) {
4146     // we are the last thread, need to free shared resources
4147     int idx = pr_buf->th_doacross_buf_idx - 1;
4148     dispatch_shared_info_t *sh_buf =
4149         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4150     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4151                      (kmp_int64)&sh_buf->doacross_num_done);
4152     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4153     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4154     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4155     sh_buf->doacross_flags = NULL;
4156     sh_buf->doacross_num_done = 0;
4157     sh_buf->doacross_buf_idx +=
4158         __kmp_dispatch_num_buffers; // free buffer for future re-use
4159   }
4160   // free private resources (need to keep buffer index forever)
4161   pr_buf->th_doacross_flags = NULL;
4162   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4163   pr_buf->th_doacross_info = NULL;
4164   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4165 }
4166 
4167 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4168 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4169   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4170 }
4171 
4172 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4173   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4174 }
4175 
4176 int __kmpc_get_target_offload(void) {
4177   if (!__kmp_init_serial) {
4178     __kmp_serial_initialize();
4179   }
4180   return __kmp_target_offload;
4181 }
4182 
4183 int __kmpc_pause_resource(kmp_pause_status_t level) {
4184   if (!__kmp_init_serial) {
4185     return 1; // Can't pause if runtime is not initialized
4186   }
4187   return __kmp_pause_resource(level);
4188 }
4189