1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300       OMPT_STORE_RETURN_ADDRESS(gtid);
301     }
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
312                     &ap
313 #else
314                     ap
315 #endif
316                     );
317 #if INCLUDE_SSC_MARKS
318     SSC_MARK_JOINING();
319 #endif
320     __kmp_join_call(loc, gtid
321 #if OMPT_SUPPORT
322                     ,
323                     fork_context_intel
324 #endif
325                     );
326 
327     va_end(ap);
328   }
329 
330 #if KMP_STATS_ENABLED
331   if (previous_state == stats_state_e::SERIAL_REGION) {
332     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
333   } else {
334     KMP_POP_PARTITIONED_TIMER();
335   }
336 #endif // KMP_STATS_ENABLED
337 }
338 
339 /*!
340 @ingroup PARALLEL
341 @param loc source location information
342 @param global_tid global thread number
343 @param num_teams number of teams requested for the teams construct
344 @param num_threads number of threads per team requested for the teams construct
345 
346 Set the number of teams to be used by the teams construct.
347 This call is only required if the teams construct has a `num_teams` clause
348 or a `thread_limit` clause (or both).
349 */
350 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
351                            kmp_int32 num_teams, kmp_int32 num_threads) {
352   KA_TRACE(20,
353            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
354             global_tid, num_teams, num_threads));
355   __kmp_assert_valid_gtid(global_tid);
356   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
357 }
358 
359 /*!
360 @ingroup PARALLEL
361 @param loc  source location information
362 @param argc  total number of arguments in the ellipsis
363 @param microtask  pointer to callback routine consisting of outlined teams
364 construct
365 @param ...  pointers to shared variables that aren't global
366 
367 Do the actual fork and call the microtask in the relevant number of threads.
368 */
369 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
370                        ...) {
371   int gtid = __kmp_entry_gtid();
372   kmp_info_t *this_thr = __kmp_threads[gtid];
373   va_list ap;
374   va_start(ap, microtask);
375 
376 #if KMP_STATS_ENABLED
377   KMP_COUNT_BLOCK(OMP_TEAMS);
378   stats_state_e previous_state = KMP_GET_THREAD_STATE();
379   if (previous_state == stats_state_e::SERIAL_REGION) {
380     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
381   } else {
382     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
383   }
384 #endif
385 
386   // remember teams entry point and nesting level
387   this_thr->th.th_teams_microtask = microtask;
388   this_thr->th.th_teams_level =
389       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
390 
391 #if OMPT_SUPPORT
392   kmp_team_t *parent_team = this_thr->th.th_team;
393   int tid = __kmp_tid_from_gtid(gtid);
394   if (ompt_enabled.enabled) {
395     parent_team->t.t_implicit_task_taskdata[tid]
396         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
397   }
398   OMPT_STORE_RETURN_ADDRESS(gtid);
399 #endif
400 
401   // check if __kmpc_push_num_teams called, set default number of teams
402   // otherwise
403   if (this_thr->th.th_teams_size.nteams == 0) {
404     __kmp_push_num_teams(loc, gtid, 0, 0);
405   }
406   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
407   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
408   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
409 
410   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
411                   VOLATILE_CAST(microtask_t)
412                       __kmp_teams_master, // "wrapped" task
413                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
414 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
415                   &ap
416 #else
417                   ap
418 #endif
419                   );
420   __kmp_join_call(loc, gtid
421 #if OMPT_SUPPORT
422                   ,
423                   fork_context_intel
424 #endif
425                   );
426 
427   // Pop current CG root off list
428   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
429   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
430   this_thr->th.th_cg_roots = tmp->up;
431   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
432                  " to node %p. cg_nthreads was %d\n",
433                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
434   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
435   int i = tmp->cg_nthreads--;
436   if (i == 1) { // check is we are the last thread in CG (not always the case)
437     __kmp_free(tmp);
438   }
439   // Restore current task's thread_limit from CG root
440   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
441   this_thr->th.th_current_task->td_icvs.thread_limit =
442       this_thr->th.th_cg_roots->cg_thread_limit;
443 
444   this_thr->th.th_teams_microtask = NULL;
445   this_thr->th.th_teams_level = 0;
446   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
447   va_end(ap);
448 #if KMP_STATS_ENABLED
449   if (previous_state == stats_state_e::SERIAL_REGION) {
450     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
451   } else {
452     KMP_POP_PARTITIONED_TIMER();
453   }
454 #endif // KMP_STATS_ENABLED
455 }
456 
457 // I don't think this function should ever have been exported.
458 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
459 // openmp code ever called it, but it's been exported from the RTL for so
460 // long that I'm afraid to remove the definition.
461 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
462 
463 /*!
464 @ingroup PARALLEL
465 @param loc  source location information
466 @param global_tid  global thread number
467 
468 Enter a serialized parallel construct. This interface is used to handle a
469 conditional parallel region, like this,
470 @code
471 #pragma omp parallel if (condition)
472 @endcode
473 when the condition is false.
474 */
475 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
476   // The implementation is now in kmp_runtime.cpp so that it can share static
477   // functions with kmp_fork_call since the tasks to be done are similar in
478   // each case.
479   __kmp_assert_valid_gtid(global_tid);
480 #if OMPT_SUPPORT
481   OMPT_STORE_RETURN_ADDRESS(global_tid);
482 #endif
483   __kmp_serialized_parallel(loc, global_tid);
484 }
485 
486 /*!
487 @ingroup PARALLEL
488 @param loc  source location information
489 @param global_tid  global thread number
490 
491 Leave a serialized parallel construct.
492 */
493 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494   kmp_internal_control_t *top;
495   kmp_info_t *this_thr;
496   kmp_team_t *serial_team;
497 
498   KC_TRACE(10,
499            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
500 
501   /* skip all this code for autopar serialized loops since it results in
502      unacceptable overhead */
503   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
504     return;
505 
506   // Not autopar code
507   __kmp_assert_valid_gtid(global_tid);
508   if (!TCR_4(__kmp_init_parallel))
509     __kmp_parallel_initialize();
510 
511   __kmp_resume_if_soft_paused();
512 
513   this_thr = __kmp_threads[global_tid];
514   serial_team = this_thr->th.th_serial_team;
515 
516   kmp_task_team_t *task_team = this_thr->th.th_task_team;
517   // we need to wait for the proxy tasks before finishing the thread
518   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
519     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
520 
521   KMP_MB();
522   KMP_DEBUG_ASSERT(serial_team);
523   KMP_ASSERT(serial_team->t.t_serialized);
524   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
525   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
526   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
527   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
528 
529 #if OMPT_SUPPORT
530   if (ompt_enabled.enabled &&
531       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
532     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
533     if (ompt_enabled.ompt_callback_implicit_task) {
534       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
535           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
536           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
537     }
538 
539     // reset clear the task id only after unlinking the task
540     ompt_data_t *parent_task_data;
541     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
542 
543     if (ompt_enabled.ompt_callback_parallel_end) {
544       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
545           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
546           ompt_parallel_invoker_program | ompt_parallel_team,
547           OMPT_LOAD_RETURN_ADDRESS(global_tid));
548     }
549     __ompt_lw_taskteam_unlink(this_thr);
550     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
551   }
552 #endif
553 
554   /* If necessary, pop the internal control stack values and replace the team
555    * values */
556   top = serial_team->t.t_control_stack_top;
557   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
558     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
559     serial_team->t.t_control_stack_top = top->next;
560     __kmp_free(top);
561   }
562 
563   // if( serial_team -> t.t_serialized > 1 )
564   serial_team->t.t_level--;
565 
566   /* pop dispatch buffers stack */
567   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
568   {
569     dispatch_private_info_t *disp_buffer =
570         serial_team->t.t_dispatch->th_disp_buffer;
571     serial_team->t.t_dispatch->th_disp_buffer =
572         serial_team->t.t_dispatch->th_disp_buffer->next;
573     __kmp_free(disp_buffer);
574   }
575   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
576 
577   --serial_team->t.t_serialized;
578   if (serial_team->t.t_serialized == 0) {
579 
580 /* return to the parallel section */
581 
582 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
583     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
584       __kmp_clear_x87_fpu_status_word();
585       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
586       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
587     }
588 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
589 
590     this_thr->th.th_team = serial_team->t.t_parent;
591     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
592 
593     /* restore values cached in the thread */
594     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
595     this_thr->th.th_team_master =
596         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
597     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
598 
599     /* TODO the below shouldn't need to be adjusted for serialized teams */
600     this_thr->th.th_dispatch =
601         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
602 
603     __kmp_pop_current_task_from_thread(this_thr);
604 
605     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
606     this_thr->th.th_current_task->td_flags.executing = 1;
607 
608     if (__kmp_tasking_mode != tskm_immediate_exec) {
609       // Copy the task team from the new child / old parent team to the thread.
610       this_thr->th.th_task_team =
611           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
612       KA_TRACE(20,
613                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
614                 "team %p\n",
615                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
616     }
617   } else {
618     if (__kmp_tasking_mode != tskm_immediate_exec) {
619       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
620                     "depth of serial team %p to %d\n",
621                     global_tid, serial_team, serial_team->t.t_serialized));
622     }
623   }
624 
625   if (__kmp_env_consistency_check)
626     __kmp_pop_parallel(global_tid, NULL);
627 #if OMPT_SUPPORT
628   if (ompt_enabled.enabled)
629     this_thr->th.ompt_thread_info.state =
630         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
631                                            : ompt_state_work_parallel);
632 #endif
633 }
634 
635 /*!
636 @ingroup SYNCHRONIZATION
637 @param loc  source location information.
638 
639 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
640 depending on the memory ordering convention obeyed by the compiler
641 even that may not be necessary).
642 */
643 void __kmpc_flush(ident_t *loc) {
644   KC_TRACE(10, ("__kmpc_flush: called\n"));
645 
646   /* need explicit __mf() here since use volatile instead in library */
647   KMP_MB(); /* Flush all pending memory write invalidates.  */
648 
649 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
650 #if KMP_MIC
651 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
652 // We shouldn't need it, though, since the ABI rules require that
653 // * If the compiler generates NGO stores it also generates the fence
654 // * If users hand-code NGO stores they should insert the fence
655 // therefore no incomplete unordered stores should be visible.
656 #else
657   // C74404
658   // This is to address non-temporal store instructions (sfence needed).
659   // The clflush instruction is addressed either (mfence needed).
660   // Probably the non-temporal load monvtdqa instruction should also be
661   // addressed.
662   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
663   if (!__kmp_cpuinfo.initialized) {
664     __kmp_query_cpuid(&__kmp_cpuinfo);
665   }
666   if (!__kmp_cpuinfo.sse2) {
667     // CPU cannot execute SSE2 instructions.
668   } else {
669 #if KMP_COMPILER_ICC
670     _mm_mfence();
671 #elif KMP_COMPILER_MSVC
672     MemoryBarrier();
673 #else
674     __sync_synchronize();
675 #endif // KMP_COMPILER_ICC
676   }
677 #endif // KMP_MIC
678 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
679        KMP_ARCH_RISCV64)
680 // Nothing to see here move along
681 #elif KMP_ARCH_PPC64
682 // Nothing needed here (we have a real MB above).
683 #else
684 #error Unknown or unsupported architecture
685 #endif
686 
687 #if OMPT_SUPPORT && OMPT_OPTIONAL
688   if (ompt_enabled.ompt_callback_flush) {
689     ompt_callbacks.ompt_callback(ompt_callback_flush)(
690         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
691   }
692 #endif
693 }
694 
695 /* -------------------------------------------------------------------------- */
696 /*!
697 @ingroup SYNCHRONIZATION
698 @param loc source location information
699 @param global_tid thread id.
700 
701 Execute a barrier.
702 */
703 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
704   KMP_COUNT_BLOCK(OMP_BARRIER);
705   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
706   __kmp_assert_valid_gtid(global_tid);
707 
708   if (!TCR_4(__kmp_init_parallel))
709     __kmp_parallel_initialize();
710 
711   __kmp_resume_if_soft_paused();
712 
713   if (__kmp_env_consistency_check) {
714     if (loc == 0) {
715       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
716     }
717     __kmp_check_barrier(global_tid, ct_barrier, loc);
718   }
719 
720 #if OMPT_SUPPORT
721   ompt_frame_t *ompt_frame;
722   if (ompt_enabled.enabled) {
723     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
724     if (ompt_frame->enter_frame.ptr == NULL)
725       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
726     OMPT_STORE_RETURN_ADDRESS(global_tid);
727   }
728 #endif
729   __kmp_threads[global_tid]->th.th_ident = loc;
730   // TODO: explicit barrier_wait_id:
731   //   this function is called when 'barrier' directive is present or
732   //   implicit barrier at the end of a worksharing construct.
733   // 1) better to add a per-thread barrier counter to a thread data structure
734   // 2) set to 0 when a new team is created
735   // 4) no sync is required
736 
737   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
738 #if OMPT_SUPPORT && OMPT_OPTIONAL
739   if (ompt_enabled.enabled) {
740     ompt_frame->enter_frame = ompt_data_none;
741   }
742 #endif
743 }
744 
745 /* The BARRIER for a MASTER section is always explicit   */
746 /*!
747 @ingroup WORK_SHARING
748 @param loc  source location information.
749 @param global_tid  global thread number .
750 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
751 */
752 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
753   int status = 0;
754 
755   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
756   __kmp_assert_valid_gtid(global_tid);
757 
758   if (!TCR_4(__kmp_init_parallel))
759     __kmp_parallel_initialize();
760 
761   __kmp_resume_if_soft_paused();
762 
763   if (KMP_MASTER_GTID(global_tid)) {
764     KMP_COUNT_BLOCK(OMP_MASTER);
765     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
766     status = 1;
767   }
768 
769 #if OMPT_SUPPORT && OMPT_OPTIONAL
770   if (status) {
771     if (ompt_enabled.ompt_callback_master) {
772       kmp_info_t *this_thr = __kmp_threads[global_tid];
773       kmp_team_t *team = this_thr->th.th_team;
774 
775       int tid = __kmp_tid_from_gtid(global_tid);
776       ompt_callbacks.ompt_callback(ompt_callback_master)(
777           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
778           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
779           OMPT_GET_RETURN_ADDRESS(0));
780     }
781   }
782 #endif
783 
784   if (__kmp_env_consistency_check) {
785 #if KMP_USE_DYNAMIC_LOCK
786     if (status)
787       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
788     else
789       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
790 #else
791     if (status)
792       __kmp_push_sync(global_tid, ct_master, loc, NULL);
793     else
794       __kmp_check_sync(global_tid, ct_master, loc, NULL);
795 #endif
796   }
797 
798   return status;
799 }
800 
801 /*!
802 @ingroup WORK_SHARING
803 @param loc  source location information.
804 @param global_tid  global thread number .
805 
806 Mark the end of a <tt>master</tt> region. This should only be called by the
807 thread that executes the <tt>master</tt> region.
808 */
809 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
810   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
811   __kmp_assert_valid_gtid(global_tid);
812   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
813   KMP_POP_PARTITIONED_TIMER();
814 
815 #if OMPT_SUPPORT && OMPT_OPTIONAL
816   kmp_info_t *this_thr = __kmp_threads[global_tid];
817   kmp_team_t *team = this_thr->th.th_team;
818   if (ompt_enabled.ompt_callback_master) {
819     int tid = __kmp_tid_from_gtid(global_tid);
820     ompt_callbacks.ompt_callback(ompt_callback_master)(
821         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
822         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
823         OMPT_GET_RETURN_ADDRESS(0));
824   }
825 #endif
826 
827   if (__kmp_env_consistency_check) {
828     if (KMP_MASTER_GTID(global_tid))
829       __kmp_pop_sync(global_tid, ct_master, loc);
830   }
831 }
832 
833 /*!
834 @ingroup WORK_SHARING
835 @param loc  source location information.
836 @param gtid  global thread number.
837 
838 Start execution of an <tt>ordered</tt> construct.
839 */
840 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
841   int cid = 0;
842   kmp_info_t *th;
843   KMP_DEBUG_ASSERT(__kmp_init_serial);
844 
845   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
846   __kmp_assert_valid_gtid(gtid);
847 
848   if (!TCR_4(__kmp_init_parallel))
849     __kmp_parallel_initialize();
850 
851   __kmp_resume_if_soft_paused();
852 
853 #if USE_ITT_BUILD
854   __kmp_itt_ordered_prep(gtid);
855 // TODO: ordered_wait_id
856 #endif /* USE_ITT_BUILD */
857 
858   th = __kmp_threads[gtid];
859 
860 #if OMPT_SUPPORT && OMPT_OPTIONAL
861   kmp_team_t *team;
862   ompt_wait_id_t lck;
863   void *codeptr_ra;
864   if (ompt_enabled.enabled) {
865     OMPT_STORE_RETURN_ADDRESS(gtid);
866     team = __kmp_team_from_gtid(gtid);
867     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
868     /* OMPT state update */
869     th->th.ompt_thread_info.wait_id = lck;
870     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
871 
872     /* OMPT event callback */
873     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
874     if (ompt_enabled.ompt_callback_mutex_acquire) {
875       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
876           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
877           codeptr_ra);
878     }
879   }
880 #endif
881 
882   if (th->th.th_dispatch->th_deo_fcn != 0)
883     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
884   else
885     __kmp_parallel_deo(&gtid, &cid, loc);
886 
887 #if OMPT_SUPPORT && OMPT_OPTIONAL
888   if (ompt_enabled.enabled) {
889     /* OMPT state update */
890     th->th.ompt_thread_info.state = ompt_state_work_parallel;
891     th->th.ompt_thread_info.wait_id = 0;
892 
893     /* OMPT event callback */
894     if (ompt_enabled.ompt_callback_mutex_acquired) {
895       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
896           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
897     }
898   }
899 #endif
900 
901 #if USE_ITT_BUILD
902   __kmp_itt_ordered_start(gtid);
903 #endif /* USE_ITT_BUILD */
904 }
905 
906 /*!
907 @ingroup WORK_SHARING
908 @param loc  source location information.
909 @param gtid  global thread number.
910 
911 End execution of an <tt>ordered</tt> construct.
912 */
913 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
914   int cid = 0;
915   kmp_info_t *th;
916 
917   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
918   __kmp_assert_valid_gtid(gtid);
919 
920 #if USE_ITT_BUILD
921   __kmp_itt_ordered_end(gtid);
922 // TODO: ordered_wait_id
923 #endif /* USE_ITT_BUILD */
924 
925   th = __kmp_threads[gtid];
926 
927   if (th->th.th_dispatch->th_dxo_fcn != 0)
928     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
929   else
930     __kmp_parallel_dxo(&gtid, &cid, loc);
931 
932 #if OMPT_SUPPORT && OMPT_OPTIONAL
933   OMPT_STORE_RETURN_ADDRESS(gtid);
934   if (ompt_enabled.ompt_callback_mutex_released) {
935     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
936         ompt_mutex_ordered,
937         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
938             ->t.t_ordered.dt.t_value,
939         OMPT_LOAD_RETURN_ADDRESS(gtid));
940   }
941 #endif
942 }
943 
944 #if KMP_USE_DYNAMIC_LOCK
945 
946 static __forceinline void
947 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
948                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
949   // Pointer to the allocated indirect lock is written to crit, while indexing
950   // is ignored.
951   void *idx;
952   kmp_indirect_lock_t **lck;
953   lck = (kmp_indirect_lock_t **)crit;
954   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
955   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
956   KMP_SET_I_LOCK_LOCATION(ilk, loc);
957   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
958   KA_TRACE(20,
959            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
960 #if USE_ITT_BUILD
961   __kmp_itt_critical_creating(ilk->lock, loc);
962 #endif
963   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
964   if (status == 0) {
965 #if USE_ITT_BUILD
966     __kmp_itt_critical_destroyed(ilk->lock);
967 #endif
968     // We don't really need to destroy the unclaimed lock here since it will be
969     // cleaned up at program exit.
970     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
971   }
972   KMP_DEBUG_ASSERT(*lck != NULL);
973 }
974 
975 // Fast-path acquire tas lock
976 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
977   {                                                                            \
978     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
979     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
980     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
981     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
982         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
983       kmp_uint32 spins;                                                        \
984       KMP_FSYNC_PREPARE(l);                                                    \
985       KMP_INIT_YIELD(spins);                                                   \
986       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
987       do {                                                                     \
988         if (TCR_4(__kmp_nth) >                                                 \
989             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
990           KMP_YIELD(TRUE);                                                     \
991         } else {                                                               \
992           KMP_YIELD_SPIN(spins);                                               \
993         }                                                                      \
994         __kmp_spin_backoff(&backoff);                                          \
995       } while (                                                                \
996           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
997           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
998     }                                                                          \
999     KMP_FSYNC_ACQUIRED(l);                                                     \
1000   }
1001 
1002 // Fast-path test tas lock
1003 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1004   {                                                                            \
1005     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1006     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1007     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1008     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1009          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1010   }
1011 
1012 // Fast-path release tas lock
1013 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1014   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1015 
1016 #if KMP_USE_FUTEX
1017 
1018 #include <sys/syscall.h>
1019 #include <unistd.h>
1020 #ifndef FUTEX_WAIT
1021 #define FUTEX_WAIT 0
1022 #endif
1023 #ifndef FUTEX_WAKE
1024 #define FUTEX_WAKE 1
1025 #endif
1026 
1027 // Fast-path acquire futex lock
1028 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1029   {                                                                            \
1030     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1031     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1032     KMP_MB();                                                                  \
1033     KMP_FSYNC_PREPARE(ftx);                                                    \
1034     kmp_int32 poll_val;                                                        \
1035     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1036                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1037                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1038       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1039       if (!cond) {                                                             \
1040         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1041                                          poll_val |                            \
1042                                              KMP_LOCK_BUSY(1, futex))) {       \
1043           continue;                                                            \
1044         }                                                                      \
1045         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1046       }                                                                        \
1047       kmp_int32 rc;                                                            \
1048       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1049                         NULL, NULL, 0)) != 0) {                                \
1050         continue;                                                              \
1051       }                                                                        \
1052       gtid_code |= 1;                                                          \
1053     }                                                                          \
1054     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1055   }
1056 
1057 // Fast-path test futex lock
1058 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1059   {                                                                            \
1060     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1061     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1062                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1063       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1064       rc = TRUE;                                                               \
1065     } else {                                                                   \
1066       rc = FALSE;                                                              \
1067     }                                                                          \
1068   }
1069 
1070 // Fast-path release futex lock
1071 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1072   {                                                                            \
1073     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1074     KMP_MB();                                                                  \
1075     KMP_FSYNC_RELEASING(ftx);                                                  \
1076     kmp_int32 poll_val =                                                       \
1077         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1078     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1079       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1080               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1081     }                                                                          \
1082     KMP_MB();                                                                  \
1083     KMP_YIELD_OVERSUB();                                                       \
1084   }
1085 
1086 #endif // KMP_USE_FUTEX
1087 
1088 #else // KMP_USE_DYNAMIC_LOCK
1089 
1090 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1091                                                       ident_t const *loc,
1092                                                       kmp_int32 gtid) {
1093   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1094 
1095   // Because of the double-check, the following load doesn't need to be volatile
1096   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1097 
1098   if (lck == NULL) {
1099     void *idx;
1100 
1101     // Allocate & initialize the lock.
1102     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1103     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1104     __kmp_init_user_lock_with_checks(lck);
1105     __kmp_set_user_lock_location(lck, loc);
1106 #if USE_ITT_BUILD
1107     __kmp_itt_critical_creating(lck);
1108 // __kmp_itt_critical_creating() should be called *before* the first usage
1109 // of underlying lock. It is the only place where we can guarantee it. There
1110 // are chances the lock will destroyed with no usage, but it is not a
1111 // problem, because this is not real event seen by user but rather setting
1112 // name for object (lock). See more details in kmp_itt.h.
1113 #endif /* USE_ITT_BUILD */
1114 
1115     // Use a cmpxchg instruction to slam the start of the critical section with
1116     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1117     // and use the lock that the other thread allocated.
1118     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1119 
1120     if (status == 0) {
1121 // Deallocate the lock and reload the value.
1122 #if USE_ITT_BUILD
1123       __kmp_itt_critical_destroyed(lck);
1124 // Let ITT know the lock is destroyed and the same memory location may be reused
1125 // for another purpose.
1126 #endif /* USE_ITT_BUILD */
1127       __kmp_destroy_user_lock_with_checks(lck);
1128       __kmp_user_lock_free(&idx, gtid, lck);
1129       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1130       KMP_DEBUG_ASSERT(lck != NULL);
1131     }
1132   }
1133   return lck;
1134 }
1135 
1136 #endif // KMP_USE_DYNAMIC_LOCK
1137 
1138 /*!
1139 @ingroup WORK_SHARING
1140 @param loc  source location information.
1141 @param global_tid  global thread number.
1142 @param crit identity of the critical section. This could be a pointer to a lock
1143 associated with the critical section, or some other suitably unique value.
1144 
1145 Enter code protected by a `critical` construct.
1146 This function blocks until the executing thread can enter the critical section.
1147 */
1148 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1149                      kmp_critical_name *crit) {
1150 #if KMP_USE_DYNAMIC_LOCK
1151 #if OMPT_SUPPORT && OMPT_OPTIONAL
1152   OMPT_STORE_RETURN_ADDRESS(global_tid);
1153 #endif // OMPT_SUPPORT
1154   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1155 #else
1156   KMP_COUNT_BLOCK(OMP_CRITICAL);
1157 #if OMPT_SUPPORT && OMPT_OPTIONAL
1158   ompt_state_t prev_state = ompt_state_undefined;
1159   ompt_thread_info_t ti;
1160 #endif
1161   kmp_user_lock_p lck;
1162 
1163   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1164   __kmp_assert_valid_gtid(global_tid);
1165 
1166   // TODO: add THR_OVHD_STATE
1167 
1168   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1169   KMP_CHECK_USER_LOCK_INIT();
1170 
1171   if ((__kmp_user_lock_kind == lk_tas) &&
1172       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1173     lck = (kmp_user_lock_p)crit;
1174   }
1175 #if KMP_USE_FUTEX
1176   else if ((__kmp_user_lock_kind == lk_futex) &&
1177            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1178     lck = (kmp_user_lock_p)crit;
1179   }
1180 #endif
1181   else { // ticket, queuing or drdpa
1182     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1183   }
1184 
1185   if (__kmp_env_consistency_check)
1186     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1187 
1188 // since the critical directive binds to all threads, not just the current
1189 // team we have to check this even if we are in a serialized team.
1190 // also, even if we are the uber thread, we still have to conduct the lock,
1191 // as we have to contend with sibling threads.
1192 
1193 #if USE_ITT_BUILD
1194   __kmp_itt_critical_acquiring(lck);
1195 #endif /* USE_ITT_BUILD */
1196 #if OMPT_SUPPORT && OMPT_OPTIONAL
1197   OMPT_STORE_RETURN_ADDRESS(gtid);
1198   void *codeptr_ra = NULL;
1199   if (ompt_enabled.enabled) {
1200     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1201     /* OMPT state update */
1202     prev_state = ti.state;
1203     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1204     ti.state = ompt_state_wait_critical;
1205 
1206     /* OMPT event callback */
1207     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1208     if (ompt_enabled.ompt_callback_mutex_acquire) {
1209       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1210           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1211           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1212     }
1213   }
1214 #endif
1215   // Value of 'crit' should be good for using as a critical_id of the critical
1216   // section directive.
1217   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1218 
1219 #if USE_ITT_BUILD
1220   __kmp_itt_critical_acquired(lck);
1221 #endif /* USE_ITT_BUILD */
1222 #if OMPT_SUPPORT && OMPT_OPTIONAL
1223   if (ompt_enabled.enabled) {
1224     /* OMPT state update */
1225     ti.state = prev_state;
1226     ti.wait_id = 0;
1227 
1228     /* OMPT event callback */
1229     if (ompt_enabled.ompt_callback_mutex_acquired) {
1230       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1231           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1232     }
1233   }
1234 #endif
1235   KMP_POP_PARTITIONED_TIMER();
1236 
1237   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1238   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1239 #endif // KMP_USE_DYNAMIC_LOCK
1240 }
1241 
1242 #if KMP_USE_DYNAMIC_LOCK
1243 
1244 // Converts the given hint to an internal lock implementation
1245 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1246 #if KMP_USE_TSX
1247 #define KMP_TSX_LOCK(seq) lockseq_##seq
1248 #else
1249 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1250 #endif
1251 
1252 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1253 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1254 #else
1255 #define KMP_CPUINFO_RTM 0
1256 #endif
1257 
1258   // Hints that do not require further logic
1259   if (hint & kmp_lock_hint_hle)
1260     return KMP_TSX_LOCK(hle);
1261   if (hint & kmp_lock_hint_rtm)
1262     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1263   if (hint & kmp_lock_hint_adaptive)
1264     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1265 
1266   // Rule out conflicting hints first by returning the default lock
1267   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1268     return __kmp_user_lock_seq;
1269   if ((hint & omp_lock_hint_speculative) &&
1270       (hint & omp_lock_hint_nonspeculative))
1271     return __kmp_user_lock_seq;
1272 
1273   // Do not even consider speculation when it appears to be contended
1274   if (hint & omp_lock_hint_contended)
1275     return lockseq_queuing;
1276 
1277   // Uncontended lock without speculation
1278   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1279     return lockseq_tas;
1280 
1281   // HLE lock for speculation
1282   if (hint & omp_lock_hint_speculative)
1283     return KMP_TSX_LOCK(hle);
1284 
1285   return __kmp_user_lock_seq;
1286 }
1287 
1288 #if OMPT_SUPPORT && OMPT_OPTIONAL
1289 #if KMP_USE_DYNAMIC_LOCK
1290 static kmp_mutex_impl_t
1291 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1292   if (user_lock) {
1293     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1294     case 0:
1295       break;
1296 #if KMP_USE_FUTEX
1297     case locktag_futex:
1298       return kmp_mutex_impl_queuing;
1299 #endif
1300     case locktag_tas:
1301       return kmp_mutex_impl_spin;
1302 #if KMP_USE_TSX
1303     case locktag_hle:
1304       return kmp_mutex_impl_speculative;
1305 #endif
1306     default:
1307       return kmp_mutex_impl_none;
1308     }
1309     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1310   }
1311   KMP_ASSERT(ilock);
1312   switch (ilock->type) {
1313 #if KMP_USE_TSX
1314   case locktag_adaptive:
1315   case locktag_rtm:
1316     return kmp_mutex_impl_speculative;
1317 #endif
1318   case locktag_nested_tas:
1319     return kmp_mutex_impl_spin;
1320 #if KMP_USE_FUTEX
1321   case locktag_nested_futex:
1322 #endif
1323   case locktag_ticket:
1324   case locktag_queuing:
1325   case locktag_drdpa:
1326   case locktag_nested_ticket:
1327   case locktag_nested_queuing:
1328   case locktag_nested_drdpa:
1329     return kmp_mutex_impl_queuing;
1330   default:
1331     return kmp_mutex_impl_none;
1332   }
1333 }
1334 #else
1335 // For locks without dynamic binding
1336 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1337   switch (__kmp_user_lock_kind) {
1338   case lk_tas:
1339     return kmp_mutex_impl_spin;
1340 #if KMP_USE_FUTEX
1341   case lk_futex:
1342 #endif
1343   case lk_ticket:
1344   case lk_queuing:
1345   case lk_drdpa:
1346     return kmp_mutex_impl_queuing;
1347 #if KMP_USE_TSX
1348   case lk_hle:
1349   case lk_rtm:
1350   case lk_adaptive:
1351     return kmp_mutex_impl_speculative;
1352 #endif
1353   default:
1354     return kmp_mutex_impl_none;
1355   }
1356 }
1357 #endif // KMP_USE_DYNAMIC_LOCK
1358 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1359 
1360 /*!
1361 @ingroup WORK_SHARING
1362 @param loc  source location information.
1363 @param global_tid  global thread number.
1364 @param crit identity of the critical section. This could be a pointer to a lock
1365 associated with the critical section, or some other suitably unique value.
1366 @param hint the lock hint.
1367 
1368 Enter code protected by a `critical` construct with a hint. The hint value is
1369 used to suggest a lock implementation. This function blocks until the executing
1370 thread can enter the critical section unless the hint suggests use of
1371 speculative execution and the hardware supports it.
1372 */
1373 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1374                                kmp_critical_name *crit, uint32_t hint) {
1375   KMP_COUNT_BLOCK(OMP_CRITICAL);
1376   kmp_user_lock_p lck;
1377 #if OMPT_SUPPORT && OMPT_OPTIONAL
1378   ompt_state_t prev_state = ompt_state_undefined;
1379   ompt_thread_info_t ti;
1380   // This is the case, if called from __kmpc_critical:
1381   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1382   if (!codeptr)
1383     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1384 #endif
1385 
1386   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1387   __kmp_assert_valid_gtid(global_tid);
1388 
1389   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1390   // Check if it is initialized.
1391   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1392   if (*lk == 0) {
1393     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1394     if (KMP_IS_D_LOCK(lckseq)) {
1395       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1396                                   KMP_GET_D_TAG(lckseq));
1397     } else {
1398       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1399     }
1400   }
1401   // Branch for accessing the actual lock object and set operation. This
1402   // branching is inevitable since this lock initialization does not follow the
1403   // normal dispatch path (lock table is not used).
1404   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1405     lck = (kmp_user_lock_p)lk;
1406     if (__kmp_env_consistency_check) {
1407       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1408                       __kmp_map_hint_to_lock(hint));
1409     }
1410 #if USE_ITT_BUILD
1411     __kmp_itt_critical_acquiring(lck);
1412 #endif
1413 #if OMPT_SUPPORT && OMPT_OPTIONAL
1414     if (ompt_enabled.enabled) {
1415       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1416       /* OMPT state update */
1417       prev_state = ti.state;
1418       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1419       ti.state = ompt_state_wait_critical;
1420 
1421       /* OMPT event callback */
1422       if (ompt_enabled.ompt_callback_mutex_acquire) {
1423         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1424             ompt_mutex_critical, (unsigned int)hint,
1425             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1426             codeptr);
1427       }
1428     }
1429 #endif
1430 #if KMP_USE_INLINED_TAS
1431     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1432       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1433     } else
1434 #elif KMP_USE_INLINED_FUTEX
1435     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1436       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1437     } else
1438 #endif
1439     {
1440       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1441     }
1442   } else {
1443     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1444     lck = ilk->lock;
1445     if (__kmp_env_consistency_check) {
1446       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1447                       __kmp_map_hint_to_lock(hint));
1448     }
1449 #if USE_ITT_BUILD
1450     __kmp_itt_critical_acquiring(lck);
1451 #endif
1452 #if OMPT_SUPPORT && OMPT_OPTIONAL
1453     if (ompt_enabled.enabled) {
1454       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1455       /* OMPT state update */
1456       prev_state = ti.state;
1457       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1458       ti.state = ompt_state_wait_critical;
1459 
1460       /* OMPT event callback */
1461       if (ompt_enabled.ompt_callback_mutex_acquire) {
1462         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1463             ompt_mutex_critical, (unsigned int)hint,
1464             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1465             codeptr);
1466       }
1467     }
1468 #endif
1469     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1470   }
1471   KMP_POP_PARTITIONED_TIMER();
1472 
1473 #if USE_ITT_BUILD
1474   __kmp_itt_critical_acquired(lck);
1475 #endif /* USE_ITT_BUILD */
1476 #if OMPT_SUPPORT && OMPT_OPTIONAL
1477   if (ompt_enabled.enabled) {
1478     /* OMPT state update */
1479     ti.state = prev_state;
1480     ti.wait_id = 0;
1481 
1482     /* OMPT event callback */
1483     if (ompt_enabled.ompt_callback_mutex_acquired) {
1484       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1485           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1486     }
1487   }
1488 #endif
1489 
1490   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1491   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1492 } // __kmpc_critical_with_hint
1493 
1494 #endif // KMP_USE_DYNAMIC_LOCK
1495 
1496 /*!
1497 @ingroup WORK_SHARING
1498 @param loc  source location information.
1499 @param global_tid  global thread number .
1500 @param crit identity of the critical section. This could be a pointer to a lock
1501 associated with the critical section, or some other suitably unique value.
1502 
1503 Leave a critical section, releasing any lock that was held during its execution.
1504 */
1505 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1506                          kmp_critical_name *crit) {
1507   kmp_user_lock_p lck;
1508 
1509   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1510 
1511 #if KMP_USE_DYNAMIC_LOCK
1512   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1513     lck = (kmp_user_lock_p)crit;
1514     KMP_ASSERT(lck != NULL);
1515     if (__kmp_env_consistency_check) {
1516       __kmp_pop_sync(global_tid, ct_critical, loc);
1517     }
1518 #if USE_ITT_BUILD
1519     __kmp_itt_critical_releasing(lck);
1520 #endif
1521 #if KMP_USE_INLINED_TAS
1522     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1523       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1524     } else
1525 #elif KMP_USE_INLINED_FUTEX
1526     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1527       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1528     } else
1529 #endif
1530     {
1531       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1532     }
1533   } else {
1534     kmp_indirect_lock_t *ilk =
1535         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1536     KMP_ASSERT(ilk != NULL);
1537     lck = ilk->lock;
1538     if (__kmp_env_consistency_check) {
1539       __kmp_pop_sync(global_tid, ct_critical, loc);
1540     }
1541 #if USE_ITT_BUILD
1542     __kmp_itt_critical_releasing(lck);
1543 #endif
1544     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1545   }
1546 
1547 #else // KMP_USE_DYNAMIC_LOCK
1548 
1549   if ((__kmp_user_lock_kind == lk_tas) &&
1550       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1551     lck = (kmp_user_lock_p)crit;
1552   }
1553 #if KMP_USE_FUTEX
1554   else if ((__kmp_user_lock_kind == lk_futex) &&
1555            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1556     lck = (kmp_user_lock_p)crit;
1557   }
1558 #endif
1559   else { // ticket, queuing or drdpa
1560     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1561   }
1562 
1563   KMP_ASSERT(lck != NULL);
1564 
1565   if (__kmp_env_consistency_check)
1566     __kmp_pop_sync(global_tid, ct_critical, loc);
1567 
1568 #if USE_ITT_BUILD
1569   __kmp_itt_critical_releasing(lck);
1570 #endif /* USE_ITT_BUILD */
1571   // Value of 'crit' should be good for using as a critical_id of the critical
1572   // section directive.
1573   __kmp_release_user_lock_with_checks(lck, global_tid);
1574 
1575 #endif // KMP_USE_DYNAMIC_LOCK
1576 
1577 #if OMPT_SUPPORT && OMPT_OPTIONAL
1578   /* OMPT release event triggers after lock is released; place here to trigger
1579    * for all #if branches */
1580   OMPT_STORE_RETURN_ADDRESS(global_tid);
1581   if (ompt_enabled.ompt_callback_mutex_released) {
1582     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1583         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1584         OMPT_LOAD_RETURN_ADDRESS(0));
1585   }
1586 #endif
1587 
1588   KMP_POP_PARTITIONED_TIMER();
1589   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1590 }
1591 
1592 /*!
1593 @ingroup SYNCHRONIZATION
1594 @param loc source location information
1595 @param global_tid thread id.
1596 @return one if the thread should execute the master block, zero otherwise
1597 
1598 Start execution of a combined barrier and master. The barrier is executed inside
1599 this function.
1600 */
1601 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1602   int status;
1603   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1604   __kmp_assert_valid_gtid(global_tid);
1605 
1606   if (!TCR_4(__kmp_init_parallel))
1607     __kmp_parallel_initialize();
1608 
1609   __kmp_resume_if_soft_paused();
1610 
1611   if (__kmp_env_consistency_check)
1612     __kmp_check_barrier(global_tid, ct_barrier, loc);
1613 
1614 #if OMPT_SUPPORT
1615   ompt_frame_t *ompt_frame;
1616   if (ompt_enabled.enabled) {
1617     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1618     if (ompt_frame->enter_frame.ptr == NULL)
1619       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1620     OMPT_STORE_RETURN_ADDRESS(global_tid);
1621   }
1622 #endif
1623 #if USE_ITT_NOTIFY
1624   __kmp_threads[global_tid]->th.th_ident = loc;
1625 #endif
1626   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1627 #if OMPT_SUPPORT && OMPT_OPTIONAL
1628   if (ompt_enabled.enabled) {
1629     ompt_frame->enter_frame = ompt_data_none;
1630   }
1631 #endif
1632 
1633   return (status != 0) ? 0 : 1;
1634 }
1635 
1636 /*!
1637 @ingroup SYNCHRONIZATION
1638 @param loc source location information
1639 @param global_tid thread id.
1640 
1641 Complete the execution of a combined barrier and master. This function should
1642 only be called at the completion of the <tt>master</tt> code. Other threads will
1643 still be waiting at the barrier and this call releases them.
1644 */
1645 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1646   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1647   __kmp_assert_valid_gtid(global_tid);
1648   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1649 }
1650 
1651 /*!
1652 @ingroup SYNCHRONIZATION
1653 @param loc source location information
1654 @param global_tid thread id.
1655 @return one if the thread should execute the master block, zero otherwise
1656 
1657 Start execution of a combined barrier and master(nowait) construct.
1658 The barrier is executed inside this function.
1659 There is no equivalent "end" function, since the
1660 */
1661 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1662   kmp_int32 ret;
1663   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1664   __kmp_assert_valid_gtid(global_tid);
1665 
1666   if (!TCR_4(__kmp_init_parallel))
1667     __kmp_parallel_initialize();
1668 
1669   __kmp_resume_if_soft_paused();
1670 
1671   if (__kmp_env_consistency_check) {
1672     if (loc == 0) {
1673       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1674     }
1675     __kmp_check_barrier(global_tid, ct_barrier, loc);
1676   }
1677 
1678 #if OMPT_SUPPORT
1679   ompt_frame_t *ompt_frame;
1680   if (ompt_enabled.enabled) {
1681     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1682     if (ompt_frame->enter_frame.ptr == NULL)
1683       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1684     OMPT_STORE_RETURN_ADDRESS(global_tid);
1685   }
1686 #endif
1687 #if USE_ITT_NOTIFY
1688   __kmp_threads[global_tid]->th.th_ident = loc;
1689 #endif
1690   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1691 #if OMPT_SUPPORT && OMPT_OPTIONAL
1692   if (ompt_enabled.enabled) {
1693     ompt_frame->enter_frame = ompt_data_none;
1694   }
1695 #endif
1696 
1697   ret = __kmpc_master(loc, global_tid);
1698 
1699   if (__kmp_env_consistency_check) {
1700     /*  there's no __kmpc_end_master called; so the (stats) */
1701     /*  actions of __kmpc_end_master are done here          */
1702     if (ret) {
1703       /* only one thread should do the pop since only */
1704       /* one did the push (see __kmpc_master())       */
1705       __kmp_pop_sync(global_tid, ct_master, loc);
1706     }
1707   }
1708 
1709   return (ret);
1710 }
1711 
1712 /* The BARRIER for a SINGLE process section is always explicit   */
1713 /*!
1714 @ingroup WORK_SHARING
1715 @param loc  source location information
1716 @param global_tid  global thread number
1717 @return One if this thread should execute the single construct, zero otherwise.
1718 
1719 Test whether to execute a <tt>single</tt> construct.
1720 There are no implicit barriers in the two "single" calls, rather the compiler
1721 should introduce an explicit barrier if it is required.
1722 */
1723 
1724 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1725   __kmp_assert_valid_gtid(global_tid);
1726   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1727 
1728   if (rc) {
1729     // We are going to execute the single statement, so we should count it.
1730     KMP_COUNT_BLOCK(OMP_SINGLE);
1731     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1732   }
1733 
1734 #if OMPT_SUPPORT && OMPT_OPTIONAL
1735   kmp_info_t *this_thr = __kmp_threads[global_tid];
1736   kmp_team_t *team = this_thr->th.th_team;
1737   int tid = __kmp_tid_from_gtid(global_tid);
1738 
1739   if (ompt_enabled.enabled) {
1740     if (rc) {
1741       if (ompt_enabled.ompt_callback_work) {
1742         ompt_callbacks.ompt_callback(ompt_callback_work)(
1743             ompt_work_single_executor, ompt_scope_begin,
1744             &(team->t.ompt_team_info.parallel_data),
1745             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1746             1, OMPT_GET_RETURN_ADDRESS(0));
1747       }
1748     } else {
1749       if (ompt_enabled.ompt_callback_work) {
1750         ompt_callbacks.ompt_callback(ompt_callback_work)(
1751             ompt_work_single_other, ompt_scope_begin,
1752             &(team->t.ompt_team_info.parallel_data),
1753             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1754             1, OMPT_GET_RETURN_ADDRESS(0));
1755         ompt_callbacks.ompt_callback(ompt_callback_work)(
1756             ompt_work_single_other, ompt_scope_end,
1757             &(team->t.ompt_team_info.parallel_data),
1758             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1759             1, OMPT_GET_RETURN_ADDRESS(0));
1760       }
1761     }
1762   }
1763 #endif
1764 
1765   return rc;
1766 }
1767 
1768 /*!
1769 @ingroup WORK_SHARING
1770 @param loc  source location information
1771 @param global_tid  global thread number
1772 
1773 Mark the end of a <tt>single</tt> construct.  This function should
1774 only be called by the thread that executed the block of code protected
1775 by the `single` construct.
1776 */
1777 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1778   __kmp_assert_valid_gtid(global_tid);
1779   __kmp_exit_single(global_tid);
1780   KMP_POP_PARTITIONED_TIMER();
1781 
1782 #if OMPT_SUPPORT && OMPT_OPTIONAL
1783   kmp_info_t *this_thr = __kmp_threads[global_tid];
1784   kmp_team_t *team = this_thr->th.th_team;
1785   int tid = __kmp_tid_from_gtid(global_tid);
1786 
1787   if (ompt_enabled.ompt_callback_work) {
1788     ompt_callbacks.ompt_callback(ompt_callback_work)(
1789         ompt_work_single_executor, ompt_scope_end,
1790         &(team->t.ompt_team_info.parallel_data),
1791         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1792         OMPT_GET_RETURN_ADDRESS(0));
1793   }
1794 #endif
1795 }
1796 
1797 /*!
1798 @ingroup WORK_SHARING
1799 @param loc Source location
1800 @param global_tid Global thread id
1801 
1802 Mark the end of a statically scheduled loop.
1803 */
1804 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1805   KMP_POP_PARTITIONED_TIMER();
1806   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1807 
1808 #if OMPT_SUPPORT && OMPT_OPTIONAL
1809   if (ompt_enabled.ompt_callback_work) {
1810     ompt_work_t ompt_work_type = ompt_work_loop;
1811     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1812     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1813     // Determine workshare type
1814     if (loc != NULL) {
1815       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1816         ompt_work_type = ompt_work_loop;
1817       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1818         ompt_work_type = ompt_work_sections;
1819       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1820         ompt_work_type = ompt_work_distribute;
1821       } else {
1822         // use default set above.
1823         // a warning about this case is provided in __kmpc_for_static_init
1824       }
1825       KMP_DEBUG_ASSERT(ompt_work_type);
1826     }
1827     ompt_callbacks.ompt_callback(ompt_callback_work)(
1828         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1829         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1830   }
1831 #endif
1832   if (__kmp_env_consistency_check)
1833     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1834 }
1835 
1836 // User routines which take C-style arguments (call by value)
1837 // different from the Fortran equivalent routines
1838 
1839 void ompc_set_num_threads(int arg) {
1840   // !!!!! TODO: check the per-task binding
1841   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1842 }
1843 
1844 void ompc_set_dynamic(int flag) {
1845   kmp_info_t *thread;
1846 
1847   /* For the thread-private implementation of the internal controls */
1848   thread = __kmp_entry_thread();
1849 
1850   __kmp_save_internal_controls(thread);
1851 
1852   set__dynamic(thread, flag ? TRUE : FALSE);
1853 }
1854 
1855 void ompc_set_nested(int flag) {
1856   kmp_info_t *thread;
1857 
1858   /* For the thread-private internal controls implementation */
1859   thread = __kmp_entry_thread();
1860 
1861   __kmp_save_internal_controls(thread);
1862 
1863   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1864 }
1865 
1866 void ompc_set_max_active_levels(int max_active_levels) {
1867   /* TO DO */
1868   /* we want per-task implementation of this internal control */
1869 
1870   /* For the per-thread internal controls implementation */
1871   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1872 }
1873 
1874 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1875   // !!!!! TODO: check the per-task binding
1876   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1877 }
1878 
1879 int ompc_get_ancestor_thread_num(int level) {
1880   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1881 }
1882 
1883 int ompc_get_team_size(int level) {
1884   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1885 }
1886 
1887 /* OpenMP 5.0 Affinity Format API */
1888 
1889 void ompc_set_affinity_format(char const *format) {
1890   if (!__kmp_init_serial) {
1891     __kmp_serial_initialize();
1892   }
1893   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1894                          format, KMP_STRLEN(format) + 1);
1895 }
1896 
1897 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1898   size_t format_size;
1899   if (!__kmp_init_serial) {
1900     __kmp_serial_initialize();
1901   }
1902   format_size = KMP_STRLEN(__kmp_affinity_format);
1903   if (buffer && size) {
1904     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1905                            format_size + 1);
1906   }
1907   return format_size;
1908 }
1909 
1910 void ompc_display_affinity(char const *format) {
1911   int gtid;
1912   if (!TCR_4(__kmp_init_middle)) {
1913     __kmp_middle_initialize();
1914   }
1915   gtid = __kmp_get_gtid();
1916   __kmp_aux_display_affinity(gtid, format);
1917 }
1918 
1919 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1920                              char const *format) {
1921   int gtid;
1922   size_t num_required;
1923   kmp_str_buf_t capture_buf;
1924   if (!TCR_4(__kmp_init_middle)) {
1925     __kmp_middle_initialize();
1926   }
1927   gtid = __kmp_get_gtid();
1928   __kmp_str_buf_init(&capture_buf);
1929   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1930   if (buffer && buf_size) {
1931     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1932                            capture_buf.used + 1);
1933   }
1934   __kmp_str_buf_free(&capture_buf);
1935   return num_required;
1936 }
1937 
1938 void kmpc_set_stacksize(int arg) {
1939   // __kmp_aux_set_stacksize initializes the library if needed
1940   __kmp_aux_set_stacksize(arg);
1941 }
1942 
1943 void kmpc_set_stacksize_s(size_t arg) {
1944   // __kmp_aux_set_stacksize initializes the library if needed
1945   __kmp_aux_set_stacksize(arg);
1946 }
1947 
1948 void kmpc_set_blocktime(int arg) {
1949   int gtid, tid;
1950   kmp_info_t *thread;
1951 
1952   gtid = __kmp_entry_gtid();
1953   tid = __kmp_tid_from_gtid(gtid);
1954   thread = __kmp_thread_from_gtid(gtid);
1955 
1956   __kmp_aux_set_blocktime(arg, thread, tid);
1957 }
1958 
1959 void kmpc_set_library(int arg) {
1960   // __kmp_user_set_library initializes the library if needed
1961   __kmp_user_set_library((enum library_type)arg);
1962 }
1963 
1964 void kmpc_set_defaults(char const *str) {
1965   // __kmp_aux_set_defaults initializes the library if needed
1966   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1967 }
1968 
1969 void kmpc_set_disp_num_buffers(int arg) {
1970   // ignore after initialization because some teams have already
1971   // allocated dispatch buffers
1972   if (__kmp_init_serial == 0 && arg > 0)
1973     __kmp_dispatch_num_buffers = arg;
1974 }
1975 
1976 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1977 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1978   return -1;
1979 #else
1980   if (!TCR_4(__kmp_init_middle)) {
1981     __kmp_middle_initialize();
1982   }
1983   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1984 #endif
1985 }
1986 
1987 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1988 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1989   return -1;
1990 #else
1991   if (!TCR_4(__kmp_init_middle)) {
1992     __kmp_middle_initialize();
1993   }
1994   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1995 #endif
1996 }
1997 
1998 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1999 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2000   return -1;
2001 #else
2002   if (!TCR_4(__kmp_init_middle)) {
2003     __kmp_middle_initialize();
2004   }
2005   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2006 #endif
2007 }
2008 
2009 /* -------------------------------------------------------------------------- */
2010 /*!
2011 @ingroup THREADPRIVATE
2012 @param loc       source location information
2013 @param gtid      global thread number
2014 @param cpy_size  size of the cpy_data buffer
2015 @param cpy_data  pointer to data to be copied
2016 @param cpy_func  helper function to call for copying data
2017 @param didit     flag variable: 1=single thread; 0=not single thread
2018 
2019 __kmpc_copyprivate implements the interface for the private data broadcast
2020 needed for the copyprivate clause associated with a single region in an
2021 OpenMP<sup>*</sup> program (both C and Fortran).
2022 All threads participating in the parallel region call this routine.
2023 One of the threads (called the single thread) should have the <tt>didit</tt>
2024 variable set to 1 and all other threads should have that variable set to 0.
2025 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2026 
2027 The OpenMP specification forbids the use of nowait on the single region when a
2028 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2029 barrier internally to avoid race conditions, so the code generation for the
2030 single region should avoid generating a barrier after the call to @ref
2031 __kmpc_copyprivate.
2032 
2033 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2034 The <tt>loc</tt> parameter is a pointer to source location information.
2035 
2036 Internal implementation: The single thread will first copy its descriptor
2037 address (cpy_data) to a team-private location, then the other threads will each
2038 call the function pointed to by the parameter cpy_func, which carries out the
2039 copy by copying the data using the cpy_data buffer.
2040 
2041 The cpy_func routine used for the copy and the contents of the data area defined
2042 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2043 to be done. For instance, the cpy_data buffer can hold the actual data to be
2044 copied or it may hold a list of pointers to the data. The cpy_func routine must
2045 interpret the cpy_data buffer appropriately.
2046 
2047 The interface to cpy_func is as follows:
2048 @code
2049 void cpy_func( void *destination, void *source )
2050 @endcode
2051 where void *destination is the cpy_data pointer for the thread being copied to
2052 and void *source is the cpy_data pointer for the thread being copied from.
2053 */
2054 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2055                         void *cpy_data, void (*cpy_func)(void *, void *),
2056                         kmp_int32 didit) {
2057   void **data_ptr;
2058   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2059   __kmp_assert_valid_gtid(gtid);
2060 
2061   KMP_MB();
2062 
2063   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2064 
2065   if (__kmp_env_consistency_check) {
2066     if (loc == 0) {
2067       KMP_WARNING(ConstructIdentInvalid);
2068     }
2069   }
2070 
2071   // ToDo: Optimize the following two barriers into some kind of split barrier
2072 
2073   if (didit)
2074     *data_ptr = cpy_data;
2075 
2076 #if OMPT_SUPPORT
2077   ompt_frame_t *ompt_frame;
2078   if (ompt_enabled.enabled) {
2079     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2080     if (ompt_frame->enter_frame.ptr == NULL)
2081       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2082     OMPT_STORE_RETURN_ADDRESS(gtid);
2083   }
2084 #endif
2085 /* This barrier is not a barrier region boundary */
2086 #if USE_ITT_NOTIFY
2087   __kmp_threads[gtid]->th.th_ident = loc;
2088 #endif
2089   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2090 
2091   if (!didit)
2092     (*cpy_func)(cpy_data, *data_ptr);
2093 
2094 // Consider next barrier a user-visible barrier for barrier region boundaries
2095 // Nesting checks are already handled by the single construct checks
2096 
2097 #if OMPT_SUPPORT
2098   if (ompt_enabled.enabled) {
2099     OMPT_STORE_RETURN_ADDRESS(gtid);
2100   }
2101 #endif
2102 #if USE_ITT_NOTIFY
2103   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2104 // tasks can overwrite the location)
2105 #endif
2106   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2107 #if OMPT_SUPPORT && OMPT_OPTIONAL
2108   if (ompt_enabled.enabled) {
2109     ompt_frame->enter_frame = ompt_data_none;
2110   }
2111 #endif
2112 }
2113 
2114 /* -------------------------------------------------------------------------- */
2115 
2116 #define INIT_LOCK __kmp_init_user_lock_with_checks
2117 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2118 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2119 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2120 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2121 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2122   __kmp_acquire_nested_user_lock_with_checks_timed
2123 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2124 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2125 #define TEST_LOCK __kmp_test_user_lock_with_checks
2126 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2127 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2128 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2129 
2130 // TODO: Make check abort messages use location info & pass it into
2131 // with_checks routines
2132 
2133 #if KMP_USE_DYNAMIC_LOCK
2134 
2135 // internal lock initializer
2136 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2137                                                     kmp_dyna_lockseq_t seq) {
2138   if (KMP_IS_D_LOCK(seq)) {
2139     KMP_INIT_D_LOCK(lock, seq);
2140 #if USE_ITT_BUILD
2141     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2142 #endif
2143   } else {
2144     KMP_INIT_I_LOCK(lock, seq);
2145 #if USE_ITT_BUILD
2146     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2147     __kmp_itt_lock_creating(ilk->lock, loc);
2148 #endif
2149   }
2150 }
2151 
2152 // internal nest lock initializer
2153 static __forceinline void
2154 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2155                                kmp_dyna_lockseq_t seq) {
2156 #if KMP_USE_TSX
2157   // Don't have nested lock implementation for speculative locks
2158   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2159     seq = __kmp_user_lock_seq;
2160 #endif
2161   switch (seq) {
2162   case lockseq_tas:
2163     seq = lockseq_nested_tas;
2164     break;
2165 #if KMP_USE_FUTEX
2166   case lockseq_futex:
2167     seq = lockseq_nested_futex;
2168     break;
2169 #endif
2170   case lockseq_ticket:
2171     seq = lockseq_nested_ticket;
2172     break;
2173   case lockseq_queuing:
2174     seq = lockseq_nested_queuing;
2175     break;
2176   case lockseq_drdpa:
2177     seq = lockseq_nested_drdpa;
2178     break;
2179   default:
2180     seq = lockseq_nested_queuing;
2181   }
2182   KMP_INIT_I_LOCK(lock, seq);
2183 #if USE_ITT_BUILD
2184   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2185   __kmp_itt_lock_creating(ilk->lock, loc);
2186 #endif
2187 }
2188 
2189 /* initialize the lock with a hint */
2190 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2191                                 uintptr_t hint) {
2192   KMP_DEBUG_ASSERT(__kmp_init_serial);
2193   if (__kmp_env_consistency_check && user_lock == NULL) {
2194     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2195   }
2196 
2197   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2198 
2199 #if OMPT_SUPPORT && OMPT_OPTIONAL
2200   // This is the case, if called from omp_init_lock_with_hint:
2201   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2202   if (!codeptr)
2203     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2204   if (ompt_enabled.ompt_callback_lock_init) {
2205     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2206         ompt_mutex_lock, (omp_lock_hint_t)hint,
2207         __ompt_get_mutex_impl_type(user_lock),
2208         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2209   }
2210 #endif
2211 }
2212 
2213 /* initialize the lock with a hint */
2214 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2215                                      void **user_lock, uintptr_t hint) {
2216   KMP_DEBUG_ASSERT(__kmp_init_serial);
2217   if (__kmp_env_consistency_check && user_lock == NULL) {
2218     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2219   }
2220 
2221   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2222 
2223 #if OMPT_SUPPORT && OMPT_OPTIONAL
2224   // This is the case, if called from omp_init_lock_with_hint:
2225   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2226   if (!codeptr)
2227     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2228   if (ompt_enabled.ompt_callback_lock_init) {
2229     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2230         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2231         __ompt_get_mutex_impl_type(user_lock),
2232         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2233   }
2234 #endif
2235 }
2236 
2237 #endif // KMP_USE_DYNAMIC_LOCK
2238 
2239 /* initialize the lock */
2240 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2241 #if KMP_USE_DYNAMIC_LOCK
2242 
2243   KMP_DEBUG_ASSERT(__kmp_init_serial);
2244   if (__kmp_env_consistency_check && user_lock == NULL) {
2245     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2246   }
2247   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2248 
2249 #if OMPT_SUPPORT && OMPT_OPTIONAL
2250   // This is the case, if called from omp_init_lock_with_hint:
2251   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2252   if (!codeptr)
2253     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2254   if (ompt_enabled.ompt_callback_lock_init) {
2255     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2256         ompt_mutex_lock, omp_lock_hint_none,
2257         __ompt_get_mutex_impl_type(user_lock),
2258         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2259   }
2260 #endif
2261 
2262 #else // KMP_USE_DYNAMIC_LOCK
2263 
2264   static char const *const func = "omp_init_lock";
2265   kmp_user_lock_p lck;
2266   KMP_DEBUG_ASSERT(__kmp_init_serial);
2267 
2268   if (__kmp_env_consistency_check) {
2269     if (user_lock == NULL) {
2270       KMP_FATAL(LockIsUninitialized, func);
2271     }
2272   }
2273 
2274   KMP_CHECK_USER_LOCK_INIT();
2275 
2276   if ((__kmp_user_lock_kind == lk_tas) &&
2277       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2278     lck = (kmp_user_lock_p)user_lock;
2279   }
2280 #if KMP_USE_FUTEX
2281   else if ((__kmp_user_lock_kind == lk_futex) &&
2282            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2283     lck = (kmp_user_lock_p)user_lock;
2284   }
2285 #endif
2286   else {
2287     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2288   }
2289   INIT_LOCK(lck);
2290   __kmp_set_user_lock_location(lck, loc);
2291 
2292 #if OMPT_SUPPORT && OMPT_OPTIONAL
2293   // This is the case, if called from omp_init_lock_with_hint:
2294   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2295   if (!codeptr)
2296     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2297   if (ompt_enabled.ompt_callback_lock_init) {
2298     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2299         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2300         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2301   }
2302 #endif
2303 
2304 #if USE_ITT_BUILD
2305   __kmp_itt_lock_creating(lck);
2306 #endif /* USE_ITT_BUILD */
2307 
2308 #endif // KMP_USE_DYNAMIC_LOCK
2309 } // __kmpc_init_lock
2310 
2311 /* initialize the lock */
2312 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2313 #if KMP_USE_DYNAMIC_LOCK
2314 
2315   KMP_DEBUG_ASSERT(__kmp_init_serial);
2316   if (__kmp_env_consistency_check && user_lock == NULL) {
2317     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2318   }
2319   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2320 
2321 #if OMPT_SUPPORT && OMPT_OPTIONAL
2322   // This is the case, if called from omp_init_lock_with_hint:
2323   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2324   if (!codeptr)
2325     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2326   if (ompt_enabled.ompt_callback_lock_init) {
2327     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2328         ompt_mutex_nest_lock, omp_lock_hint_none,
2329         __ompt_get_mutex_impl_type(user_lock),
2330         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2331   }
2332 #endif
2333 
2334 #else // KMP_USE_DYNAMIC_LOCK
2335 
2336   static char const *const func = "omp_init_nest_lock";
2337   kmp_user_lock_p lck;
2338   KMP_DEBUG_ASSERT(__kmp_init_serial);
2339 
2340   if (__kmp_env_consistency_check) {
2341     if (user_lock == NULL) {
2342       KMP_FATAL(LockIsUninitialized, func);
2343     }
2344   }
2345 
2346   KMP_CHECK_USER_LOCK_INIT();
2347 
2348   if ((__kmp_user_lock_kind == lk_tas) &&
2349       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2350        OMP_NEST_LOCK_T_SIZE)) {
2351     lck = (kmp_user_lock_p)user_lock;
2352   }
2353 #if KMP_USE_FUTEX
2354   else if ((__kmp_user_lock_kind == lk_futex) &&
2355            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2356             OMP_NEST_LOCK_T_SIZE)) {
2357     lck = (kmp_user_lock_p)user_lock;
2358   }
2359 #endif
2360   else {
2361     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2362   }
2363 
2364   INIT_NESTED_LOCK(lck);
2365   __kmp_set_user_lock_location(lck, loc);
2366 
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368   // This is the case, if called from omp_init_lock_with_hint:
2369   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2370   if (!codeptr)
2371     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2372   if (ompt_enabled.ompt_callback_lock_init) {
2373     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2374         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2375         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2376   }
2377 #endif
2378 
2379 #if USE_ITT_BUILD
2380   __kmp_itt_lock_creating(lck);
2381 #endif /* USE_ITT_BUILD */
2382 
2383 #endif // KMP_USE_DYNAMIC_LOCK
2384 } // __kmpc_init_nest_lock
2385 
2386 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2387 #if KMP_USE_DYNAMIC_LOCK
2388 
2389 #if USE_ITT_BUILD
2390   kmp_user_lock_p lck;
2391   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2392     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2393   } else {
2394     lck = (kmp_user_lock_p)user_lock;
2395   }
2396   __kmp_itt_lock_destroyed(lck);
2397 #endif
2398 #if OMPT_SUPPORT && OMPT_OPTIONAL
2399   // This is the case, if called from omp_init_lock_with_hint:
2400   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2401   if (!codeptr)
2402     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2403   if (ompt_enabled.ompt_callback_lock_destroy) {
2404     kmp_user_lock_p lck;
2405     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2406       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2407     } else {
2408       lck = (kmp_user_lock_p)user_lock;
2409     }
2410     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2411         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2412   }
2413 #endif
2414   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2415 #else
2416   kmp_user_lock_p lck;
2417 
2418   if ((__kmp_user_lock_kind == lk_tas) &&
2419       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2420     lck = (kmp_user_lock_p)user_lock;
2421   }
2422 #if KMP_USE_FUTEX
2423   else if ((__kmp_user_lock_kind == lk_futex) &&
2424            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2425     lck = (kmp_user_lock_p)user_lock;
2426   }
2427 #endif
2428   else {
2429     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2430   }
2431 
2432 #if OMPT_SUPPORT && OMPT_OPTIONAL
2433   // This is the case, if called from omp_init_lock_with_hint:
2434   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2435   if (!codeptr)
2436     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2437   if (ompt_enabled.ompt_callback_lock_destroy) {
2438     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2439         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2440   }
2441 #endif
2442 
2443 #if USE_ITT_BUILD
2444   __kmp_itt_lock_destroyed(lck);
2445 #endif /* USE_ITT_BUILD */
2446   DESTROY_LOCK(lck);
2447 
2448   if ((__kmp_user_lock_kind == lk_tas) &&
2449       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2450     ;
2451   }
2452 #if KMP_USE_FUTEX
2453   else if ((__kmp_user_lock_kind == lk_futex) &&
2454            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2455     ;
2456   }
2457 #endif
2458   else {
2459     __kmp_user_lock_free(user_lock, gtid, lck);
2460   }
2461 #endif // KMP_USE_DYNAMIC_LOCK
2462 } // __kmpc_destroy_lock
2463 
2464 /* destroy the lock */
2465 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2466 #if KMP_USE_DYNAMIC_LOCK
2467 
2468 #if USE_ITT_BUILD
2469   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2470   __kmp_itt_lock_destroyed(ilk->lock);
2471 #endif
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473   // This is the case, if called from omp_init_lock_with_hint:
2474   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2475   if (!codeptr)
2476     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2477   if (ompt_enabled.ompt_callback_lock_destroy) {
2478     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2479         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2480   }
2481 #endif
2482   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2483 
2484 #else // KMP_USE_DYNAMIC_LOCK
2485 
2486   kmp_user_lock_p lck;
2487 
2488   if ((__kmp_user_lock_kind == lk_tas) &&
2489       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2490        OMP_NEST_LOCK_T_SIZE)) {
2491     lck = (kmp_user_lock_p)user_lock;
2492   }
2493 #if KMP_USE_FUTEX
2494   else if ((__kmp_user_lock_kind == lk_futex) &&
2495            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2496             OMP_NEST_LOCK_T_SIZE)) {
2497     lck = (kmp_user_lock_p)user_lock;
2498   }
2499 #endif
2500   else {
2501     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2502   }
2503 
2504 #if OMPT_SUPPORT && OMPT_OPTIONAL
2505   // This is the case, if called from omp_init_lock_with_hint:
2506   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2507   if (!codeptr)
2508     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2509   if (ompt_enabled.ompt_callback_lock_destroy) {
2510     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2511         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2512   }
2513 #endif
2514 
2515 #if USE_ITT_BUILD
2516   __kmp_itt_lock_destroyed(lck);
2517 #endif /* USE_ITT_BUILD */
2518 
2519   DESTROY_NESTED_LOCK(lck);
2520 
2521   if ((__kmp_user_lock_kind == lk_tas) &&
2522       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2523        OMP_NEST_LOCK_T_SIZE)) {
2524     ;
2525   }
2526 #if KMP_USE_FUTEX
2527   else if ((__kmp_user_lock_kind == lk_futex) &&
2528            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2529             OMP_NEST_LOCK_T_SIZE)) {
2530     ;
2531   }
2532 #endif
2533   else {
2534     __kmp_user_lock_free(user_lock, gtid, lck);
2535   }
2536 #endif // KMP_USE_DYNAMIC_LOCK
2537 } // __kmpc_destroy_nest_lock
2538 
2539 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2540   KMP_COUNT_BLOCK(OMP_set_lock);
2541 #if KMP_USE_DYNAMIC_LOCK
2542   int tag = KMP_EXTRACT_D_TAG(user_lock);
2543 #if USE_ITT_BUILD
2544   __kmp_itt_lock_acquiring(
2545       (kmp_user_lock_p)
2546           user_lock); // itt function will get to the right lock object.
2547 #endif
2548 #if OMPT_SUPPORT && OMPT_OPTIONAL
2549   // This is the case, if called from omp_init_lock_with_hint:
2550   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2551   if (!codeptr)
2552     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2553   if (ompt_enabled.ompt_callback_mutex_acquire) {
2554     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2555         ompt_mutex_lock, omp_lock_hint_none,
2556         __ompt_get_mutex_impl_type(user_lock),
2557         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2558   }
2559 #endif
2560 #if KMP_USE_INLINED_TAS
2561   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2562     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2563   } else
2564 #elif KMP_USE_INLINED_FUTEX
2565   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2566     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2567   } else
2568 #endif
2569   {
2570     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2571   }
2572 #if USE_ITT_BUILD
2573   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2574 #endif
2575 #if OMPT_SUPPORT && OMPT_OPTIONAL
2576   if (ompt_enabled.ompt_callback_mutex_acquired) {
2577     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2578         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2579   }
2580 #endif
2581 
2582 #else // KMP_USE_DYNAMIC_LOCK
2583 
2584   kmp_user_lock_p lck;
2585 
2586   if ((__kmp_user_lock_kind == lk_tas) &&
2587       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2588     lck = (kmp_user_lock_p)user_lock;
2589   }
2590 #if KMP_USE_FUTEX
2591   else if ((__kmp_user_lock_kind == lk_futex) &&
2592            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2593     lck = (kmp_user_lock_p)user_lock;
2594   }
2595 #endif
2596   else {
2597     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2598   }
2599 
2600 #if USE_ITT_BUILD
2601   __kmp_itt_lock_acquiring(lck);
2602 #endif /* USE_ITT_BUILD */
2603 #if OMPT_SUPPORT && OMPT_OPTIONAL
2604   // This is the case, if called from omp_init_lock_with_hint:
2605   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2606   if (!codeptr)
2607     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2608   if (ompt_enabled.ompt_callback_mutex_acquire) {
2609     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2610         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2611         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2612   }
2613 #endif
2614 
2615   ACQUIRE_LOCK(lck, gtid);
2616 
2617 #if USE_ITT_BUILD
2618   __kmp_itt_lock_acquired(lck);
2619 #endif /* USE_ITT_BUILD */
2620 
2621 #if OMPT_SUPPORT && OMPT_OPTIONAL
2622   if (ompt_enabled.ompt_callback_mutex_acquired) {
2623     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2624         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2625   }
2626 #endif
2627 
2628 #endif // KMP_USE_DYNAMIC_LOCK
2629 }
2630 
2631 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2632 #if KMP_USE_DYNAMIC_LOCK
2633 
2634 #if USE_ITT_BUILD
2635   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2636 #endif
2637 #if OMPT_SUPPORT && OMPT_OPTIONAL
2638   // This is the case, if called from omp_init_lock_with_hint:
2639   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2640   if (!codeptr)
2641     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2642   if (ompt_enabled.enabled) {
2643     if (ompt_enabled.ompt_callback_mutex_acquire) {
2644       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2645           ompt_mutex_nest_lock, omp_lock_hint_none,
2646           __ompt_get_mutex_impl_type(user_lock),
2647           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2648     }
2649   }
2650 #endif
2651   int acquire_status =
2652       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2653   (void) acquire_status;
2654 #if USE_ITT_BUILD
2655   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2656 #endif
2657 
2658 #if OMPT_SUPPORT && OMPT_OPTIONAL
2659   if (ompt_enabled.enabled) {
2660     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2661       if (ompt_enabled.ompt_callback_mutex_acquired) {
2662         // lock_first
2663         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2664             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2665             codeptr);
2666       }
2667     } else {
2668       if (ompt_enabled.ompt_callback_nest_lock) {
2669         // lock_next
2670         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2671             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2672       }
2673     }
2674   }
2675 #endif
2676 
2677 #else // KMP_USE_DYNAMIC_LOCK
2678   int acquire_status;
2679   kmp_user_lock_p lck;
2680 
2681   if ((__kmp_user_lock_kind == lk_tas) &&
2682       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2683        OMP_NEST_LOCK_T_SIZE)) {
2684     lck = (kmp_user_lock_p)user_lock;
2685   }
2686 #if KMP_USE_FUTEX
2687   else if ((__kmp_user_lock_kind == lk_futex) &&
2688            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2689             OMP_NEST_LOCK_T_SIZE)) {
2690     lck = (kmp_user_lock_p)user_lock;
2691   }
2692 #endif
2693   else {
2694     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2695   }
2696 
2697 #if USE_ITT_BUILD
2698   __kmp_itt_lock_acquiring(lck);
2699 #endif /* USE_ITT_BUILD */
2700 #if OMPT_SUPPORT && OMPT_OPTIONAL
2701   // This is the case, if called from omp_init_lock_with_hint:
2702   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2703   if (!codeptr)
2704     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2705   if (ompt_enabled.enabled) {
2706     if (ompt_enabled.ompt_callback_mutex_acquire) {
2707       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2708           ompt_mutex_nest_lock, omp_lock_hint_none,
2709           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2710           codeptr);
2711     }
2712   }
2713 #endif
2714 
2715   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2716 
2717 #if USE_ITT_BUILD
2718   __kmp_itt_lock_acquired(lck);
2719 #endif /* USE_ITT_BUILD */
2720 
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722   if (ompt_enabled.enabled) {
2723     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2724       if (ompt_enabled.ompt_callback_mutex_acquired) {
2725         // lock_first
2726         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2727             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2728       }
2729     } else {
2730       if (ompt_enabled.ompt_callback_nest_lock) {
2731         // lock_next
2732         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2733             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2734       }
2735     }
2736   }
2737 #endif
2738 
2739 #endif // KMP_USE_DYNAMIC_LOCK
2740 }
2741 
2742 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2743 #if KMP_USE_DYNAMIC_LOCK
2744 
2745   int tag = KMP_EXTRACT_D_TAG(user_lock);
2746 #if USE_ITT_BUILD
2747   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2748 #endif
2749 #if KMP_USE_INLINED_TAS
2750   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2751     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2752   } else
2753 #elif KMP_USE_INLINED_FUTEX
2754   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2755     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2756   } else
2757 #endif
2758   {
2759     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2760   }
2761 
2762 #if OMPT_SUPPORT && OMPT_OPTIONAL
2763   // This is the case, if called from omp_init_lock_with_hint:
2764   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2765   if (!codeptr)
2766     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2767   if (ompt_enabled.ompt_callback_mutex_released) {
2768     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2769         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2770   }
2771 #endif
2772 
2773 #else // KMP_USE_DYNAMIC_LOCK
2774 
2775   kmp_user_lock_p lck;
2776 
2777   /* Can't use serial interval since not block structured */
2778   /* release the lock */
2779 
2780   if ((__kmp_user_lock_kind == lk_tas) &&
2781       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2782 #if KMP_OS_LINUX &&                                                            \
2783     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2784 // "fast" path implemented to fix customer performance issue
2785 #if USE_ITT_BUILD
2786     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2787 #endif /* USE_ITT_BUILD */
2788     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2789     KMP_MB();
2790 
2791 #if OMPT_SUPPORT && OMPT_OPTIONAL
2792     // This is the case, if called from omp_init_lock_with_hint:
2793     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2794     if (!codeptr)
2795       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2796     if (ompt_enabled.ompt_callback_mutex_released) {
2797       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2798           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2799     }
2800 #endif
2801 
2802     return;
2803 #else
2804     lck = (kmp_user_lock_p)user_lock;
2805 #endif
2806   }
2807 #if KMP_USE_FUTEX
2808   else if ((__kmp_user_lock_kind == lk_futex) &&
2809            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2810     lck = (kmp_user_lock_p)user_lock;
2811   }
2812 #endif
2813   else {
2814     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2815   }
2816 
2817 #if USE_ITT_BUILD
2818   __kmp_itt_lock_releasing(lck);
2819 #endif /* USE_ITT_BUILD */
2820 
2821   RELEASE_LOCK(lck, gtid);
2822 
2823 #if OMPT_SUPPORT && OMPT_OPTIONAL
2824   // This is the case, if called from omp_init_lock_with_hint:
2825   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2826   if (!codeptr)
2827     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2828   if (ompt_enabled.ompt_callback_mutex_released) {
2829     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2830         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2831   }
2832 #endif
2833 
2834 #endif // KMP_USE_DYNAMIC_LOCK
2835 }
2836 
2837 /* release the lock */
2838 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2839 #if KMP_USE_DYNAMIC_LOCK
2840 
2841 #if USE_ITT_BUILD
2842   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2843 #endif
2844   int release_status =
2845       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2846   (void) release_status;
2847 
2848 #if OMPT_SUPPORT && OMPT_OPTIONAL
2849   // This is the case, if called from omp_init_lock_with_hint:
2850   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2851   if (!codeptr)
2852     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2853   if (ompt_enabled.enabled) {
2854     if (release_status == KMP_LOCK_RELEASED) {
2855       if (ompt_enabled.ompt_callback_mutex_released) {
2856         // release_lock_last
2857         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2858             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2859             codeptr);
2860       }
2861     } else if (ompt_enabled.ompt_callback_nest_lock) {
2862       // release_lock_prev
2863       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2864           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2865     }
2866   }
2867 #endif
2868 
2869 #else // KMP_USE_DYNAMIC_LOCK
2870 
2871   kmp_user_lock_p lck;
2872 
2873   /* Can't use serial interval since not block structured */
2874 
2875   if ((__kmp_user_lock_kind == lk_tas) &&
2876       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2877        OMP_NEST_LOCK_T_SIZE)) {
2878 #if KMP_OS_LINUX &&                                                            \
2879     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2880     // "fast" path implemented to fix customer performance issue
2881     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2882 #if USE_ITT_BUILD
2883     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2884 #endif /* USE_ITT_BUILD */
2885 
2886 #if OMPT_SUPPORT && OMPT_OPTIONAL
2887     int release_status = KMP_LOCK_STILL_HELD;
2888 #endif
2889 
2890     if (--(tl->lk.depth_locked) == 0) {
2891       TCW_4(tl->lk.poll, 0);
2892 #if OMPT_SUPPORT && OMPT_OPTIONAL
2893       release_status = KMP_LOCK_RELEASED;
2894 #endif
2895     }
2896     KMP_MB();
2897 
2898 #if OMPT_SUPPORT && OMPT_OPTIONAL
2899     // This is the case, if called from omp_init_lock_with_hint:
2900     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2901     if (!codeptr)
2902       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2903     if (ompt_enabled.enabled) {
2904       if (release_status == KMP_LOCK_RELEASED) {
2905         if (ompt_enabled.ompt_callback_mutex_released) {
2906           // release_lock_last
2907           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2908               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2909         }
2910       } else if (ompt_enabled.ompt_callback_nest_lock) {
2911         // release_lock_previous
2912         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2913             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2914       }
2915     }
2916 #endif
2917 
2918     return;
2919 #else
2920     lck = (kmp_user_lock_p)user_lock;
2921 #endif
2922   }
2923 #if KMP_USE_FUTEX
2924   else if ((__kmp_user_lock_kind == lk_futex) &&
2925            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2926             OMP_NEST_LOCK_T_SIZE)) {
2927     lck = (kmp_user_lock_p)user_lock;
2928   }
2929 #endif
2930   else {
2931     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2932   }
2933 
2934 #if USE_ITT_BUILD
2935   __kmp_itt_lock_releasing(lck);
2936 #endif /* USE_ITT_BUILD */
2937 
2938   int release_status;
2939   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2940 #if OMPT_SUPPORT && OMPT_OPTIONAL
2941   // This is the case, if called from omp_init_lock_with_hint:
2942   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2943   if (!codeptr)
2944     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2945   if (ompt_enabled.enabled) {
2946     if (release_status == KMP_LOCK_RELEASED) {
2947       if (ompt_enabled.ompt_callback_mutex_released) {
2948         // release_lock_last
2949         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2950             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2951       }
2952     } else if (ompt_enabled.ompt_callback_nest_lock) {
2953       // release_lock_previous
2954       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2955           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2956     }
2957   }
2958 #endif
2959 
2960 #endif // KMP_USE_DYNAMIC_LOCK
2961 }
2962 
2963 /* try to acquire the lock */
2964 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2965   KMP_COUNT_BLOCK(OMP_test_lock);
2966 
2967 #if KMP_USE_DYNAMIC_LOCK
2968   int rc;
2969   int tag = KMP_EXTRACT_D_TAG(user_lock);
2970 #if USE_ITT_BUILD
2971   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2972 #endif
2973 #if OMPT_SUPPORT && OMPT_OPTIONAL
2974   // This is the case, if called from omp_init_lock_with_hint:
2975   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2976   if (!codeptr)
2977     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2978   if (ompt_enabled.ompt_callback_mutex_acquire) {
2979     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2980         ompt_mutex_lock, omp_lock_hint_none,
2981         __ompt_get_mutex_impl_type(user_lock),
2982         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2983   }
2984 #endif
2985 #if KMP_USE_INLINED_TAS
2986   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2987     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2988   } else
2989 #elif KMP_USE_INLINED_FUTEX
2990   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2991     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2992   } else
2993 #endif
2994   {
2995     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2996   }
2997   if (rc) {
2998 #if USE_ITT_BUILD
2999     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3000 #endif
3001 #if OMPT_SUPPORT && OMPT_OPTIONAL
3002     if (ompt_enabled.ompt_callback_mutex_acquired) {
3003       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3004           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3005     }
3006 #endif
3007     return FTN_TRUE;
3008   } else {
3009 #if USE_ITT_BUILD
3010     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3011 #endif
3012     return FTN_FALSE;
3013   }
3014 
3015 #else // KMP_USE_DYNAMIC_LOCK
3016 
3017   kmp_user_lock_p lck;
3018   int rc;
3019 
3020   if ((__kmp_user_lock_kind == lk_tas) &&
3021       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3022     lck = (kmp_user_lock_p)user_lock;
3023   }
3024 #if KMP_USE_FUTEX
3025   else if ((__kmp_user_lock_kind == lk_futex) &&
3026            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3027     lck = (kmp_user_lock_p)user_lock;
3028   }
3029 #endif
3030   else {
3031     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3032   }
3033 
3034 #if USE_ITT_BUILD
3035   __kmp_itt_lock_acquiring(lck);
3036 #endif /* USE_ITT_BUILD */
3037 #if OMPT_SUPPORT && OMPT_OPTIONAL
3038   // This is the case, if called from omp_init_lock_with_hint:
3039   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3040   if (!codeptr)
3041     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3042   if (ompt_enabled.ompt_callback_mutex_acquire) {
3043     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3044         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3045         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3046   }
3047 #endif
3048 
3049   rc = TEST_LOCK(lck, gtid);
3050 #if USE_ITT_BUILD
3051   if (rc) {
3052     __kmp_itt_lock_acquired(lck);
3053   } else {
3054     __kmp_itt_lock_cancelled(lck);
3055   }
3056 #endif /* USE_ITT_BUILD */
3057 #if OMPT_SUPPORT && OMPT_OPTIONAL
3058   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3059     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3060         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3061   }
3062 #endif
3063 
3064   return (rc ? FTN_TRUE : FTN_FALSE);
3065 
3066 /* Can't use serial interval since not block structured */
3067 
3068 #endif // KMP_USE_DYNAMIC_LOCK
3069 }
3070 
3071 /* try to acquire the lock */
3072 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3073 #if KMP_USE_DYNAMIC_LOCK
3074   int rc;
3075 #if USE_ITT_BUILD
3076   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3077 #endif
3078 #if OMPT_SUPPORT && OMPT_OPTIONAL
3079   // This is the case, if called from omp_init_lock_with_hint:
3080   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3081   if (!codeptr)
3082     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3083   if (ompt_enabled.ompt_callback_mutex_acquire) {
3084     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3085         ompt_mutex_nest_lock, omp_lock_hint_none,
3086         __ompt_get_mutex_impl_type(user_lock),
3087         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3088   }
3089 #endif
3090   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3091 #if USE_ITT_BUILD
3092   if (rc) {
3093     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3094   } else {
3095     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3096   }
3097 #endif
3098 #if OMPT_SUPPORT && OMPT_OPTIONAL
3099   if (ompt_enabled.enabled && rc) {
3100     if (rc == 1) {
3101       if (ompt_enabled.ompt_callback_mutex_acquired) {
3102         // lock_first
3103         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3104             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3105             codeptr);
3106       }
3107     } else {
3108       if (ompt_enabled.ompt_callback_nest_lock) {
3109         // lock_next
3110         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3111             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3112       }
3113     }
3114   }
3115 #endif
3116   return rc;
3117 
3118 #else // KMP_USE_DYNAMIC_LOCK
3119 
3120   kmp_user_lock_p lck;
3121   int rc;
3122 
3123   if ((__kmp_user_lock_kind == lk_tas) &&
3124       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3125        OMP_NEST_LOCK_T_SIZE)) {
3126     lck = (kmp_user_lock_p)user_lock;
3127   }
3128 #if KMP_USE_FUTEX
3129   else if ((__kmp_user_lock_kind == lk_futex) &&
3130            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3131             OMP_NEST_LOCK_T_SIZE)) {
3132     lck = (kmp_user_lock_p)user_lock;
3133   }
3134 #endif
3135   else {
3136     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3137   }
3138 
3139 #if USE_ITT_BUILD
3140   __kmp_itt_lock_acquiring(lck);
3141 #endif /* USE_ITT_BUILD */
3142 
3143 #if OMPT_SUPPORT && OMPT_OPTIONAL
3144   // This is the case, if called from omp_init_lock_with_hint:
3145   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3146   if (!codeptr)
3147     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3148   if (ompt_enabled.enabled) &&
3149         ompt_enabled.ompt_callback_mutex_acquire) {
3150       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3151           ompt_mutex_nest_lock, omp_lock_hint_none,
3152           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3153           codeptr);
3154     }
3155 #endif
3156 
3157   rc = TEST_NESTED_LOCK(lck, gtid);
3158 #if USE_ITT_BUILD
3159   if (rc) {
3160     __kmp_itt_lock_acquired(lck);
3161   } else {
3162     __kmp_itt_lock_cancelled(lck);
3163   }
3164 #endif /* USE_ITT_BUILD */
3165 #if OMPT_SUPPORT && OMPT_OPTIONAL
3166   if (ompt_enabled.enabled && rc) {
3167     if (rc == 1) {
3168       if (ompt_enabled.ompt_callback_mutex_acquired) {
3169         // lock_first
3170         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3171             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3172       }
3173     } else {
3174       if (ompt_enabled.ompt_callback_nest_lock) {
3175         // lock_next
3176         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3177             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3178       }
3179     }
3180   }
3181 #endif
3182   return rc;
3183 
3184 /* Can't use serial interval since not block structured */
3185 
3186 #endif // KMP_USE_DYNAMIC_LOCK
3187 }
3188 
3189 // Interface to fast scalable reduce methods routines
3190 
3191 // keep the selected method in a thread local structure for cross-function
3192 // usage: will be used in __kmpc_end_reduce* functions;
3193 // another solution: to re-determine the method one more time in
3194 // __kmpc_end_reduce* functions (new prototype required then)
3195 // AT: which solution is better?
3196 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3197   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3198 
3199 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3200   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3201 
3202 // description of the packed_reduction_method variable: look at the macros in
3203 // kmp.h
3204 
3205 // used in a critical section reduce block
3206 static __forceinline void
3207 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3208                                           kmp_critical_name *crit) {
3209 
3210   // this lock was visible to a customer and to the threading profile tool as a
3211   // serial overhead span (although it's used for an internal purpose only)
3212   //            why was it visible in previous implementation?
3213   //            should we keep it visible in new reduce block?
3214   kmp_user_lock_p lck;
3215 
3216 #if KMP_USE_DYNAMIC_LOCK
3217 
3218   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3219   // Check if it is initialized.
3220   if (*lk == 0) {
3221     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3222       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3223                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3224     } else {
3225       __kmp_init_indirect_csptr(crit, loc, global_tid,
3226                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3227     }
3228   }
3229   // Branch for accessing the actual lock object and set operation. This
3230   // branching is inevitable since this lock initialization does not follow the
3231   // normal dispatch path (lock table is not used).
3232   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3233     lck = (kmp_user_lock_p)lk;
3234     KMP_DEBUG_ASSERT(lck != NULL);
3235     if (__kmp_env_consistency_check) {
3236       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3237     }
3238     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3239   } else {
3240     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3241     lck = ilk->lock;
3242     KMP_DEBUG_ASSERT(lck != NULL);
3243     if (__kmp_env_consistency_check) {
3244       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3245     }
3246     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3247   }
3248 
3249 #else // KMP_USE_DYNAMIC_LOCK
3250 
3251   // We know that the fast reduction code is only emitted by Intel compilers
3252   // with 32 byte critical sections. If there isn't enough space, then we
3253   // have to use a pointer.
3254   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3255     lck = (kmp_user_lock_p)crit;
3256   } else {
3257     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3258   }
3259   KMP_DEBUG_ASSERT(lck != NULL);
3260 
3261   if (__kmp_env_consistency_check)
3262     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3263 
3264   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3265 
3266 #endif // KMP_USE_DYNAMIC_LOCK
3267 }
3268 
3269 // used in a critical section reduce block
3270 static __forceinline void
3271 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3272                                         kmp_critical_name *crit) {
3273 
3274   kmp_user_lock_p lck;
3275 
3276 #if KMP_USE_DYNAMIC_LOCK
3277 
3278   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3279     lck = (kmp_user_lock_p)crit;
3280     if (__kmp_env_consistency_check)
3281       __kmp_pop_sync(global_tid, ct_critical, loc);
3282     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3283   } else {
3284     kmp_indirect_lock_t *ilk =
3285         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3286     if (__kmp_env_consistency_check)
3287       __kmp_pop_sync(global_tid, ct_critical, loc);
3288     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3289   }
3290 
3291 #else // KMP_USE_DYNAMIC_LOCK
3292 
3293   // We know that the fast reduction code is only emitted by Intel compilers
3294   // with 32 byte critical sections. If there isn't enough space, then we have
3295   // to use a pointer.
3296   if (__kmp_base_user_lock_size > 32) {
3297     lck = *((kmp_user_lock_p *)crit);
3298     KMP_ASSERT(lck != NULL);
3299   } else {
3300     lck = (kmp_user_lock_p)crit;
3301   }
3302 
3303   if (__kmp_env_consistency_check)
3304     __kmp_pop_sync(global_tid, ct_critical, loc);
3305 
3306   __kmp_release_user_lock_with_checks(lck, global_tid);
3307 
3308 #endif // KMP_USE_DYNAMIC_LOCK
3309 } // __kmp_end_critical_section_reduce_block
3310 
3311 static __forceinline int
3312 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3313                                      int *task_state) {
3314   kmp_team_t *team;
3315 
3316   // Check if we are inside the teams construct?
3317   if (th->th.th_teams_microtask) {
3318     *team_p = team = th->th.th_team;
3319     if (team->t.t_level == th->th.th_teams_level) {
3320       // This is reduction at teams construct.
3321       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3322       // Let's swap teams temporarily for the reduction.
3323       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3324       th->th.th_team = team->t.t_parent;
3325       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3326       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3327       *task_state = th->th.th_task_state;
3328       th->th.th_task_state = 0;
3329 
3330       return 1;
3331     }
3332   }
3333   return 0;
3334 }
3335 
3336 static __forceinline void
3337 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3338   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3339   th->th.th_info.ds.ds_tid = 0;
3340   th->th.th_team = team;
3341   th->th.th_team_nproc = team->t.t_nproc;
3342   th->th.th_task_team = team->t.t_task_team[task_state];
3343   th->th.th_task_state = task_state;
3344 }
3345 
3346 /* 2.a.i. Reduce Block without a terminating barrier */
3347 /*!
3348 @ingroup SYNCHRONIZATION
3349 @param loc source location information
3350 @param global_tid global thread number
3351 @param num_vars number of items (variables) to be reduced
3352 @param reduce_size size of data in bytes to be reduced
3353 @param reduce_data pointer to data to be reduced
3354 @param reduce_func callback function providing reduction operation on two
3355 operands and returning result of reduction in lhs_data
3356 @param lck pointer to the unique lock data structure
3357 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3358 threads if atomic reduction needed
3359 
3360 The nowait version is used for a reduce clause with the nowait argument.
3361 */
3362 kmp_int32
3363 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3364                      size_t reduce_size, void *reduce_data,
3365                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3366                      kmp_critical_name *lck) {
3367 
3368   KMP_COUNT_BLOCK(REDUCE_nowait);
3369   int retval = 0;
3370   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3371   kmp_info_t *th;
3372   kmp_team_t *team;
3373   int teams_swapped = 0, task_state;
3374   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3375   __kmp_assert_valid_gtid(global_tid);
3376 
3377   // why do we need this initialization here at all?
3378   // Reduction clause can not be used as a stand-alone directive.
3379 
3380   // do not call __kmp_serial_initialize(), it will be called by
3381   // __kmp_parallel_initialize() if needed
3382   // possible detection of false-positive race by the threadchecker ???
3383   if (!TCR_4(__kmp_init_parallel))
3384     __kmp_parallel_initialize();
3385 
3386   __kmp_resume_if_soft_paused();
3387 
3388 // check correctness of reduce block nesting
3389 #if KMP_USE_DYNAMIC_LOCK
3390   if (__kmp_env_consistency_check)
3391     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3392 #else
3393   if (__kmp_env_consistency_check)
3394     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3395 #endif
3396 
3397   th = __kmp_thread_from_gtid(global_tid);
3398   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3399 
3400   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3401   // the value should be kept in a variable
3402   // the variable should be either a construct-specific or thread-specific
3403   // property, not a team specific property
3404   //     (a thread can reach the next reduce block on the next construct, reduce
3405   //     method may differ on the next construct)
3406   // an ident_t "loc" parameter could be used as a construct-specific property
3407   // (what if loc == 0?)
3408   //     (if both construct-specific and team-specific variables were shared,
3409   //     then unness extra syncs should be needed)
3410   // a thread-specific variable is better regarding two issues above (next
3411   // construct and extra syncs)
3412   // a thread-specific "th_local.reduction_method" variable is used currently
3413   // each thread executes 'determine' and 'set' lines (no need to execute by one
3414   // thread, to avoid unness extra syncs)
3415 
3416   packed_reduction_method = __kmp_determine_reduction_method(
3417       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3418   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3419 
3420   OMPT_REDUCTION_DECL(th, global_tid);
3421   if (packed_reduction_method == critical_reduce_block) {
3422 
3423     OMPT_REDUCTION_BEGIN;
3424 
3425     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3426     retval = 1;
3427 
3428   } else if (packed_reduction_method == empty_reduce_block) {
3429 
3430     OMPT_REDUCTION_BEGIN;
3431 
3432     // usage: if team size == 1, no synchronization is required ( Intel
3433     // platforms only )
3434     retval = 1;
3435 
3436   } else if (packed_reduction_method == atomic_reduce_block) {
3437 
3438     retval = 2;
3439 
3440     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3441     // won't be called by the code gen)
3442     //     (it's not quite good, because the checking block has been closed by
3443     //     this 'pop',
3444     //      but atomic operation has not been executed yet, will be executed
3445     //      slightly later, literally on next instruction)
3446     if (__kmp_env_consistency_check)
3447       __kmp_pop_sync(global_tid, ct_reduce, loc);
3448 
3449   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3450                                    tree_reduce_block)) {
3451 
3452 // AT: performance issue: a real barrier here
3453 // AT:     (if master goes slow, other threads are blocked here waiting for the
3454 // master to come and release them)
3455 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3456 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3457 // be confusing to a customer)
3458 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3459 // might go faster and be more in line with sense of NOWAIT
3460 // AT: TO DO: do epcc test and compare times
3461 
3462 // this barrier should be invisible to a customer and to the threading profile
3463 // tool (it's neither a terminating barrier nor customer's code, it's
3464 // used for an internal purpose)
3465 #if OMPT_SUPPORT
3466     // JP: can this barrier potentially leed to task scheduling?
3467     // JP: as long as there is a barrier in the implementation, OMPT should and
3468     // will provide the barrier events
3469     //         so we set-up the necessary frame/return addresses.
3470     ompt_frame_t *ompt_frame;
3471     if (ompt_enabled.enabled) {
3472       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3473       if (ompt_frame->enter_frame.ptr == NULL)
3474         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3475       OMPT_STORE_RETURN_ADDRESS(global_tid);
3476     }
3477 #endif
3478 #if USE_ITT_NOTIFY
3479     __kmp_threads[global_tid]->th.th_ident = loc;
3480 #endif
3481     retval =
3482         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3483                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3484     retval = (retval != 0) ? (0) : (1);
3485 #if OMPT_SUPPORT && OMPT_OPTIONAL
3486     if (ompt_enabled.enabled) {
3487       ompt_frame->enter_frame = ompt_data_none;
3488     }
3489 #endif
3490 
3491     // all other workers except master should do this pop here
3492     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3493     if (__kmp_env_consistency_check) {
3494       if (retval == 0) {
3495         __kmp_pop_sync(global_tid, ct_reduce, loc);
3496       }
3497     }
3498 
3499   } else {
3500 
3501     // should never reach this block
3502     KMP_ASSERT(0); // "unexpected method"
3503   }
3504   if (teams_swapped) {
3505     __kmp_restore_swapped_teams(th, team, task_state);
3506   }
3507   KA_TRACE(
3508       10,
3509       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3510        global_tid, packed_reduction_method, retval));
3511 
3512   return retval;
3513 }
3514 
3515 /*!
3516 @ingroup SYNCHRONIZATION
3517 @param loc source location information
3518 @param global_tid global thread id.
3519 @param lck pointer to the unique lock data structure
3520 
3521 Finish the execution of a reduce nowait.
3522 */
3523 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3524                               kmp_critical_name *lck) {
3525 
3526   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3527 
3528   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3529   __kmp_assert_valid_gtid(global_tid);
3530 
3531   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3532 
3533   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3534 
3535   if (packed_reduction_method == critical_reduce_block) {
3536 
3537     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3538     OMPT_REDUCTION_END;
3539 
3540   } else if (packed_reduction_method == empty_reduce_block) {
3541 
3542     // usage: if team size == 1, no synchronization is required ( on Intel
3543     // platforms only )
3544 
3545     OMPT_REDUCTION_END;
3546 
3547   } else if (packed_reduction_method == atomic_reduce_block) {
3548 
3549     // neither master nor other workers should get here
3550     //     (code gen does not generate this call in case 2: atomic reduce block)
3551     // actually it's better to remove this elseif at all;
3552     // after removal this value will checked by the 'else' and will assert
3553 
3554   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3555                                    tree_reduce_block)) {
3556 
3557     // only master gets here
3558     // OMPT: tree reduction is annotated in the barrier code
3559 
3560   } else {
3561 
3562     // should never reach this block
3563     KMP_ASSERT(0); // "unexpected method"
3564   }
3565 
3566   if (__kmp_env_consistency_check)
3567     __kmp_pop_sync(global_tid, ct_reduce, loc);
3568 
3569   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3570                 global_tid, packed_reduction_method));
3571 
3572   return;
3573 }
3574 
3575 /* 2.a.ii. Reduce Block with a terminating barrier */
3576 
3577 /*!
3578 @ingroup SYNCHRONIZATION
3579 @param loc source location information
3580 @param global_tid global thread number
3581 @param num_vars number of items (variables) to be reduced
3582 @param reduce_size size of data in bytes to be reduced
3583 @param reduce_data pointer to data to be reduced
3584 @param reduce_func callback function providing reduction operation on two
3585 operands and returning result of reduction in lhs_data
3586 @param lck pointer to the unique lock data structure
3587 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3588 threads if atomic reduction needed
3589 
3590 A blocking reduce that includes an implicit barrier.
3591 */
3592 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3593                         size_t reduce_size, void *reduce_data,
3594                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3595                         kmp_critical_name *lck) {
3596   KMP_COUNT_BLOCK(REDUCE_wait);
3597   int retval = 0;
3598   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3599   kmp_info_t *th;
3600   kmp_team_t *team;
3601   int teams_swapped = 0, task_state;
3602 
3603   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3604   __kmp_assert_valid_gtid(global_tid);
3605 
3606   // why do we need this initialization here at all?
3607   // Reduction clause can not be a stand-alone directive.
3608 
3609   // do not call __kmp_serial_initialize(), it will be called by
3610   // __kmp_parallel_initialize() if needed
3611   // possible detection of false-positive race by the threadchecker ???
3612   if (!TCR_4(__kmp_init_parallel))
3613     __kmp_parallel_initialize();
3614 
3615   __kmp_resume_if_soft_paused();
3616 
3617 // check correctness of reduce block nesting
3618 #if KMP_USE_DYNAMIC_LOCK
3619   if (__kmp_env_consistency_check)
3620     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3621 #else
3622   if (__kmp_env_consistency_check)
3623     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3624 #endif
3625 
3626   th = __kmp_thread_from_gtid(global_tid);
3627   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3628 
3629   packed_reduction_method = __kmp_determine_reduction_method(
3630       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3631   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3632 
3633   OMPT_REDUCTION_DECL(th, global_tid);
3634 
3635   if (packed_reduction_method == critical_reduce_block) {
3636 
3637     OMPT_REDUCTION_BEGIN;
3638     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3639     retval = 1;
3640 
3641   } else if (packed_reduction_method == empty_reduce_block) {
3642 
3643     OMPT_REDUCTION_BEGIN;
3644     // usage: if team size == 1, no synchronization is required ( Intel
3645     // platforms only )
3646     retval = 1;
3647 
3648   } else if (packed_reduction_method == atomic_reduce_block) {
3649 
3650     retval = 2;
3651 
3652   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3653                                    tree_reduce_block)) {
3654 
3655 // case tree_reduce_block:
3656 // this barrier should be visible to a customer and to the threading profile
3657 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3658 #if OMPT_SUPPORT
3659     ompt_frame_t *ompt_frame;
3660     if (ompt_enabled.enabled) {
3661       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3662       if (ompt_frame->enter_frame.ptr == NULL)
3663         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3664       OMPT_STORE_RETURN_ADDRESS(global_tid);
3665     }
3666 #endif
3667 #if USE_ITT_NOTIFY
3668     __kmp_threads[global_tid]->th.th_ident =
3669         loc; // needed for correct notification of frames
3670 #endif
3671     retval =
3672         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3673                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3674     retval = (retval != 0) ? (0) : (1);
3675 #if OMPT_SUPPORT && OMPT_OPTIONAL
3676     if (ompt_enabled.enabled) {
3677       ompt_frame->enter_frame = ompt_data_none;
3678     }
3679 #endif
3680 
3681     // all other workers except master should do this pop here
3682     // ( none of other workers except master will enter __kmpc_end_reduce() )
3683     if (__kmp_env_consistency_check) {
3684       if (retval == 0) { // 0: all other workers; 1: master
3685         __kmp_pop_sync(global_tid, ct_reduce, loc);
3686       }
3687     }
3688 
3689   } else {
3690 
3691     // should never reach this block
3692     KMP_ASSERT(0); // "unexpected method"
3693   }
3694   if (teams_swapped) {
3695     __kmp_restore_swapped_teams(th, team, task_state);
3696   }
3697 
3698   KA_TRACE(10,
3699            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3700             global_tid, packed_reduction_method, retval));
3701   return retval;
3702 }
3703 
3704 /*!
3705 @ingroup SYNCHRONIZATION
3706 @param loc source location information
3707 @param global_tid global thread id.
3708 @param lck pointer to the unique lock data structure
3709 
3710 Finish the execution of a blocking reduce.
3711 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3712 start function.
3713 */
3714 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3715                        kmp_critical_name *lck) {
3716 
3717   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3718   kmp_info_t *th;
3719   kmp_team_t *team;
3720   int teams_swapped = 0, task_state;
3721 
3722   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3723   __kmp_assert_valid_gtid(global_tid);
3724 
3725   th = __kmp_thread_from_gtid(global_tid);
3726   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3727 
3728   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3729 
3730   // this barrier should be visible to a customer and to the threading profile
3731   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3732   OMPT_REDUCTION_DECL(th, global_tid);
3733 
3734   if (packed_reduction_method == critical_reduce_block) {
3735     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3736 
3737     OMPT_REDUCTION_END;
3738 
3739 // TODO: implicit barrier: should be exposed
3740 #if OMPT_SUPPORT
3741     ompt_frame_t *ompt_frame;
3742     if (ompt_enabled.enabled) {
3743       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3744       if (ompt_frame->enter_frame.ptr == NULL)
3745         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3746       OMPT_STORE_RETURN_ADDRESS(global_tid);
3747     }
3748 #endif
3749 #if USE_ITT_NOTIFY
3750     __kmp_threads[global_tid]->th.th_ident = loc;
3751 #endif
3752     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3753 #if OMPT_SUPPORT && OMPT_OPTIONAL
3754     if (ompt_enabled.enabled) {
3755       ompt_frame->enter_frame = ompt_data_none;
3756     }
3757 #endif
3758 
3759   } else if (packed_reduction_method == empty_reduce_block) {
3760 
3761     OMPT_REDUCTION_END;
3762 
3763 // usage: if team size==1, no synchronization is required (Intel platforms only)
3764 
3765 // TODO: implicit barrier: should be exposed
3766 #if OMPT_SUPPORT
3767     ompt_frame_t *ompt_frame;
3768     if (ompt_enabled.enabled) {
3769       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3770       if (ompt_frame->enter_frame.ptr == NULL)
3771         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3772       OMPT_STORE_RETURN_ADDRESS(global_tid);
3773     }
3774 #endif
3775 #if USE_ITT_NOTIFY
3776     __kmp_threads[global_tid]->th.th_ident = loc;
3777 #endif
3778     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3779 #if OMPT_SUPPORT && OMPT_OPTIONAL
3780     if (ompt_enabled.enabled) {
3781       ompt_frame->enter_frame = ompt_data_none;
3782     }
3783 #endif
3784 
3785   } else if (packed_reduction_method == atomic_reduce_block) {
3786 
3787 #if OMPT_SUPPORT
3788     ompt_frame_t *ompt_frame;
3789     if (ompt_enabled.enabled) {
3790       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3791       if (ompt_frame->enter_frame.ptr == NULL)
3792         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3793       OMPT_STORE_RETURN_ADDRESS(global_tid);
3794     }
3795 #endif
3796 // TODO: implicit barrier: should be exposed
3797 #if USE_ITT_NOTIFY
3798     __kmp_threads[global_tid]->th.th_ident = loc;
3799 #endif
3800     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3801 #if OMPT_SUPPORT && OMPT_OPTIONAL
3802     if (ompt_enabled.enabled) {
3803       ompt_frame->enter_frame = ompt_data_none;
3804     }
3805 #endif
3806 
3807   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3808                                    tree_reduce_block)) {
3809 
3810     // only master executes here (master releases all other workers)
3811     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3812                             global_tid);
3813 
3814   } else {
3815 
3816     // should never reach this block
3817     KMP_ASSERT(0); // "unexpected method"
3818   }
3819   if (teams_swapped) {
3820     __kmp_restore_swapped_teams(th, team, task_state);
3821   }
3822 
3823   if (__kmp_env_consistency_check)
3824     __kmp_pop_sync(global_tid, ct_reduce, loc);
3825 
3826   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3827                 global_tid, packed_reduction_method));
3828 
3829   return;
3830 }
3831 
3832 #undef __KMP_GET_REDUCTION_METHOD
3833 #undef __KMP_SET_REDUCTION_METHOD
3834 
3835 /* end of interface to fast scalable reduce routines */
3836 
3837 kmp_uint64 __kmpc_get_taskid() {
3838 
3839   kmp_int32 gtid;
3840   kmp_info_t *thread;
3841 
3842   gtid = __kmp_get_gtid();
3843   if (gtid < 0) {
3844     return 0;
3845   }
3846   thread = __kmp_thread_from_gtid(gtid);
3847   return thread->th.th_current_task->td_task_id;
3848 
3849 } // __kmpc_get_taskid
3850 
3851 kmp_uint64 __kmpc_get_parent_taskid() {
3852 
3853   kmp_int32 gtid;
3854   kmp_info_t *thread;
3855   kmp_taskdata_t *parent_task;
3856 
3857   gtid = __kmp_get_gtid();
3858   if (gtid < 0) {
3859     return 0;
3860   }
3861   thread = __kmp_thread_from_gtid(gtid);
3862   parent_task = thread->th.th_current_task->td_parent;
3863   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3864 
3865 } // __kmpc_get_parent_taskid
3866 
3867 /*!
3868 @ingroup WORK_SHARING
3869 @param loc  source location information.
3870 @param gtid  global thread number.
3871 @param num_dims  number of associated doacross loops.
3872 @param dims  info on loops bounds.
3873 
3874 Initialize doacross loop information.
3875 Expect compiler send us inclusive bounds,
3876 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3877 */
3878 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3879                           const struct kmp_dim *dims) {
3880   __kmp_assert_valid_gtid(gtid);
3881   int j, idx;
3882   kmp_int64 last, trace_count;
3883   kmp_info_t *th = __kmp_threads[gtid];
3884   kmp_team_t *team = th->th.th_team;
3885   kmp_uint32 *flags;
3886   kmp_disp_t *pr_buf = th->th.th_dispatch;
3887   dispatch_shared_info_t *sh_buf;
3888 
3889   KA_TRACE(
3890       20,
3891       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3892        gtid, num_dims, !team->t.t_serialized));
3893   KMP_DEBUG_ASSERT(dims != NULL);
3894   KMP_DEBUG_ASSERT(num_dims > 0);
3895 
3896   if (team->t.t_serialized) {
3897     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3898     return; // no dependencies if team is serialized
3899   }
3900   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3901   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3902   // the next loop
3903   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3904 
3905   // Save bounds info into allocated private buffer
3906   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3907   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3908       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3909   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3910   pr_buf->th_doacross_info[0] =
3911       (kmp_int64)num_dims; // first element is number of dimensions
3912   // Save also address of num_done in order to access it later without knowing
3913   // the buffer index
3914   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3915   pr_buf->th_doacross_info[2] = dims[0].lo;
3916   pr_buf->th_doacross_info[3] = dims[0].up;
3917   pr_buf->th_doacross_info[4] = dims[0].st;
3918   last = 5;
3919   for (j = 1; j < num_dims; ++j) {
3920     kmp_int64
3921         range_length; // To keep ranges of all dimensions but the first dims[0]
3922     if (dims[j].st == 1) { // most common case
3923       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3924       range_length = dims[j].up - dims[j].lo + 1;
3925     } else {
3926       if (dims[j].st > 0) {
3927         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3928         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3929       } else { // negative increment
3930         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3931         range_length =
3932             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3933       }
3934     }
3935     pr_buf->th_doacross_info[last++] = range_length;
3936     pr_buf->th_doacross_info[last++] = dims[j].lo;
3937     pr_buf->th_doacross_info[last++] = dims[j].up;
3938     pr_buf->th_doacross_info[last++] = dims[j].st;
3939   }
3940 
3941   // Compute total trip count.
3942   // Start with range of dims[0] which we don't need to keep in the buffer.
3943   if (dims[0].st == 1) { // most common case
3944     trace_count = dims[0].up - dims[0].lo + 1;
3945   } else if (dims[0].st > 0) {
3946     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3947     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3948   } else { // negative increment
3949     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3950     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3951   }
3952   for (j = 1; j < num_dims; ++j) {
3953     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3954   }
3955   KMP_DEBUG_ASSERT(trace_count > 0);
3956 
3957   // Check if shared buffer is not occupied by other loop (idx -
3958   // __kmp_dispatch_num_buffers)
3959   if (idx != sh_buf->doacross_buf_idx) {
3960     // Shared buffer is occupied, wait for it to be free
3961     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3962                  __kmp_eq_4, NULL);
3963   }
3964 #if KMP_32_BIT_ARCH
3965   // Check if we are the first thread. After the CAS the first thread gets 0,
3966   // others get 1 if initialization is in progress, allocated pointer otherwise.
3967   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3968   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3969       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3970 #else
3971   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3972       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3973 #endif
3974   if (flags == NULL) {
3975     // we are the first thread, allocate the array of flags
3976     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3977     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3978     KMP_MB();
3979     sh_buf->doacross_flags = flags;
3980   } else if (flags == (kmp_uint32 *)1) {
3981 #if KMP_32_BIT_ARCH
3982     // initialization is still in progress, need to wait
3983     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3984 #else
3985     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3986 #endif
3987       KMP_YIELD(TRUE);
3988     KMP_MB();
3989   } else {
3990     KMP_MB();
3991   }
3992   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3993   pr_buf->th_doacross_flags =
3994       sh_buf->doacross_flags; // save private copy in order to not
3995   // touch shared buffer on each iteration
3996   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3997 }
3998 
3999 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4000   __kmp_assert_valid_gtid(gtid);
4001   kmp_int32 shft, num_dims, i;
4002   kmp_uint32 flag;
4003   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4004   kmp_info_t *th = __kmp_threads[gtid];
4005   kmp_team_t *team = th->th.th_team;
4006   kmp_disp_t *pr_buf;
4007   kmp_int64 lo, up, st;
4008 
4009   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4010   if (team->t.t_serialized) {
4011     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4012     return; // no dependencies if team is serialized
4013   }
4014 
4015   // calculate sequential iteration number and check out-of-bounds condition
4016   pr_buf = th->th.th_dispatch;
4017   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4018   num_dims = pr_buf->th_doacross_info[0];
4019   lo = pr_buf->th_doacross_info[2];
4020   up = pr_buf->th_doacross_info[3];
4021   st = pr_buf->th_doacross_info[4];
4022 #if OMPT_SUPPORT && OMPT_OPTIONAL
4023   ompt_dependence_t deps[num_dims];
4024 #endif
4025   if (st == 1) { // most common case
4026     if (vec[0] < lo || vec[0] > up) {
4027       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4028                     "bounds [%lld,%lld]\n",
4029                     gtid, vec[0], lo, up));
4030       return;
4031     }
4032     iter_number = vec[0] - lo;
4033   } else if (st > 0) {
4034     if (vec[0] < lo || vec[0] > up) {
4035       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4036                     "bounds [%lld,%lld]\n",
4037                     gtid, vec[0], lo, up));
4038       return;
4039     }
4040     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4041   } else { // negative increment
4042     if (vec[0] > lo || vec[0] < up) {
4043       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4044                     "bounds [%lld,%lld]\n",
4045                     gtid, vec[0], lo, up));
4046       return;
4047     }
4048     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4049   }
4050 #if OMPT_SUPPORT && OMPT_OPTIONAL
4051   deps[0].variable.value = iter_number;
4052   deps[0].dependence_type = ompt_dependence_type_sink;
4053 #endif
4054   for (i = 1; i < num_dims; ++i) {
4055     kmp_int64 iter, ln;
4056     kmp_int32 j = i * 4;
4057     ln = pr_buf->th_doacross_info[j + 1];
4058     lo = pr_buf->th_doacross_info[j + 2];
4059     up = pr_buf->th_doacross_info[j + 3];
4060     st = pr_buf->th_doacross_info[j + 4];
4061     if (st == 1) {
4062       if (vec[i] < lo || vec[i] > up) {
4063         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4064                       "bounds [%lld,%lld]\n",
4065                       gtid, vec[i], lo, up));
4066         return;
4067       }
4068       iter = vec[i] - lo;
4069     } else if (st > 0) {
4070       if (vec[i] < lo || vec[i] > up) {
4071         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4072                       "bounds [%lld,%lld]\n",
4073                       gtid, vec[i], lo, up));
4074         return;
4075       }
4076       iter = (kmp_uint64)(vec[i] - lo) / st;
4077     } else { // st < 0
4078       if (vec[i] > lo || vec[i] < up) {
4079         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4080                       "bounds [%lld,%lld]\n",
4081                       gtid, vec[i], lo, up));
4082         return;
4083       }
4084       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4085     }
4086     iter_number = iter + ln * iter_number;
4087 #if OMPT_SUPPORT && OMPT_OPTIONAL
4088     deps[i].variable.value = iter;
4089     deps[i].dependence_type = ompt_dependence_type_sink;
4090 #endif
4091   }
4092   shft = iter_number % 32; // use 32-bit granularity
4093   iter_number >>= 5; // divided by 32
4094   flag = 1 << shft;
4095   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4096     KMP_YIELD(TRUE);
4097   }
4098   KMP_MB();
4099 #if OMPT_SUPPORT && OMPT_OPTIONAL
4100   if (ompt_enabled.ompt_callback_dependences) {
4101     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4102         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4103   }
4104 #endif
4105   KA_TRACE(20,
4106            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4107             gtid, (iter_number << 5) + shft));
4108 }
4109 
4110 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4111   __kmp_assert_valid_gtid(gtid);
4112   kmp_int32 shft, num_dims, i;
4113   kmp_uint32 flag;
4114   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4115   kmp_info_t *th = __kmp_threads[gtid];
4116   kmp_team_t *team = th->th.th_team;
4117   kmp_disp_t *pr_buf;
4118   kmp_int64 lo, st;
4119 
4120   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4121   if (team->t.t_serialized) {
4122     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4123     return; // no dependencies if team is serialized
4124   }
4125 
4126   // calculate sequential iteration number (same as in "wait" but no
4127   // out-of-bounds checks)
4128   pr_buf = th->th.th_dispatch;
4129   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4130   num_dims = pr_buf->th_doacross_info[0];
4131   lo = pr_buf->th_doacross_info[2];
4132   st = pr_buf->th_doacross_info[4];
4133 #if OMPT_SUPPORT && OMPT_OPTIONAL
4134   ompt_dependence_t deps[num_dims];
4135 #endif
4136   if (st == 1) { // most common case
4137     iter_number = vec[0] - lo;
4138   } else if (st > 0) {
4139     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4140   } else { // negative increment
4141     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4142   }
4143 #if OMPT_SUPPORT && OMPT_OPTIONAL
4144   deps[0].variable.value = iter_number;
4145   deps[0].dependence_type = ompt_dependence_type_source;
4146 #endif
4147   for (i = 1; i < num_dims; ++i) {
4148     kmp_int64 iter, ln;
4149     kmp_int32 j = i * 4;
4150     ln = pr_buf->th_doacross_info[j + 1];
4151     lo = pr_buf->th_doacross_info[j + 2];
4152     st = pr_buf->th_doacross_info[j + 4];
4153     if (st == 1) {
4154       iter = vec[i] - lo;
4155     } else if (st > 0) {
4156       iter = (kmp_uint64)(vec[i] - lo) / st;
4157     } else { // st < 0
4158       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4159     }
4160     iter_number = iter + ln * iter_number;
4161 #if OMPT_SUPPORT && OMPT_OPTIONAL
4162     deps[i].variable.value = iter;
4163     deps[i].dependence_type = ompt_dependence_type_source;
4164 #endif
4165   }
4166 #if OMPT_SUPPORT && OMPT_OPTIONAL
4167   if (ompt_enabled.ompt_callback_dependences) {
4168     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4169         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4170   }
4171 #endif
4172   shft = iter_number % 32; // use 32-bit granularity
4173   iter_number >>= 5; // divided by 32
4174   flag = 1 << shft;
4175   KMP_MB();
4176   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4177     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4178   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4179                 (iter_number << 5) + shft));
4180 }
4181 
4182 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4183   __kmp_assert_valid_gtid(gtid);
4184   kmp_int32 num_done;
4185   kmp_info_t *th = __kmp_threads[gtid];
4186   kmp_team_t *team = th->th.th_team;
4187   kmp_disp_t *pr_buf = th->th.th_dispatch;
4188 
4189   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4190   if (team->t.t_serialized) {
4191     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4192     return; // nothing to do
4193   }
4194   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4195   if (num_done == th->th.th_team_nproc) {
4196     // we are the last thread, need to free shared resources
4197     int idx = pr_buf->th_doacross_buf_idx - 1;
4198     dispatch_shared_info_t *sh_buf =
4199         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4200     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4201                      (kmp_int64)&sh_buf->doacross_num_done);
4202     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4203     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4204     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4205     sh_buf->doacross_flags = NULL;
4206     sh_buf->doacross_num_done = 0;
4207     sh_buf->doacross_buf_idx +=
4208         __kmp_dispatch_num_buffers; // free buffer for future re-use
4209   }
4210   // free private resources (need to keep buffer index forever)
4211   pr_buf->th_doacross_flags = NULL;
4212   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4213   pr_buf->th_doacross_info = NULL;
4214   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4215 }
4216 
4217 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4218 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4219   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4220 }
4221 
4222 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4223   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4224 }
4225 
4226 int __kmpc_get_target_offload(void) {
4227   if (!__kmp_init_serial) {
4228     __kmp_serial_initialize();
4229   }
4230   return __kmp_target_offload;
4231 }
4232 
4233 int __kmpc_pause_resource(kmp_pause_status_t level) {
4234   if (!__kmp_init_serial) {
4235     return 1; // Can't pause if runtime is not initialized
4236   }
4237   return __kmp_pause_resource(level);
4238 }
4239