1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300       OMPT_STORE_RETURN_ADDRESS(gtid);
301     }
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319                     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327   } else {
328     KMP_POP_PARTITIONED_TIMER();
329   }
330 #endif // KMP_STATS_ENABLED
331 }
332 
333 /*!
334 @ingroup PARALLEL
335 @param loc source location information
336 @param global_tid global thread number
337 @param num_teams number of teams requested for the teams construct
338 @param num_threads number of threads per team requested for the teams construct
339 
340 Set the number of teams to be used by the teams construct.
341 This call is only required if the teams construct has a `num_teams` clause
342 or a `thread_limit` clause (or both).
343 */
344 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
345                            kmp_int32 num_teams, kmp_int32 num_threads) {
346   KA_TRACE(20,
347            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
348             global_tid, num_teams, num_threads));
349   __kmp_assert_valid_gtid(global_tid);
350   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
351 }
352 
353 /*!
354 @ingroup PARALLEL
355 @param loc  source location information
356 @param argc  total number of arguments in the ellipsis
357 @param microtask  pointer to callback routine consisting of outlined teams
358 construct
359 @param ...  pointers to shared variables that aren't global
360 
361 Do the actual fork and call the microtask in the relevant number of threads.
362 */
363 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
364                        ...) {
365   int gtid = __kmp_entry_gtid();
366   kmp_info_t *this_thr = __kmp_threads[gtid];
367   va_list ap;
368   va_start(ap, microtask);
369 
370 #if KMP_STATS_ENABLED
371   KMP_COUNT_BLOCK(OMP_TEAMS);
372   stats_state_e previous_state = KMP_GET_THREAD_STATE();
373   if (previous_state == stats_state_e::SERIAL_REGION) {
374     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
375   } else {
376     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
377   }
378 #endif
379 
380   // remember teams entry point and nesting level
381   this_thr->th.th_teams_microtask = microtask;
382   this_thr->th.th_teams_level =
383       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
384 
385 #if OMPT_SUPPORT
386   kmp_team_t *parent_team = this_thr->th.th_team;
387   int tid = __kmp_tid_from_gtid(gtid);
388   if (ompt_enabled.enabled) {
389     parent_team->t.t_implicit_task_taskdata[tid]
390         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
391   }
392   OMPT_STORE_RETURN_ADDRESS(gtid);
393 #endif
394 
395   // check if __kmpc_push_num_teams called, set default number of teams
396   // otherwise
397   if (this_thr->th.th_teams_size.nteams == 0) {
398     __kmp_push_num_teams(loc, gtid, 0, 0);
399   }
400   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
401   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
402   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
403 
404   __kmp_fork_call(
405       loc, gtid, fork_context_intel, argc,
406       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
407       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
408   __kmp_join_call(loc, gtid
409 #if OMPT_SUPPORT
410                   ,
411                   fork_context_intel
412 #endif
413                   );
414 
415   // Pop current CG root off list
416   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
417   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
418   this_thr->th.th_cg_roots = tmp->up;
419   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
420                  " to node %p. cg_nthreads was %d\n",
421                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
422   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
423   int i = tmp->cg_nthreads--;
424   if (i == 1) { // check is we are the last thread in CG (not always the case)
425     __kmp_free(tmp);
426   }
427   // Restore current task's thread_limit from CG root
428   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
429   this_thr->th.th_current_task->td_icvs.thread_limit =
430       this_thr->th.th_cg_roots->cg_thread_limit;
431 
432   this_thr->th.th_teams_microtask = NULL;
433   this_thr->th.th_teams_level = 0;
434   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
435   va_end(ap);
436 #if KMP_STATS_ENABLED
437   if (previous_state == stats_state_e::SERIAL_REGION) {
438     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
439   } else {
440     KMP_POP_PARTITIONED_TIMER();
441   }
442 #endif // KMP_STATS_ENABLED
443 }
444 
445 // I don't think this function should ever have been exported.
446 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
447 // openmp code ever called it, but it's been exported from the RTL for so
448 // long that I'm afraid to remove the definition.
449 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
450 
451 /*!
452 @ingroup PARALLEL
453 @param loc  source location information
454 @param global_tid  global thread number
455 
456 Enter a serialized parallel construct. This interface is used to handle a
457 conditional parallel region, like this,
458 @code
459 #pragma omp parallel if (condition)
460 @endcode
461 when the condition is false.
462 */
463 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
464   // The implementation is now in kmp_runtime.cpp so that it can share static
465   // functions with kmp_fork_call since the tasks to be done are similar in
466   // each case.
467   __kmp_assert_valid_gtid(global_tid);
468 #if OMPT_SUPPORT
469   OMPT_STORE_RETURN_ADDRESS(global_tid);
470 #endif
471   __kmp_serialized_parallel(loc, global_tid);
472 }
473 
474 /*!
475 @ingroup PARALLEL
476 @param loc  source location information
477 @param global_tid  global thread number
478 
479 Leave a serialized parallel construct.
480 */
481 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482   kmp_internal_control_t *top;
483   kmp_info_t *this_thr;
484   kmp_team_t *serial_team;
485 
486   KC_TRACE(10,
487            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
488 
489   /* skip all this code for autopar serialized loops since it results in
490      unacceptable overhead */
491   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
492     return;
493 
494   // Not autopar code
495   __kmp_assert_valid_gtid(global_tid);
496   if (!TCR_4(__kmp_init_parallel))
497     __kmp_parallel_initialize();
498 
499   __kmp_resume_if_soft_paused();
500 
501   this_thr = __kmp_threads[global_tid];
502   serial_team = this_thr->th.th_serial_team;
503 
504   kmp_task_team_t *task_team = this_thr->th.th_task_team;
505   // we need to wait for the proxy tasks before finishing the thread
506   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
507     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
508 
509   KMP_MB();
510   KMP_DEBUG_ASSERT(serial_team);
511   KMP_ASSERT(serial_team->t.t_serialized);
512   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
513   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
514   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
515   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
516 
517 #if OMPT_SUPPORT
518   if (ompt_enabled.enabled &&
519       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
520     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
521     if (ompt_enabled.ompt_callback_implicit_task) {
522       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
523           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
524           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
525     }
526 
527     // reset clear the task id only after unlinking the task
528     ompt_data_t *parent_task_data;
529     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
530 
531     if (ompt_enabled.ompt_callback_parallel_end) {
532       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
533           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
534           ompt_parallel_invoker_program | ompt_parallel_team,
535           OMPT_LOAD_RETURN_ADDRESS(global_tid));
536     }
537     __ompt_lw_taskteam_unlink(this_thr);
538     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
539   }
540 #endif
541 
542   /* If necessary, pop the internal control stack values and replace the team
543    * values */
544   top = serial_team->t.t_control_stack_top;
545   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
546     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
547     serial_team->t.t_control_stack_top = top->next;
548     __kmp_free(top);
549   }
550 
551   // if( serial_team -> t.t_serialized > 1 )
552   serial_team->t.t_level--;
553 
554   /* pop dispatch buffers stack */
555   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
556   {
557     dispatch_private_info_t *disp_buffer =
558         serial_team->t.t_dispatch->th_disp_buffer;
559     serial_team->t.t_dispatch->th_disp_buffer =
560         serial_team->t.t_dispatch->th_disp_buffer->next;
561     __kmp_free(disp_buffer);
562   }
563   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
564 
565   --serial_team->t.t_serialized;
566   if (serial_team->t.t_serialized == 0) {
567 
568 /* return to the parallel section */
569 
570 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
571     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
572       __kmp_clear_x87_fpu_status_word();
573       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
574       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
575     }
576 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
577 
578     this_thr->th.th_team = serial_team->t.t_parent;
579     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
580 
581     /* restore values cached in the thread */
582     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
583     this_thr->th.th_team_master =
584         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
585     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
586 
587     /* TODO the below shouldn't need to be adjusted for serialized teams */
588     this_thr->th.th_dispatch =
589         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
590 
591     __kmp_pop_current_task_from_thread(this_thr);
592 
593     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
594     this_thr->th.th_current_task->td_flags.executing = 1;
595 
596     if (__kmp_tasking_mode != tskm_immediate_exec) {
597       // Copy the task team from the new child / old parent team to the thread.
598       this_thr->th.th_task_team =
599           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
600       KA_TRACE(20,
601                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
602                 "team %p\n",
603                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
604     }
605   } else {
606     if (__kmp_tasking_mode != tskm_immediate_exec) {
607       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
608                     "depth of serial team %p to %d\n",
609                     global_tid, serial_team, serial_team->t.t_serialized));
610     }
611   }
612 
613   if (__kmp_env_consistency_check)
614     __kmp_pop_parallel(global_tid, NULL);
615 #if OMPT_SUPPORT
616   if (ompt_enabled.enabled)
617     this_thr->th.ompt_thread_info.state =
618         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
619                                            : ompt_state_work_parallel);
620 #endif
621 }
622 
623 /*!
624 @ingroup SYNCHRONIZATION
625 @param loc  source location information.
626 
627 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
628 depending on the memory ordering convention obeyed by the compiler
629 even that may not be necessary).
630 */
631 void __kmpc_flush(ident_t *loc) {
632   KC_TRACE(10, ("__kmpc_flush: called\n"));
633 
634   /* need explicit __mf() here since use volatile instead in library */
635   KMP_MB(); /* Flush all pending memory write invalidates.  */
636 
637 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
638 #if KMP_MIC
639 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
640 // We shouldn't need it, though, since the ABI rules require that
641 // * If the compiler generates NGO stores it also generates the fence
642 // * If users hand-code NGO stores they should insert the fence
643 // therefore no incomplete unordered stores should be visible.
644 #else
645   // C74404
646   // This is to address non-temporal store instructions (sfence needed).
647   // The clflush instruction is addressed either (mfence needed).
648   // Probably the non-temporal load monvtdqa instruction should also be
649   // addressed.
650   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
651   if (!__kmp_cpuinfo.initialized) {
652     __kmp_query_cpuid(&__kmp_cpuinfo);
653   }
654   if (!__kmp_cpuinfo.sse2) {
655     // CPU cannot execute SSE2 instructions.
656   } else {
657 #if KMP_COMPILER_ICC
658     _mm_mfence();
659 #elif KMP_COMPILER_MSVC
660     MemoryBarrier();
661 #else
662     __sync_synchronize();
663 #endif // KMP_COMPILER_ICC
664   }
665 #endif // KMP_MIC
666 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
667        KMP_ARCH_RISCV64)
668 // Nothing to see here move along
669 #elif KMP_ARCH_PPC64
670 // Nothing needed here (we have a real MB above).
671 #else
672 #error Unknown or unsupported architecture
673 #endif
674 
675 #if OMPT_SUPPORT && OMPT_OPTIONAL
676   if (ompt_enabled.ompt_callback_flush) {
677     ompt_callbacks.ompt_callback(ompt_callback_flush)(
678         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
679   }
680 #endif
681 }
682 
683 /* -------------------------------------------------------------------------- */
684 /*!
685 @ingroup SYNCHRONIZATION
686 @param loc source location information
687 @param global_tid thread id.
688 
689 Execute a barrier.
690 */
691 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
692   KMP_COUNT_BLOCK(OMP_BARRIER);
693   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
694   __kmp_assert_valid_gtid(global_tid);
695 
696   if (!TCR_4(__kmp_init_parallel))
697     __kmp_parallel_initialize();
698 
699   __kmp_resume_if_soft_paused();
700 
701   if (__kmp_env_consistency_check) {
702     if (loc == 0) {
703       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
704     }
705     __kmp_check_barrier(global_tid, ct_barrier, loc);
706   }
707 
708 #if OMPT_SUPPORT
709   ompt_frame_t *ompt_frame;
710   if (ompt_enabled.enabled) {
711     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
712     if (ompt_frame->enter_frame.ptr == NULL)
713       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
714     OMPT_STORE_RETURN_ADDRESS(global_tid);
715   }
716 #endif
717   __kmp_threads[global_tid]->th.th_ident = loc;
718   // TODO: explicit barrier_wait_id:
719   //   this function is called when 'barrier' directive is present or
720   //   implicit barrier at the end of a worksharing construct.
721   // 1) better to add a per-thread barrier counter to a thread data structure
722   // 2) set to 0 when a new team is created
723   // 4) no sync is required
724 
725   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
726 #if OMPT_SUPPORT && OMPT_OPTIONAL
727   if (ompt_enabled.enabled) {
728     ompt_frame->enter_frame = ompt_data_none;
729   }
730 #endif
731 }
732 
733 /* The BARRIER for a MASTER section is always explicit   */
734 /*!
735 @ingroup WORK_SHARING
736 @param loc  source location information.
737 @param global_tid  global thread number .
738 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
739 */
740 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
741   int status = 0;
742 
743   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
744   __kmp_assert_valid_gtid(global_tid);
745 
746   if (!TCR_4(__kmp_init_parallel))
747     __kmp_parallel_initialize();
748 
749   __kmp_resume_if_soft_paused();
750 
751   if (KMP_MASTER_GTID(global_tid)) {
752     KMP_COUNT_BLOCK(OMP_MASTER);
753     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
754     status = 1;
755   }
756 
757 #if OMPT_SUPPORT && OMPT_OPTIONAL
758   if (status) {
759     if (ompt_enabled.ompt_callback_master) {
760       kmp_info_t *this_thr = __kmp_threads[global_tid];
761       kmp_team_t *team = this_thr->th.th_team;
762 
763       int tid = __kmp_tid_from_gtid(global_tid);
764       ompt_callbacks.ompt_callback(ompt_callback_master)(
765           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
766           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
767           OMPT_GET_RETURN_ADDRESS(0));
768     }
769   }
770 #endif
771 
772   if (__kmp_env_consistency_check) {
773 #if KMP_USE_DYNAMIC_LOCK
774     if (status)
775       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
776     else
777       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
778 #else
779     if (status)
780       __kmp_push_sync(global_tid, ct_master, loc, NULL);
781     else
782       __kmp_check_sync(global_tid, ct_master, loc, NULL);
783 #endif
784   }
785 
786   return status;
787 }
788 
789 /*!
790 @ingroup WORK_SHARING
791 @param loc  source location information.
792 @param global_tid  global thread number .
793 
794 Mark the end of a <tt>master</tt> region. This should only be called by the
795 thread that executes the <tt>master</tt> region.
796 */
797 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
798   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
799   __kmp_assert_valid_gtid(global_tid);
800   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
801   KMP_POP_PARTITIONED_TIMER();
802 
803 #if OMPT_SUPPORT && OMPT_OPTIONAL
804   kmp_info_t *this_thr = __kmp_threads[global_tid];
805   kmp_team_t *team = this_thr->th.th_team;
806   if (ompt_enabled.ompt_callback_master) {
807     int tid = __kmp_tid_from_gtid(global_tid);
808     ompt_callbacks.ompt_callback(ompt_callback_master)(
809         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
810         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
811         OMPT_GET_RETURN_ADDRESS(0));
812   }
813 #endif
814 
815   if (__kmp_env_consistency_check) {
816     if (KMP_MASTER_GTID(global_tid))
817       __kmp_pop_sync(global_tid, ct_master, loc);
818   }
819 }
820 
821 /*!
822 @ingroup WORK_SHARING
823 @param loc  source location information.
824 @param gtid  global thread number.
825 
826 Start execution of an <tt>ordered</tt> construct.
827 */
828 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
829   int cid = 0;
830   kmp_info_t *th;
831   KMP_DEBUG_ASSERT(__kmp_init_serial);
832 
833   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
834   __kmp_assert_valid_gtid(gtid);
835 
836   if (!TCR_4(__kmp_init_parallel))
837     __kmp_parallel_initialize();
838 
839   __kmp_resume_if_soft_paused();
840 
841 #if USE_ITT_BUILD
842   __kmp_itt_ordered_prep(gtid);
843 // TODO: ordered_wait_id
844 #endif /* USE_ITT_BUILD */
845 
846   th = __kmp_threads[gtid];
847 
848 #if OMPT_SUPPORT && OMPT_OPTIONAL
849   kmp_team_t *team;
850   ompt_wait_id_t lck;
851   void *codeptr_ra;
852   if (ompt_enabled.enabled) {
853     OMPT_STORE_RETURN_ADDRESS(gtid);
854     team = __kmp_team_from_gtid(gtid);
855     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
856     /* OMPT state update */
857     th->th.ompt_thread_info.wait_id = lck;
858     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
859 
860     /* OMPT event callback */
861     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
862     if (ompt_enabled.ompt_callback_mutex_acquire) {
863       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
864           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
865           codeptr_ra);
866     }
867   }
868 #endif
869 
870   if (th->th.th_dispatch->th_deo_fcn != 0)
871     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
872   else
873     __kmp_parallel_deo(&gtid, &cid, loc);
874 
875 #if OMPT_SUPPORT && OMPT_OPTIONAL
876   if (ompt_enabled.enabled) {
877     /* OMPT state update */
878     th->th.ompt_thread_info.state = ompt_state_work_parallel;
879     th->th.ompt_thread_info.wait_id = 0;
880 
881     /* OMPT event callback */
882     if (ompt_enabled.ompt_callback_mutex_acquired) {
883       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
884           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
885     }
886   }
887 #endif
888 
889 #if USE_ITT_BUILD
890   __kmp_itt_ordered_start(gtid);
891 #endif /* USE_ITT_BUILD */
892 }
893 
894 /*!
895 @ingroup WORK_SHARING
896 @param loc  source location information.
897 @param gtid  global thread number.
898 
899 End execution of an <tt>ordered</tt> construct.
900 */
901 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
902   int cid = 0;
903   kmp_info_t *th;
904 
905   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
906   __kmp_assert_valid_gtid(gtid);
907 
908 #if USE_ITT_BUILD
909   __kmp_itt_ordered_end(gtid);
910 // TODO: ordered_wait_id
911 #endif /* USE_ITT_BUILD */
912 
913   th = __kmp_threads[gtid];
914 
915   if (th->th.th_dispatch->th_dxo_fcn != 0)
916     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
917   else
918     __kmp_parallel_dxo(&gtid, &cid, loc);
919 
920 #if OMPT_SUPPORT && OMPT_OPTIONAL
921   OMPT_STORE_RETURN_ADDRESS(gtid);
922   if (ompt_enabled.ompt_callback_mutex_released) {
923     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
924         ompt_mutex_ordered,
925         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
926             ->t.t_ordered.dt.t_value,
927         OMPT_LOAD_RETURN_ADDRESS(gtid));
928   }
929 #endif
930 }
931 
932 #if KMP_USE_DYNAMIC_LOCK
933 
934 static __forceinline void
935 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
936                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
937   // Pointer to the allocated indirect lock is written to crit, while indexing
938   // is ignored.
939   void *idx;
940   kmp_indirect_lock_t **lck;
941   lck = (kmp_indirect_lock_t **)crit;
942   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
943   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
944   KMP_SET_I_LOCK_LOCATION(ilk, loc);
945   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
946   KA_TRACE(20,
947            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
948 #if USE_ITT_BUILD
949   __kmp_itt_critical_creating(ilk->lock, loc);
950 #endif
951   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
952   if (status == 0) {
953 #if USE_ITT_BUILD
954     __kmp_itt_critical_destroyed(ilk->lock);
955 #endif
956     // We don't really need to destroy the unclaimed lock here since it will be
957     // cleaned up at program exit.
958     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
959   }
960   KMP_DEBUG_ASSERT(*lck != NULL);
961 }
962 
963 // Fast-path acquire tas lock
964 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
965   {                                                                            \
966     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
967     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
968     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
969     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
970         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
971       kmp_uint32 spins;                                                        \
972       KMP_FSYNC_PREPARE(l);                                                    \
973       KMP_INIT_YIELD(spins);                                                   \
974       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
975       do {                                                                     \
976         if (TCR_4(__kmp_nth) >                                                 \
977             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
978           KMP_YIELD(TRUE);                                                     \
979         } else {                                                               \
980           KMP_YIELD_SPIN(spins);                                               \
981         }                                                                      \
982         __kmp_spin_backoff(&backoff);                                          \
983       } while (                                                                \
984           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
985           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
986     }                                                                          \
987     KMP_FSYNC_ACQUIRED(l);                                                     \
988   }
989 
990 // Fast-path test tas lock
991 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
992   {                                                                            \
993     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
994     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
995     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
996     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
997          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
998   }
999 
1000 // Fast-path release tas lock
1001 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1002   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1003 
1004 #if KMP_USE_FUTEX
1005 
1006 #include <sys/syscall.h>
1007 #include <unistd.h>
1008 #ifndef FUTEX_WAIT
1009 #define FUTEX_WAIT 0
1010 #endif
1011 #ifndef FUTEX_WAKE
1012 #define FUTEX_WAKE 1
1013 #endif
1014 
1015 // Fast-path acquire futex lock
1016 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1017   {                                                                            \
1018     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1019     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1020     KMP_MB();                                                                  \
1021     KMP_FSYNC_PREPARE(ftx);                                                    \
1022     kmp_int32 poll_val;                                                        \
1023     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1024                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1025                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1026       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1027       if (!cond) {                                                             \
1028         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1029                                          poll_val |                            \
1030                                              KMP_LOCK_BUSY(1, futex))) {       \
1031           continue;                                                            \
1032         }                                                                      \
1033         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1034       }                                                                        \
1035       kmp_int32 rc;                                                            \
1036       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1037                         NULL, NULL, 0)) != 0) {                                \
1038         continue;                                                              \
1039       }                                                                        \
1040       gtid_code |= 1;                                                          \
1041     }                                                                          \
1042     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1043   }
1044 
1045 // Fast-path test futex lock
1046 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1047   {                                                                            \
1048     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1049     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1050                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1051       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1052       rc = TRUE;                                                               \
1053     } else {                                                                   \
1054       rc = FALSE;                                                              \
1055     }                                                                          \
1056   }
1057 
1058 // Fast-path release futex lock
1059 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1060   {                                                                            \
1061     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1062     KMP_MB();                                                                  \
1063     KMP_FSYNC_RELEASING(ftx);                                                  \
1064     kmp_int32 poll_val =                                                       \
1065         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1066     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1067       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1068               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1069     }                                                                          \
1070     KMP_MB();                                                                  \
1071     KMP_YIELD_OVERSUB();                                                       \
1072   }
1073 
1074 #endif // KMP_USE_FUTEX
1075 
1076 #else // KMP_USE_DYNAMIC_LOCK
1077 
1078 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1079                                                       ident_t const *loc,
1080                                                       kmp_int32 gtid) {
1081   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1082 
1083   // Because of the double-check, the following load doesn't need to be volatile
1084   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1085 
1086   if (lck == NULL) {
1087     void *idx;
1088 
1089     // Allocate & initialize the lock.
1090     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1091     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1092     __kmp_init_user_lock_with_checks(lck);
1093     __kmp_set_user_lock_location(lck, loc);
1094 #if USE_ITT_BUILD
1095     __kmp_itt_critical_creating(lck);
1096 // __kmp_itt_critical_creating() should be called *before* the first usage
1097 // of underlying lock. It is the only place where we can guarantee it. There
1098 // are chances the lock will destroyed with no usage, but it is not a
1099 // problem, because this is not real event seen by user but rather setting
1100 // name for object (lock). See more details in kmp_itt.h.
1101 #endif /* USE_ITT_BUILD */
1102 
1103     // Use a cmpxchg instruction to slam the start of the critical section with
1104     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1105     // and use the lock that the other thread allocated.
1106     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1107 
1108     if (status == 0) {
1109 // Deallocate the lock and reload the value.
1110 #if USE_ITT_BUILD
1111       __kmp_itt_critical_destroyed(lck);
1112 // Let ITT know the lock is destroyed and the same memory location may be reused
1113 // for another purpose.
1114 #endif /* USE_ITT_BUILD */
1115       __kmp_destroy_user_lock_with_checks(lck);
1116       __kmp_user_lock_free(&idx, gtid, lck);
1117       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1118       KMP_DEBUG_ASSERT(lck != NULL);
1119     }
1120   }
1121   return lck;
1122 }
1123 
1124 #endif // KMP_USE_DYNAMIC_LOCK
1125 
1126 /*!
1127 @ingroup WORK_SHARING
1128 @param loc  source location information.
1129 @param global_tid  global thread number.
1130 @param crit identity of the critical section. This could be a pointer to a lock
1131 associated with the critical section, or some other suitably unique value.
1132 
1133 Enter code protected by a `critical` construct.
1134 This function blocks until the executing thread can enter the critical section.
1135 */
1136 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1137                      kmp_critical_name *crit) {
1138 #if KMP_USE_DYNAMIC_LOCK
1139 #if OMPT_SUPPORT && OMPT_OPTIONAL
1140   OMPT_STORE_RETURN_ADDRESS(global_tid);
1141 #endif // OMPT_SUPPORT
1142   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1143 #else
1144   KMP_COUNT_BLOCK(OMP_CRITICAL);
1145 #if OMPT_SUPPORT && OMPT_OPTIONAL
1146   ompt_state_t prev_state = ompt_state_undefined;
1147   ompt_thread_info_t ti;
1148 #endif
1149   kmp_user_lock_p lck;
1150 
1151   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1152   __kmp_assert_valid_gtid(global_tid);
1153 
1154   // TODO: add THR_OVHD_STATE
1155 
1156   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1157   KMP_CHECK_USER_LOCK_INIT();
1158 
1159   if ((__kmp_user_lock_kind == lk_tas) &&
1160       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1161     lck = (kmp_user_lock_p)crit;
1162   }
1163 #if KMP_USE_FUTEX
1164   else if ((__kmp_user_lock_kind == lk_futex) &&
1165            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1166     lck = (kmp_user_lock_p)crit;
1167   }
1168 #endif
1169   else { // ticket, queuing or drdpa
1170     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1171   }
1172 
1173   if (__kmp_env_consistency_check)
1174     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1175 
1176 // since the critical directive binds to all threads, not just the current
1177 // team we have to check this even if we are in a serialized team.
1178 // also, even if we are the uber thread, we still have to conduct the lock,
1179 // as we have to contend with sibling threads.
1180 
1181 #if USE_ITT_BUILD
1182   __kmp_itt_critical_acquiring(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185   OMPT_STORE_RETURN_ADDRESS(gtid);
1186   void *codeptr_ra = NULL;
1187   if (ompt_enabled.enabled) {
1188     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1189     /* OMPT state update */
1190     prev_state = ti.state;
1191     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1192     ti.state = ompt_state_wait_critical;
1193 
1194     /* OMPT event callback */
1195     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1196     if (ompt_enabled.ompt_callback_mutex_acquire) {
1197       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1198           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1199           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1200     }
1201   }
1202 #endif
1203   // Value of 'crit' should be good for using as a critical_id of the critical
1204   // section directive.
1205   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1206 
1207 #if USE_ITT_BUILD
1208   __kmp_itt_critical_acquired(lck);
1209 #endif /* USE_ITT_BUILD */
1210 #if OMPT_SUPPORT && OMPT_OPTIONAL
1211   if (ompt_enabled.enabled) {
1212     /* OMPT state update */
1213     ti.state = prev_state;
1214     ti.wait_id = 0;
1215 
1216     /* OMPT event callback */
1217     if (ompt_enabled.ompt_callback_mutex_acquired) {
1218       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1219           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1220     }
1221   }
1222 #endif
1223   KMP_POP_PARTITIONED_TIMER();
1224 
1225   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1226   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1227 #endif // KMP_USE_DYNAMIC_LOCK
1228 }
1229 
1230 #if KMP_USE_DYNAMIC_LOCK
1231 
1232 // Converts the given hint to an internal lock implementation
1233 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1234 #if KMP_USE_TSX
1235 #define KMP_TSX_LOCK(seq) lockseq_##seq
1236 #else
1237 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1238 #endif
1239 
1240 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1241 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1242 #else
1243 #define KMP_CPUINFO_RTM 0
1244 #endif
1245 
1246   // Hints that do not require further logic
1247   if (hint & kmp_lock_hint_hle)
1248     return KMP_TSX_LOCK(hle);
1249   if (hint & kmp_lock_hint_rtm)
1250     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1251   if (hint & kmp_lock_hint_adaptive)
1252     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1253 
1254   // Rule out conflicting hints first by returning the default lock
1255   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1256     return __kmp_user_lock_seq;
1257   if ((hint & omp_lock_hint_speculative) &&
1258       (hint & omp_lock_hint_nonspeculative))
1259     return __kmp_user_lock_seq;
1260 
1261   // Do not even consider speculation when it appears to be contended
1262   if (hint & omp_lock_hint_contended)
1263     return lockseq_queuing;
1264 
1265   // Uncontended lock without speculation
1266   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1267     return lockseq_tas;
1268 
1269   // HLE lock for speculation
1270   if (hint & omp_lock_hint_speculative)
1271     return KMP_TSX_LOCK(hle);
1272 
1273   return __kmp_user_lock_seq;
1274 }
1275 
1276 #if OMPT_SUPPORT && OMPT_OPTIONAL
1277 #if KMP_USE_DYNAMIC_LOCK
1278 static kmp_mutex_impl_t
1279 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1280   if (user_lock) {
1281     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1282     case 0:
1283       break;
1284 #if KMP_USE_FUTEX
1285     case locktag_futex:
1286       return kmp_mutex_impl_queuing;
1287 #endif
1288     case locktag_tas:
1289       return kmp_mutex_impl_spin;
1290 #if KMP_USE_TSX
1291     case locktag_hle:
1292       return kmp_mutex_impl_speculative;
1293 #endif
1294     default:
1295       return kmp_mutex_impl_none;
1296     }
1297     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1298   }
1299   KMP_ASSERT(ilock);
1300   switch (ilock->type) {
1301 #if KMP_USE_TSX
1302   case locktag_adaptive:
1303   case locktag_rtm:
1304     return kmp_mutex_impl_speculative;
1305 #endif
1306   case locktag_nested_tas:
1307     return kmp_mutex_impl_spin;
1308 #if KMP_USE_FUTEX
1309   case locktag_nested_futex:
1310 #endif
1311   case locktag_ticket:
1312   case locktag_queuing:
1313   case locktag_drdpa:
1314   case locktag_nested_ticket:
1315   case locktag_nested_queuing:
1316   case locktag_nested_drdpa:
1317     return kmp_mutex_impl_queuing;
1318   default:
1319     return kmp_mutex_impl_none;
1320   }
1321 }
1322 #else
1323 // For locks without dynamic binding
1324 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1325   switch (__kmp_user_lock_kind) {
1326   case lk_tas:
1327     return kmp_mutex_impl_spin;
1328 #if KMP_USE_FUTEX
1329   case lk_futex:
1330 #endif
1331   case lk_ticket:
1332   case lk_queuing:
1333   case lk_drdpa:
1334     return kmp_mutex_impl_queuing;
1335 #if KMP_USE_TSX
1336   case lk_hle:
1337   case lk_rtm:
1338   case lk_adaptive:
1339     return kmp_mutex_impl_speculative;
1340 #endif
1341   default:
1342     return kmp_mutex_impl_none;
1343   }
1344 }
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1347 
1348 /*!
1349 @ingroup WORK_SHARING
1350 @param loc  source location information.
1351 @param global_tid  global thread number.
1352 @param crit identity of the critical section. This could be a pointer to a lock
1353 associated with the critical section, or some other suitably unique value.
1354 @param hint the lock hint.
1355 
1356 Enter code protected by a `critical` construct with a hint. The hint value is
1357 used to suggest a lock implementation. This function blocks until the executing
1358 thread can enter the critical section unless the hint suggests use of
1359 speculative execution and the hardware supports it.
1360 */
1361 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1362                                kmp_critical_name *crit, uint32_t hint) {
1363   KMP_COUNT_BLOCK(OMP_CRITICAL);
1364   kmp_user_lock_p lck;
1365 #if OMPT_SUPPORT && OMPT_OPTIONAL
1366   ompt_state_t prev_state = ompt_state_undefined;
1367   ompt_thread_info_t ti;
1368   // This is the case, if called from __kmpc_critical:
1369   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1370   if (!codeptr)
1371     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1372 #endif
1373 
1374   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1375   __kmp_assert_valid_gtid(global_tid);
1376 
1377   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1378   // Check if it is initialized.
1379   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1380   if (*lk == 0) {
1381     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1382     if (KMP_IS_D_LOCK(lckseq)) {
1383       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1384                                   KMP_GET_D_TAG(lckseq));
1385     } else {
1386       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1387     }
1388   }
1389   // Branch for accessing the actual lock object and set operation. This
1390   // branching is inevitable since this lock initialization does not follow the
1391   // normal dispatch path (lock table is not used).
1392   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1393     lck = (kmp_user_lock_p)lk;
1394     if (__kmp_env_consistency_check) {
1395       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1396                       __kmp_map_hint_to_lock(hint));
1397     }
1398 #if USE_ITT_BUILD
1399     __kmp_itt_critical_acquiring(lck);
1400 #endif
1401 #if OMPT_SUPPORT && OMPT_OPTIONAL
1402     if (ompt_enabled.enabled) {
1403       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1404       /* OMPT state update */
1405       prev_state = ti.state;
1406       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1407       ti.state = ompt_state_wait_critical;
1408 
1409       /* OMPT event callback */
1410       if (ompt_enabled.ompt_callback_mutex_acquire) {
1411         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1412             ompt_mutex_critical, (unsigned int)hint,
1413             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1414             codeptr);
1415       }
1416     }
1417 #endif
1418 #if KMP_USE_INLINED_TAS
1419     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1420       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1421     } else
1422 #elif KMP_USE_INLINED_FUTEX
1423     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1424       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1425     } else
1426 #endif
1427     {
1428       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1429     }
1430   } else {
1431     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1432     lck = ilk->lock;
1433     if (__kmp_env_consistency_check) {
1434       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1435                       __kmp_map_hint_to_lock(hint));
1436     }
1437 #if USE_ITT_BUILD
1438     __kmp_itt_critical_acquiring(lck);
1439 #endif
1440 #if OMPT_SUPPORT && OMPT_OPTIONAL
1441     if (ompt_enabled.enabled) {
1442       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1443       /* OMPT state update */
1444       prev_state = ti.state;
1445       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1446       ti.state = ompt_state_wait_critical;
1447 
1448       /* OMPT event callback */
1449       if (ompt_enabled.ompt_callback_mutex_acquire) {
1450         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1451             ompt_mutex_critical, (unsigned int)hint,
1452             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1453             codeptr);
1454       }
1455     }
1456 #endif
1457     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1458   }
1459   KMP_POP_PARTITIONED_TIMER();
1460 
1461 #if USE_ITT_BUILD
1462   __kmp_itt_critical_acquired(lck);
1463 #endif /* USE_ITT_BUILD */
1464 #if OMPT_SUPPORT && OMPT_OPTIONAL
1465   if (ompt_enabled.enabled) {
1466     /* OMPT state update */
1467     ti.state = prev_state;
1468     ti.wait_id = 0;
1469 
1470     /* OMPT event callback */
1471     if (ompt_enabled.ompt_callback_mutex_acquired) {
1472       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1473           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1474     }
1475   }
1476 #endif
1477 
1478   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1479   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1480 } // __kmpc_critical_with_hint
1481 
1482 #endif // KMP_USE_DYNAMIC_LOCK
1483 
1484 /*!
1485 @ingroup WORK_SHARING
1486 @param loc  source location information.
1487 @param global_tid  global thread number .
1488 @param crit identity of the critical section. This could be a pointer to a lock
1489 associated with the critical section, or some other suitably unique value.
1490 
1491 Leave a critical section, releasing any lock that was held during its execution.
1492 */
1493 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1494                          kmp_critical_name *crit) {
1495   kmp_user_lock_p lck;
1496 
1497   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1498 
1499 #if KMP_USE_DYNAMIC_LOCK
1500   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1501     lck = (kmp_user_lock_p)crit;
1502     KMP_ASSERT(lck != NULL);
1503     if (__kmp_env_consistency_check) {
1504       __kmp_pop_sync(global_tid, ct_critical, loc);
1505     }
1506 #if USE_ITT_BUILD
1507     __kmp_itt_critical_releasing(lck);
1508 #endif
1509 #if KMP_USE_INLINED_TAS
1510     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1511       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1512     } else
1513 #elif KMP_USE_INLINED_FUTEX
1514     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1515       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1516     } else
1517 #endif
1518     {
1519       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1520     }
1521   } else {
1522     kmp_indirect_lock_t *ilk =
1523         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1524     KMP_ASSERT(ilk != NULL);
1525     lck = ilk->lock;
1526     if (__kmp_env_consistency_check) {
1527       __kmp_pop_sync(global_tid, ct_critical, loc);
1528     }
1529 #if USE_ITT_BUILD
1530     __kmp_itt_critical_releasing(lck);
1531 #endif
1532     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1533   }
1534 
1535 #else // KMP_USE_DYNAMIC_LOCK
1536 
1537   if ((__kmp_user_lock_kind == lk_tas) &&
1538       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1539     lck = (kmp_user_lock_p)crit;
1540   }
1541 #if KMP_USE_FUTEX
1542   else if ((__kmp_user_lock_kind == lk_futex) &&
1543            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1544     lck = (kmp_user_lock_p)crit;
1545   }
1546 #endif
1547   else { // ticket, queuing or drdpa
1548     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1549   }
1550 
1551   KMP_ASSERT(lck != NULL);
1552 
1553   if (__kmp_env_consistency_check)
1554     __kmp_pop_sync(global_tid, ct_critical, loc);
1555 
1556 #if USE_ITT_BUILD
1557   __kmp_itt_critical_releasing(lck);
1558 #endif /* USE_ITT_BUILD */
1559   // Value of 'crit' should be good for using as a critical_id of the critical
1560   // section directive.
1561   __kmp_release_user_lock_with_checks(lck, global_tid);
1562 
1563 #endif // KMP_USE_DYNAMIC_LOCK
1564 
1565 #if OMPT_SUPPORT && OMPT_OPTIONAL
1566   /* OMPT release event triggers after lock is released; place here to trigger
1567    * for all #if branches */
1568   OMPT_STORE_RETURN_ADDRESS(global_tid);
1569   if (ompt_enabled.ompt_callback_mutex_released) {
1570     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1571         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1572         OMPT_LOAD_RETURN_ADDRESS(0));
1573   }
1574 #endif
1575 
1576   KMP_POP_PARTITIONED_TIMER();
1577   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1578 }
1579 
1580 /*!
1581 @ingroup SYNCHRONIZATION
1582 @param loc source location information
1583 @param global_tid thread id.
1584 @return one if the thread should execute the master block, zero otherwise
1585 
1586 Start execution of a combined barrier and master. The barrier is executed inside
1587 this function.
1588 */
1589 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1590   int status;
1591   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1592   __kmp_assert_valid_gtid(global_tid);
1593 
1594   if (!TCR_4(__kmp_init_parallel))
1595     __kmp_parallel_initialize();
1596 
1597   __kmp_resume_if_soft_paused();
1598 
1599   if (__kmp_env_consistency_check)
1600     __kmp_check_barrier(global_tid, ct_barrier, loc);
1601 
1602 #if OMPT_SUPPORT
1603   ompt_frame_t *ompt_frame;
1604   if (ompt_enabled.enabled) {
1605     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1606     if (ompt_frame->enter_frame.ptr == NULL)
1607       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1608     OMPT_STORE_RETURN_ADDRESS(global_tid);
1609   }
1610 #endif
1611 #if USE_ITT_NOTIFY
1612   __kmp_threads[global_tid]->th.th_ident = loc;
1613 #endif
1614   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1615 #if OMPT_SUPPORT && OMPT_OPTIONAL
1616   if (ompt_enabled.enabled) {
1617     ompt_frame->enter_frame = ompt_data_none;
1618   }
1619 #endif
1620 
1621   return (status != 0) ? 0 : 1;
1622 }
1623 
1624 /*!
1625 @ingroup SYNCHRONIZATION
1626 @param loc source location information
1627 @param global_tid thread id.
1628 
1629 Complete the execution of a combined barrier and master. This function should
1630 only be called at the completion of the <tt>master</tt> code. Other threads will
1631 still be waiting at the barrier and this call releases them.
1632 */
1633 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1634   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1635   __kmp_assert_valid_gtid(global_tid);
1636   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1637 }
1638 
1639 /*!
1640 @ingroup SYNCHRONIZATION
1641 @param loc source location information
1642 @param global_tid thread id.
1643 @return one if the thread should execute the master block, zero otherwise
1644 
1645 Start execution of a combined barrier and master(nowait) construct.
1646 The barrier is executed inside this function.
1647 There is no equivalent "end" function, since the
1648 */
1649 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1650   kmp_int32 ret;
1651   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1652   __kmp_assert_valid_gtid(global_tid);
1653 
1654   if (!TCR_4(__kmp_init_parallel))
1655     __kmp_parallel_initialize();
1656 
1657   __kmp_resume_if_soft_paused();
1658 
1659   if (__kmp_env_consistency_check) {
1660     if (loc == 0) {
1661       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1662     }
1663     __kmp_check_barrier(global_tid, ct_barrier, loc);
1664   }
1665 
1666 #if OMPT_SUPPORT
1667   ompt_frame_t *ompt_frame;
1668   if (ompt_enabled.enabled) {
1669     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1670     if (ompt_frame->enter_frame.ptr == NULL)
1671       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1672     OMPT_STORE_RETURN_ADDRESS(global_tid);
1673   }
1674 #endif
1675 #if USE_ITT_NOTIFY
1676   __kmp_threads[global_tid]->th.th_ident = loc;
1677 #endif
1678   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1679 #if OMPT_SUPPORT && OMPT_OPTIONAL
1680   if (ompt_enabled.enabled) {
1681     ompt_frame->enter_frame = ompt_data_none;
1682   }
1683 #endif
1684 
1685   ret = __kmpc_master(loc, global_tid);
1686 
1687   if (__kmp_env_consistency_check) {
1688     /*  there's no __kmpc_end_master called; so the (stats) */
1689     /*  actions of __kmpc_end_master are done here          */
1690     if (ret) {
1691       /* only one thread should do the pop since only */
1692       /* one did the push (see __kmpc_master())       */
1693       __kmp_pop_sync(global_tid, ct_master, loc);
1694     }
1695   }
1696 
1697   return (ret);
1698 }
1699 
1700 /* The BARRIER for a SINGLE process section is always explicit   */
1701 /*!
1702 @ingroup WORK_SHARING
1703 @param loc  source location information
1704 @param global_tid  global thread number
1705 @return One if this thread should execute the single construct, zero otherwise.
1706 
1707 Test whether to execute a <tt>single</tt> construct.
1708 There are no implicit barriers in the two "single" calls, rather the compiler
1709 should introduce an explicit barrier if it is required.
1710 */
1711 
1712 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1713   __kmp_assert_valid_gtid(global_tid);
1714   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1715 
1716   if (rc) {
1717     // We are going to execute the single statement, so we should count it.
1718     KMP_COUNT_BLOCK(OMP_SINGLE);
1719     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1720   }
1721 
1722 #if OMPT_SUPPORT && OMPT_OPTIONAL
1723   kmp_info_t *this_thr = __kmp_threads[global_tid];
1724   kmp_team_t *team = this_thr->th.th_team;
1725   int tid = __kmp_tid_from_gtid(global_tid);
1726 
1727   if (ompt_enabled.enabled) {
1728     if (rc) {
1729       if (ompt_enabled.ompt_callback_work) {
1730         ompt_callbacks.ompt_callback(ompt_callback_work)(
1731             ompt_work_single_executor, ompt_scope_begin,
1732             &(team->t.ompt_team_info.parallel_data),
1733             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1734             1, OMPT_GET_RETURN_ADDRESS(0));
1735       }
1736     } else {
1737       if (ompt_enabled.ompt_callback_work) {
1738         ompt_callbacks.ompt_callback(ompt_callback_work)(
1739             ompt_work_single_other, ompt_scope_begin,
1740             &(team->t.ompt_team_info.parallel_data),
1741             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1742             1, OMPT_GET_RETURN_ADDRESS(0));
1743         ompt_callbacks.ompt_callback(ompt_callback_work)(
1744             ompt_work_single_other, ompt_scope_end,
1745             &(team->t.ompt_team_info.parallel_data),
1746             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1747             1, OMPT_GET_RETURN_ADDRESS(0));
1748       }
1749     }
1750   }
1751 #endif
1752 
1753   return rc;
1754 }
1755 
1756 /*!
1757 @ingroup WORK_SHARING
1758 @param loc  source location information
1759 @param global_tid  global thread number
1760 
1761 Mark the end of a <tt>single</tt> construct.  This function should
1762 only be called by the thread that executed the block of code protected
1763 by the `single` construct.
1764 */
1765 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1766   __kmp_assert_valid_gtid(global_tid);
1767   __kmp_exit_single(global_tid);
1768   KMP_POP_PARTITIONED_TIMER();
1769 
1770 #if OMPT_SUPPORT && OMPT_OPTIONAL
1771   kmp_info_t *this_thr = __kmp_threads[global_tid];
1772   kmp_team_t *team = this_thr->th.th_team;
1773   int tid = __kmp_tid_from_gtid(global_tid);
1774 
1775   if (ompt_enabled.ompt_callback_work) {
1776     ompt_callbacks.ompt_callback(ompt_callback_work)(
1777         ompt_work_single_executor, ompt_scope_end,
1778         &(team->t.ompt_team_info.parallel_data),
1779         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1780         OMPT_GET_RETURN_ADDRESS(0));
1781   }
1782 #endif
1783 }
1784 
1785 /*!
1786 @ingroup WORK_SHARING
1787 @param loc Source location
1788 @param global_tid Global thread id
1789 
1790 Mark the end of a statically scheduled loop.
1791 */
1792 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1793   KMP_POP_PARTITIONED_TIMER();
1794   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1795 
1796 #if OMPT_SUPPORT && OMPT_OPTIONAL
1797   if (ompt_enabled.ompt_callback_work) {
1798     ompt_work_t ompt_work_type = ompt_work_loop;
1799     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1800     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1801     // Determine workshare type
1802     if (loc != NULL) {
1803       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1804         ompt_work_type = ompt_work_loop;
1805       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1806         ompt_work_type = ompt_work_sections;
1807       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1808         ompt_work_type = ompt_work_distribute;
1809       } else {
1810         // use default set above.
1811         // a warning about this case is provided in __kmpc_for_static_init
1812       }
1813       KMP_DEBUG_ASSERT(ompt_work_type);
1814     }
1815     ompt_callbacks.ompt_callback(ompt_callback_work)(
1816         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1817         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1818   }
1819 #endif
1820   if (__kmp_env_consistency_check)
1821     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1822 }
1823 
1824 // User routines which take C-style arguments (call by value)
1825 // different from the Fortran equivalent routines
1826 
1827 void ompc_set_num_threads(int arg) {
1828   // !!!!! TODO: check the per-task binding
1829   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1830 }
1831 
1832 void ompc_set_dynamic(int flag) {
1833   kmp_info_t *thread;
1834 
1835   /* For the thread-private implementation of the internal controls */
1836   thread = __kmp_entry_thread();
1837 
1838   __kmp_save_internal_controls(thread);
1839 
1840   set__dynamic(thread, flag ? TRUE : FALSE);
1841 }
1842 
1843 void ompc_set_nested(int flag) {
1844   kmp_info_t *thread;
1845 
1846   /* For the thread-private internal controls implementation */
1847   thread = __kmp_entry_thread();
1848 
1849   __kmp_save_internal_controls(thread);
1850 
1851   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1852 }
1853 
1854 void ompc_set_max_active_levels(int max_active_levels) {
1855   /* TO DO */
1856   /* we want per-task implementation of this internal control */
1857 
1858   /* For the per-thread internal controls implementation */
1859   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1860 }
1861 
1862 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1863   // !!!!! TODO: check the per-task binding
1864   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1865 }
1866 
1867 int ompc_get_ancestor_thread_num(int level) {
1868   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1869 }
1870 
1871 int ompc_get_team_size(int level) {
1872   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1873 }
1874 
1875 /* OpenMP 5.0 Affinity Format API */
1876 
1877 void ompc_set_affinity_format(char const *format) {
1878   if (!__kmp_init_serial) {
1879     __kmp_serial_initialize();
1880   }
1881   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1882                          format, KMP_STRLEN(format) + 1);
1883 }
1884 
1885 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1886   size_t format_size;
1887   if (!__kmp_init_serial) {
1888     __kmp_serial_initialize();
1889   }
1890   format_size = KMP_STRLEN(__kmp_affinity_format);
1891   if (buffer && size) {
1892     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1893                            format_size + 1);
1894   }
1895   return format_size;
1896 }
1897 
1898 void ompc_display_affinity(char const *format) {
1899   int gtid;
1900   if (!TCR_4(__kmp_init_middle)) {
1901     __kmp_middle_initialize();
1902   }
1903   gtid = __kmp_get_gtid();
1904   __kmp_aux_display_affinity(gtid, format);
1905 }
1906 
1907 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1908                              char const *format) {
1909   int gtid;
1910   size_t num_required;
1911   kmp_str_buf_t capture_buf;
1912   if (!TCR_4(__kmp_init_middle)) {
1913     __kmp_middle_initialize();
1914   }
1915   gtid = __kmp_get_gtid();
1916   __kmp_str_buf_init(&capture_buf);
1917   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1918   if (buffer && buf_size) {
1919     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1920                            capture_buf.used + 1);
1921   }
1922   __kmp_str_buf_free(&capture_buf);
1923   return num_required;
1924 }
1925 
1926 void kmpc_set_stacksize(int arg) {
1927   // __kmp_aux_set_stacksize initializes the library if needed
1928   __kmp_aux_set_stacksize(arg);
1929 }
1930 
1931 void kmpc_set_stacksize_s(size_t arg) {
1932   // __kmp_aux_set_stacksize initializes the library if needed
1933   __kmp_aux_set_stacksize(arg);
1934 }
1935 
1936 void kmpc_set_blocktime(int arg) {
1937   int gtid, tid;
1938   kmp_info_t *thread;
1939 
1940   gtid = __kmp_entry_gtid();
1941   tid = __kmp_tid_from_gtid(gtid);
1942   thread = __kmp_thread_from_gtid(gtid);
1943 
1944   __kmp_aux_set_blocktime(arg, thread, tid);
1945 }
1946 
1947 void kmpc_set_library(int arg) {
1948   // __kmp_user_set_library initializes the library if needed
1949   __kmp_user_set_library((enum library_type)arg);
1950 }
1951 
1952 void kmpc_set_defaults(char const *str) {
1953   // __kmp_aux_set_defaults initializes the library if needed
1954   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1955 }
1956 
1957 void kmpc_set_disp_num_buffers(int arg) {
1958   // ignore after initialization because some teams have already
1959   // allocated dispatch buffers
1960   if (__kmp_init_serial == 0 && arg > 0)
1961     __kmp_dispatch_num_buffers = arg;
1962 }
1963 
1964 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1965 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1966   return -1;
1967 #else
1968   if (!TCR_4(__kmp_init_middle)) {
1969     __kmp_middle_initialize();
1970   }
1971   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1972 #endif
1973 }
1974 
1975 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1976 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1977   return -1;
1978 #else
1979   if (!TCR_4(__kmp_init_middle)) {
1980     __kmp_middle_initialize();
1981   }
1982   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1983 #endif
1984 }
1985 
1986 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1987 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1988   return -1;
1989 #else
1990   if (!TCR_4(__kmp_init_middle)) {
1991     __kmp_middle_initialize();
1992   }
1993   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1994 #endif
1995 }
1996 
1997 /* -------------------------------------------------------------------------- */
1998 /*!
1999 @ingroup THREADPRIVATE
2000 @param loc       source location information
2001 @param gtid      global thread number
2002 @param cpy_size  size of the cpy_data buffer
2003 @param cpy_data  pointer to data to be copied
2004 @param cpy_func  helper function to call for copying data
2005 @param didit     flag variable: 1=single thread; 0=not single thread
2006 
2007 __kmpc_copyprivate implements the interface for the private data broadcast
2008 needed for the copyprivate clause associated with a single region in an
2009 OpenMP<sup>*</sup> program (both C and Fortran).
2010 All threads participating in the parallel region call this routine.
2011 One of the threads (called the single thread) should have the <tt>didit</tt>
2012 variable set to 1 and all other threads should have that variable set to 0.
2013 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2014 
2015 The OpenMP specification forbids the use of nowait on the single region when a
2016 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2017 barrier internally to avoid race conditions, so the code generation for the
2018 single region should avoid generating a barrier after the call to @ref
2019 __kmpc_copyprivate.
2020 
2021 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2022 The <tt>loc</tt> parameter is a pointer to source location information.
2023 
2024 Internal implementation: The single thread will first copy its descriptor
2025 address (cpy_data) to a team-private location, then the other threads will each
2026 call the function pointed to by the parameter cpy_func, which carries out the
2027 copy by copying the data using the cpy_data buffer.
2028 
2029 The cpy_func routine used for the copy and the contents of the data area defined
2030 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2031 to be done. For instance, the cpy_data buffer can hold the actual data to be
2032 copied or it may hold a list of pointers to the data. The cpy_func routine must
2033 interpret the cpy_data buffer appropriately.
2034 
2035 The interface to cpy_func is as follows:
2036 @code
2037 void cpy_func( void *destination, void *source )
2038 @endcode
2039 where void *destination is the cpy_data pointer for the thread being copied to
2040 and void *source is the cpy_data pointer for the thread being copied from.
2041 */
2042 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2043                         void *cpy_data, void (*cpy_func)(void *, void *),
2044                         kmp_int32 didit) {
2045   void **data_ptr;
2046   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2047   __kmp_assert_valid_gtid(gtid);
2048 
2049   KMP_MB();
2050 
2051   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2052 
2053   if (__kmp_env_consistency_check) {
2054     if (loc == 0) {
2055       KMP_WARNING(ConstructIdentInvalid);
2056     }
2057   }
2058 
2059   // ToDo: Optimize the following two barriers into some kind of split barrier
2060 
2061   if (didit)
2062     *data_ptr = cpy_data;
2063 
2064 #if OMPT_SUPPORT
2065   ompt_frame_t *ompt_frame;
2066   if (ompt_enabled.enabled) {
2067     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2068     if (ompt_frame->enter_frame.ptr == NULL)
2069       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2070     OMPT_STORE_RETURN_ADDRESS(gtid);
2071   }
2072 #endif
2073 /* This barrier is not a barrier region boundary */
2074 #if USE_ITT_NOTIFY
2075   __kmp_threads[gtid]->th.th_ident = loc;
2076 #endif
2077   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2078 
2079   if (!didit)
2080     (*cpy_func)(cpy_data, *data_ptr);
2081 
2082 // Consider next barrier a user-visible barrier for barrier region boundaries
2083 // Nesting checks are already handled by the single construct checks
2084 
2085 #if OMPT_SUPPORT
2086   if (ompt_enabled.enabled) {
2087     OMPT_STORE_RETURN_ADDRESS(gtid);
2088   }
2089 #endif
2090 #if USE_ITT_NOTIFY
2091   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2092 // tasks can overwrite the location)
2093 #endif
2094   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2095 #if OMPT_SUPPORT && OMPT_OPTIONAL
2096   if (ompt_enabled.enabled) {
2097     ompt_frame->enter_frame = ompt_data_none;
2098   }
2099 #endif
2100 }
2101 
2102 /* -------------------------------------------------------------------------- */
2103 
2104 #define INIT_LOCK __kmp_init_user_lock_with_checks
2105 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2106 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2107 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2108 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2109 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2110   __kmp_acquire_nested_user_lock_with_checks_timed
2111 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2112 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2113 #define TEST_LOCK __kmp_test_user_lock_with_checks
2114 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2115 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2116 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2117 
2118 // TODO: Make check abort messages use location info & pass it into
2119 // with_checks routines
2120 
2121 #if KMP_USE_DYNAMIC_LOCK
2122 
2123 // internal lock initializer
2124 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2125                                                     kmp_dyna_lockseq_t seq) {
2126   if (KMP_IS_D_LOCK(seq)) {
2127     KMP_INIT_D_LOCK(lock, seq);
2128 #if USE_ITT_BUILD
2129     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2130 #endif
2131   } else {
2132     KMP_INIT_I_LOCK(lock, seq);
2133 #if USE_ITT_BUILD
2134     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2135     __kmp_itt_lock_creating(ilk->lock, loc);
2136 #endif
2137   }
2138 }
2139 
2140 // internal nest lock initializer
2141 static __forceinline void
2142 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2143                                kmp_dyna_lockseq_t seq) {
2144 #if KMP_USE_TSX
2145   // Don't have nested lock implementation for speculative locks
2146   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2147     seq = __kmp_user_lock_seq;
2148 #endif
2149   switch (seq) {
2150   case lockseq_tas:
2151     seq = lockseq_nested_tas;
2152     break;
2153 #if KMP_USE_FUTEX
2154   case lockseq_futex:
2155     seq = lockseq_nested_futex;
2156     break;
2157 #endif
2158   case lockseq_ticket:
2159     seq = lockseq_nested_ticket;
2160     break;
2161   case lockseq_queuing:
2162     seq = lockseq_nested_queuing;
2163     break;
2164   case lockseq_drdpa:
2165     seq = lockseq_nested_drdpa;
2166     break;
2167   default:
2168     seq = lockseq_nested_queuing;
2169   }
2170   KMP_INIT_I_LOCK(lock, seq);
2171 #if USE_ITT_BUILD
2172   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2173   __kmp_itt_lock_creating(ilk->lock, loc);
2174 #endif
2175 }
2176 
2177 /* initialize the lock with a hint */
2178 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2179                                 uintptr_t hint) {
2180   KMP_DEBUG_ASSERT(__kmp_init_serial);
2181   if (__kmp_env_consistency_check && user_lock == NULL) {
2182     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2183   }
2184 
2185   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2186 
2187 #if OMPT_SUPPORT && OMPT_OPTIONAL
2188   // This is the case, if called from omp_init_lock_with_hint:
2189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2190   if (!codeptr)
2191     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2192   if (ompt_enabled.ompt_callback_lock_init) {
2193     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2194         ompt_mutex_lock, (omp_lock_hint_t)hint,
2195         __ompt_get_mutex_impl_type(user_lock),
2196         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2197   }
2198 #endif
2199 }
2200 
2201 /* initialize the lock with a hint */
2202 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2203                                      void **user_lock, uintptr_t hint) {
2204   KMP_DEBUG_ASSERT(__kmp_init_serial);
2205   if (__kmp_env_consistency_check && user_lock == NULL) {
2206     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2207   }
2208 
2209   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2210 
2211 #if OMPT_SUPPORT && OMPT_OPTIONAL
2212   // This is the case, if called from omp_init_lock_with_hint:
2213   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2214   if (!codeptr)
2215     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2216   if (ompt_enabled.ompt_callback_lock_init) {
2217     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2218         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2219         __ompt_get_mutex_impl_type(user_lock),
2220         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2221   }
2222 #endif
2223 }
2224 
2225 #endif // KMP_USE_DYNAMIC_LOCK
2226 
2227 /* initialize the lock */
2228 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2229 #if KMP_USE_DYNAMIC_LOCK
2230 
2231   KMP_DEBUG_ASSERT(__kmp_init_serial);
2232   if (__kmp_env_consistency_check && user_lock == NULL) {
2233     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2234   }
2235   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2236 
2237 #if OMPT_SUPPORT && OMPT_OPTIONAL
2238   // This is the case, if called from omp_init_lock_with_hint:
2239   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2240   if (!codeptr)
2241     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2242   if (ompt_enabled.ompt_callback_lock_init) {
2243     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2244         ompt_mutex_lock, omp_lock_hint_none,
2245         __ompt_get_mutex_impl_type(user_lock),
2246         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2247   }
2248 #endif
2249 
2250 #else // KMP_USE_DYNAMIC_LOCK
2251 
2252   static char const *const func = "omp_init_lock";
2253   kmp_user_lock_p lck;
2254   KMP_DEBUG_ASSERT(__kmp_init_serial);
2255 
2256   if (__kmp_env_consistency_check) {
2257     if (user_lock == NULL) {
2258       KMP_FATAL(LockIsUninitialized, func);
2259     }
2260   }
2261 
2262   KMP_CHECK_USER_LOCK_INIT();
2263 
2264   if ((__kmp_user_lock_kind == lk_tas) &&
2265       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2266     lck = (kmp_user_lock_p)user_lock;
2267   }
2268 #if KMP_USE_FUTEX
2269   else if ((__kmp_user_lock_kind == lk_futex) &&
2270            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2271     lck = (kmp_user_lock_p)user_lock;
2272   }
2273 #endif
2274   else {
2275     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2276   }
2277   INIT_LOCK(lck);
2278   __kmp_set_user_lock_location(lck, loc);
2279 
2280 #if OMPT_SUPPORT && OMPT_OPTIONAL
2281   // This is the case, if called from omp_init_lock_with_hint:
2282   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2283   if (!codeptr)
2284     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2285   if (ompt_enabled.ompt_callback_lock_init) {
2286     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2287         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2288         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2289   }
2290 #endif
2291 
2292 #if USE_ITT_BUILD
2293   __kmp_itt_lock_creating(lck);
2294 #endif /* USE_ITT_BUILD */
2295 
2296 #endif // KMP_USE_DYNAMIC_LOCK
2297 } // __kmpc_init_lock
2298 
2299 /* initialize the lock */
2300 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2301 #if KMP_USE_DYNAMIC_LOCK
2302 
2303   KMP_DEBUG_ASSERT(__kmp_init_serial);
2304   if (__kmp_env_consistency_check && user_lock == NULL) {
2305     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2306   }
2307   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2308 
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL
2310   // This is the case, if called from omp_init_lock_with_hint:
2311   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2312   if (!codeptr)
2313     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2314   if (ompt_enabled.ompt_callback_lock_init) {
2315     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2316         ompt_mutex_nest_lock, omp_lock_hint_none,
2317         __ompt_get_mutex_impl_type(user_lock),
2318         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2319   }
2320 #endif
2321 
2322 #else // KMP_USE_DYNAMIC_LOCK
2323 
2324   static char const *const func = "omp_init_nest_lock";
2325   kmp_user_lock_p lck;
2326   KMP_DEBUG_ASSERT(__kmp_init_serial);
2327 
2328   if (__kmp_env_consistency_check) {
2329     if (user_lock == NULL) {
2330       KMP_FATAL(LockIsUninitialized, func);
2331     }
2332   }
2333 
2334   KMP_CHECK_USER_LOCK_INIT();
2335 
2336   if ((__kmp_user_lock_kind == lk_tas) &&
2337       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2338        OMP_NEST_LOCK_T_SIZE)) {
2339     lck = (kmp_user_lock_p)user_lock;
2340   }
2341 #if KMP_USE_FUTEX
2342   else if ((__kmp_user_lock_kind == lk_futex) &&
2343            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2344             OMP_NEST_LOCK_T_SIZE)) {
2345     lck = (kmp_user_lock_p)user_lock;
2346   }
2347 #endif
2348   else {
2349     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2350   }
2351 
2352   INIT_NESTED_LOCK(lck);
2353   __kmp_set_user_lock_location(lck, loc);
2354 
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356   // This is the case, if called from omp_init_lock_with_hint:
2357   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2358   if (!codeptr)
2359     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2360   if (ompt_enabled.ompt_callback_lock_init) {
2361     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2362         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2363         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2364   }
2365 #endif
2366 
2367 #if USE_ITT_BUILD
2368   __kmp_itt_lock_creating(lck);
2369 #endif /* USE_ITT_BUILD */
2370 
2371 #endif // KMP_USE_DYNAMIC_LOCK
2372 } // __kmpc_init_nest_lock
2373 
2374 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2375 #if KMP_USE_DYNAMIC_LOCK
2376 
2377 #if USE_ITT_BUILD
2378   kmp_user_lock_p lck;
2379   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2380     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2381   } else {
2382     lck = (kmp_user_lock_p)user_lock;
2383   }
2384   __kmp_itt_lock_destroyed(lck);
2385 #endif
2386 #if OMPT_SUPPORT && OMPT_OPTIONAL
2387   // This is the case, if called from omp_init_lock_with_hint:
2388   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2389   if (!codeptr)
2390     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2391   if (ompt_enabled.ompt_callback_lock_destroy) {
2392     kmp_user_lock_p lck;
2393     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2394       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2395     } else {
2396       lck = (kmp_user_lock_p)user_lock;
2397     }
2398     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2399         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2400   }
2401 #endif
2402   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2403 #else
2404   kmp_user_lock_p lck;
2405 
2406   if ((__kmp_user_lock_kind == lk_tas) &&
2407       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2408     lck = (kmp_user_lock_p)user_lock;
2409   }
2410 #if KMP_USE_FUTEX
2411   else if ((__kmp_user_lock_kind == lk_futex) &&
2412            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2413     lck = (kmp_user_lock_p)user_lock;
2414   }
2415 #endif
2416   else {
2417     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2418   }
2419 
2420 #if OMPT_SUPPORT && OMPT_OPTIONAL
2421   // This is the case, if called from omp_init_lock_with_hint:
2422   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2423   if (!codeptr)
2424     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2425   if (ompt_enabled.ompt_callback_lock_destroy) {
2426     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2427         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2428   }
2429 #endif
2430 
2431 #if USE_ITT_BUILD
2432   __kmp_itt_lock_destroyed(lck);
2433 #endif /* USE_ITT_BUILD */
2434   DESTROY_LOCK(lck);
2435 
2436   if ((__kmp_user_lock_kind == lk_tas) &&
2437       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2438     ;
2439   }
2440 #if KMP_USE_FUTEX
2441   else if ((__kmp_user_lock_kind == lk_futex) &&
2442            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2443     ;
2444   }
2445 #endif
2446   else {
2447     __kmp_user_lock_free(user_lock, gtid, lck);
2448   }
2449 #endif // KMP_USE_DYNAMIC_LOCK
2450 } // __kmpc_destroy_lock
2451 
2452 /* destroy the lock */
2453 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2454 #if KMP_USE_DYNAMIC_LOCK
2455 
2456 #if USE_ITT_BUILD
2457   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2458   __kmp_itt_lock_destroyed(ilk->lock);
2459 #endif
2460 #if OMPT_SUPPORT && OMPT_OPTIONAL
2461   // This is the case, if called from omp_init_lock_with_hint:
2462   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2463   if (!codeptr)
2464     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2465   if (ompt_enabled.ompt_callback_lock_destroy) {
2466     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2467         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2468   }
2469 #endif
2470   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2471 
2472 #else // KMP_USE_DYNAMIC_LOCK
2473 
2474   kmp_user_lock_p lck;
2475 
2476   if ((__kmp_user_lock_kind == lk_tas) &&
2477       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2478        OMP_NEST_LOCK_T_SIZE)) {
2479     lck = (kmp_user_lock_p)user_lock;
2480   }
2481 #if KMP_USE_FUTEX
2482   else if ((__kmp_user_lock_kind == lk_futex) &&
2483            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2484             OMP_NEST_LOCK_T_SIZE)) {
2485     lck = (kmp_user_lock_p)user_lock;
2486   }
2487 #endif
2488   else {
2489     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2490   }
2491 
2492 #if OMPT_SUPPORT && OMPT_OPTIONAL
2493   // This is the case, if called from omp_init_lock_with_hint:
2494   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2495   if (!codeptr)
2496     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2497   if (ompt_enabled.ompt_callback_lock_destroy) {
2498     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2499         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2500   }
2501 #endif
2502 
2503 #if USE_ITT_BUILD
2504   __kmp_itt_lock_destroyed(lck);
2505 #endif /* USE_ITT_BUILD */
2506 
2507   DESTROY_NESTED_LOCK(lck);
2508 
2509   if ((__kmp_user_lock_kind == lk_tas) &&
2510       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2511        OMP_NEST_LOCK_T_SIZE)) {
2512     ;
2513   }
2514 #if KMP_USE_FUTEX
2515   else if ((__kmp_user_lock_kind == lk_futex) &&
2516            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2517             OMP_NEST_LOCK_T_SIZE)) {
2518     ;
2519   }
2520 #endif
2521   else {
2522     __kmp_user_lock_free(user_lock, gtid, lck);
2523   }
2524 #endif // KMP_USE_DYNAMIC_LOCK
2525 } // __kmpc_destroy_nest_lock
2526 
2527 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2528   KMP_COUNT_BLOCK(OMP_set_lock);
2529 #if KMP_USE_DYNAMIC_LOCK
2530   int tag = KMP_EXTRACT_D_TAG(user_lock);
2531 #if USE_ITT_BUILD
2532   __kmp_itt_lock_acquiring(
2533       (kmp_user_lock_p)
2534           user_lock); // itt function will get to the right lock object.
2535 #endif
2536 #if OMPT_SUPPORT && OMPT_OPTIONAL
2537   // This is the case, if called from omp_init_lock_with_hint:
2538   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2539   if (!codeptr)
2540     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2541   if (ompt_enabled.ompt_callback_mutex_acquire) {
2542     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2543         ompt_mutex_lock, omp_lock_hint_none,
2544         __ompt_get_mutex_impl_type(user_lock),
2545         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2546   }
2547 #endif
2548 #if KMP_USE_INLINED_TAS
2549   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2550     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2551   } else
2552 #elif KMP_USE_INLINED_FUTEX
2553   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2554     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2555   } else
2556 #endif
2557   {
2558     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2559   }
2560 #if USE_ITT_BUILD
2561   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2562 #endif
2563 #if OMPT_SUPPORT && OMPT_OPTIONAL
2564   if (ompt_enabled.ompt_callback_mutex_acquired) {
2565     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2566         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2567   }
2568 #endif
2569 
2570 #else // KMP_USE_DYNAMIC_LOCK
2571 
2572   kmp_user_lock_p lck;
2573 
2574   if ((__kmp_user_lock_kind == lk_tas) &&
2575       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2576     lck = (kmp_user_lock_p)user_lock;
2577   }
2578 #if KMP_USE_FUTEX
2579   else if ((__kmp_user_lock_kind == lk_futex) &&
2580            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2581     lck = (kmp_user_lock_p)user_lock;
2582   }
2583 #endif
2584   else {
2585     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2586   }
2587 
2588 #if USE_ITT_BUILD
2589   __kmp_itt_lock_acquiring(lck);
2590 #endif /* USE_ITT_BUILD */
2591 #if OMPT_SUPPORT && OMPT_OPTIONAL
2592   // This is the case, if called from omp_init_lock_with_hint:
2593   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2594   if (!codeptr)
2595     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2596   if (ompt_enabled.ompt_callback_mutex_acquire) {
2597     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2598         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2599         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2600   }
2601 #endif
2602 
2603   ACQUIRE_LOCK(lck, gtid);
2604 
2605 #if USE_ITT_BUILD
2606   __kmp_itt_lock_acquired(lck);
2607 #endif /* USE_ITT_BUILD */
2608 
2609 #if OMPT_SUPPORT && OMPT_OPTIONAL
2610   if (ompt_enabled.ompt_callback_mutex_acquired) {
2611     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2612         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2613   }
2614 #endif
2615 
2616 #endif // KMP_USE_DYNAMIC_LOCK
2617 }
2618 
2619 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2620 #if KMP_USE_DYNAMIC_LOCK
2621 
2622 #if USE_ITT_BUILD
2623   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2624 #endif
2625 #if OMPT_SUPPORT && OMPT_OPTIONAL
2626   // This is the case, if called from omp_init_lock_with_hint:
2627   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2628   if (!codeptr)
2629     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2630   if (ompt_enabled.enabled) {
2631     if (ompt_enabled.ompt_callback_mutex_acquire) {
2632       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2633           ompt_mutex_nest_lock, omp_lock_hint_none,
2634           __ompt_get_mutex_impl_type(user_lock),
2635           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2636     }
2637   }
2638 #endif
2639   int acquire_status =
2640       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2641   (void) acquire_status;
2642 #if USE_ITT_BUILD
2643   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2644 #endif
2645 
2646 #if OMPT_SUPPORT && OMPT_OPTIONAL
2647   if (ompt_enabled.enabled) {
2648     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2649       if (ompt_enabled.ompt_callback_mutex_acquired) {
2650         // lock_first
2651         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2652             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2653             codeptr);
2654       }
2655     } else {
2656       if (ompt_enabled.ompt_callback_nest_lock) {
2657         // lock_next
2658         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2659             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2660       }
2661     }
2662   }
2663 #endif
2664 
2665 #else // KMP_USE_DYNAMIC_LOCK
2666   int acquire_status;
2667   kmp_user_lock_p lck;
2668 
2669   if ((__kmp_user_lock_kind == lk_tas) &&
2670       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2671        OMP_NEST_LOCK_T_SIZE)) {
2672     lck = (kmp_user_lock_p)user_lock;
2673   }
2674 #if KMP_USE_FUTEX
2675   else if ((__kmp_user_lock_kind == lk_futex) &&
2676            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2677             OMP_NEST_LOCK_T_SIZE)) {
2678     lck = (kmp_user_lock_p)user_lock;
2679   }
2680 #endif
2681   else {
2682     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2683   }
2684 
2685 #if USE_ITT_BUILD
2686   __kmp_itt_lock_acquiring(lck);
2687 #endif /* USE_ITT_BUILD */
2688 #if OMPT_SUPPORT && OMPT_OPTIONAL
2689   // This is the case, if called from omp_init_lock_with_hint:
2690   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2691   if (!codeptr)
2692     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2693   if (ompt_enabled.enabled) {
2694     if (ompt_enabled.ompt_callback_mutex_acquire) {
2695       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2696           ompt_mutex_nest_lock, omp_lock_hint_none,
2697           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2698           codeptr);
2699     }
2700   }
2701 #endif
2702 
2703   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2704 
2705 #if USE_ITT_BUILD
2706   __kmp_itt_lock_acquired(lck);
2707 #endif /* USE_ITT_BUILD */
2708 
2709 #if OMPT_SUPPORT && OMPT_OPTIONAL
2710   if (ompt_enabled.enabled) {
2711     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2712       if (ompt_enabled.ompt_callback_mutex_acquired) {
2713         // lock_first
2714         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2715             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2716       }
2717     } else {
2718       if (ompt_enabled.ompt_callback_nest_lock) {
2719         // lock_next
2720         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2721             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2722       }
2723     }
2724   }
2725 #endif
2726 
2727 #endif // KMP_USE_DYNAMIC_LOCK
2728 }
2729 
2730 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2731 #if KMP_USE_DYNAMIC_LOCK
2732 
2733   int tag = KMP_EXTRACT_D_TAG(user_lock);
2734 #if USE_ITT_BUILD
2735   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2736 #endif
2737 #if KMP_USE_INLINED_TAS
2738   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2739     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2740   } else
2741 #elif KMP_USE_INLINED_FUTEX
2742   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2743     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2744   } else
2745 #endif
2746   {
2747     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2748   }
2749 
2750 #if OMPT_SUPPORT && OMPT_OPTIONAL
2751   // This is the case, if called from omp_init_lock_with_hint:
2752   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2753   if (!codeptr)
2754     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2755   if (ompt_enabled.ompt_callback_mutex_released) {
2756     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2757         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2758   }
2759 #endif
2760 
2761 #else // KMP_USE_DYNAMIC_LOCK
2762 
2763   kmp_user_lock_p lck;
2764 
2765   /* Can't use serial interval since not block structured */
2766   /* release the lock */
2767 
2768   if ((__kmp_user_lock_kind == lk_tas) &&
2769       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2770 #if KMP_OS_LINUX &&                                                            \
2771     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2772 // "fast" path implemented to fix customer performance issue
2773 #if USE_ITT_BUILD
2774     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2775 #endif /* USE_ITT_BUILD */
2776     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2777     KMP_MB();
2778 
2779 #if OMPT_SUPPORT && OMPT_OPTIONAL
2780     // This is the case, if called from omp_init_lock_with_hint:
2781     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2782     if (!codeptr)
2783       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2784     if (ompt_enabled.ompt_callback_mutex_released) {
2785       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2786           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2787     }
2788 #endif
2789 
2790     return;
2791 #else
2792     lck = (kmp_user_lock_p)user_lock;
2793 #endif
2794   }
2795 #if KMP_USE_FUTEX
2796   else if ((__kmp_user_lock_kind == lk_futex) &&
2797            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2798     lck = (kmp_user_lock_p)user_lock;
2799   }
2800 #endif
2801   else {
2802     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2803   }
2804 
2805 #if USE_ITT_BUILD
2806   __kmp_itt_lock_releasing(lck);
2807 #endif /* USE_ITT_BUILD */
2808 
2809   RELEASE_LOCK(lck, gtid);
2810 
2811 #if OMPT_SUPPORT && OMPT_OPTIONAL
2812   // This is the case, if called from omp_init_lock_with_hint:
2813   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2814   if (!codeptr)
2815     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2816   if (ompt_enabled.ompt_callback_mutex_released) {
2817     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2818         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2819   }
2820 #endif
2821 
2822 #endif // KMP_USE_DYNAMIC_LOCK
2823 }
2824 
2825 /* release the lock */
2826 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2827 #if KMP_USE_DYNAMIC_LOCK
2828 
2829 #if USE_ITT_BUILD
2830   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2831 #endif
2832   int release_status =
2833       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2834   (void) release_status;
2835 
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837   // This is the case, if called from omp_init_lock_with_hint:
2838   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2839   if (!codeptr)
2840     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2841   if (ompt_enabled.enabled) {
2842     if (release_status == KMP_LOCK_RELEASED) {
2843       if (ompt_enabled.ompt_callback_mutex_released) {
2844         // release_lock_last
2845         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2846             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2847             codeptr);
2848       }
2849     } else if (ompt_enabled.ompt_callback_nest_lock) {
2850       // release_lock_prev
2851       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2852           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2853     }
2854   }
2855 #endif
2856 
2857 #else // KMP_USE_DYNAMIC_LOCK
2858 
2859   kmp_user_lock_p lck;
2860 
2861   /* Can't use serial interval since not block structured */
2862 
2863   if ((__kmp_user_lock_kind == lk_tas) &&
2864       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2865        OMP_NEST_LOCK_T_SIZE)) {
2866 #if KMP_OS_LINUX &&                                                            \
2867     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2868     // "fast" path implemented to fix customer performance issue
2869     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2870 #if USE_ITT_BUILD
2871     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2872 #endif /* USE_ITT_BUILD */
2873 
2874 #if OMPT_SUPPORT && OMPT_OPTIONAL
2875     int release_status = KMP_LOCK_STILL_HELD;
2876 #endif
2877 
2878     if (--(tl->lk.depth_locked) == 0) {
2879       TCW_4(tl->lk.poll, 0);
2880 #if OMPT_SUPPORT && OMPT_OPTIONAL
2881       release_status = KMP_LOCK_RELEASED;
2882 #endif
2883     }
2884     KMP_MB();
2885 
2886 #if OMPT_SUPPORT && OMPT_OPTIONAL
2887     // This is the case, if called from omp_init_lock_with_hint:
2888     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2889     if (!codeptr)
2890       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2891     if (ompt_enabled.enabled) {
2892       if (release_status == KMP_LOCK_RELEASED) {
2893         if (ompt_enabled.ompt_callback_mutex_released) {
2894           // release_lock_last
2895           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2896               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2897         }
2898       } else if (ompt_enabled.ompt_callback_nest_lock) {
2899         // release_lock_previous
2900         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2901             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2902       }
2903     }
2904 #endif
2905 
2906     return;
2907 #else
2908     lck = (kmp_user_lock_p)user_lock;
2909 #endif
2910   }
2911 #if KMP_USE_FUTEX
2912   else if ((__kmp_user_lock_kind == lk_futex) &&
2913            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2914             OMP_NEST_LOCK_T_SIZE)) {
2915     lck = (kmp_user_lock_p)user_lock;
2916   }
2917 #endif
2918   else {
2919     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2920   }
2921 
2922 #if USE_ITT_BUILD
2923   __kmp_itt_lock_releasing(lck);
2924 #endif /* USE_ITT_BUILD */
2925 
2926   int release_status;
2927   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2928 #if OMPT_SUPPORT && OMPT_OPTIONAL
2929   // This is the case, if called from omp_init_lock_with_hint:
2930   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2931   if (!codeptr)
2932     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2933   if (ompt_enabled.enabled) {
2934     if (release_status == KMP_LOCK_RELEASED) {
2935       if (ompt_enabled.ompt_callback_mutex_released) {
2936         // release_lock_last
2937         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2938             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2939       }
2940     } else if (ompt_enabled.ompt_callback_nest_lock) {
2941       // release_lock_previous
2942       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2943           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2944     }
2945   }
2946 #endif
2947 
2948 #endif // KMP_USE_DYNAMIC_LOCK
2949 }
2950 
2951 /* try to acquire the lock */
2952 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2953   KMP_COUNT_BLOCK(OMP_test_lock);
2954 
2955 #if KMP_USE_DYNAMIC_LOCK
2956   int rc;
2957   int tag = KMP_EXTRACT_D_TAG(user_lock);
2958 #if USE_ITT_BUILD
2959   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2960 #endif
2961 #if OMPT_SUPPORT && OMPT_OPTIONAL
2962   // This is the case, if called from omp_init_lock_with_hint:
2963   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2964   if (!codeptr)
2965     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2966   if (ompt_enabled.ompt_callback_mutex_acquire) {
2967     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2968         ompt_mutex_lock, omp_lock_hint_none,
2969         __ompt_get_mutex_impl_type(user_lock),
2970         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2971   }
2972 #endif
2973 #if KMP_USE_INLINED_TAS
2974   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2975     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2976   } else
2977 #elif KMP_USE_INLINED_FUTEX
2978   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2979     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2980   } else
2981 #endif
2982   {
2983     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2984   }
2985   if (rc) {
2986 #if USE_ITT_BUILD
2987     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2988 #endif
2989 #if OMPT_SUPPORT && OMPT_OPTIONAL
2990     if (ompt_enabled.ompt_callback_mutex_acquired) {
2991       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2992           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2993     }
2994 #endif
2995     return FTN_TRUE;
2996   } else {
2997 #if USE_ITT_BUILD
2998     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2999 #endif
3000     return FTN_FALSE;
3001   }
3002 
3003 #else // KMP_USE_DYNAMIC_LOCK
3004 
3005   kmp_user_lock_p lck;
3006   int rc;
3007 
3008   if ((__kmp_user_lock_kind == lk_tas) &&
3009       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3010     lck = (kmp_user_lock_p)user_lock;
3011   }
3012 #if KMP_USE_FUTEX
3013   else if ((__kmp_user_lock_kind == lk_futex) &&
3014            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3015     lck = (kmp_user_lock_p)user_lock;
3016   }
3017 #endif
3018   else {
3019     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3020   }
3021 
3022 #if USE_ITT_BUILD
3023   __kmp_itt_lock_acquiring(lck);
3024 #endif /* USE_ITT_BUILD */
3025 #if OMPT_SUPPORT && OMPT_OPTIONAL
3026   // This is the case, if called from omp_init_lock_with_hint:
3027   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3028   if (!codeptr)
3029     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3030   if (ompt_enabled.ompt_callback_mutex_acquire) {
3031     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3032         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3033         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3034   }
3035 #endif
3036 
3037   rc = TEST_LOCK(lck, gtid);
3038 #if USE_ITT_BUILD
3039   if (rc) {
3040     __kmp_itt_lock_acquired(lck);
3041   } else {
3042     __kmp_itt_lock_cancelled(lck);
3043   }
3044 #endif /* USE_ITT_BUILD */
3045 #if OMPT_SUPPORT && OMPT_OPTIONAL
3046   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3047     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3048         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3049   }
3050 #endif
3051 
3052   return (rc ? FTN_TRUE : FTN_FALSE);
3053 
3054 /* Can't use serial interval since not block structured */
3055 
3056 #endif // KMP_USE_DYNAMIC_LOCK
3057 }
3058 
3059 /* try to acquire the lock */
3060 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3061 #if KMP_USE_DYNAMIC_LOCK
3062   int rc;
3063 #if USE_ITT_BUILD
3064   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3065 #endif
3066 #if OMPT_SUPPORT && OMPT_OPTIONAL
3067   // This is the case, if called from omp_init_lock_with_hint:
3068   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3069   if (!codeptr)
3070     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3071   if (ompt_enabled.ompt_callback_mutex_acquire) {
3072     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3073         ompt_mutex_nest_lock, omp_lock_hint_none,
3074         __ompt_get_mutex_impl_type(user_lock),
3075         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3076   }
3077 #endif
3078   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3079 #if USE_ITT_BUILD
3080   if (rc) {
3081     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3082   } else {
3083     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3084   }
3085 #endif
3086 #if OMPT_SUPPORT && OMPT_OPTIONAL
3087   if (ompt_enabled.enabled && rc) {
3088     if (rc == 1) {
3089       if (ompt_enabled.ompt_callback_mutex_acquired) {
3090         // lock_first
3091         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3092             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3093             codeptr);
3094       }
3095     } else {
3096       if (ompt_enabled.ompt_callback_nest_lock) {
3097         // lock_next
3098         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3099             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3100       }
3101     }
3102   }
3103 #endif
3104   return rc;
3105 
3106 #else // KMP_USE_DYNAMIC_LOCK
3107 
3108   kmp_user_lock_p lck;
3109   int rc;
3110 
3111   if ((__kmp_user_lock_kind == lk_tas) &&
3112       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3113        OMP_NEST_LOCK_T_SIZE)) {
3114     lck = (kmp_user_lock_p)user_lock;
3115   }
3116 #if KMP_USE_FUTEX
3117   else if ((__kmp_user_lock_kind == lk_futex) &&
3118            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3119             OMP_NEST_LOCK_T_SIZE)) {
3120     lck = (kmp_user_lock_p)user_lock;
3121   }
3122 #endif
3123   else {
3124     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3125   }
3126 
3127 #if USE_ITT_BUILD
3128   __kmp_itt_lock_acquiring(lck);
3129 #endif /* USE_ITT_BUILD */
3130 
3131 #if OMPT_SUPPORT && OMPT_OPTIONAL
3132   // This is the case, if called from omp_init_lock_with_hint:
3133   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3134   if (!codeptr)
3135     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3136   if (ompt_enabled.enabled) &&
3137         ompt_enabled.ompt_callback_mutex_acquire) {
3138       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3139           ompt_mutex_nest_lock, omp_lock_hint_none,
3140           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3141           codeptr);
3142     }
3143 #endif
3144 
3145   rc = TEST_NESTED_LOCK(lck, gtid);
3146 #if USE_ITT_BUILD
3147   if (rc) {
3148     __kmp_itt_lock_acquired(lck);
3149   } else {
3150     __kmp_itt_lock_cancelled(lck);
3151   }
3152 #endif /* USE_ITT_BUILD */
3153 #if OMPT_SUPPORT && OMPT_OPTIONAL
3154   if (ompt_enabled.enabled && rc) {
3155     if (rc == 1) {
3156       if (ompt_enabled.ompt_callback_mutex_acquired) {
3157         // lock_first
3158         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3159             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3160       }
3161     } else {
3162       if (ompt_enabled.ompt_callback_nest_lock) {
3163         // lock_next
3164         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3165             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3166       }
3167     }
3168   }
3169 #endif
3170   return rc;
3171 
3172 /* Can't use serial interval since not block structured */
3173 
3174 #endif // KMP_USE_DYNAMIC_LOCK
3175 }
3176 
3177 // Interface to fast scalable reduce methods routines
3178 
3179 // keep the selected method in a thread local structure for cross-function
3180 // usage: will be used in __kmpc_end_reduce* functions;
3181 // another solution: to re-determine the method one more time in
3182 // __kmpc_end_reduce* functions (new prototype required then)
3183 // AT: which solution is better?
3184 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3185   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3186 
3187 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3188   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3189 
3190 // description of the packed_reduction_method variable: look at the macros in
3191 // kmp.h
3192 
3193 // used in a critical section reduce block
3194 static __forceinline void
3195 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3196                                           kmp_critical_name *crit) {
3197 
3198   // this lock was visible to a customer and to the threading profile tool as a
3199   // serial overhead span (although it's used for an internal purpose only)
3200   //            why was it visible in previous implementation?
3201   //            should we keep it visible in new reduce block?
3202   kmp_user_lock_p lck;
3203 
3204 #if KMP_USE_DYNAMIC_LOCK
3205 
3206   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3207   // Check if it is initialized.
3208   if (*lk == 0) {
3209     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3210       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3211                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3212     } else {
3213       __kmp_init_indirect_csptr(crit, loc, global_tid,
3214                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3215     }
3216   }
3217   // Branch for accessing the actual lock object and set operation. This
3218   // branching is inevitable since this lock initialization does not follow the
3219   // normal dispatch path (lock table is not used).
3220   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3221     lck = (kmp_user_lock_p)lk;
3222     KMP_DEBUG_ASSERT(lck != NULL);
3223     if (__kmp_env_consistency_check) {
3224       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3225     }
3226     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3227   } else {
3228     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3229     lck = ilk->lock;
3230     KMP_DEBUG_ASSERT(lck != NULL);
3231     if (__kmp_env_consistency_check) {
3232       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3233     }
3234     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3235   }
3236 
3237 #else // KMP_USE_DYNAMIC_LOCK
3238 
3239   // We know that the fast reduction code is only emitted by Intel compilers
3240   // with 32 byte critical sections. If there isn't enough space, then we
3241   // have to use a pointer.
3242   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3243     lck = (kmp_user_lock_p)crit;
3244   } else {
3245     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3246   }
3247   KMP_DEBUG_ASSERT(lck != NULL);
3248 
3249   if (__kmp_env_consistency_check)
3250     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3251 
3252   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3253 
3254 #endif // KMP_USE_DYNAMIC_LOCK
3255 }
3256 
3257 // used in a critical section reduce block
3258 static __forceinline void
3259 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3260                                         kmp_critical_name *crit) {
3261 
3262   kmp_user_lock_p lck;
3263 
3264 #if KMP_USE_DYNAMIC_LOCK
3265 
3266   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3267     lck = (kmp_user_lock_p)crit;
3268     if (__kmp_env_consistency_check)
3269       __kmp_pop_sync(global_tid, ct_critical, loc);
3270     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3271   } else {
3272     kmp_indirect_lock_t *ilk =
3273         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3274     if (__kmp_env_consistency_check)
3275       __kmp_pop_sync(global_tid, ct_critical, loc);
3276     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3277   }
3278 
3279 #else // KMP_USE_DYNAMIC_LOCK
3280 
3281   // We know that the fast reduction code is only emitted by Intel compilers
3282   // with 32 byte critical sections. If there isn't enough space, then we have
3283   // to use a pointer.
3284   if (__kmp_base_user_lock_size > 32) {
3285     lck = *((kmp_user_lock_p *)crit);
3286     KMP_ASSERT(lck != NULL);
3287   } else {
3288     lck = (kmp_user_lock_p)crit;
3289   }
3290 
3291   if (__kmp_env_consistency_check)
3292     __kmp_pop_sync(global_tid, ct_critical, loc);
3293 
3294   __kmp_release_user_lock_with_checks(lck, global_tid);
3295 
3296 #endif // KMP_USE_DYNAMIC_LOCK
3297 } // __kmp_end_critical_section_reduce_block
3298 
3299 static __forceinline int
3300 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3301                                      int *task_state) {
3302   kmp_team_t *team;
3303 
3304   // Check if we are inside the teams construct?
3305   if (th->th.th_teams_microtask) {
3306     *team_p = team = th->th.th_team;
3307     if (team->t.t_level == th->th.th_teams_level) {
3308       // This is reduction at teams construct.
3309       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3310       // Let's swap teams temporarily for the reduction.
3311       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3312       th->th.th_team = team->t.t_parent;
3313       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3314       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3315       *task_state = th->th.th_task_state;
3316       th->th.th_task_state = 0;
3317 
3318       return 1;
3319     }
3320   }
3321   return 0;
3322 }
3323 
3324 static __forceinline void
3325 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3326   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3327   th->th.th_info.ds.ds_tid = 0;
3328   th->th.th_team = team;
3329   th->th.th_team_nproc = team->t.t_nproc;
3330   th->th.th_task_team = team->t.t_task_team[task_state];
3331   th->th.th_task_state = task_state;
3332 }
3333 
3334 /* 2.a.i. Reduce Block without a terminating barrier */
3335 /*!
3336 @ingroup SYNCHRONIZATION
3337 @param loc source location information
3338 @param global_tid global thread number
3339 @param num_vars number of items (variables) to be reduced
3340 @param reduce_size size of data in bytes to be reduced
3341 @param reduce_data pointer to data to be reduced
3342 @param reduce_func callback function providing reduction operation on two
3343 operands and returning result of reduction in lhs_data
3344 @param lck pointer to the unique lock data structure
3345 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3346 threads if atomic reduction needed
3347 
3348 The nowait version is used for a reduce clause with the nowait argument.
3349 */
3350 kmp_int32
3351 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3352                      size_t reduce_size, void *reduce_data,
3353                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3354                      kmp_critical_name *lck) {
3355 
3356   KMP_COUNT_BLOCK(REDUCE_nowait);
3357   int retval = 0;
3358   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3359   kmp_info_t *th;
3360   kmp_team_t *team;
3361   int teams_swapped = 0, task_state;
3362   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3363   __kmp_assert_valid_gtid(global_tid);
3364 
3365   // why do we need this initialization here at all?
3366   // Reduction clause can not be used as a stand-alone directive.
3367 
3368   // do not call __kmp_serial_initialize(), it will be called by
3369   // __kmp_parallel_initialize() if needed
3370   // possible detection of false-positive race by the threadchecker ???
3371   if (!TCR_4(__kmp_init_parallel))
3372     __kmp_parallel_initialize();
3373 
3374   __kmp_resume_if_soft_paused();
3375 
3376 // check correctness of reduce block nesting
3377 #if KMP_USE_DYNAMIC_LOCK
3378   if (__kmp_env_consistency_check)
3379     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3380 #else
3381   if (__kmp_env_consistency_check)
3382     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3383 #endif
3384 
3385   th = __kmp_thread_from_gtid(global_tid);
3386   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3387 
3388   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3389   // the value should be kept in a variable
3390   // the variable should be either a construct-specific or thread-specific
3391   // property, not a team specific property
3392   //     (a thread can reach the next reduce block on the next construct, reduce
3393   //     method may differ on the next construct)
3394   // an ident_t "loc" parameter could be used as a construct-specific property
3395   // (what if loc == 0?)
3396   //     (if both construct-specific and team-specific variables were shared,
3397   //     then unness extra syncs should be needed)
3398   // a thread-specific variable is better regarding two issues above (next
3399   // construct and extra syncs)
3400   // a thread-specific "th_local.reduction_method" variable is used currently
3401   // each thread executes 'determine' and 'set' lines (no need to execute by one
3402   // thread, to avoid unness extra syncs)
3403 
3404   packed_reduction_method = __kmp_determine_reduction_method(
3405       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3406   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3407 
3408   OMPT_REDUCTION_DECL(th, global_tid);
3409   if (packed_reduction_method == critical_reduce_block) {
3410 
3411     OMPT_REDUCTION_BEGIN;
3412 
3413     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3414     retval = 1;
3415 
3416   } else if (packed_reduction_method == empty_reduce_block) {
3417 
3418     OMPT_REDUCTION_BEGIN;
3419 
3420     // usage: if team size == 1, no synchronization is required ( Intel
3421     // platforms only )
3422     retval = 1;
3423 
3424   } else if (packed_reduction_method == atomic_reduce_block) {
3425 
3426     retval = 2;
3427 
3428     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3429     // won't be called by the code gen)
3430     //     (it's not quite good, because the checking block has been closed by
3431     //     this 'pop',
3432     //      but atomic operation has not been executed yet, will be executed
3433     //      slightly later, literally on next instruction)
3434     if (__kmp_env_consistency_check)
3435       __kmp_pop_sync(global_tid, ct_reduce, loc);
3436 
3437   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3438                                    tree_reduce_block)) {
3439 
3440 // AT: performance issue: a real barrier here
3441 // AT:     (if master goes slow, other threads are blocked here waiting for the
3442 // master to come and release them)
3443 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3444 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3445 // be confusing to a customer)
3446 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3447 // might go faster and be more in line with sense of NOWAIT
3448 // AT: TO DO: do epcc test and compare times
3449 
3450 // this barrier should be invisible to a customer and to the threading profile
3451 // tool (it's neither a terminating barrier nor customer's code, it's
3452 // used for an internal purpose)
3453 #if OMPT_SUPPORT
3454     // JP: can this barrier potentially leed to task scheduling?
3455     // JP: as long as there is a barrier in the implementation, OMPT should and
3456     // will provide the barrier events
3457     //         so we set-up the necessary frame/return addresses.
3458     ompt_frame_t *ompt_frame;
3459     if (ompt_enabled.enabled) {
3460       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3461       if (ompt_frame->enter_frame.ptr == NULL)
3462         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3463       OMPT_STORE_RETURN_ADDRESS(global_tid);
3464     }
3465 #endif
3466 #if USE_ITT_NOTIFY
3467     __kmp_threads[global_tid]->th.th_ident = loc;
3468 #endif
3469     retval =
3470         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3471                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3472     retval = (retval != 0) ? (0) : (1);
3473 #if OMPT_SUPPORT && OMPT_OPTIONAL
3474     if (ompt_enabled.enabled) {
3475       ompt_frame->enter_frame = ompt_data_none;
3476     }
3477 #endif
3478 
3479     // all other workers except master should do this pop here
3480     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3481     if (__kmp_env_consistency_check) {
3482       if (retval == 0) {
3483         __kmp_pop_sync(global_tid, ct_reduce, loc);
3484       }
3485     }
3486 
3487   } else {
3488 
3489     // should never reach this block
3490     KMP_ASSERT(0); // "unexpected method"
3491   }
3492   if (teams_swapped) {
3493     __kmp_restore_swapped_teams(th, team, task_state);
3494   }
3495   KA_TRACE(
3496       10,
3497       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3498        global_tid, packed_reduction_method, retval));
3499 
3500   return retval;
3501 }
3502 
3503 /*!
3504 @ingroup SYNCHRONIZATION
3505 @param loc source location information
3506 @param global_tid global thread id.
3507 @param lck pointer to the unique lock data structure
3508 
3509 Finish the execution of a reduce nowait.
3510 */
3511 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3512                               kmp_critical_name *lck) {
3513 
3514   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3515 
3516   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3517   __kmp_assert_valid_gtid(global_tid);
3518 
3519   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3520 
3521   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3522 
3523   if (packed_reduction_method == critical_reduce_block) {
3524 
3525     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3526     OMPT_REDUCTION_END;
3527 
3528   } else if (packed_reduction_method == empty_reduce_block) {
3529 
3530     // usage: if team size == 1, no synchronization is required ( on Intel
3531     // platforms only )
3532 
3533     OMPT_REDUCTION_END;
3534 
3535   } else if (packed_reduction_method == atomic_reduce_block) {
3536 
3537     // neither master nor other workers should get here
3538     //     (code gen does not generate this call in case 2: atomic reduce block)
3539     // actually it's better to remove this elseif at all;
3540     // after removal this value will checked by the 'else' and will assert
3541 
3542   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3543                                    tree_reduce_block)) {
3544 
3545     // only master gets here
3546     // OMPT: tree reduction is annotated in the barrier code
3547 
3548   } else {
3549 
3550     // should never reach this block
3551     KMP_ASSERT(0); // "unexpected method"
3552   }
3553 
3554   if (__kmp_env_consistency_check)
3555     __kmp_pop_sync(global_tid, ct_reduce, loc);
3556 
3557   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3558                 global_tid, packed_reduction_method));
3559 
3560   return;
3561 }
3562 
3563 /* 2.a.ii. Reduce Block with a terminating barrier */
3564 
3565 /*!
3566 @ingroup SYNCHRONIZATION
3567 @param loc source location information
3568 @param global_tid global thread number
3569 @param num_vars number of items (variables) to be reduced
3570 @param reduce_size size of data in bytes to be reduced
3571 @param reduce_data pointer to data to be reduced
3572 @param reduce_func callback function providing reduction operation on two
3573 operands and returning result of reduction in lhs_data
3574 @param lck pointer to the unique lock data structure
3575 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3576 threads if atomic reduction needed
3577 
3578 A blocking reduce that includes an implicit barrier.
3579 */
3580 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3581                         size_t reduce_size, void *reduce_data,
3582                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3583                         kmp_critical_name *lck) {
3584   KMP_COUNT_BLOCK(REDUCE_wait);
3585   int retval = 0;
3586   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3587   kmp_info_t *th;
3588   kmp_team_t *team;
3589   int teams_swapped = 0, task_state;
3590 
3591   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3592   __kmp_assert_valid_gtid(global_tid);
3593 
3594   // why do we need this initialization here at all?
3595   // Reduction clause can not be a stand-alone directive.
3596 
3597   // do not call __kmp_serial_initialize(), it will be called by
3598   // __kmp_parallel_initialize() if needed
3599   // possible detection of false-positive race by the threadchecker ???
3600   if (!TCR_4(__kmp_init_parallel))
3601     __kmp_parallel_initialize();
3602 
3603   __kmp_resume_if_soft_paused();
3604 
3605 // check correctness of reduce block nesting
3606 #if KMP_USE_DYNAMIC_LOCK
3607   if (__kmp_env_consistency_check)
3608     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3609 #else
3610   if (__kmp_env_consistency_check)
3611     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3612 #endif
3613 
3614   th = __kmp_thread_from_gtid(global_tid);
3615   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3616 
3617   packed_reduction_method = __kmp_determine_reduction_method(
3618       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3619   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3620 
3621   OMPT_REDUCTION_DECL(th, global_tid);
3622 
3623   if (packed_reduction_method == critical_reduce_block) {
3624 
3625     OMPT_REDUCTION_BEGIN;
3626     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3627     retval = 1;
3628 
3629   } else if (packed_reduction_method == empty_reduce_block) {
3630 
3631     OMPT_REDUCTION_BEGIN;
3632     // usage: if team size == 1, no synchronization is required ( Intel
3633     // platforms only )
3634     retval = 1;
3635 
3636   } else if (packed_reduction_method == atomic_reduce_block) {
3637 
3638     retval = 2;
3639 
3640   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3641                                    tree_reduce_block)) {
3642 
3643 // case tree_reduce_block:
3644 // this barrier should be visible to a customer and to the threading profile
3645 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3646 #if OMPT_SUPPORT
3647     ompt_frame_t *ompt_frame;
3648     if (ompt_enabled.enabled) {
3649       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3650       if (ompt_frame->enter_frame.ptr == NULL)
3651         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3652       OMPT_STORE_RETURN_ADDRESS(global_tid);
3653     }
3654 #endif
3655 #if USE_ITT_NOTIFY
3656     __kmp_threads[global_tid]->th.th_ident =
3657         loc; // needed for correct notification of frames
3658 #endif
3659     retval =
3660         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3661                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3662     retval = (retval != 0) ? (0) : (1);
3663 #if OMPT_SUPPORT && OMPT_OPTIONAL
3664     if (ompt_enabled.enabled) {
3665       ompt_frame->enter_frame = ompt_data_none;
3666     }
3667 #endif
3668 
3669     // all other workers except master should do this pop here
3670     // ( none of other workers except master will enter __kmpc_end_reduce() )
3671     if (__kmp_env_consistency_check) {
3672       if (retval == 0) { // 0: all other workers; 1: master
3673         __kmp_pop_sync(global_tid, ct_reduce, loc);
3674       }
3675     }
3676 
3677   } else {
3678 
3679     // should never reach this block
3680     KMP_ASSERT(0); // "unexpected method"
3681   }
3682   if (teams_swapped) {
3683     __kmp_restore_swapped_teams(th, team, task_state);
3684   }
3685 
3686   KA_TRACE(10,
3687            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3688             global_tid, packed_reduction_method, retval));
3689   return retval;
3690 }
3691 
3692 /*!
3693 @ingroup SYNCHRONIZATION
3694 @param loc source location information
3695 @param global_tid global thread id.
3696 @param lck pointer to the unique lock data structure
3697 
3698 Finish the execution of a blocking reduce.
3699 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3700 start function.
3701 */
3702 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3703                        kmp_critical_name *lck) {
3704 
3705   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3706   kmp_info_t *th;
3707   kmp_team_t *team;
3708   int teams_swapped = 0, task_state;
3709 
3710   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3711   __kmp_assert_valid_gtid(global_tid);
3712 
3713   th = __kmp_thread_from_gtid(global_tid);
3714   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3715 
3716   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3717 
3718   // this barrier should be visible to a customer and to the threading profile
3719   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3720   OMPT_REDUCTION_DECL(th, global_tid);
3721 
3722   if (packed_reduction_method == critical_reduce_block) {
3723     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3724 
3725     OMPT_REDUCTION_END;
3726 
3727 // TODO: implicit barrier: should be exposed
3728 #if OMPT_SUPPORT
3729     ompt_frame_t *ompt_frame;
3730     if (ompt_enabled.enabled) {
3731       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3732       if (ompt_frame->enter_frame.ptr == NULL)
3733         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3734       OMPT_STORE_RETURN_ADDRESS(global_tid);
3735     }
3736 #endif
3737 #if USE_ITT_NOTIFY
3738     __kmp_threads[global_tid]->th.th_ident = loc;
3739 #endif
3740     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3741 #if OMPT_SUPPORT && OMPT_OPTIONAL
3742     if (ompt_enabled.enabled) {
3743       ompt_frame->enter_frame = ompt_data_none;
3744     }
3745 #endif
3746 
3747   } else if (packed_reduction_method == empty_reduce_block) {
3748 
3749     OMPT_REDUCTION_END;
3750 
3751 // usage: if team size==1, no synchronization is required (Intel platforms only)
3752 
3753 // TODO: implicit barrier: should be exposed
3754 #if OMPT_SUPPORT
3755     ompt_frame_t *ompt_frame;
3756     if (ompt_enabled.enabled) {
3757       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3758       if (ompt_frame->enter_frame.ptr == NULL)
3759         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3760       OMPT_STORE_RETURN_ADDRESS(global_tid);
3761     }
3762 #endif
3763 #if USE_ITT_NOTIFY
3764     __kmp_threads[global_tid]->th.th_ident = loc;
3765 #endif
3766     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3767 #if OMPT_SUPPORT && OMPT_OPTIONAL
3768     if (ompt_enabled.enabled) {
3769       ompt_frame->enter_frame = ompt_data_none;
3770     }
3771 #endif
3772 
3773   } else if (packed_reduction_method == atomic_reduce_block) {
3774 
3775 #if OMPT_SUPPORT
3776     ompt_frame_t *ompt_frame;
3777     if (ompt_enabled.enabled) {
3778       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3779       if (ompt_frame->enter_frame.ptr == NULL)
3780         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3781       OMPT_STORE_RETURN_ADDRESS(global_tid);
3782     }
3783 #endif
3784 // TODO: implicit barrier: should be exposed
3785 #if USE_ITT_NOTIFY
3786     __kmp_threads[global_tid]->th.th_ident = loc;
3787 #endif
3788     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3789 #if OMPT_SUPPORT && OMPT_OPTIONAL
3790     if (ompt_enabled.enabled) {
3791       ompt_frame->enter_frame = ompt_data_none;
3792     }
3793 #endif
3794 
3795   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3796                                    tree_reduce_block)) {
3797 
3798     // only master executes here (master releases all other workers)
3799     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3800                             global_tid);
3801 
3802   } else {
3803 
3804     // should never reach this block
3805     KMP_ASSERT(0); // "unexpected method"
3806   }
3807   if (teams_swapped) {
3808     __kmp_restore_swapped_teams(th, team, task_state);
3809   }
3810 
3811   if (__kmp_env_consistency_check)
3812     __kmp_pop_sync(global_tid, ct_reduce, loc);
3813 
3814   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3815                 global_tid, packed_reduction_method));
3816 
3817   return;
3818 }
3819 
3820 #undef __KMP_GET_REDUCTION_METHOD
3821 #undef __KMP_SET_REDUCTION_METHOD
3822 
3823 /* end of interface to fast scalable reduce routines */
3824 
3825 kmp_uint64 __kmpc_get_taskid() {
3826 
3827   kmp_int32 gtid;
3828   kmp_info_t *thread;
3829 
3830   gtid = __kmp_get_gtid();
3831   if (gtid < 0) {
3832     return 0;
3833   }
3834   thread = __kmp_thread_from_gtid(gtid);
3835   return thread->th.th_current_task->td_task_id;
3836 
3837 } // __kmpc_get_taskid
3838 
3839 kmp_uint64 __kmpc_get_parent_taskid() {
3840 
3841   kmp_int32 gtid;
3842   kmp_info_t *thread;
3843   kmp_taskdata_t *parent_task;
3844 
3845   gtid = __kmp_get_gtid();
3846   if (gtid < 0) {
3847     return 0;
3848   }
3849   thread = __kmp_thread_from_gtid(gtid);
3850   parent_task = thread->th.th_current_task->td_parent;
3851   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3852 
3853 } // __kmpc_get_parent_taskid
3854 
3855 /*!
3856 @ingroup WORK_SHARING
3857 @param loc  source location information.
3858 @param gtid  global thread number.
3859 @param num_dims  number of associated doacross loops.
3860 @param dims  info on loops bounds.
3861 
3862 Initialize doacross loop information.
3863 Expect compiler send us inclusive bounds,
3864 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3865 */
3866 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3867                           const struct kmp_dim *dims) {
3868   __kmp_assert_valid_gtid(gtid);
3869   int j, idx;
3870   kmp_int64 last, trace_count;
3871   kmp_info_t *th = __kmp_threads[gtid];
3872   kmp_team_t *team = th->th.th_team;
3873   kmp_uint32 *flags;
3874   kmp_disp_t *pr_buf = th->th.th_dispatch;
3875   dispatch_shared_info_t *sh_buf;
3876 
3877   KA_TRACE(
3878       20,
3879       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3880        gtid, num_dims, !team->t.t_serialized));
3881   KMP_DEBUG_ASSERT(dims != NULL);
3882   KMP_DEBUG_ASSERT(num_dims > 0);
3883 
3884   if (team->t.t_serialized) {
3885     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3886     return; // no dependencies if team is serialized
3887   }
3888   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3889   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3890   // the next loop
3891   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3892 
3893   // Save bounds info into allocated private buffer
3894   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3895   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3896       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3897   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3898   pr_buf->th_doacross_info[0] =
3899       (kmp_int64)num_dims; // first element is number of dimensions
3900   // Save also address of num_done in order to access it later without knowing
3901   // the buffer index
3902   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3903   pr_buf->th_doacross_info[2] = dims[0].lo;
3904   pr_buf->th_doacross_info[3] = dims[0].up;
3905   pr_buf->th_doacross_info[4] = dims[0].st;
3906   last = 5;
3907   for (j = 1; j < num_dims; ++j) {
3908     kmp_int64
3909         range_length; // To keep ranges of all dimensions but the first dims[0]
3910     if (dims[j].st == 1) { // most common case
3911       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3912       range_length = dims[j].up - dims[j].lo + 1;
3913     } else {
3914       if (dims[j].st > 0) {
3915         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3916         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3917       } else { // negative increment
3918         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3919         range_length =
3920             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3921       }
3922     }
3923     pr_buf->th_doacross_info[last++] = range_length;
3924     pr_buf->th_doacross_info[last++] = dims[j].lo;
3925     pr_buf->th_doacross_info[last++] = dims[j].up;
3926     pr_buf->th_doacross_info[last++] = dims[j].st;
3927   }
3928 
3929   // Compute total trip count.
3930   // Start with range of dims[0] which we don't need to keep in the buffer.
3931   if (dims[0].st == 1) { // most common case
3932     trace_count = dims[0].up - dims[0].lo + 1;
3933   } else if (dims[0].st > 0) {
3934     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3935     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3936   } else { // negative increment
3937     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3938     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3939   }
3940   for (j = 1; j < num_dims; ++j) {
3941     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3942   }
3943   KMP_DEBUG_ASSERT(trace_count > 0);
3944 
3945   // Check if shared buffer is not occupied by other loop (idx -
3946   // __kmp_dispatch_num_buffers)
3947   if (idx != sh_buf->doacross_buf_idx) {
3948     // Shared buffer is occupied, wait for it to be free
3949     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3950                  __kmp_eq_4, NULL);
3951   }
3952 #if KMP_32_BIT_ARCH
3953   // Check if we are the first thread. After the CAS the first thread gets 0,
3954   // others get 1 if initialization is in progress, allocated pointer otherwise.
3955   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3956   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3957       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3958 #else
3959   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3960       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3961 #endif
3962   if (flags == NULL) {
3963     // we are the first thread, allocate the array of flags
3964     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3965     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3966     KMP_MB();
3967     sh_buf->doacross_flags = flags;
3968   } else if (flags == (kmp_uint32 *)1) {
3969 #if KMP_32_BIT_ARCH
3970     // initialization is still in progress, need to wait
3971     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3972 #else
3973     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3974 #endif
3975       KMP_YIELD(TRUE);
3976     KMP_MB();
3977   } else {
3978     KMP_MB();
3979   }
3980   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3981   pr_buf->th_doacross_flags =
3982       sh_buf->doacross_flags; // save private copy in order to not
3983   // touch shared buffer on each iteration
3984   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3985 }
3986 
3987 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3988   __kmp_assert_valid_gtid(gtid);
3989   kmp_int32 shft, num_dims, i;
3990   kmp_uint32 flag;
3991   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3992   kmp_info_t *th = __kmp_threads[gtid];
3993   kmp_team_t *team = th->th.th_team;
3994   kmp_disp_t *pr_buf;
3995   kmp_int64 lo, up, st;
3996 
3997   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3998   if (team->t.t_serialized) {
3999     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4000     return; // no dependencies if team is serialized
4001   }
4002 
4003   // calculate sequential iteration number and check out-of-bounds condition
4004   pr_buf = th->th.th_dispatch;
4005   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4006   num_dims = pr_buf->th_doacross_info[0];
4007   lo = pr_buf->th_doacross_info[2];
4008   up = pr_buf->th_doacross_info[3];
4009   st = pr_buf->th_doacross_info[4];
4010 #if OMPT_SUPPORT && OMPT_OPTIONAL
4011   ompt_dependence_t deps[num_dims];
4012 #endif
4013   if (st == 1) { // most common case
4014     if (vec[0] < lo || vec[0] > up) {
4015       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4016                     "bounds [%lld,%lld]\n",
4017                     gtid, vec[0], lo, up));
4018       return;
4019     }
4020     iter_number = vec[0] - lo;
4021   } else if (st > 0) {
4022     if (vec[0] < lo || vec[0] > up) {
4023       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4024                     "bounds [%lld,%lld]\n",
4025                     gtid, vec[0], lo, up));
4026       return;
4027     }
4028     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4029   } else { // negative increment
4030     if (vec[0] > lo || vec[0] < up) {
4031       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4032                     "bounds [%lld,%lld]\n",
4033                     gtid, vec[0], lo, up));
4034       return;
4035     }
4036     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4037   }
4038 #if OMPT_SUPPORT && OMPT_OPTIONAL
4039   deps[0].variable.value = iter_number;
4040   deps[0].dependence_type = ompt_dependence_type_sink;
4041 #endif
4042   for (i = 1; i < num_dims; ++i) {
4043     kmp_int64 iter, ln;
4044     kmp_int32 j = i * 4;
4045     ln = pr_buf->th_doacross_info[j + 1];
4046     lo = pr_buf->th_doacross_info[j + 2];
4047     up = pr_buf->th_doacross_info[j + 3];
4048     st = pr_buf->th_doacross_info[j + 4];
4049     if (st == 1) {
4050       if (vec[i] < lo || vec[i] > up) {
4051         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4052                       "bounds [%lld,%lld]\n",
4053                       gtid, vec[i], lo, up));
4054         return;
4055       }
4056       iter = vec[i] - lo;
4057     } else if (st > 0) {
4058       if (vec[i] < lo || vec[i] > up) {
4059         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4060                       "bounds [%lld,%lld]\n",
4061                       gtid, vec[i], lo, up));
4062         return;
4063       }
4064       iter = (kmp_uint64)(vec[i] - lo) / st;
4065     } else { // st < 0
4066       if (vec[i] > lo || vec[i] < up) {
4067         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4068                       "bounds [%lld,%lld]\n",
4069                       gtid, vec[i], lo, up));
4070         return;
4071       }
4072       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4073     }
4074     iter_number = iter + ln * iter_number;
4075 #if OMPT_SUPPORT && OMPT_OPTIONAL
4076     deps[i].variable.value = iter;
4077     deps[i].dependence_type = ompt_dependence_type_sink;
4078 #endif
4079   }
4080   shft = iter_number % 32; // use 32-bit granularity
4081   iter_number >>= 5; // divided by 32
4082   flag = 1 << shft;
4083   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4084     KMP_YIELD(TRUE);
4085   }
4086   KMP_MB();
4087 #if OMPT_SUPPORT && OMPT_OPTIONAL
4088   if (ompt_enabled.ompt_callback_dependences) {
4089     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4090         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4091   }
4092 #endif
4093   KA_TRACE(20,
4094            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4095             gtid, (iter_number << 5) + shft));
4096 }
4097 
4098 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4099   __kmp_assert_valid_gtid(gtid);
4100   kmp_int32 shft, num_dims, i;
4101   kmp_uint32 flag;
4102   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4103   kmp_info_t *th = __kmp_threads[gtid];
4104   kmp_team_t *team = th->th.th_team;
4105   kmp_disp_t *pr_buf;
4106   kmp_int64 lo, st;
4107 
4108   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4109   if (team->t.t_serialized) {
4110     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4111     return; // no dependencies if team is serialized
4112   }
4113 
4114   // calculate sequential iteration number (same as in "wait" but no
4115   // out-of-bounds checks)
4116   pr_buf = th->th.th_dispatch;
4117   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4118   num_dims = pr_buf->th_doacross_info[0];
4119   lo = pr_buf->th_doacross_info[2];
4120   st = pr_buf->th_doacross_info[4];
4121 #if OMPT_SUPPORT && OMPT_OPTIONAL
4122   ompt_dependence_t deps[num_dims];
4123 #endif
4124   if (st == 1) { // most common case
4125     iter_number = vec[0] - lo;
4126   } else if (st > 0) {
4127     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4128   } else { // negative increment
4129     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4130   }
4131 #if OMPT_SUPPORT && OMPT_OPTIONAL
4132   deps[0].variable.value = iter_number;
4133   deps[0].dependence_type = ompt_dependence_type_source;
4134 #endif
4135   for (i = 1; i < num_dims; ++i) {
4136     kmp_int64 iter, ln;
4137     kmp_int32 j = i * 4;
4138     ln = pr_buf->th_doacross_info[j + 1];
4139     lo = pr_buf->th_doacross_info[j + 2];
4140     st = pr_buf->th_doacross_info[j + 4];
4141     if (st == 1) {
4142       iter = vec[i] - lo;
4143     } else if (st > 0) {
4144       iter = (kmp_uint64)(vec[i] - lo) / st;
4145     } else { // st < 0
4146       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4147     }
4148     iter_number = iter + ln * iter_number;
4149 #if OMPT_SUPPORT && OMPT_OPTIONAL
4150     deps[i].variable.value = iter;
4151     deps[i].dependence_type = ompt_dependence_type_source;
4152 #endif
4153   }
4154 #if OMPT_SUPPORT && OMPT_OPTIONAL
4155   if (ompt_enabled.ompt_callback_dependences) {
4156     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4157         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, num_dims);
4158   }
4159 #endif
4160   shft = iter_number % 32; // use 32-bit granularity
4161   iter_number >>= 5; // divided by 32
4162   flag = 1 << shft;
4163   KMP_MB();
4164   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4165     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4166   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4167                 (iter_number << 5) + shft));
4168 }
4169 
4170 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4171   __kmp_assert_valid_gtid(gtid);
4172   kmp_int32 num_done;
4173   kmp_info_t *th = __kmp_threads[gtid];
4174   kmp_team_t *team = th->th.th_team;
4175   kmp_disp_t *pr_buf = th->th.th_dispatch;
4176 
4177   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4178   if (team->t.t_serialized) {
4179     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4180     return; // nothing to do
4181   }
4182   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4183   if (num_done == th->th.th_team_nproc) {
4184     // we are the last thread, need to free shared resources
4185     int idx = pr_buf->th_doacross_buf_idx - 1;
4186     dispatch_shared_info_t *sh_buf =
4187         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4188     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4189                      (kmp_int64)&sh_buf->doacross_num_done);
4190     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4191     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4192     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4193     sh_buf->doacross_flags = NULL;
4194     sh_buf->doacross_num_done = 0;
4195     sh_buf->doacross_buf_idx +=
4196         __kmp_dispatch_num_buffers; // free buffer for future re-use
4197   }
4198   // free private resources (need to keep buffer index forever)
4199   pr_buf->th_doacross_flags = NULL;
4200   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4201   pr_buf->th_doacross_info = NULL;
4202   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4203 }
4204 
4205 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4206 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4207   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4208 }
4209 
4210 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4211   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4212 }
4213 
4214 int __kmpc_get_target_offload(void) {
4215   if (!__kmp_init_serial) {
4216     __kmp_serial_initialize();
4217   }
4218   return __kmp_target_offload;
4219 }
4220 
4221 int __kmpc_pause_resource(kmp_pause_status_t level) {
4222   if (!__kmp_init_serial) {
4223     return 1; // Can't pause if runtime is not initialized
4224   }
4225   return __kmp_pause_resource(level);
4226 }
4227