1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 #if OMP_40_ENABLED
248 
249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
250                            kmp_int32 proc_bind) {
251   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252                 proc_bind));
253 
254   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
255 }
256 
257 #endif /* OMP_40_ENABLED */
258 
259 /*!
260 @ingroup PARALLEL
261 @param loc  source location information
262 @param argc  total number of arguments in the ellipsis
263 @param microtask  pointer to callback routine consisting of outlined parallel
264 construct
265 @param ...  pointers to shared variables that aren't global
266 
267 Do the actual fork and call the microtask in the relevant number of threads.
268 */
269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
270   int gtid = __kmp_entry_gtid();
271 
272 #if (KMP_STATS_ENABLED)
273   // If we were in a serial region, then stop the serial timer, record
274   // the event, and start parallel region timer
275   stats_state_e previous_state = KMP_GET_THREAD_STATE();
276   if (previous_state == stats_state_e::SERIAL_REGION) {
277     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
278   } else {
279     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
280   }
281   int inParallel = __kmpc_in_parallel(loc);
282   if (inParallel) {
283     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
284   } else {
285     KMP_COUNT_BLOCK(OMP_PARALLEL);
286   }
287 #endif
288 
289   // maybe to save thr_state is enough here
290   {
291     va_list ap;
292     va_start(ap, microtask);
293 
294 #if OMPT_SUPPORT
295     ompt_frame_t *ompt_frame;
296     if (ompt_enabled.enabled) {
297       kmp_info_t *master_th = __kmp_threads[gtid];
298       kmp_team_t *parent_team = master_th->th.th_team;
299       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
300       if (lwt)
301         ompt_frame = &(lwt->ompt_task_info.frame);
302       else {
303         int tid = __kmp_tid_from_gtid(gtid);
304         ompt_frame = &(
305             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
306       }
307       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
308       OMPT_STORE_RETURN_ADDRESS(gtid);
309     }
310 #endif
311 
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_FORKING();
314 #endif
315     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
316                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
317                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
320                     &ap
321 #else
322                     ap
323 #endif
324                     );
325 #if INCLUDE_SSC_MARKS
326     SSC_MARK_JOINING();
327 #endif
328     __kmp_join_call(loc, gtid
329 #if OMPT_SUPPORT
330                     ,
331                     fork_context_intel
332 #endif
333                     );
334 
335     va_end(ap);
336   }
337 
338 #if KMP_STATS_ENABLED
339   if (previous_state == stats_state_e::SERIAL_REGION) {
340     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
341   } else {
342     KMP_POP_PARTITIONED_TIMER();
343   }
344 #endif // KMP_STATS_ENABLED
345 }
346 
347 #if OMP_40_ENABLED
348 /*!
349 @ingroup PARALLEL
350 @param loc source location information
351 @param global_tid global thread number
352 @param num_teams number of teams requested for the teams construct
353 @param num_threads number of threads per team requested for the teams construct
354 
355 Set the number of teams to be used by the teams construct.
356 This call is only required if the teams construct has a `num_teams` clause
357 or a `thread_limit` clause (or both).
358 */
359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
360                            kmp_int32 num_teams, kmp_int32 num_threads) {
361   KA_TRACE(20,
362            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
363             global_tid, num_teams, num_threads));
364 
365   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
366 }
367 
368 /*!
369 @ingroup PARALLEL
370 @param loc  source location information
371 @param argc  total number of arguments in the ellipsis
372 @param microtask  pointer to callback routine consisting of outlined teams
373 construct
374 @param ...  pointers to shared variables that aren't global
375 
376 Do the actual fork and call the microtask in the relevant number of threads.
377 */
378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
379                        ...) {
380   int gtid = __kmp_entry_gtid();
381   kmp_info_t *this_thr = __kmp_threads[gtid];
382   va_list ap;
383   va_start(ap, microtask);
384 
385   KMP_COUNT_BLOCK(OMP_TEAMS);
386 
387   // remember teams entry point and nesting level
388   this_thr->th.th_teams_microtask = microtask;
389   this_thr->th.th_teams_level =
390       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
391 
392 #if OMPT_SUPPORT
393   kmp_team_t *parent_team = this_thr->th.th_team;
394   int tid = __kmp_tid_from_gtid(gtid);
395   if (ompt_enabled.enabled) {
396     parent_team->t.t_implicit_task_taskdata[tid]
397         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
398   }
399   OMPT_STORE_RETURN_ADDRESS(gtid);
400 #endif
401 
402   // check if __kmpc_push_num_teams called, set default number of teams
403   // otherwise
404   if (this_thr->th.th_teams_size.nteams == 0) {
405     __kmp_push_num_teams(loc, gtid, 0, 0);
406   }
407   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
410 
411   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
412                   VOLATILE_CAST(microtask_t)
413                       __kmp_teams_master, // "wrapped" task
414                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
416                   &ap
417 #else
418                   ap
419 #endif
420                   );
421   __kmp_join_call(loc, gtid
422 #if OMPT_SUPPORT
423                   ,
424                   fork_context_intel
425 #endif
426                   );
427 
428   // Pop current CG root off list
429   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
431   this_thr->th.th_cg_roots = tmp->up;
432   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
433                  " to node %p. cg_nthreads was %d\n",
434                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
435   __kmp_free(tmp);
436   // Restore current task's thread_limit from CG root
437   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
438   this_thr->th.th_current_task->td_icvs.thread_limit =
439       this_thr->th.th_cg_roots->cg_thread_limit;
440 
441   this_thr->th.th_teams_microtask = NULL;
442   this_thr->th.th_teams_level = 0;
443   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
444   va_end(ap);
445 }
446 #endif /* OMP_40_ENABLED */
447 
448 // I don't think this function should ever have been exported.
449 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
450 // openmp code ever called it, but it's been exported from the RTL for so
451 // long that I'm afraid to remove the definition.
452 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
453 
454 /*!
455 @ingroup PARALLEL
456 @param loc  source location information
457 @param global_tid  global thread number
458 
459 Enter a serialized parallel construct. This interface is used to handle a
460 conditional parallel region, like this,
461 @code
462 #pragma omp parallel if (condition)
463 @endcode
464 when the condition is false.
465 */
466 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
467 // The implementation is now in kmp_runtime.cpp so that it can share static
468 // functions with kmp_fork_call since the tasks to be done are similar in
469 // each case.
470 #if OMPT_SUPPORT
471   OMPT_STORE_RETURN_ADDRESS(global_tid);
472 #endif
473   __kmp_serialized_parallel(loc, global_tid);
474 }
475 
476 /*!
477 @ingroup PARALLEL
478 @param loc  source location information
479 @param global_tid  global thread number
480 
481 Leave a serialized parallel construct.
482 */
483 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
484   kmp_internal_control_t *top;
485   kmp_info_t *this_thr;
486   kmp_team_t *serial_team;
487 
488   KC_TRACE(10,
489            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
490 
491   /* skip all this code for autopar serialized loops since it results in
492      unacceptable overhead */
493   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
494     return;
495 
496   // Not autopar code
497   if (!TCR_4(__kmp_init_parallel))
498     __kmp_parallel_initialize();
499 
500 #if OMP_50_ENABLED
501   __kmp_resume_if_soft_paused();
502 #endif
503 
504   this_thr = __kmp_threads[global_tid];
505   serial_team = this_thr->th.th_serial_team;
506 
507 #if OMP_45_ENABLED
508   kmp_task_team_t *task_team = this_thr->th.th_task_team;
509 
510   // we need to wait for the proxy tasks before finishing the thread
511   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
512     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
513 #endif
514 
515   KMP_MB();
516   KMP_DEBUG_ASSERT(serial_team);
517   KMP_ASSERT(serial_team->t.t_serialized);
518   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
519   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
520   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
521   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
522 
523 #if OMPT_SUPPORT
524   if (ompt_enabled.enabled &&
525       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
526     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
527     if (ompt_enabled.ompt_callback_implicit_task) {
528       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
529           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
530           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
531     }
532 
533     // reset clear the task id only after unlinking the task
534     ompt_data_t *parent_task_data;
535     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
536 
537     if (ompt_enabled.ompt_callback_parallel_end) {
538       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
539           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
540           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
541     }
542     __ompt_lw_taskteam_unlink(this_thr);
543     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
544   }
545 #endif
546 
547   /* If necessary, pop the internal control stack values and replace the team
548    * values */
549   top = serial_team->t.t_control_stack_top;
550   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
551     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
552     serial_team->t.t_control_stack_top = top->next;
553     __kmp_free(top);
554   }
555 
556   // if( serial_team -> t.t_serialized > 1 )
557   serial_team->t.t_level--;
558 
559   /* pop dispatch buffers stack */
560   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
561   {
562     dispatch_private_info_t *disp_buffer =
563         serial_team->t.t_dispatch->th_disp_buffer;
564     serial_team->t.t_dispatch->th_disp_buffer =
565         serial_team->t.t_dispatch->th_disp_buffer->next;
566     __kmp_free(disp_buffer);
567   }
568 #if OMP_50_ENABLED
569   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
570 #endif
571 
572   --serial_team->t.t_serialized;
573   if (serial_team->t.t_serialized == 0) {
574 
575 /* return to the parallel section */
576 
577 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
578     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
579       __kmp_clear_x87_fpu_status_word();
580       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
581       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
582     }
583 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
584 
585     this_thr->th.th_team = serial_team->t.t_parent;
586     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
587 
588     /* restore values cached in the thread */
589     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
590     this_thr->th.th_team_master =
591         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
592     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
593 
594     /* TODO the below shouldn't need to be adjusted for serialized teams */
595     this_thr->th.th_dispatch =
596         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
597 
598     __kmp_pop_current_task_from_thread(this_thr);
599 
600     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
601     this_thr->th.th_current_task->td_flags.executing = 1;
602 
603     if (__kmp_tasking_mode != tskm_immediate_exec) {
604       // Copy the task team from the new child / old parent team to the thread.
605       this_thr->th.th_task_team =
606           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
607       KA_TRACE(20,
608                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
609                 "team %p\n",
610                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
611     }
612   } else {
613     if (__kmp_tasking_mode != tskm_immediate_exec) {
614       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
615                     "depth of serial team %p to %d\n",
616                     global_tid, serial_team, serial_team->t.t_serialized));
617     }
618   }
619 
620   if (__kmp_env_consistency_check)
621     __kmp_pop_parallel(global_tid, NULL);
622 #if OMPT_SUPPORT
623   if (ompt_enabled.enabled)
624     this_thr->th.ompt_thread_info.state =
625         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
626                                            : ompt_state_work_parallel);
627 #endif
628 }
629 
630 /*!
631 @ingroup SYNCHRONIZATION
632 @param loc  source location information.
633 
634 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
635 depending on the memory ordering convention obeyed by the compiler
636 even that may not be necessary).
637 */
638 void __kmpc_flush(ident_t *loc) {
639   KC_TRACE(10, ("__kmpc_flush: called\n"));
640 
641   /* need explicit __mf() here since use volatile instead in library */
642   KMP_MB(); /* Flush all pending memory write invalidates.  */
643 
644 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
645 #if KMP_MIC
646 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
647 // We shouldn't need it, though, since the ABI rules require that
648 // * If the compiler generates NGO stores it also generates the fence
649 // * If users hand-code NGO stores they should insert the fence
650 // therefore no incomplete unordered stores should be visible.
651 #else
652   // C74404
653   // This is to address non-temporal store instructions (sfence needed).
654   // The clflush instruction is addressed either (mfence needed).
655   // Probably the non-temporal load monvtdqa instruction should also be
656   // addressed.
657   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
658   if (!__kmp_cpuinfo.initialized) {
659     __kmp_query_cpuid(&__kmp_cpuinfo);
660   }
661   if (!__kmp_cpuinfo.sse2) {
662     // CPU cannot execute SSE2 instructions.
663   } else {
664 #if KMP_COMPILER_ICC
665     _mm_mfence();
666 #elif KMP_COMPILER_MSVC
667     MemoryBarrier();
668 #else
669     __sync_synchronize();
670 #endif // KMP_COMPILER_ICC
671   }
672 #endif // KMP_MIC
673 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
674 // Nothing to see here move along
675 #elif KMP_ARCH_PPC64
676 // Nothing needed here (we have a real MB above).
677 #if KMP_OS_CNK
678   // The flushing thread needs to yield here; this prevents a
679   // busy-waiting thread from saturating the pipeline. flush is
680   // often used in loops like this:
681   // while (!flag) {
682   //   #pragma omp flush(flag)
683   // }
684   // and adding the yield here is good for at least a 10x speedup
685   // when running >2 threads per core (on the NAS LU benchmark).
686   __kmp_yield(TRUE);
687 #endif
688 #else
689 #error Unknown or unsupported architecture
690 #endif
691 
692 #if OMPT_SUPPORT && OMPT_OPTIONAL
693   if (ompt_enabled.ompt_callback_flush) {
694     ompt_callbacks.ompt_callback(ompt_callback_flush)(
695         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
696   }
697 #endif
698 }
699 
700 /* -------------------------------------------------------------------------- */
701 /*!
702 @ingroup SYNCHRONIZATION
703 @param loc source location information
704 @param global_tid thread id.
705 
706 Execute a barrier.
707 */
708 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
709   KMP_COUNT_BLOCK(OMP_BARRIER);
710   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
711 
712   if (!TCR_4(__kmp_init_parallel))
713     __kmp_parallel_initialize();
714 
715 #if OMP_50_ENABLED
716   __kmp_resume_if_soft_paused();
717 #endif
718 
719   if (__kmp_env_consistency_check) {
720     if (loc == 0) {
721       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
722     }
723 
724     __kmp_check_barrier(global_tid, ct_barrier, loc);
725   }
726 
727 #if OMPT_SUPPORT
728   ompt_frame_t *ompt_frame;
729   if (ompt_enabled.enabled) {
730     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
731     if (ompt_frame->enter_frame.ptr == NULL)
732       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
733     OMPT_STORE_RETURN_ADDRESS(global_tid);
734   }
735 #endif
736   __kmp_threads[global_tid]->th.th_ident = loc;
737   // TODO: explicit barrier_wait_id:
738   //   this function is called when 'barrier' directive is present or
739   //   implicit barrier at the end of a worksharing construct.
740   // 1) better to add a per-thread barrier counter to a thread data structure
741   // 2) set to 0 when a new team is created
742   // 4) no sync is required
743 
744   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
745 #if OMPT_SUPPORT && OMPT_OPTIONAL
746   if (ompt_enabled.enabled) {
747     ompt_frame->enter_frame = ompt_data_none;
748   }
749 #endif
750 }
751 
752 /* The BARRIER for a MASTER section is always explicit   */
753 /*!
754 @ingroup WORK_SHARING
755 @param loc  source location information.
756 @param global_tid  global thread number .
757 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
758 */
759 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
760   int status = 0;
761 
762   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
763 
764   if (!TCR_4(__kmp_init_parallel))
765     __kmp_parallel_initialize();
766 
767 #if OMP_50_ENABLED
768   __kmp_resume_if_soft_paused();
769 #endif
770 
771   if (KMP_MASTER_GTID(global_tid)) {
772     KMP_COUNT_BLOCK(OMP_MASTER);
773     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
774     status = 1;
775   }
776 
777 #if OMPT_SUPPORT && OMPT_OPTIONAL
778   if (status) {
779     if (ompt_enabled.ompt_callback_master) {
780       kmp_info_t *this_thr = __kmp_threads[global_tid];
781       kmp_team_t *team = this_thr->th.th_team;
782 
783       int tid = __kmp_tid_from_gtid(global_tid);
784       ompt_callbacks.ompt_callback(ompt_callback_master)(
785           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
786           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
787           OMPT_GET_RETURN_ADDRESS(0));
788     }
789   }
790 #endif
791 
792   if (__kmp_env_consistency_check) {
793 #if KMP_USE_DYNAMIC_LOCK
794     if (status)
795       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
796     else
797       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
798 #else
799     if (status)
800       __kmp_push_sync(global_tid, ct_master, loc, NULL);
801     else
802       __kmp_check_sync(global_tid, ct_master, loc, NULL);
803 #endif
804   }
805 
806   return status;
807 }
808 
809 /*!
810 @ingroup WORK_SHARING
811 @param loc  source location information.
812 @param global_tid  global thread number .
813 
814 Mark the end of a <tt>master</tt> region. This should only be called by the
815 thread that executes the <tt>master</tt> region.
816 */
817 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
818   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
819 
820   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
821   KMP_POP_PARTITIONED_TIMER();
822 
823 #if OMPT_SUPPORT && OMPT_OPTIONAL
824   kmp_info_t *this_thr = __kmp_threads[global_tid];
825   kmp_team_t *team = this_thr->th.th_team;
826   if (ompt_enabled.ompt_callback_master) {
827     int tid = __kmp_tid_from_gtid(global_tid);
828     ompt_callbacks.ompt_callback(ompt_callback_master)(
829         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
830         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
831         OMPT_GET_RETURN_ADDRESS(0));
832   }
833 #endif
834 
835   if (__kmp_env_consistency_check) {
836     if (global_tid < 0)
837       KMP_WARNING(ThreadIdentInvalid);
838 
839     if (KMP_MASTER_GTID(global_tid))
840       __kmp_pop_sync(global_tid, ct_master, loc);
841   }
842 }
843 
844 /*!
845 @ingroup WORK_SHARING
846 @param loc  source location information.
847 @param gtid  global thread number.
848 
849 Start execution of an <tt>ordered</tt> construct.
850 */
851 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
852   int cid = 0;
853   kmp_info_t *th;
854   KMP_DEBUG_ASSERT(__kmp_init_serial);
855 
856   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
857 
858   if (!TCR_4(__kmp_init_parallel))
859     __kmp_parallel_initialize();
860 
861 #if OMP_50_ENABLED
862   __kmp_resume_if_soft_paused();
863 #endif
864 
865 #if USE_ITT_BUILD
866   __kmp_itt_ordered_prep(gtid);
867 // TODO: ordered_wait_id
868 #endif /* USE_ITT_BUILD */
869 
870   th = __kmp_threads[gtid];
871 
872 #if OMPT_SUPPORT && OMPT_OPTIONAL
873   kmp_team_t *team;
874   ompt_wait_id_t lck;
875   void *codeptr_ra;
876   if (ompt_enabled.enabled) {
877     OMPT_STORE_RETURN_ADDRESS(gtid);
878     team = __kmp_team_from_gtid(gtid);
879     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
880     /* OMPT state update */
881     th->th.ompt_thread_info.wait_id = lck;
882     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
883 
884     /* OMPT event callback */
885     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
886     if (ompt_enabled.ompt_callback_mutex_acquire) {
887       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
888           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
889           (ompt_wait_id_t)lck, codeptr_ra);
890     }
891   }
892 #endif
893 
894   if (th->th.th_dispatch->th_deo_fcn != 0)
895     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
896   else
897     __kmp_parallel_deo(&gtid, &cid, loc);
898 
899 #if OMPT_SUPPORT && OMPT_OPTIONAL
900   if (ompt_enabled.enabled) {
901     /* OMPT state update */
902     th->th.ompt_thread_info.state = ompt_state_work_parallel;
903     th->th.ompt_thread_info.wait_id = 0;
904 
905     /* OMPT event callback */
906     if (ompt_enabled.ompt_callback_mutex_acquired) {
907       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
908           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
909     }
910   }
911 #endif
912 
913 #if USE_ITT_BUILD
914   __kmp_itt_ordered_start(gtid);
915 #endif /* USE_ITT_BUILD */
916 }
917 
918 /*!
919 @ingroup WORK_SHARING
920 @param loc  source location information.
921 @param gtid  global thread number.
922 
923 End execution of an <tt>ordered</tt> construct.
924 */
925 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
926   int cid = 0;
927   kmp_info_t *th;
928 
929   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
930 
931 #if USE_ITT_BUILD
932   __kmp_itt_ordered_end(gtid);
933 // TODO: ordered_wait_id
934 #endif /* USE_ITT_BUILD */
935 
936   th = __kmp_threads[gtid];
937 
938   if (th->th.th_dispatch->th_dxo_fcn != 0)
939     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
940   else
941     __kmp_parallel_dxo(&gtid, &cid, loc);
942 
943 #if OMPT_SUPPORT && OMPT_OPTIONAL
944   OMPT_STORE_RETURN_ADDRESS(gtid);
945   if (ompt_enabled.ompt_callback_mutex_released) {
946     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
947         ompt_mutex_ordered,
948         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
949         OMPT_LOAD_RETURN_ADDRESS(gtid));
950   }
951 #endif
952 }
953 
954 #if KMP_USE_DYNAMIC_LOCK
955 
956 static __forceinline void
957 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
958                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
959   // Pointer to the allocated indirect lock is written to crit, while indexing
960   // is ignored.
961   void *idx;
962   kmp_indirect_lock_t **lck;
963   lck = (kmp_indirect_lock_t **)crit;
964   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
965   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
966   KMP_SET_I_LOCK_LOCATION(ilk, loc);
967   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
968   KA_TRACE(20,
969            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
970 #if USE_ITT_BUILD
971   __kmp_itt_critical_creating(ilk->lock, loc);
972 #endif
973   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
974   if (status == 0) {
975 #if USE_ITT_BUILD
976     __kmp_itt_critical_destroyed(ilk->lock);
977 #endif
978     // We don't really need to destroy the unclaimed lock here since it will be
979     // cleaned up at program exit.
980     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
981   }
982   KMP_DEBUG_ASSERT(*lck != NULL);
983 }
984 
985 // Fast-path acquire tas lock
986 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
987   {                                                                            \
988     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
989     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
990     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
991     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
992         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
993       kmp_uint32 spins;                                                        \
994       KMP_FSYNC_PREPARE(l);                                                    \
995       KMP_INIT_YIELD(spins);                                                   \
996       if (TCR_4(__kmp_nth) >                                                   \
997           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
998         KMP_YIELD(TRUE);                                                       \
999       } else {                                                                 \
1000         KMP_YIELD_SPIN(spins);                                                 \
1001       }                                                                        \
1002       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1003       while (                                                                  \
1004           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1005           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
1006         __kmp_spin_backoff(&backoff);                                          \
1007         if (TCR_4(__kmp_nth) >                                                 \
1008             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1009           KMP_YIELD(TRUE);                                                     \
1010         } else {                                                               \
1011           KMP_YIELD_SPIN(spins);                                               \
1012         }                                                                      \
1013       }                                                                        \
1014     }                                                                          \
1015     KMP_FSYNC_ACQUIRED(l);                                                     \
1016   }
1017 
1018 // Fast-path test tas lock
1019 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1020   {                                                                            \
1021     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1022     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1023     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1024     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1025          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1026   }
1027 
1028 // Fast-path release tas lock
1029 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1030   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1031 
1032 #if KMP_USE_FUTEX
1033 
1034 #include <sys/syscall.h>
1035 #include <unistd.h>
1036 #ifndef FUTEX_WAIT
1037 #define FUTEX_WAIT 0
1038 #endif
1039 #ifndef FUTEX_WAKE
1040 #define FUTEX_WAKE 1
1041 #endif
1042 
1043 // Fast-path acquire futex lock
1044 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1045   {                                                                            \
1046     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1047     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1048     KMP_MB();                                                                  \
1049     KMP_FSYNC_PREPARE(ftx);                                                    \
1050     kmp_int32 poll_val;                                                        \
1051     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1052                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1053                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1054       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1055       if (!cond) {                                                             \
1056         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1057                                          poll_val |                            \
1058                                              KMP_LOCK_BUSY(1, futex))) {       \
1059           continue;                                                            \
1060         }                                                                      \
1061         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1062       }                                                                        \
1063       kmp_int32 rc;                                                            \
1064       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1065                         NULL, NULL, 0)) != 0) {                                \
1066         continue;                                                              \
1067       }                                                                        \
1068       gtid_code |= 1;                                                          \
1069     }                                                                          \
1070     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1071   }
1072 
1073 // Fast-path test futex lock
1074 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1075   {                                                                            \
1076     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1077     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1078                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1079       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1080       rc = TRUE;                                                               \
1081     } else {                                                                   \
1082       rc = FALSE;                                                              \
1083     }                                                                          \
1084   }
1085 
1086 // Fast-path release futex lock
1087 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1088   {                                                                            \
1089     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1090     KMP_MB();                                                                  \
1091     KMP_FSYNC_RELEASING(ftx);                                                  \
1092     kmp_int32 poll_val =                                                       \
1093         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1094     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1095       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1096               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1097     }                                                                          \
1098     KMP_MB();                                                                  \
1099     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1100               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1101   }
1102 
1103 #endif // KMP_USE_FUTEX
1104 
1105 #else // KMP_USE_DYNAMIC_LOCK
1106 
1107 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1108                                                       ident_t const *loc,
1109                                                       kmp_int32 gtid) {
1110   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1111 
1112   // Because of the double-check, the following load doesn't need to be volatile
1113   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1114 
1115   if (lck == NULL) {
1116     void *idx;
1117 
1118     // Allocate & initialize the lock.
1119     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1120     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1121     __kmp_init_user_lock_with_checks(lck);
1122     __kmp_set_user_lock_location(lck, loc);
1123 #if USE_ITT_BUILD
1124     __kmp_itt_critical_creating(lck);
1125 // __kmp_itt_critical_creating() should be called *before* the first usage
1126 // of underlying lock. It is the only place where we can guarantee it. There
1127 // are chances the lock will destroyed with no usage, but it is not a
1128 // problem, because this is not real event seen by user but rather setting
1129 // name for object (lock). See more details in kmp_itt.h.
1130 #endif /* USE_ITT_BUILD */
1131 
1132     // Use a cmpxchg instruction to slam the start of the critical section with
1133     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1134     // and use the lock that the other thread allocated.
1135     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1136 
1137     if (status == 0) {
1138 // Deallocate the lock and reload the value.
1139 #if USE_ITT_BUILD
1140       __kmp_itt_critical_destroyed(lck);
1141 // Let ITT know the lock is destroyed and the same memory location may be reused
1142 // for another purpose.
1143 #endif /* USE_ITT_BUILD */
1144       __kmp_destroy_user_lock_with_checks(lck);
1145       __kmp_user_lock_free(&idx, gtid, lck);
1146       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1147       KMP_DEBUG_ASSERT(lck != NULL);
1148     }
1149   }
1150   return lck;
1151 }
1152 
1153 #endif // KMP_USE_DYNAMIC_LOCK
1154 
1155 /*!
1156 @ingroup WORK_SHARING
1157 @param loc  source location information.
1158 @param global_tid  global thread number .
1159 @param crit identity of the critical section. This could be a pointer to a lock
1160 associated with the critical section, or some other suitably unique value.
1161 
1162 Enter code protected by a `critical` construct.
1163 This function blocks until the executing thread can enter the critical section.
1164 */
1165 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1166                      kmp_critical_name *crit) {
1167 #if KMP_USE_DYNAMIC_LOCK
1168 #if OMPT_SUPPORT && OMPT_OPTIONAL
1169   OMPT_STORE_RETURN_ADDRESS(global_tid);
1170 #endif // OMPT_SUPPORT
1171   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1172 #else
1173   KMP_COUNT_BLOCK(OMP_CRITICAL);
1174 #if OMPT_SUPPORT && OMPT_OPTIONAL
1175   ompt_state_t prev_state = ompt_state_undefined;
1176   ompt_thread_info_t ti;
1177 #endif
1178   kmp_user_lock_p lck;
1179 
1180   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1181 
1182   // TODO: add THR_OVHD_STATE
1183 
1184   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1185   KMP_CHECK_USER_LOCK_INIT();
1186 
1187   if ((__kmp_user_lock_kind == lk_tas) &&
1188       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1189     lck = (kmp_user_lock_p)crit;
1190   }
1191 #if KMP_USE_FUTEX
1192   else if ((__kmp_user_lock_kind == lk_futex) &&
1193            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1194     lck = (kmp_user_lock_p)crit;
1195   }
1196 #endif
1197   else { // ticket, queuing or drdpa
1198     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1199   }
1200 
1201   if (__kmp_env_consistency_check)
1202     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1203 
1204 // since the critical directive binds to all threads, not just the current
1205 // team we have to check this even if we are in a serialized team.
1206 // also, even if we are the uber thread, we still have to conduct the lock,
1207 // as we have to contend with sibling threads.
1208 
1209 #if USE_ITT_BUILD
1210   __kmp_itt_critical_acquiring(lck);
1211 #endif /* USE_ITT_BUILD */
1212 #if OMPT_SUPPORT && OMPT_OPTIONAL
1213   OMPT_STORE_RETURN_ADDRESS(gtid);
1214   void *codeptr_ra = NULL;
1215   if (ompt_enabled.enabled) {
1216     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1217     /* OMPT state update */
1218     prev_state = ti.state;
1219     ti.wait_id = (ompt_wait_id_t)lck;
1220     ti.state = ompt_state_wait_critical;
1221 
1222     /* OMPT event callback */
1223     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1224     if (ompt_enabled.ompt_callback_mutex_acquire) {
1225       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1226           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1227           (ompt_wait_id_t)crit, codeptr_ra);
1228     }
1229   }
1230 #endif
1231   // Value of 'crit' should be good for using as a critical_id of the critical
1232   // section directive.
1233   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1234 
1235 #if USE_ITT_BUILD
1236   __kmp_itt_critical_acquired(lck);
1237 #endif /* USE_ITT_BUILD */
1238 #if OMPT_SUPPORT && OMPT_OPTIONAL
1239   if (ompt_enabled.enabled) {
1240     /* OMPT state update */
1241     ti.state = prev_state;
1242     ti.wait_id = 0;
1243 
1244     /* OMPT event callback */
1245     if (ompt_enabled.ompt_callback_mutex_acquired) {
1246       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1247           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1248     }
1249   }
1250 #endif
1251   KMP_POP_PARTITIONED_TIMER();
1252 
1253   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1254   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1255 #endif // KMP_USE_DYNAMIC_LOCK
1256 }
1257 
1258 #if KMP_USE_DYNAMIC_LOCK
1259 
1260 // Converts the given hint to an internal lock implementation
1261 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1262 #if KMP_USE_TSX
1263 #define KMP_TSX_LOCK(seq) lockseq_##seq
1264 #else
1265 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1266 #endif
1267 
1268 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1269 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1270 #else
1271 #define KMP_CPUINFO_RTM 0
1272 #endif
1273 
1274   // Hints that do not require further logic
1275   if (hint & kmp_lock_hint_hle)
1276     return KMP_TSX_LOCK(hle);
1277   if (hint & kmp_lock_hint_rtm)
1278     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1279   if (hint & kmp_lock_hint_adaptive)
1280     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1281 
1282   // Rule out conflicting hints first by returning the default lock
1283   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1284     return __kmp_user_lock_seq;
1285   if ((hint & omp_lock_hint_speculative) &&
1286       (hint & omp_lock_hint_nonspeculative))
1287     return __kmp_user_lock_seq;
1288 
1289   // Do not even consider speculation when it appears to be contended
1290   if (hint & omp_lock_hint_contended)
1291     return lockseq_queuing;
1292 
1293   // Uncontended lock without speculation
1294   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1295     return lockseq_tas;
1296 
1297   // HLE lock for speculation
1298   if (hint & omp_lock_hint_speculative)
1299     return KMP_TSX_LOCK(hle);
1300 
1301   return __kmp_user_lock_seq;
1302 }
1303 
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305 #if KMP_USE_DYNAMIC_LOCK
1306 static kmp_mutex_impl_t
1307 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1308   if (user_lock) {
1309     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1310     case 0:
1311       break;
1312 #if KMP_USE_FUTEX
1313     case locktag_futex:
1314       return kmp_mutex_impl_queuing;
1315 #endif
1316     case locktag_tas:
1317       return kmp_mutex_impl_spin;
1318 #if KMP_USE_TSX
1319     case locktag_hle:
1320       return kmp_mutex_impl_speculative;
1321 #endif
1322     default:
1323       return kmp_mutex_impl_none;
1324     }
1325     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1326   }
1327   KMP_ASSERT(ilock);
1328   switch (ilock->type) {
1329 #if KMP_USE_TSX
1330   case locktag_adaptive:
1331   case locktag_rtm:
1332     return kmp_mutex_impl_speculative;
1333 #endif
1334   case locktag_nested_tas:
1335     return kmp_mutex_impl_spin;
1336 #if KMP_USE_FUTEX
1337   case locktag_nested_futex:
1338 #endif
1339   case locktag_ticket:
1340   case locktag_queuing:
1341   case locktag_drdpa:
1342   case locktag_nested_ticket:
1343   case locktag_nested_queuing:
1344   case locktag_nested_drdpa:
1345     return kmp_mutex_impl_queuing;
1346   default:
1347     return kmp_mutex_impl_none;
1348   }
1349 }
1350 #else
1351 // For locks without dynamic binding
1352 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1353   switch (__kmp_user_lock_kind) {
1354   case lk_tas:
1355     return kmp_mutex_impl_spin;
1356 #if KMP_USE_FUTEX
1357   case lk_futex:
1358 #endif
1359   case lk_ticket:
1360   case lk_queuing:
1361   case lk_drdpa:
1362     return kmp_mutex_impl_queuing;
1363 #if KMP_USE_TSX
1364   case lk_hle:
1365   case lk_rtm:
1366   case lk_adaptive:
1367     return kmp_mutex_impl_speculative;
1368 #endif
1369   default:
1370     return kmp_mutex_impl_none;
1371   }
1372 }
1373 #endif // KMP_USE_DYNAMIC_LOCK
1374 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1375 
1376 /*!
1377 @ingroup WORK_SHARING
1378 @param loc  source location information.
1379 @param global_tid  global thread number.
1380 @param crit identity of the critical section. This could be a pointer to a lock
1381 associated with the critical section, or some other suitably unique value.
1382 @param hint the lock hint.
1383 
1384 Enter code protected by a `critical` construct with a hint. The hint value is
1385 used to suggest a lock implementation. This function blocks until the executing
1386 thread can enter the critical section unless the hint suggests use of
1387 speculative execution and the hardware supports it.
1388 */
1389 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1390                                kmp_critical_name *crit, uint32_t hint) {
1391   KMP_COUNT_BLOCK(OMP_CRITICAL);
1392   kmp_user_lock_p lck;
1393 #if OMPT_SUPPORT && OMPT_OPTIONAL
1394   ompt_state_t prev_state = ompt_state_undefined;
1395   ompt_thread_info_t ti;
1396   // This is the case, if called from __kmpc_critical:
1397   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1398   if (!codeptr)
1399     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1400 #endif
1401 
1402   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1403 
1404   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1405   // Check if it is initialized.
1406   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1407   if (*lk == 0) {
1408     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1409     if (KMP_IS_D_LOCK(lckseq)) {
1410       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1411                                   KMP_GET_D_TAG(lckseq));
1412     } else {
1413       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1414     }
1415   }
1416   // Branch for accessing the actual lock object and set operation. This
1417   // branching is inevitable since this lock initialization does not follow the
1418   // normal dispatch path (lock table is not used).
1419   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1420     lck = (kmp_user_lock_p)lk;
1421     if (__kmp_env_consistency_check) {
1422       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1423                       __kmp_map_hint_to_lock(hint));
1424     }
1425 #if USE_ITT_BUILD
1426     __kmp_itt_critical_acquiring(lck);
1427 #endif
1428 #if OMPT_SUPPORT && OMPT_OPTIONAL
1429     if (ompt_enabled.enabled) {
1430       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1431       /* OMPT state update */
1432       prev_state = ti.state;
1433       ti.wait_id = (ompt_wait_id_t)lck;
1434       ti.state = ompt_state_wait_critical;
1435 
1436       /* OMPT event callback */
1437       if (ompt_enabled.ompt_callback_mutex_acquire) {
1438         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1439             ompt_mutex_critical, (unsigned int)hint,
1440             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1441       }
1442     }
1443 #endif
1444 #if KMP_USE_INLINED_TAS
1445     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1446       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1447     } else
1448 #elif KMP_USE_INLINED_FUTEX
1449     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1450       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1451     } else
1452 #endif
1453     {
1454       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1455     }
1456   } else {
1457     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1458     lck = ilk->lock;
1459     if (__kmp_env_consistency_check) {
1460       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1461                       __kmp_map_hint_to_lock(hint));
1462     }
1463 #if USE_ITT_BUILD
1464     __kmp_itt_critical_acquiring(lck);
1465 #endif
1466 #if OMPT_SUPPORT && OMPT_OPTIONAL
1467     if (ompt_enabled.enabled) {
1468       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1469       /* OMPT state update */
1470       prev_state = ti.state;
1471       ti.wait_id = (ompt_wait_id_t)lck;
1472       ti.state = ompt_state_wait_critical;
1473 
1474       /* OMPT event callback */
1475       if (ompt_enabled.ompt_callback_mutex_acquire) {
1476         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1477             ompt_mutex_critical, (unsigned int)hint,
1478             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1479       }
1480     }
1481 #endif
1482     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1483   }
1484   KMP_POP_PARTITIONED_TIMER();
1485 
1486 #if USE_ITT_BUILD
1487   __kmp_itt_critical_acquired(lck);
1488 #endif /* USE_ITT_BUILD */
1489 #if OMPT_SUPPORT && OMPT_OPTIONAL
1490   if (ompt_enabled.enabled) {
1491     /* OMPT state update */
1492     ti.state = prev_state;
1493     ti.wait_id = 0;
1494 
1495     /* OMPT event callback */
1496     if (ompt_enabled.ompt_callback_mutex_acquired) {
1497       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1498           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1499     }
1500   }
1501 #endif
1502 
1503   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1504   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1505 } // __kmpc_critical_with_hint
1506 
1507 #endif // KMP_USE_DYNAMIC_LOCK
1508 
1509 /*!
1510 @ingroup WORK_SHARING
1511 @param loc  source location information.
1512 @param global_tid  global thread number .
1513 @param crit identity of the critical section. This could be a pointer to a lock
1514 associated with the critical section, or some other suitably unique value.
1515 
1516 Leave a critical section, releasing any lock that was held during its execution.
1517 */
1518 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1519                          kmp_critical_name *crit) {
1520   kmp_user_lock_p lck;
1521 
1522   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1523 
1524 #if KMP_USE_DYNAMIC_LOCK
1525   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1526     lck = (kmp_user_lock_p)crit;
1527     KMP_ASSERT(lck != NULL);
1528     if (__kmp_env_consistency_check) {
1529       __kmp_pop_sync(global_tid, ct_critical, loc);
1530     }
1531 #if USE_ITT_BUILD
1532     __kmp_itt_critical_releasing(lck);
1533 #endif
1534 #if KMP_USE_INLINED_TAS
1535     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1536       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1537     } else
1538 #elif KMP_USE_INLINED_FUTEX
1539     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1540       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1541     } else
1542 #endif
1543     {
1544       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1545     }
1546   } else {
1547     kmp_indirect_lock_t *ilk =
1548         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1549     KMP_ASSERT(ilk != NULL);
1550     lck = ilk->lock;
1551     if (__kmp_env_consistency_check) {
1552       __kmp_pop_sync(global_tid, ct_critical, loc);
1553     }
1554 #if USE_ITT_BUILD
1555     __kmp_itt_critical_releasing(lck);
1556 #endif
1557     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1558   }
1559 
1560 #else // KMP_USE_DYNAMIC_LOCK
1561 
1562   if ((__kmp_user_lock_kind == lk_tas) &&
1563       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1564     lck = (kmp_user_lock_p)crit;
1565   }
1566 #if KMP_USE_FUTEX
1567   else if ((__kmp_user_lock_kind == lk_futex) &&
1568            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1569     lck = (kmp_user_lock_p)crit;
1570   }
1571 #endif
1572   else { // ticket, queuing or drdpa
1573     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1574   }
1575 
1576   KMP_ASSERT(lck != NULL);
1577 
1578   if (__kmp_env_consistency_check)
1579     __kmp_pop_sync(global_tid, ct_critical, loc);
1580 
1581 #if USE_ITT_BUILD
1582   __kmp_itt_critical_releasing(lck);
1583 #endif /* USE_ITT_BUILD */
1584   // Value of 'crit' should be good for using as a critical_id of the critical
1585   // section directive.
1586   __kmp_release_user_lock_with_checks(lck, global_tid);
1587 
1588 #endif // KMP_USE_DYNAMIC_LOCK
1589 
1590 #if OMPT_SUPPORT && OMPT_OPTIONAL
1591   /* OMPT release event triggers after lock is released; place here to trigger
1592    * for all #if branches */
1593   OMPT_STORE_RETURN_ADDRESS(global_tid);
1594   if (ompt_enabled.ompt_callback_mutex_released) {
1595     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1596         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1597   }
1598 #endif
1599 
1600   KMP_POP_PARTITIONED_TIMER();
1601   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1602 }
1603 
1604 /*!
1605 @ingroup SYNCHRONIZATION
1606 @param loc source location information
1607 @param global_tid thread id.
1608 @return one if the thread should execute the master block, zero otherwise
1609 
1610 Start execution of a combined barrier and master. The barrier is executed inside
1611 this function.
1612 */
1613 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1614   int status;
1615 
1616   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1617 
1618   if (!TCR_4(__kmp_init_parallel))
1619     __kmp_parallel_initialize();
1620 
1621 #if OMP_50_ENABLED
1622   __kmp_resume_if_soft_paused();
1623 #endif
1624 
1625   if (__kmp_env_consistency_check)
1626     __kmp_check_barrier(global_tid, ct_barrier, loc);
1627 
1628 #if OMPT_SUPPORT
1629   ompt_frame_t *ompt_frame;
1630   if (ompt_enabled.enabled) {
1631     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1632     if (ompt_frame->enter_frame.ptr == NULL)
1633       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1634     OMPT_STORE_RETURN_ADDRESS(global_tid);
1635   }
1636 #endif
1637 #if USE_ITT_NOTIFY
1638   __kmp_threads[global_tid]->th.th_ident = loc;
1639 #endif
1640   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1641 #if OMPT_SUPPORT && OMPT_OPTIONAL
1642   if (ompt_enabled.enabled) {
1643     ompt_frame->enter_frame = ompt_data_none;
1644   }
1645 #endif
1646 
1647   return (status != 0) ? 0 : 1;
1648 }
1649 
1650 /*!
1651 @ingroup SYNCHRONIZATION
1652 @param loc source location information
1653 @param global_tid thread id.
1654 
1655 Complete the execution of a combined barrier and master. This function should
1656 only be called at the completion of the <tt>master</tt> code. Other threads will
1657 still be waiting at the barrier and this call releases them.
1658 */
1659 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1660   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1661 
1662   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1663 }
1664 
1665 /*!
1666 @ingroup SYNCHRONIZATION
1667 @param loc source location information
1668 @param global_tid thread id.
1669 @return one if the thread should execute the master block, zero otherwise
1670 
1671 Start execution of a combined barrier and master(nowait) construct.
1672 The barrier is executed inside this function.
1673 There is no equivalent "end" function, since the
1674 */
1675 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1676   kmp_int32 ret;
1677 
1678   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1679 
1680   if (!TCR_4(__kmp_init_parallel))
1681     __kmp_parallel_initialize();
1682 
1683 #if OMP_50_ENABLED
1684   __kmp_resume_if_soft_paused();
1685 #endif
1686 
1687   if (__kmp_env_consistency_check) {
1688     if (loc == 0) {
1689       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1690     }
1691     __kmp_check_barrier(global_tid, ct_barrier, loc);
1692   }
1693 
1694 #if OMPT_SUPPORT
1695   ompt_frame_t *ompt_frame;
1696   if (ompt_enabled.enabled) {
1697     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1698     if (ompt_frame->enter_frame.ptr == NULL)
1699       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1700     OMPT_STORE_RETURN_ADDRESS(global_tid);
1701   }
1702 #endif
1703 #if USE_ITT_NOTIFY
1704   __kmp_threads[global_tid]->th.th_ident = loc;
1705 #endif
1706   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1707 #if OMPT_SUPPORT && OMPT_OPTIONAL
1708   if (ompt_enabled.enabled) {
1709     ompt_frame->enter_frame = ompt_data_none;
1710   }
1711 #endif
1712 
1713   ret = __kmpc_master(loc, global_tid);
1714 
1715   if (__kmp_env_consistency_check) {
1716     /*  there's no __kmpc_end_master called; so the (stats) */
1717     /*  actions of __kmpc_end_master are done here          */
1718 
1719     if (global_tid < 0) {
1720       KMP_WARNING(ThreadIdentInvalid);
1721     }
1722     if (ret) {
1723       /* only one thread should do the pop since only */
1724       /* one did the push (see __kmpc_master())       */
1725 
1726       __kmp_pop_sync(global_tid, ct_master, loc);
1727     }
1728   }
1729 
1730   return (ret);
1731 }
1732 
1733 /* The BARRIER for a SINGLE process section is always explicit   */
1734 /*!
1735 @ingroup WORK_SHARING
1736 @param loc  source location information
1737 @param global_tid  global thread number
1738 @return One if this thread should execute the single construct, zero otherwise.
1739 
1740 Test whether to execute a <tt>single</tt> construct.
1741 There are no implicit barriers in the two "single" calls, rather the compiler
1742 should introduce an explicit barrier if it is required.
1743 */
1744 
1745 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1746   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1747 
1748   if (rc) {
1749     // We are going to execute the single statement, so we should count it.
1750     KMP_COUNT_BLOCK(OMP_SINGLE);
1751     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1752   }
1753 
1754 #if OMPT_SUPPORT && OMPT_OPTIONAL
1755   kmp_info_t *this_thr = __kmp_threads[global_tid];
1756   kmp_team_t *team = this_thr->th.th_team;
1757   int tid = __kmp_tid_from_gtid(global_tid);
1758 
1759   if (ompt_enabled.enabled) {
1760     if (rc) {
1761       if (ompt_enabled.ompt_callback_work) {
1762         ompt_callbacks.ompt_callback(ompt_callback_work)(
1763             ompt_work_single_executor, ompt_scope_begin,
1764             &(team->t.ompt_team_info.parallel_data),
1765             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1766             1, OMPT_GET_RETURN_ADDRESS(0));
1767       }
1768     } else {
1769       if (ompt_enabled.ompt_callback_work) {
1770         ompt_callbacks.ompt_callback(ompt_callback_work)(
1771             ompt_work_single_other, ompt_scope_begin,
1772             &(team->t.ompt_team_info.parallel_data),
1773             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1774             1, OMPT_GET_RETURN_ADDRESS(0));
1775         ompt_callbacks.ompt_callback(ompt_callback_work)(
1776             ompt_work_single_other, ompt_scope_end,
1777             &(team->t.ompt_team_info.parallel_data),
1778             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1779             1, OMPT_GET_RETURN_ADDRESS(0));
1780       }
1781     }
1782   }
1783 #endif
1784 
1785   return rc;
1786 }
1787 
1788 /*!
1789 @ingroup WORK_SHARING
1790 @param loc  source location information
1791 @param global_tid  global thread number
1792 
1793 Mark the end of a <tt>single</tt> construct.  This function should
1794 only be called by the thread that executed the block of code protected
1795 by the `single` construct.
1796 */
1797 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1798   __kmp_exit_single(global_tid);
1799   KMP_POP_PARTITIONED_TIMER();
1800 
1801 #if OMPT_SUPPORT && OMPT_OPTIONAL
1802   kmp_info_t *this_thr = __kmp_threads[global_tid];
1803   kmp_team_t *team = this_thr->th.th_team;
1804   int tid = __kmp_tid_from_gtid(global_tid);
1805 
1806   if (ompt_enabled.ompt_callback_work) {
1807     ompt_callbacks.ompt_callback(ompt_callback_work)(
1808         ompt_work_single_executor, ompt_scope_end,
1809         &(team->t.ompt_team_info.parallel_data),
1810         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1811         OMPT_GET_RETURN_ADDRESS(0));
1812   }
1813 #endif
1814 }
1815 
1816 /*!
1817 @ingroup WORK_SHARING
1818 @param loc Source location
1819 @param global_tid Global thread id
1820 
1821 Mark the end of a statically scheduled loop.
1822 */
1823 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1824   KMP_POP_PARTITIONED_TIMER();
1825   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1826 
1827 #if OMPT_SUPPORT && OMPT_OPTIONAL
1828   if (ompt_enabled.ompt_callback_work) {
1829     ompt_work_t ompt_work_type = ompt_work_loop;
1830     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1831     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1832     // Determine workshare type
1833     if (loc != NULL) {
1834       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1835         ompt_work_type = ompt_work_loop;
1836       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1837         ompt_work_type = ompt_work_sections;
1838       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1839         ompt_work_type = ompt_work_distribute;
1840       } else {
1841         // use default set above.
1842         // a warning about this case is provided in __kmpc_for_static_init
1843       }
1844       KMP_DEBUG_ASSERT(ompt_work_type);
1845     }
1846     ompt_callbacks.ompt_callback(ompt_callback_work)(
1847         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1848         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1849   }
1850 #endif
1851   if (__kmp_env_consistency_check)
1852     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1853 }
1854 
1855 // User routines which take C-style arguments (call by value)
1856 // different from the Fortran equivalent routines
1857 
1858 void ompc_set_num_threads(int arg) {
1859   // !!!!! TODO: check the per-task binding
1860   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1861 }
1862 
1863 void ompc_set_dynamic(int flag) {
1864   kmp_info_t *thread;
1865 
1866   /* For the thread-private implementation of the internal controls */
1867   thread = __kmp_entry_thread();
1868 
1869   __kmp_save_internal_controls(thread);
1870 
1871   set__dynamic(thread, flag ? TRUE : FALSE);
1872 }
1873 
1874 void ompc_set_nested(int flag) {
1875   kmp_info_t *thread;
1876 
1877   /* For the thread-private internal controls implementation */
1878   thread = __kmp_entry_thread();
1879 
1880   __kmp_save_internal_controls(thread);
1881 
1882   set__nested(thread, flag ? TRUE : FALSE);
1883 }
1884 
1885 void ompc_set_max_active_levels(int max_active_levels) {
1886   /* TO DO */
1887   /* we want per-task implementation of this internal control */
1888 
1889   /* For the per-thread internal controls implementation */
1890   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1891 }
1892 
1893 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1894   // !!!!! TODO: check the per-task binding
1895   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1896 }
1897 
1898 int ompc_get_ancestor_thread_num(int level) {
1899   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1900 }
1901 
1902 int ompc_get_team_size(int level) {
1903   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1904 }
1905 
1906 #if OMP_50_ENABLED
1907 /* OpenMP 5.0 Affinity Format API */
1908 
1909 void ompc_set_affinity_format(char const *format) {
1910   if (!__kmp_init_serial) {
1911     __kmp_serial_initialize();
1912   }
1913   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1914                          format, KMP_STRLEN(format) + 1);
1915 }
1916 
1917 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1918   size_t format_size;
1919   if (!__kmp_init_serial) {
1920     __kmp_serial_initialize();
1921   }
1922   format_size = KMP_STRLEN(__kmp_affinity_format);
1923   if (buffer && size) {
1924     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1925                            format_size + 1);
1926   }
1927   return format_size;
1928 }
1929 
1930 void ompc_display_affinity(char const *format) {
1931   int gtid;
1932   if (!TCR_4(__kmp_init_middle)) {
1933     __kmp_middle_initialize();
1934   }
1935   gtid = __kmp_get_gtid();
1936   __kmp_aux_display_affinity(gtid, format);
1937 }
1938 
1939 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1940                              char const *format) {
1941   int gtid;
1942   size_t num_required;
1943   kmp_str_buf_t capture_buf;
1944   if (!TCR_4(__kmp_init_middle)) {
1945     __kmp_middle_initialize();
1946   }
1947   gtid = __kmp_get_gtid();
1948   __kmp_str_buf_init(&capture_buf);
1949   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1950   if (buffer && buf_size) {
1951     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1952                            capture_buf.used + 1);
1953   }
1954   __kmp_str_buf_free(&capture_buf);
1955   return num_required;
1956 }
1957 #endif /* OMP_50_ENABLED */
1958 
1959 void kmpc_set_stacksize(int arg) {
1960   // __kmp_aux_set_stacksize initializes the library if needed
1961   __kmp_aux_set_stacksize(arg);
1962 }
1963 
1964 void kmpc_set_stacksize_s(size_t arg) {
1965   // __kmp_aux_set_stacksize initializes the library if needed
1966   __kmp_aux_set_stacksize(arg);
1967 }
1968 
1969 void kmpc_set_blocktime(int arg) {
1970   int gtid, tid;
1971   kmp_info_t *thread;
1972 
1973   gtid = __kmp_entry_gtid();
1974   tid = __kmp_tid_from_gtid(gtid);
1975   thread = __kmp_thread_from_gtid(gtid);
1976 
1977   __kmp_aux_set_blocktime(arg, thread, tid);
1978 }
1979 
1980 void kmpc_set_library(int arg) {
1981   // __kmp_user_set_library initializes the library if needed
1982   __kmp_user_set_library((enum library_type)arg);
1983 }
1984 
1985 void kmpc_set_defaults(char const *str) {
1986   // __kmp_aux_set_defaults initializes the library if needed
1987   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1988 }
1989 
1990 void kmpc_set_disp_num_buffers(int arg) {
1991   // ignore after initialization because some teams have already
1992   // allocated dispatch buffers
1993   if (__kmp_init_serial == 0 && arg > 0)
1994     __kmp_dispatch_num_buffers = arg;
1995 }
1996 
1997 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1998 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1999   return -1;
2000 #else
2001   if (!TCR_4(__kmp_init_middle)) {
2002     __kmp_middle_initialize();
2003   }
2004   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2005 #endif
2006 }
2007 
2008 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2009 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2010   return -1;
2011 #else
2012   if (!TCR_4(__kmp_init_middle)) {
2013     __kmp_middle_initialize();
2014   }
2015   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2016 #endif
2017 }
2018 
2019 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2020 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2021   return -1;
2022 #else
2023   if (!TCR_4(__kmp_init_middle)) {
2024     __kmp_middle_initialize();
2025   }
2026   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2027 #endif
2028 }
2029 
2030 /* -------------------------------------------------------------------------- */
2031 /*!
2032 @ingroup THREADPRIVATE
2033 @param loc       source location information
2034 @param gtid      global thread number
2035 @param cpy_size  size of the cpy_data buffer
2036 @param cpy_data  pointer to data to be copied
2037 @param cpy_func  helper function to call for copying data
2038 @param didit     flag variable: 1=single thread; 0=not single thread
2039 
2040 __kmpc_copyprivate implements the interface for the private data broadcast
2041 needed for the copyprivate clause associated with a single region in an
2042 OpenMP<sup>*</sup> program (both C and Fortran).
2043 All threads participating in the parallel region call this routine.
2044 One of the threads (called the single thread) should have the <tt>didit</tt>
2045 variable set to 1 and all other threads should have that variable set to 0.
2046 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2047 
2048 The OpenMP specification forbids the use of nowait on the single region when a
2049 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2050 barrier internally to avoid race conditions, so the code generation for the
2051 single region should avoid generating a barrier after the call to @ref
2052 __kmpc_copyprivate.
2053 
2054 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2055 The <tt>loc</tt> parameter is a pointer to source location information.
2056 
2057 Internal implementation: The single thread will first copy its descriptor
2058 address (cpy_data) to a team-private location, then the other threads will each
2059 call the function pointed to by the parameter cpy_func, which carries out the
2060 copy by copying the data using the cpy_data buffer.
2061 
2062 The cpy_func routine used for the copy and the contents of the data area defined
2063 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2064 to be done. For instance, the cpy_data buffer can hold the actual data to be
2065 copied or it may hold a list of pointers to the data. The cpy_func routine must
2066 interpret the cpy_data buffer appropriately.
2067 
2068 The interface to cpy_func is as follows:
2069 @code
2070 void cpy_func( void *destination, void *source )
2071 @endcode
2072 where void *destination is the cpy_data pointer for the thread being copied to
2073 and void *source is the cpy_data pointer for the thread being copied from.
2074 */
2075 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2076                         void *cpy_data, void (*cpy_func)(void *, void *),
2077                         kmp_int32 didit) {
2078   void **data_ptr;
2079 
2080   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2081 
2082   KMP_MB();
2083 
2084   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2085 
2086   if (__kmp_env_consistency_check) {
2087     if (loc == 0) {
2088       KMP_WARNING(ConstructIdentInvalid);
2089     }
2090   }
2091 
2092   // ToDo: Optimize the following two barriers into some kind of split barrier
2093 
2094   if (didit)
2095     *data_ptr = cpy_data;
2096 
2097 #if OMPT_SUPPORT
2098   ompt_frame_t *ompt_frame;
2099   if (ompt_enabled.enabled) {
2100     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2101     if (ompt_frame->enter_frame.ptr == NULL)
2102       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2103     OMPT_STORE_RETURN_ADDRESS(gtid);
2104   }
2105 #endif
2106 /* This barrier is not a barrier region boundary */
2107 #if USE_ITT_NOTIFY
2108   __kmp_threads[gtid]->th.th_ident = loc;
2109 #endif
2110   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2111 
2112   if (!didit)
2113     (*cpy_func)(cpy_data, *data_ptr);
2114 
2115 // Consider next barrier a user-visible barrier for barrier region boundaries
2116 // Nesting checks are already handled by the single construct checks
2117 
2118 #if OMPT_SUPPORT
2119   if (ompt_enabled.enabled) {
2120     OMPT_STORE_RETURN_ADDRESS(gtid);
2121   }
2122 #endif
2123 #if USE_ITT_NOTIFY
2124   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2125 // tasks can overwrite the location)
2126 #endif
2127   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2128 #if OMPT_SUPPORT && OMPT_OPTIONAL
2129   if (ompt_enabled.enabled) {
2130     ompt_frame->enter_frame = ompt_data_none;
2131   }
2132 #endif
2133 }
2134 
2135 /* -------------------------------------------------------------------------- */
2136 
2137 #define INIT_LOCK __kmp_init_user_lock_with_checks
2138 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2139 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2140 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2141 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2142 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2143   __kmp_acquire_nested_user_lock_with_checks_timed
2144 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2145 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2146 #define TEST_LOCK __kmp_test_user_lock_with_checks
2147 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2148 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2149 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2150 
2151 // TODO: Make check abort messages use location info & pass it into
2152 // with_checks routines
2153 
2154 #if KMP_USE_DYNAMIC_LOCK
2155 
2156 // internal lock initializer
2157 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2158                                                     kmp_dyna_lockseq_t seq) {
2159   if (KMP_IS_D_LOCK(seq)) {
2160     KMP_INIT_D_LOCK(lock, seq);
2161 #if USE_ITT_BUILD
2162     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2163 #endif
2164   } else {
2165     KMP_INIT_I_LOCK(lock, seq);
2166 #if USE_ITT_BUILD
2167     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2168     __kmp_itt_lock_creating(ilk->lock, loc);
2169 #endif
2170   }
2171 }
2172 
2173 // internal nest lock initializer
2174 static __forceinline void
2175 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2176                                kmp_dyna_lockseq_t seq) {
2177 #if KMP_USE_TSX
2178   // Don't have nested lock implementation for speculative locks
2179   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2180     seq = __kmp_user_lock_seq;
2181 #endif
2182   switch (seq) {
2183   case lockseq_tas:
2184     seq = lockseq_nested_tas;
2185     break;
2186 #if KMP_USE_FUTEX
2187   case lockseq_futex:
2188     seq = lockseq_nested_futex;
2189     break;
2190 #endif
2191   case lockseq_ticket:
2192     seq = lockseq_nested_ticket;
2193     break;
2194   case lockseq_queuing:
2195     seq = lockseq_nested_queuing;
2196     break;
2197   case lockseq_drdpa:
2198     seq = lockseq_nested_drdpa;
2199     break;
2200   default:
2201     seq = lockseq_nested_queuing;
2202   }
2203   KMP_INIT_I_LOCK(lock, seq);
2204 #if USE_ITT_BUILD
2205   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2206   __kmp_itt_lock_creating(ilk->lock, loc);
2207 #endif
2208 }
2209 
2210 /* initialize the lock with a hint */
2211 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2212                                 uintptr_t hint) {
2213   KMP_DEBUG_ASSERT(__kmp_init_serial);
2214   if (__kmp_env_consistency_check && user_lock == NULL) {
2215     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2216   }
2217 
2218   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2219 
2220 #if OMPT_SUPPORT && OMPT_OPTIONAL
2221   // This is the case, if called from omp_init_lock_with_hint:
2222   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2223   if (!codeptr)
2224     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2225   if (ompt_enabled.ompt_callback_lock_init) {
2226     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2227         ompt_mutex_lock, (omp_lock_hint_t)hint,
2228         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2229         codeptr);
2230   }
2231 #endif
2232 }
2233 
2234 /* initialize the lock with a hint */
2235 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2236                                      void **user_lock, uintptr_t hint) {
2237   KMP_DEBUG_ASSERT(__kmp_init_serial);
2238   if (__kmp_env_consistency_check && user_lock == NULL) {
2239     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2240   }
2241 
2242   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2243 
2244 #if OMPT_SUPPORT && OMPT_OPTIONAL
2245   // This is the case, if called from omp_init_lock_with_hint:
2246   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2247   if (!codeptr)
2248     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2249   if (ompt_enabled.ompt_callback_lock_init) {
2250     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2251         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2252         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2253         codeptr);
2254   }
2255 #endif
2256 }
2257 
2258 #endif // KMP_USE_DYNAMIC_LOCK
2259 
2260 /* initialize the lock */
2261 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2262 #if KMP_USE_DYNAMIC_LOCK
2263 
2264   KMP_DEBUG_ASSERT(__kmp_init_serial);
2265   if (__kmp_env_consistency_check && user_lock == NULL) {
2266     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2267   }
2268   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2269 
2270 #if OMPT_SUPPORT && OMPT_OPTIONAL
2271   // This is the case, if called from omp_init_lock_with_hint:
2272   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2273   if (!codeptr)
2274     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2275   if (ompt_enabled.ompt_callback_lock_init) {
2276     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2277         ompt_mutex_lock, omp_lock_hint_none,
2278         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2279         codeptr);
2280   }
2281 #endif
2282 
2283 #else // KMP_USE_DYNAMIC_LOCK
2284 
2285   static char const *const func = "omp_init_lock";
2286   kmp_user_lock_p lck;
2287   KMP_DEBUG_ASSERT(__kmp_init_serial);
2288 
2289   if (__kmp_env_consistency_check) {
2290     if (user_lock == NULL) {
2291       KMP_FATAL(LockIsUninitialized, func);
2292     }
2293   }
2294 
2295   KMP_CHECK_USER_LOCK_INIT();
2296 
2297   if ((__kmp_user_lock_kind == lk_tas) &&
2298       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2299     lck = (kmp_user_lock_p)user_lock;
2300   }
2301 #if KMP_USE_FUTEX
2302   else if ((__kmp_user_lock_kind == lk_futex) &&
2303            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2304     lck = (kmp_user_lock_p)user_lock;
2305   }
2306 #endif
2307   else {
2308     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2309   }
2310   INIT_LOCK(lck);
2311   __kmp_set_user_lock_location(lck, loc);
2312 
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314   // This is the case, if called from omp_init_lock_with_hint:
2315   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2316   if (!codeptr)
2317     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2318   if (ompt_enabled.ompt_callback_lock_init) {
2319     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2320         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2321         (ompt_wait_id_t)user_lock, codeptr);
2322   }
2323 #endif
2324 
2325 #if USE_ITT_BUILD
2326   __kmp_itt_lock_creating(lck);
2327 #endif /* USE_ITT_BUILD */
2328 
2329 #endif // KMP_USE_DYNAMIC_LOCK
2330 } // __kmpc_init_lock
2331 
2332 /* initialize the lock */
2333 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2334 #if KMP_USE_DYNAMIC_LOCK
2335 
2336   KMP_DEBUG_ASSERT(__kmp_init_serial);
2337   if (__kmp_env_consistency_check && user_lock == NULL) {
2338     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2339   }
2340   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2341 
2342 #if OMPT_SUPPORT && OMPT_OPTIONAL
2343   // This is the case, if called from omp_init_lock_with_hint:
2344   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2345   if (!codeptr)
2346     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2347   if (ompt_enabled.ompt_callback_lock_init) {
2348     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2349         ompt_mutex_nest_lock, omp_lock_hint_none,
2350         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2351         codeptr);
2352   }
2353 #endif
2354 
2355 #else // KMP_USE_DYNAMIC_LOCK
2356 
2357   static char const *const func = "omp_init_nest_lock";
2358   kmp_user_lock_p lck;
2359   KMP_DEBUG_ASSERT(__kmp_init_serial);
2360 
2361   if (__kmp_env_consistency_check) {
2362     if (user_lock == NULL) {
2363       KMP_FATAL(LockIsUninitialized, func);
2364     }
2365   }
2366 
2367   KMP_CHECK_USER_LOCK_INIT();
2368 
2369   if ((__kmp_user_lock_kind == lk_tas) &&
2370       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2371        OMP_NEST_LOCK_T_SIZE)) {
2372     lck = (kmp_user_lock_p)user_lock;
2373   }
2374 #if KMP_USE_FUTEX
2375   else if ((__kmp_user_lock_kind == lk_futex) &&
2376            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2377             OMP_NEST_LOCK_T_SIZE)) {
2378     lck = (kmp_user_lock_p)user_lock;
2379   }
2380 #endif
2381   else {
2382     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2383   }
2384 
2385   INIT_NESTED_LOCK(lck);
2386   __kmp_set_user_lock_location(lck, loc);
2387 
2388 #if OMPT_SUPPORT && OMPT_OPTIONAL
2389   // This is the case, if called from omp_init_lock_with_hint:
2390   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2391   if (!codeptr)
2392     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2393   if (ompt_enabled.ompt_callback_lock_init) {
2394     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2395         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2396         (ompt_wait_id_t)user_lock, codeptr);
2397   }
2398 #endif
2399 
2400 #if USE_ITT_BUILD
2401   __kmp_itt_lock_creating(lck);
2402 #endif /* USE_ITT_BUILD */
2403 
2404 #endif // KMP_USE_DYNAMIC_LOCK
2405 } // __kmpc_init_nest_lock
2406 
2407 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2408 #if KMP_USE_DYNAMIC_LOCK
2409 
2410 #if USE_ITT_BUILD
2411   kmp_user_lock_p lck;
2412   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2413     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2414   } else {
2415     lck = (kmp_user_lock_p)user_lock;
2416   }
2417   __kmp_itt_lock_destroyed(lck);
2418 #endif
2419 #if OMPT_SUPPORT && OMPT_OPTIONAL
2420   // This is the case, if called from omp_init_lock_with_hint:
2421   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2422   if (!codeptr)
2423     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2424   if (ompt_enabled.ompt_callback_lock_destroy) {
2425     kmp_user_lock_p lck;
2426     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2427       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2428     } else {
2429       lck = (kmp_user_lock_p)user_lock;
2430     }
2431     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2432         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2433   }
2434 #endif
2435   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2436 #else
2437   kmp_user_lock_p lck;
2438 
2439   if ((__kmp_user_lock_kind == lk_tas) &&
2440       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2441     lck = (kmp_user_lock_p)user_lock;
2442   }
2443 #if KMP_USE_FUTEX
2444   else if ((__kmp_user_lock_kind == lk_futex) &&
2445            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2446     lck = (kmp_user_lock_p)user_lock;
2447   }
2448 #endif
2449   else {
2450     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2451   }
2452 
2453 #if OMPT_SUPPORT && OMPT_OPTIONAL
2454   // This is the case, if called from omp_init_lock_with_hint:
2455   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2456   if (!codeptr)
2457     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2458   if (ompt_enabled.ompt_callback_lock_destroy) {
2459     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2460         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2461   }
2462 #endif
2463 
2464 #if USE_ITT_BUILD
2465   __kmp_itt_lock_destroyed(lck);
2466 #endif /* USE_ITT_BUILD */
2467   DESTROY_LOCK(lck);
2468 
2469   if ((__kmp_user_lock_kind == lk_tas) &&
2470       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2471     ;
2472   }
2473 #if KMP_USE_FUTEX
2474   else if ((__kmp_user_lock_kind == lk_futex) &&
2475            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2476     ;
2477   }
2478 #endif
2479   else {
2480     __kmp_user_lock_free(user_lock, gtid, lck);
2481   }
2482 #endif // KMP_USE_DYNAMIC_LOCK
2483 } // __kmpc_destroy_lock
2484 
2485 /* destroy the lock */
2486 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2487 #if KMP_USE_DYNAMIC_LOCK
2488 
2489 #if USE_ITT_BUILD
2490   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2491   __kmp_itt_lock_destroyed(ilk->lock);
2492 #endif
2493 #if OMPT_SUPPORT && OMPT_OPTIONAL
2494   // This is the case, if called from omp_init_lock_with_hint:
2495   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2496   if (!codeptr)
2497     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2498   if (ompt_enabled.ompt_callback_lock_destroy) {
2499     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2500         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2501   }
2502 #endif
2503   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2504 
2505 #else // KMP_USE_DYNAMIC_LOCK
2506 
2507   kmp_user_lock_p lck;
2508 
2509   if ((__kmp_user_lock_kind == lk_tas) &&
2510       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2511        OMP_NEST_LOCK_T_SIZE)) {
2512     lck = (kmp_user_lock_p)user_lock;
2513   }
2514 #if KMP_USE_FUTEX
2515   else if ((__kmp_user_lock_kind == lk_futex) &&
2516            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2517             OMP_NEST_LOCK_T_SIZE)) {
2518     lck = (kmp_user_lock_p)user_lock;
2519   }
2520 #endif
2521   else {
2522     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2523   }
2524 
2525 #if OMPT_SUPPORT && OMPT_OPTIONAL
2526   // This is the case, if called from omp_init_lock_with_hint:
2527   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2528   if (!codeptr)
2529     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2530   if (ompt_enabled.ompt_callback_lock_destroy) {
2531     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2532         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2533   }
2534 #endif
2535 
2536 #if USE_ITT_BUILD
2537   __kmp_itt_lock_destroyed(lck);
2538 #endif /* USE_ITT_BUILD */
2539 
2540   DESTROY_NESTED_LOCK(lck);
2541 
2542   if ((__kmp_user_lock_kind == lk_tas) &&
2543       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2544        OMP_NEST_LOCK_T_SIZE)) {
2545     ;
2546   }
2547 #if KMP_USE_FUTEX
2548   else if ((__kmp_user_lock_kind == lk_futex) &&
2549            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2550             OMP_NEST_LOCK_T_SIZE)) {
2551     ;
2552   }
2553 #endif
2554   else {
2555     __kmp_user_lock_free(user_lock, gtid, lck);
2556   }
2557 #endif // KMP_USE_DYNAMIC_LOCK
2558 } // __kmpc_destroy_nest_lock
2559 
2560 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2561   KMP_COUNT_BLOCK(OMP_set_lock);
2562 #if KMP_USE_DYNAMIC_LOCK
2563   int tag = KMP_EXTRACT_D_TAG(user_lock);
2564 #if USE_ITT_BUILD
2565   __kmp_itt_lock_acquiring(
2566       (kmp_user_lock_p)
2567           user_lock); // itt function will get to the right lock object.
2568 #endif
2569 #if OMPT_SUPPORT && OMPT_OPTIONAL
2570   // This is the case, if called from omp_init_lock_with_hint:
2571   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2572   if (!codeptr)
2573     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2574   if (ompt_enabled.ompt_callback_mutex_acquire) {
2575     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2576         ompt_mutex_lock, omp_lock_hint_none,
2577         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2578         codeptr);
2579   }
2580 #endif
2581 #if KMP_USE_INLINED_TAS
2582   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2583     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2584   } else
2585 #elif KMP_USE_INLINED_FUTEX
2586   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2587     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2588   } else
2589 #endif
2590   {
2591     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2592   }
2593 #if USE_ITT_BUILD
2594   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2595 #endif
2596 #if OMPT_SUPPORT && OMPT_OPTIONAL
2597   if (ompt_enabled.ompt_callback_mutex_acquired) {
2598     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2599         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2600   }
2601 #endif
2602 
2603 #else // KMP_USE_DYNAMIC_LOCK
2604 
2605   kmp_user_lock_p lck;
2606 
2607   if ((__kmp_user_lock_kind == lk_tas) &&
2608       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2609     lck = (kmp_user_lock_p)user_lock;
2610   }
2611 #if KMP_USE_FUTEX
2612   else if ((__kmp_user_lock_kind == lk_futex) &&
2613            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2614     lck = (kmp_user_lock_p)user_lock;
2615   }
2616 #endif
2617   else {
2618     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2619   }
2620 
2621 #if USE_ITT_BUILD
2622   __kmp_itt_lock_acquiring(lck);
2623 #endif /* USE_ITT_BUILD */
2624 #if OMPT_SUPPORT && OMPT_OPTIONAL
2625   // This is the case, if called from omp_init_lock_with_hint:
2626   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2627   if (!codeptr)
2628     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2629   if (ompt_enabled.ompt_callback_mutex_acquire) {
2630     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2631         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2632         (ompt_wait_id_t)lck, codeptr);
2633   }
2634 #endif
2635 
2636   ACQUIRE_LOCK(lck, gtid);
2637 
2638 #if USE_ITT_BUILD
2639   __kmp_itt_lock_acquired(lck);
2640 #endif /* USE_ITT_BUILD */
2641 
2642 #if OMPT_SUPPORT && OMPT_OPTIONAL
2643   if (ompt_enabled.ompt_callback_mutex_acquired) {
2644     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2645         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2646   }
2647 #endif
2648 
2649 #endif // KMP_USE_DYNAMIC_LOCK
2650 }
2651 
2652 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2653 #if KMP_USE_DYNAMIC_LOCK
2654 
2655 #if USE_ITT_BUILD
2656   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2657 #endif
2658 #if OMPT_SUPPORT && OMPT_OPTIONAL
2659   // This is the case, if called from omp_init_lock_with_hint:
2660   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2661   if (!codeptr)
2662     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2663   if (ompt_enabled.enabled) {
2664     if (ompt_enabled.ompt_callback_mutex_acquire) {
2665       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2666           ompt_mutex_nest_lock, omp_lock_hint_none,
2667           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2668           codeptr);
2669     }
2670   }
2671 #endif
2672   int acquire_status =
2673       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2674   (void) acquire_status;
2675 #if USE_ITT_BUILD
2676   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2677 #endif
2678 
2679 #if OMPT_SUPPORT && OMPT_OPTIONAL
2680   if (ompt_enabled.enabled) {
2681     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2682       if (ompt_enabled.ompt_callback_mutex_acquired) {
2683         // lock_first
2684         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2685             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2686       }
2687     } else {
2688       if (ompt_enabled.ompt_callback_nest_lock) {
2689         // lock_next
2690         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2691             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2692       }
2693     }
2694   }
2695 #endif
2696 
2697 #else // KMP_USE_DYNAMIC_LOCK
2698   int acquire_status;
2699   kmp_user_lock_p lck;
2700 
2701   if ((__kmp_user_lock_kind == lk_tas) &&
2702       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2703        OMP_NEST_LOCK_T_SIZE)) {
2704     lck = (kmp_user_lock_p)user_lock;
2705   }
2706 #if KMP_USE_FUTEX
2707   else if ((__kmp_user_lock_kind == lk_futex) &&
2708            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2709             OMP_NEST_LOCK_T_SIZE)) {
2710     lck = (kmp_user_lock_p)user_lock;
2711   }
2712 #endif
2713   else {
2714     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2715   }
2716 
2717 #if USE_ITT_BUILD
2718   __kmp_itt_lock_acquiring(lck);
2719 #endif /* USE_ITT_BUILD */
2720 #if OMPT_SUPPORT && OMPT_OPTIONAL
2721   // This is the case, if called from omp_init_lock_with_hint:
2722   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2723   if (!codeptr)
2724     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2725   if (ompt_enabled.enabled) {
2726     if (ompt_enabled.ompt_callback_mutex_acquire) {
2727       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2728           ompt_mutex_nest_lock, omp_lock_hint_none,
2729           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2730     }
2731   }
2732 #endif
2733 
2734   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2735 
2736 #if USE_ITT_BUILD
2737   __kmp_itt_lock_acquired(lck);
2738 #endif /* USE_ITT_BUILD */
2739 
2740 #if OMPT_SUPPORT && OMPT_OPTIONAL
2741   if (ompt_enabled.enabled) {
2742     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2743       if (ompt_enabled.ompt_callback_mutex_acquired) {
2744         // lock_first
2745         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2746             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2747       }
2748     } else {
2749       if (ompt_enabled.ompt_callback_nest_lock) {
2750         // lock_next
2751         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2752             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2753       }
2754     }
2755   }
2756 #endif
2757 
2758 #endif // KMP_USE_DYNAMIC_LOCK
2759 }
2760 
2761 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2762 #if KMP_USE_DYNAMIC_LOCK
2763 
2764   int tag = KMP_EXTRACT_D_TAG(user_lock);
2765 #if USE_ITT_BUILD
2766   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2767 #endif
2768 #if KMP_USE_INLINED_TAS
2769   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2770     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2771   } else
2772 #elif KMP_USE_INLINED_FUTEX
2773   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2774     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2775   } else
2776 #endif
2777   {
2778     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2779   }
2780 
2781 #if OMPT_SUPPORT && OMPT_OPTIONAL
2782   // This is the case, if called from omp_init_lock_with_hint:
2783   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2784   if (!codeptr)
2785     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2786   if (ompt_enabled.ompt_callback_mutex_released) {
2787     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2788         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2789   }
2790 #endif
2791 
2792 #else // KMP_USE_DYNAMIC_LOCK
2793 
2794   kmp_user_lock_p lck;
2795 
2796   /* Can't use serial interval since not block structured */
2797   /* release the lock */
2798 
2799   if ((__kmp_user_lock_kind == lk_tas) &&
2800       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2801 #if KMP_OS_LINUX &&                                                            \
2802     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2803 // "fast" path implemented to fix customer performance issue
2804 #if USE_ITT_BUILD
2805     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2806 #endif /* USE_ITT_BUILD */
2807     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2808     KMP_MB();
2809 
2810 #if OMPT_SUPPORT && OMPT_OPTIONAL
2811     // This is the case, if called from omp_init_lock_with_hint:
2812     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2813     if (!codeptr)
2814       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2815     if (ompt_enabled.ompt_callback_mutex_released) {
2816       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2817           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2818     }
2819 #endif
2820 
2821     return;
2822 #else
2823     lck = (kmp_user_lock_p)user_lock;
2824 #endif
2825   }
2826 #if KMP_USE_FUTEX
2827   else if ((__kmp_user_lock_kind == lk_futex) &&
2828            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2829     lck = (kmp_user_lock_p)user_lock;
2830   }
2831 #endif
2832   else {
2833     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2834   }
2835 
2836 #if USE_ITT_BUILD
2837   __kmp_itt_lock_releasing(lck);
2838 #endif /* USE_ITT_BUILD */
2839 
2840   RELEASE_LOCK(lck, gtid);
2841 
2842 #if OMPT_SUPPORT && OMPT_OPTIONAL
2843   // This is the case, if called from omp_init_lock_with_hint:
2844   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2845   if (!codeptr)
2846     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2847   if (ompt_enabled.ompt_callback_mutex_released) {
2848     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2849         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2850   }
2851 #endif
2852 
2853 #endif // KMP_USE_DYNAMIC_LOCK
2854 }
2855 
2856 /* release the lock */
2857 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2858 #if KMP_USE_DYNAMIC_LOCK
2859 
2860 #if USE_ITT_BUILD
2861   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2862 #endif
2863   int release_status =
2864       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2865   (void) release_status;
2866 
2867 #if OMPT_SUPPORT && OMPT_OPTIONAL
2868   // This is the case, if called from omp_init_lock_with_hint:
2869   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2870   if (!codeptr)
2871     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2872   if (ompt_enabled.enabled) {
2873     if (release_status == KMP_LOCK_RELEASED) {
2874       if (ompt_enabled.ompt_callback_mutex_released) {
2875         // release_lock_last
2876         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2877             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2878       }
2879     } else if (ompt_enabled.ompt_callback_nest_lock) {
2880       // release_lock_prev
2881       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2882           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2883     }
2884   }
2885 #endif
2886 
2887 #else // KMP_USE_DYNAMIC_LOCK
2888 
2889   kmp_user_lock_p lck;
2890 
2891   /* Can't use serial interval since not block structured */
2892 
2893   if ((__kmp_user_lock_kind == lk_tas) &&
2894       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2895        OMP_NEST_LOCK_T_SIZE)) {
2896 #if KMP_OS_LINUX &&                                                            \
2897     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2898     // "fast" path implemented to fix customer performance issue
2899     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2900 #if USE_ITT_BUILD
2901     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2902 #endif /* USE_ITT_BUILD */
2903 
2904 #if OMPT_SUPPORT && OMPT_OPTIONAL
2905     int release_status = KMP_LOCK_STILL_HELD;
2906 #endif
2907 
2908     if (--(tl->lk.depth_locked) == 0) {
2909       TCW_4(tl->lk.poll, 0);
2910 #if OMPT_SUPPORT && OMPT_OPTIONAL
2911       release_status = KMP_LOCK_RELEASED;
2912 #endif
2913     }
2914     KMP_MB();
2915 
2916 #if OMPT_SUPPORT && OMPT_OPTIONAL
2917     // This is the case, if called from omp_init_lock_with_hint:
2918     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2919     if (!codeptr)
2920       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2921     if (ompt_enabled.enabled) {
2922       if (release_status == KMP_LOCK_RELEASED) {
2923         if (ompt_enabled.ompt_callback_mutex_released) {
2924           // release_lock_last
2925           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2926               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2927         }
2928       } else if (ompt_enabled.ompt_callback_nest_lock) {
2929         // release_lock_previous
2930         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2931             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2932       }
2933     }
2934 #endif
2935 
2936     return;
2937 #else
2938     lck = (kmp_user_lock_p)user_lock;
2939 #endif
2940   }
2941 #if KMP_USE_FUTEX
2942   else if ((__kmp_user_lock_kind == lk_futex) &&
2943            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2944             OMP_NEST_LOCK_T_SIZE)) {
2945     lck = (kmp_user_lock_p)user_lock;
2946   }
2947 #endif
2948   else {
2949     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2950   }
2951 
2952 #if USE_ITT_BUILD
2953   __kmp_itt_lock_releasing(lck);
2954 #endif /* USE_ITT_BUILD */
2955 
2956   int release_status;
2957   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2958 #if OMPT_SUPPORT && OMPT_OPTIONAL
2959   // This is the case, if called from omp_init_lock_with_hint:
2960   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2961   if (!codeptr)
2962     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2963   if (ompt_enabled.enabled) {
2964     if (release_status == KMP_LOCK_RELEASED) {
2965       if (ompt_enabled.ompt_callback_mutex_released) {
2966         // release_lock_last
2967         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2968             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2969       }
2970     } else if (ompt_enabled.ompt_callback_nest_lock) {
2971       // release_lock_previous
2972       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2973           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2974     }
2975   }
2976 #endif
2977 
2978 #endif // KMP_USE_DYNAMIC_LOCK
2979 }
2980 
2981 /* try to acquire the lock */
2982 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2983   KMP_COUNT_BLOCK(OMP_test_lock);
2984 
2985 #if KMP_USE_DYNAMIC_LOCK
2986   int rc;
2987   int tag = KMP_EXTRACT_D_TAG(user_lock);
2988 #if USE_ITT_BUILD
2989   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2990 #endif
2991 #if OMPT_SUPPORT && OMPT_OPTIONAL
2992   // This is the case, if called from omp_init_lock_with_hint:
2993   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2994   if (!codeptr)
2995     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2996   if (ompt_enabled.ompt_callback_mutex_acquire) {
2997     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2998         ompt_mutex_lock, omp_lock_hint_none,
2999         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3000         codeptr);
3001   }
3002 #endif
3003 #if KMP_USE_INLINED_TAS
3004   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3005     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3006   } else
3007 #elif KMP_USE_INLINED_FUTEX
3008   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3009     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3010   } else
3011 #endif
3012   {
3013     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3014   }
3015   if (rc) {
3016 #if USE_ITT_BUILD
3017     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3018 #endif
3019 #if OMPT_SUPPORT && OMPT_OPTIONAL
3020     if (ompt_enabled.ompt_callback_mutex_acquired) {
3021       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3022           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
3023     }
3024 #endif
3025     return FTN_TRUE;
3026   } else {
3027 #if USE_ITT_BUILD
3028     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3029 #endif
3030     return FTN_FALSE;
3031   }
3032 
3033 #else // KMP_USE_DYNAMIC_LOCK
3034 
3035   kmp_user_lock_p lck;
3036   int rc;
3037 
3038   if ((__kmp_user_lock_kind == lk_tas) &&
3039       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3040     lck = (kmp_user_lock_p)user_lock;
3041   }
3042 #if KMP_USE_FUTEX
3043   else if ((__kmp_user_lock_kind == lk_futex) &&
3044            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3045     lck = (kmp_user_lock_p)user_lock;
3046   }
3047 #endif
3048   else {
3049     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3050   }
3051 
3052 #if USE_ITT_BUILD
3053   __kmp_itt_lock_acquiring(lck);
3054 #endif /* USE_ITT_BUILD */
3055 #if OMPT_SUPPORT && OMPT_OPTIONAL
3056   // This is the case, if called from omp_init_lock_with_hint:
3057   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3058   if (!codeptr)
3059     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3060   if (ompt_enabled.ompt_callback_mutex_acquire) {
3061     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3062         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3063         (ompt_wait_id_t)lck, codeptr);
3064   }
3065 #endif
3066 
3067   rc = TEST_LOCK(lck, gtid);
3068 #if USE_ITT_BUILD
3069   if (rc) {
3070     __kmp_itt_lock_acquired(lck);
3071   } else {
3072     __kmp_itt_lock_cancelled(lck);
3073   }
3074 #endif /* USE_ITT_BUILD */
3075 #if OMPT_SUPPORT && OMPT_OPTIONAL
3076   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3077     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3078         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
3079   }
3080 #endif
3081 
3082   return (rc ? FTN_TRUE : FTN_FALSE);
3083 
3084 /* Can't use serial interval since not block structured */
3085 
3086 #endif // KMP_USE_DYNAMIC_LOCK
3087 }
3088 
3089 /* try to acquire the lock */
3090 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3091 #if KMP_USE_DYNAMIC_LOCK
3092   int rc;
3093 #if USE_ITT_BUILD
3094   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3095 #endif
3096 #if OMPT_SUPPORT && OMPT_OPTIONAL
3097   // This is the case, if called from omp_init_lock_with_hint:
3098   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3099   if (!codeptr)
3100     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3101   if (ompt_enabled.ompt_callback_mutex_acquire) {
3102     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3103         ompt_mutex_nest_lock, omp_lock_hint_none,
3104         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3105         codeptr);
3106   }
3107 #endif
3108   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3109 #if USE_ITT_BUILD
3110   if (rc) {
3111     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3112   } else {
3113     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3114   }
3115 #endif
3116 #if OMPT_SUPPORT && OMPT_OPTIONAL
3117   if (ompt_enabled.enabled && rc) {
3118     if (rc == 1) {
3119       if (ompt_enabled.ompt_callback_mutex_acquired) {
3120         // lock_first
3121         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3122             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3123       }
3124     } else {
3125       if (ompt_enabled.ompt_callback_nest_lock) {
3126         // lock_next
3127         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3128             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3129       }
3130     }
3131   }
3132 #endif
3133   return rc;
3134 
3135 #else // KMP_USE_DYNAMIC_LOCK
3136 
3137   kmp_user_lock_p lck;
3138   int rc;
3139 
3140   if ((__kmp_user_lock_kind == lk_tas) &&
3141       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3142        OMP_NEST_LOCK_T_SIZE)) {
3143     lck = (kmp_user_lock_p)user_lock;
3144   }
3145 #if KMP_USE_FUTEX
3146   else if ((__kmp_user_lock_kind == lk_futex) &&
3147            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3148             OMP_NEST_LOCK_T_SIZE)) {
3149     lck = (kmp_user_lock_p)user_lock;
3150   }
3151 #endif
3152   else {
3153     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3154   }
3155 
3156 #if USE_ITT_BUILD
3157   __kmp_itt_lock_acquiring(lck);
3158 #endif /* USE_ITT_BUILD */
3159 
3160 #if OMPT_SUPPORT && OMPT_OPTIONAL
3161   // This is the case, if called from omp_init_lock_with_hint:
3162   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3163   if (!codeptr)
3164     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3165   if (ompt_enabled.enabled) &&
3166         ompt_enabled.ompt_callback_mutex_acquire) {
3167       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3168           ompt_mutex_nest_lock, omp_lock_hint_none,
3169           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3170     }
3171 #endif
3172 
3173   rc = TEST_NESTED_LOCK(lck, gtid);
3174 #if USE_ITT_BUILD
3175   if (rc) {
3176     __kmp_itt_lock_acquired(lck);
3177   } else {
3178     __kmp_itt_lock_cancelled(lck);
3179   }
3180 #endif /* USE_ITT_BUILD */
3181 #if OMPT_SUPPORT && OMPT_OPTIONAL
3182   if (ompt_enabled.enabled && rc) {
3183     if (rc == 1) {
3184       if (ompt_enabled.ompt_callback_mutex_acquired) {
3185         // lock_first
3186         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3187             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3188       }
3189     } else {
3190       if (ompt_enabled.ompt_callback_nest_lock) {
3191         // lock_next
3192         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3193             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3194       }
3195     }
3196   }
3197 #endif
3198   return rc;
3199 
3200 /* Can't use serial interval since not block structured */
3201 
3202 #endif // KMP_USE_DYNAMIC_LOCK
3203 }
3204 
3205 // Interface to fast scalable reduce methods routines
3206 
3207 // keep the selected method in a thread local structure for cross-function
3208 // usage: will be used in __kmpc_end_reduce* functions;
3209 // another solution: to re-determine the method one more time in
3210 // __kmpc_end_reduce* functions (new prototype required then)
3211 // AT: which solution is better?
3212 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3213   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3214 
3215 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3216   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3217 
3218 // description of the packed_reduction_method variable: look at the macros in
3219 // kmp.h
3220 
3221 // used in a critical section reduce block
3222 static __forceinline void
3223 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3224                                           kmp_critical_name *crit) {
3225 
3226   // this lock was visible to a customer and to the threading profile tool as a
3227   // serial overhead span (although it's used for an internal purpose only)
3228   //            why was it visible in previous implementation?
3229   //            should we keep it visible in new reduce block?
3230   kmp_user_lock_p lck;
3231 
3232 #if KMP_USE_DYNAMIC_LOCK
3233 
3234   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3235   // Check if it is initialized.
3236   if (*lk == 0) {
3237     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3238       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3239                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3240     } else {
3241       __kmp_init_indirect_csptr(crit, loc, global_tid,
3242                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3243     }
3244   }
3245   // Branch for accessing the actual lock object and set operation. This
3246   // branching is inevitable since this lock initialization does not follow the
3247   // normal dispatch path (lock table is not used).
3248   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3249     lck = (kmp_user_lock_p)lk;
3250     KMP_DEBUG_ASSERT(lck != NULL);
3251     if (__kmp_env_consistency_check) {
3252       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3253     }
3254     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3255   } else {
3256     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3257     lck = ilk->lock;
3258     KMP_DEBUG_ASSERT(lck != NULL);
3259     if (__kmp_env_consistency_check) {
3260       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3261     }
3262     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3263   }
3264 
3265 #else // KMP_USE_DYNAMIC_LOCK
3266 
3267   // We know that the fast reduction code is only emitted by Intel compilers
3268   // with 32 byte critical sections. If there isn't enough space, then we
3269   // have to use a pointer.
3270   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3271     lck = (kmp_user_lock_p)crit;
3272   } else {
3273     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3274   }
3275   KMP_DEBUG_ASSERT(lck != NULL);
3276 
3277   if (__kmp_env_consistency_check)
3278     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3279 
3280   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3281 
3282 #endif // KMP_USE_DYNAMIC_LOCK
3283 }
3284 
3285 // used in a critical section reduce block
3286 static __forceinline void
3287 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3288                                         kmp_critical_name *crit) {
3289 
3290   kmp_user_lock_p lck;
3291 
3292 #if KMP_USE_DYNAMIC_LOCK
3293 
3294   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3295     lck = (kmp_user_lock_p)crit;
3296     if (__kmp_env_consistency_check)
3297       __kmp_pop_sync(global_tid, ct_critical, loc);
3298     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3299   } else {
3300     kmp_indirect_lock_t *ilk =
3301         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3302     if (__kmp_env_consistency_check)
3303       __kmp_pop_sync(global_tid, ct_critical, loc);
3304     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3305   }
3306 
3307 #else // KMP_USE_DYNAMIC_LOCK
3308 
3309   // We know that the fast reduction code is only emitted by Intel compilers
3310   // with 32 byte critical sections. If there isn't enough space, then we have
3311   // to use a pointer.
3312   if (__kmp_base_user_lock_size > 32) {
3313     lck = *((kmp_user_lock_p *)crit);
3314     KMP_ASSERT(lck != NULL);
3315   } else {
3316     lck = (kmp_user_lock_p)crit;
3317   }
3318 
3319   if (__kmp_env_consistency_check)
3320     __kmp_pop_sync(global_tid, ct_critical, loc);
3321 
3322   __kmp_release_user_lock_with_checks(lck, global_tid);
3323 
3324 #endif // KMP_USE_DYNAMIC_LOCK
3325 } // __kmp_end_critical_section_reduce_block
3326 
3327 #if OMP_40_ENABLED
3328 static __forceinline int
3329 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3330                                      int *task_state) {
3331   kmp_team_t *team;
3332 
3333   // Check if we are inside the teams construct?
3334   if (th->th.th_teams_microtask) {
3335     *team_p = team = th->th.th_team;
3336     if (team->t.t_level == th->th.th_teams_level) {
3337       // This is reduction at teams construct.
3338       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3339       // Let's swap teams temporarily for the reduction.
3340       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3341       th->th.th_team = team->t.t_parent;
3342       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3343       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3344       *task_state = th->th.th_task_state;
3345       th->th.th_task_state = 0;
3346 
3347       return 1;
3348     }
3349   }
3350   return 0;
3351 }
3352 
3353 static __forceinline void
3354 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3355   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3356   th->th.th_info.ds.ds_tid = 0;
3357   th->th.th_team = team;
3358   th->th.th_team_nproc = team->t.t_nproc;
3359   th->th.th_task_team = team->t.t_task_team[task_state];
3360   th->th.th_task_state = task_state;
3361 }
3362 #endif
3363 
3364 /* 2.a.i. Reduce Block without a terminating barrier */
3365 /*!
3366 @ingroup SYNCHRONIZATION
3367 @param loc source location information
3368 @param global_tid global thread number
3369 @param num_vars number of items (variables) to be reduced
3370 @param reduce_size size of data in bytes to be reduced
3371 @param reduce_data pointer to data to be reduced
3372 @param reduce_func callback function providing reduction operation on two
3373 operands and returning result of reduction in lhs_data
3374 @param lck pointer to the unique lock data structure
3375 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3376 threads if atomic reduction needed
3377 
3378 The nowait version is used for a reduce clause with the nowait argument.
3379 */
3380 kmp_int32
3381 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3382                      size_t reduce_size, void *reduce_data,
3383                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3384                      kmp_critical_name *lck) {
3385 
3386   KMP_COUNT_BLOCK(REDUCE_nowait);
3387   int retval = 0;
3388   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3389 #if OMP_40_ENABLED
3390   kmp_info_t *th;
3391   kmp_team_t *team;
3392   int teams_swapped = 0, task_state;
3393 #endif
3394   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3395 
3396   // why do we need this initialization here at all?
3397   // Reduction clause can not be used as a stand-alone directive.
3398 
3399   // do not call __kmp_serial_initialize(), it will be called by
3400   // __kmp_parallel_initialize() if needed
3401   // possible detection of false-positive race by the threadchecker ???
3402   if (!TCR_4(__kmp_init_parallel))
3403     __kmp_parallel_initialize();
3404 
3405 #if OMP_50_ENABLED
3406   __kmp_resume_if_soft_paused();
3407 #endif
3408 
3409 // check correctness of reduce block nesting
3410 #if KMP_USE_DYNAMIC_LOCK
3411   if (__kmp_env_consistency_check)
3412     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3413 #else
3414   if (__kmp_env_consistency_check)
3415     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3416 #endif
3417 
3418 #if OMP_40_ENABLED
3419   th = __kmp_thread_from_gtid(global_tid);
3420   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3421 #endif // OMP_40_ENABLED
3422 
3423   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3424   // the value should be kept in a variable
3425   // the variable should be either a construct-specific or thread-specific
3426   // property, not a team specific property
3427   //     (a thread can reach the next reduce block on the next construct, reduce
3428   //     method may differ on the next construct)
3429   // an ident_t "loc" parameter could be used as a construct-specific property
3430   // (what if loc == 0?)
3431   //     (if both construct-specific and team-specific variables were shared,
3432   //     then unness extra syncs should be needed)
3433   // a thread-specific variable is better regarding two issues above (next
3434   // construct and extra syncs)
3435   // a thread-specific "th_local.reduction_method" variable is used currently
3436   // each thread executes 'determine' and 'set' lines (no need to execute by one
3437   // thread, to avoid unness extra syncs)
3438 
3439   packed_reduction_method = __kmp_determine_reduction_method(
3440       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3441   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3442 
3443   if (packed_reduction_method == critical_reduce_block) {
3444 
3445     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3446     retval = 1;
3447 
3448   } else if (packed_reduction_method == empty_reduce_block) {
3449 
3450     // usage: if team size == 1, no synchronization is required ( Intel
3451     // platforms only )
3452     retval = 1;
3453 
3454   } else if (packed_reduction_method == atomic_reduce_block) {
3455 
3456     retval = 2;
3457 
3458     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3459     // won't be called by the code gen)
3460     //     (it's not quite good, because the checking block has been closed by
3461     //     this 'pop',
3462     //      but atomic operation has not been executed yet, will be executed
3463     //      slightly later, literally on next instruction)
3464     if (__kmp_env_consistency_check)
3465       __kmp_pop_sync(global_tid, ct_reduce, loc);
3466 
3467   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3468                                    tree_reduce_block)) {
3469 
3470 // AT: performance issue: a real barrier here
3471 // AT:     (if master goes slow, other threads are blocked here waiting for the
3472 // master to come and release them)
3473 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3474 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3475 // be confusing to a customer)
3476 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3477 // might go faster and be more in line with sense of NOWAIT
3478 // AT: TO DO: do epcc test and compare times
3479 
3480 // this barrier should be invisible to a customer and to the threading profile
3481 // tool (it's neither a terminating barrier nor customer's code, it's
3482 // used for an internal purpose)
3483 #if OMPT_SUPPORT
3484     // JP: can this barrier potentially leed to task scheduling?
3485     // JP: as long as there is a barrier in the implementation, OMPT should and
3486     // will provide the barrier events
3487     //         so we set-up the necessary frame/return addresses.
3488     ompt_frame_t *ompt_frame;
3489     if (ompt_enabled.enabled) {
3490       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3491       if (ompt_frame->enter_frame.ptr == NULL)
3492         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3493       OMPT_STORE_RETURN_ADDRESS(global_tid);
3494     }
3495 #endif
3496 #if USE_ITT_NOTIFY
3497     __kmp_threads[global_tid]->th.th_ident = loc;
3498 #endif
3499     retval =
3500         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3501                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3502     retval = (retval != 0) ? (0) : (1);
3503 #if OMPT_SUPPORT && OMPT_OPTIONAL
3504     if (ompt_enabled.enabled) {
3505       ompt_frame->enter_frame = ompt_data_none;
3506     }
3507 #endif
3508 
3509     // all other workers except master should do this pop here
3510     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3511     if (__kmp_env_consistency_check) {
3512       if (retval == 0) {
3513         __kmp_pop_sync(global_tid, ct_reduce, loc);
3514       }
3515     }
3516 
3517   } else {
3518 
3519     // should never reach this block
3520     KMP_ASSERT(0); // "unexpected method"
3521   }
3522 #if OMP_40_ENABLED
3523   if (teams_swapped) {
3524     __kmp_restore_swapped_teams(th, team, task_state);
3525   }
3526 #endif
3527   KA_TRACE(
3528       10,
3529       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3530        global_tid, packed_reduction_method, retval));
3531 
3532   return retval;
3533 }
3534 
3535 /*!
3536 @ingroup SYNCHRONIZATION
3537 @param loc source location information
3538 @param global_tid global thread id.
3539 @param lck pointer to the unique lock data structure
3540 
3541 Finish the execution of a reduce nowait.
3542 */
3543 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3544                               kmp_critical_name *lck) {
3545 
3546   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3547 
3548   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3549 
3550   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3551 
3552   if (packed_reduction_method == critical_reduce_block) {
3553 
3554     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3555 
3556   } else if (packed_reduction_method == empty_reduce_block) {
3557 
3558     // usage: if team size == 1, no synchronization is required ( on Intel
3559     // platforms only )
3560 
3561   } else if (packed_reduction_method == atomic_reduce_block) {
3562 
3563     // neither master nor other workers should get here
3564     //     (code gen does not generate this call in case 2: atomic reduce block)
3565     // actually it's better to remove this elseif at all;
3566     // after removal this value will checked by the 'else' and will assert
3567 
3568   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3569                                    tree_reduce_block)) {
3570 
3571     // only master gets here
3572 
3573   } else {
3574 
3575     // should never reach this block
3576     KMP_ASSERT(0); // "unexpected method"
3577   }
3578 
3579   if (__kmp_env_consistency_check)
3580     __kmp_pop_sync(global_tid, ct_reduce, loc);
3581 
3582   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3583                 global_tid, packed_reduction_method));
3584 
3585   return;
3586 }
3587 
3588 /* 2.a.ii. Reduce Block with a terminating barrier */
3589 
3590 /*!
3591 @ingroup SYNCHRONIZATION
3592 @param loc source location information
3593 @param global_tid global thread number
3594 @param num_vars number of items (variables) to be reduced
3595 @param reduce_size size of data in bytes to be reduced
3596 @param reduce_data pointer to data to be reduced
3597 @param reduce_func callback function providing reduction operation on two
3598 operands and returning result of reduction in lhs_data
3599 @param lck pointer to the unique lock data structure
3600 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3601 threads if atomic reduction needed
3602 
3603 A blocking reduce that includes an implicit barrier.
3604 */
3605 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3606                         size_t reduce_size, void *reduce_data,
3607                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3608                         kmp_critical_name *lck) {
3609   KMP_COUNT_BLOCK(REDUCE_wait);
3610   int retval = 0;
3611   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3612 #if OMP_40_ENABLED
3613   kmp_info_t *th;
3614   kmp_team_t *team;
3615   int teams_swapped = 0, task_state;
3616 #endif
3617 
3618   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3619 
3620   // why do we need this initialization here at all?
3621   // Reduction clause can not be a stand-alone directive.
3622 
3623   // do not call __kmp_serial_initialize(), it will be called by
3624   // __kmp_parallel_initialize() if needed
3625   // possible detection of false-positive race by the threadchecker ???
3626   if (!TCR_4(__kmp_init_parallel))
3627     __kmp_parallel_initialize();
3628 
3629 #if OMP_50_ENABLED
3630   __kmp_resume_if_soft_paused();
3631 #endif
3632 
3633 // check correctness of reduce block nesting
3634 #if KMP_USE_DYNAMIC_LOCK
3635   if (__kmp_env_consistency_check)
3636     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3637 #else
3638   if (__kmp_env_consistency_check)
3639     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3640 #endif
3641 
3642 #if OMP_40_ENABLED
3643   th = __kmp_thread_from_gtid(global_tid);
3644   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3645 #endif // OMP_40_ENABLED
3646 
3647   packed_reduction_method = __kmp_determine_reduction_method(
3648       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3649   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3650 
3651   if (packed_reduction_method == critical_reduce_block) {
3652 
3653     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3654     retval = 1;
3655 
3656   } else if (packed_reduction_method == empty_reduce_block) {
3657 
3658     // usage: if team size == 1, no synchronization is required ( Intel
3659     // platforms only )
3660     retval = 1;
3661 
3662   } else if (packed_reduction_method == atomic_reduce_block) {
3663 
3664     retval = 2;
3665 
3666   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3667                                    tree_reduce_block)) {
3668 
3669 // case tree_reduce_block:
3670 // this barrier should be visible to a customer and to the threading profile
3671 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3672 #if OMPT_SUPPORT
3673     ompt_frame_t *ompt_frame;
3674     if (ompt_enabled.enabled) {
3675       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3676       if (ompt_frame->enter_frame.ptr == NULL)
3677         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3678       OMPT_STORE_RETURN_ADDRESS(global_tid);
3679     }
3680 #endif
3681 #if USE_ITT_NOTIFY
3682     __kmp_threads[global_tid]->th.th_ident =
3683         loc; // needed for correct notification of frames
3684 #endif
3685     retval =
3686         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3687                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3688     retval = (retval != 0) ? (0) : (1);
3689 #if OMPT_SUPPORT && OMPT_OPTIONAL
3690     if (ompt_enabled.enabled) {
3691       ompt_frame->enter_frame = ompt_data_none;
3692     }
3693 #endif
3694 
3695     // all other workers except master should do this pop here
3696     // ( none of other workers except master will enter __kmpc_end_reduce() )
3697     if (__kmp_env_consistency_check) {
3698       if (retval == 0) { // 0: all other workers; 1: master
3699         __kmp_pop_sync(global_tid, ct_reduce, loc);
3700       }
3701     }
3702 
3703   } else {
3704 
3705     // should never reach this block
3706     KMP_ASSERT(0); // "unexpected method"
3707   }
3708 #if OMP_40_ENABLED
3709   if (teams_swapped) {
3710     __kmp_restore_swapped_teams(th, team, task_state);
3711   }
3712 #endif
3713 
3714   KA_TRACE(10,
3715            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3716             global_tid, packed_reduction_method, retval));
3717 
3718   return retval;
3719 }
3720 
3721 /*!
3722 @ingroup SYNCHRONIZATION
3723 @param loc source location information
3724 @param global_tid global thread id.
3725 @param lck pointer to the unique lock data structure
3726 
3727 Finish the execution of a blocking reduce.
3728 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3729 start function.
3730 */
3731 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3732                        kmp_critical_name *lck) {
3733 
3734   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3735 #if OMP_40_ENABLED
3736   kmp_info_t *th;
3737   kmp_team_t *team;
3738   int teams_swapped = 0, task_state;
3739 #endif
3740 
3741   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3742 
3743 #if OMP_40_ENABLED
3744   th = __kmp_thread_from_gtid(global_tid);
3745   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3746 #endif // OMP_40_ENABLED
3747 
3748   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3749 
3750   // this barrier should be visible to a customer and to the threading profile
3751   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3752 
3753   if (packed_reduction_method == critical_reduce_block) {
3754 
3755     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3756 
3757 // TODO: implicit barrier: should be exposed
3758 #if OMPT_SUPPORT
3759     ompt_frame_t *ompt_frame;
3760     if (ompt_enabled.enabled) {
3761       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3762       if (ompt_frame->enter_frame.ptr == NULL)
3763         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3764       OMPT_STORE_RETURN_ADDRESS(global_tid);
3765     }
3766 #endif
3767 #if USE_ITT_NOTIFY
3768     __kmp_threads[global_tid]->th.th_ident = loc;
3769 #endif
3770     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3771 #if OMPT_SUPPORT && OMPT_OPTIONAL
3772     if (ompt_enabled.enabled) {
3773       ompt_frame->enter_frame = ompt_data_none;
3774     }
3775 #endif
3776 
3777   } else if (packed_reduction_method == empty_reduce_block) {
3778 
3779 // usage: if team size==1, no synchronization is required (Intel platforms only)
3780 
3781 // TODO: implicit barrier: should be exposed
3782 #if OMPT_SUPPORT
3783     ompt_frame_t *ompt_frame;
3784     if (ompt_enabled.enabled) {
3785       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3786       if (ompt_frame->enter_frame.ptr == NULL)
3787         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3788       OMPT_STORE_RETURN_ADDRESS(global_tid);
3789     }
3790 #endif
3791 #if USE_ITT_NOTIFY
3792     __kmp_threads[global_tid]->th.th_ident = loc;
3793 #endif
3794     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3795 #if OMPT_SUPPORT && OMPT_OPTIONAL
3796     if (ompt_enabled.enabled) {
3797       ompt_frame->enter_frame = ompt_data_none;
3798     }
3799 #endif
3800 
3801   } else if (packed_reduction_method == atomic_reduce_block) {
3802 
3803 #if OMPT_SUPPORT
3804     ompt_frame_t *ompt_frame;
3805     if (ompt_enabled.enabled) {
3806       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3807       if (ompt_frame->enter_frame.ptr == NULL)
3808         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3809       OMPT_STORE_RETURN_ADDRESS(global_tid);
3810     }
3811 #endif
3812 // TODO: implicit barrier: should be exposed
3813 #if USE_ITT_NOTIFY
3814     __kmp_threads[global_tid]->th.th_ident = loc;
3815 #endif
3816     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3817 #if OMPT_SUPPORT && OMPT_OPTIONAL
3818     if (ompt_enabled.enabled) {
3819       ompt_frame->enter_frame = ompt_data_none;
3820     }
3821 #endif
3822 
3823   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3824                                    tree_reduce_block)) {
3825 
3826     // only master executes here (master releases all other workers)
3827     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3828                             global_tid);
3829 
3830   } else {
3831 
3832     // should never reach this block
3833     KMP_ASSERT(0); // "unexpected method"
3834   }
3835 #if OMP_40_ENABLED
3836   if (teams_swapped) {
3837     __kmp_restore_swapped_teams(th, team, task_state);
3838   }
3839 #endif
3840 
3841   if (__kmp_env_consistency_check)
3842     __kmp_pop_sync(global_tid, ct_reduce, loc);
3843 
3844   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3845                 global_tid, packed_reduction_method));
3846 
3847   return;
3848 }
3849 
3850 #undef __KMP_GET_REDUCTION_METHOD
3851 #undef __KMP_SET_REDUCTION_METHOD
3852 
3853 /* end of interface to fast scalable reduce routines */
3854 
3855 kmp_uint64 __kmpc_get_taskid() {
3856 
3857   kmp_int32 gtid;
3858   kmp_info_t *thread;
3859 
3860   gtid = __kmp_get_gtid();
3861   if (gtid < 0) {
3862     return 0;
3863   }
3864   thread = __kmp_thread_from_gtid(gtid);
3865   return thread->th.th_current_task->td_task_id;
3866 
3867 } // __kmpc_get_taskid
3868 
3869 kmp_uint64 __kmpc_get_parent_taskid() {
3870 
3871   kmp_int32 gtid;
3872   kmp_info_t *thread;
3873   kmp_taskdata_t *parent_task;
3874 
3875   gtid = __kmp_get_gtid();
3876   if (gtid < 0) {
3877     return 0;
3878   }
3879   thread = __kmp_thread_from_gtid(gtid);
3880   parent_task = thread->th.th_current_task->td_parent;
3881   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3882 
3883 } // __kmpc_get_parent_taskid
3884 
3885 #if OMP_45_ENABLED
3886 /*!
3887 @ingroup WORK_SHARING
3888 @param loc  source location information.
3889 @param gtid  global thread number.
3890 @param num_dims  number of associated doacross loops.
3891 @param dims  info on loops bounds.
3892 
3893 Initialize doacross loop information.
3894 Expect compiler send us inclusive bounds,
3895 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3896 */
3897 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3898                           const struct kmp_dim *dims) {
3899   int j, idx;
3900   kmp_int64 last, trace_count;
3901   kmp_info_t *th = __kmp_threads[gtid];
3902   kmp_team_t *team = th->th.th_team;
3903   kmp_uint32 *flags;
3904   kmp_disp_t *pr_buf = th->th.th_dispatch;
3905   dispatch_shared_info_t *sh_buf;
3906 
3907   KA_TRACE(
3908       20,
3909       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3910        gtid, num_dims, !team->t.t_serialized));
3911   KMP_DEBUG_ASSERT(dims != NULL);
3912   KMP_DEBUG_ASSERT(num_dims > 0);
3913 
3914   if (team->t.t_serialized) {
3915     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3916     return; // no dependencies if team is serialized
3917   }
3918   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3919   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3920   // the next loop
3921   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3922 
3923   // Save bounds info into allocated private buffer
3924   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3925   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3926       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3927   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3928   pr_buf->th_doacross_info[0] =
3929       (kmp_int64)num_dims; // first element is number of dimensions
3930   // Save also address of num_done in order to access it later without knowing
3931   // the buffer index
3932   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3933   pr_buf->th_doacross_info[2] = dims[0].lo;
3934   pr_buf->th_doacross_info[3] = dims[0].up;
3935   pr_buf->th_doacross_info[4] = dims[0].st;
3936   last = 5;
3937   for (j = 1; j < num_dims; ++j) {
3938     kmp_int64
3939         range_length; // To keep ranges of all dimensions but the first dims[0]
3940     if (dims[j].st == 1) { // most common case
3941       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3942       range_length = dims[j].up - dims[j].lo + 1;
3943     } else {
3944       if (dims[j].st > 0) {
3945         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3946         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3947       } else { // negative increment
3948         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3949         range_length =
3950             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3951       }
3952     }
3953     pr_buf->th_doacross_info[last++] = range_length;
3954     pr_buf->th_doacross_info[last++] = dims[j].lo;
3955     pr_buf->th_doacross_info[last++] = dims[j].up;
3956     pr_buf->th_doacross_info[last++] = dims[j].st;
3957   }
3958 
3959   // Compute total trip count.
3960   // Start with range of dims[0] which we don't need to keep in the buffer.
3961   if (dims[0].st == 1) { // most common case
3962     trace_count = dims[0].up - dims[0].lo + 1;
3963   } else if (dims[0].st > 0) {
3964     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3965     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3966   } else { // negative increment
3967     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3968     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3969   }
3970   for (j = 1; j < num_dims; ++j) {
3971     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3972   }
3973   KMP_DEBUG_ASSERT(trace_count > 0);
3974 
3975   // Check if shared buffer is not occupied by other loop (idx -
3976   // __kmp_dispatch_num_buffers)
3977   if (idx != sh_buf->doacross_buf_idx) {
3978     // Shared buffer is occupied, wait for it to be free
3979     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3980                        __kmp_eq_4, NULL);
3981   }
3982 #if KMP_32_BIT_ARCH
3983   // Check if we are the first thread. After the CAS the first thread gets 0,
3984   // others get 1 if initialization is in progress, allocated pointer otherwise.
3985   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3986   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3987       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3988 #else
3989   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3990       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3991 #endif
3992   if (flags == NULL) {
3993     // we are the first thread, allocate the array of flags
3994     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3995     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3996     KMP_MB();
3997     sh_buf->doacross_flags = flags;
3998   } else if (flags == (kmp_uint32 *)1) {
3999 #if KMP_32_BIT_ARCH
4000     // initialization is still in progress, need to wait
4001     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4002 #else
4003     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4004 #endif
4005       KMP_YIELD(TRUE);
4006     KMP_MB();
4007   } else {
4008     KMP_MB();
4009   }
4010   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4011   pr_buf->th_doacross_flags =
4012       sh_buf->doacross_flags; // save private copy in order to not
4013   // touch shared buffer on each iteration
4014   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4015 }
4016 
4017 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4018   kmp_int32 shft, num_dims, i;
4019   kmp_uint32 flag;
4020   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4021   kmp_info_t *th = __kmp_threads[gtid];
4022   kmp_team_t *team = th->th.th_team;
4023   kmp_disp_t *pr_buf;
4024   kmp_int64 lo, up, st;
4025 
4026   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4027   if (team->t.t_serialized) {
4028     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4029     return; // no dependencies if team is serialized
4030   }
4031 
4032   // calculate sequential iteration number and check out-of-bounds condition
4033   pr_buf = th->th.th_dispatch;
4034   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4035   num_dims = pr_buf->th_doacross_info[0];
4036   lo = pr_buf->th_doacross_info[2];
4037   up = pr_buf->th_doacross_info[3];
4038   st = pr_buf->th_doacross_info[4];
4039   if (st == 1) { // most common case
4040     if (vec[0] < lo || vec[0] > up) {
4041       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4042                     "bounds [%lld,%lld]\n",
4043                     gtid, vec[0], lo, up));
4044       return;
4045     }
4046     iter_number = vec[0] - lo;
4047   } else if (st > 0) {
4048     if (vec[0] < lo || vec[0] > up) {
4049       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4050                     "bounds [%lld,%lld]\n",
4051                     gtid, vec[0], lo, up));
4052       return;
4053     }
4054     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4055   } else { // negative increment
4056     if (vec[0] > lo || vec[0] < up) {
4057       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4058                     "bounds [%lld,%lld]\n",
4059                     gtid, vec[0], lo, up));
4060       return;
4061     }
4062     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4063   }
4064   for (i = 1; i < num_dims; ++i) {
4065     kmp_int64 iter, ln;
4066     kmp_int32 j = i * 4;
4067     ln = pr_buf->th_doacross_info[j + 1];
4068     lo = pr_buf->th_doacross_info[j + 2];
4069     up = pr_buf->th_doacross_info[j + 3];
4070     st = pr_buf->th_doacross_info[j + 4];
4071     if (st == 1) {
4072       if (vec[i] < lo || vec[i] > up) {
4073         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4074                       "bounds [%lld,%lld]\n",
4075                       gtid, vec[i], lo, up));
4076         return;
4077       }
4078       iter = vec[i] - lo;
4079     } else if (st > 0) {
4080       if (vec[i] < lo || vec[i] > up) {
4081         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4082                       "bounds [%lld,%lld]\n",
4083                       gtid, vec[i], lo, up));
4084         return;
4085       }
4086       iter = (kmp_uint64)(vec[i] - lo) / st;
4087     } else { // st < 0
4088       if (vec[i] > lo || vec[i] < up) {
4089         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4090                       "bounds [%lld,%lld]\n",
4091                       gtid, vec[i], lo, up));
4092         return;
4093       }
4094       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4095     }
4096     iter_number = iter + ln * iter_number;
4097   }
4098   shft = iter_number % 32; // use 32-bit granularity
4099   iter_number >>= 5; // divided by 32
4100   flag = 1 << shft;
4101   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4102     KMP_YIELD(TRUE);
4103   }
4104   KMP_MB();
4105   KA_TRACE(20,
4106            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4107             gtid, (iter_number << 5) + shft));
4108 }
4109 
4110 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4111   kmp_int32 shft, num_dims, i;
4112   kmp_uint32 flag;
4113   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4114   kmp_info_t *th = __kmp_threads[gtid];
4115   kmp_team_t *team = th->th.th_team;
4116   kmp_disp_t *pr_buf;
4117   kmp_int64 lo, st;
4118 
4119   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4120   if (team->t.t_serialized) {
4121     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4122     return; // no dependencies if team is serialized
4123   }
4124 
4125   // calculate sequential iteration number (same as in "wait" but no
4126   // out-of-bounds checks)
4127   pr_buf = th->th.th_dispatch;
4128   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4129   num_dims = pr_buf->th_doacross_info[0];
4130   lo = pr_buf->th_doacross_info[2];
4131   st = pr_buf->th_doacross_info[4];
4132   if (st == 1) { // most common case
4133     iter_number = vec[0] - lo;
4134   } else if (st > 0) {
4135     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4136   } else { // negative increment
4137     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4138   }
4139   for (i = 1; i < num_dims; ++i) {
4140     kmp_int64 iter, ln;
4141     kmp_int32 j = i * 4;
4142     ln = pr_buf->th_doacross_info[j + 1];
4143     lo = pr_buf->th_doacross_info[j + 2];
4144     st = pr_buf->th_doacross_info[j + 4];
4145     if (st == 1) {
4146       iter = vec[i] - lo;
4147     } else if (st > 0) {
4148       iter = (kmp_uint64)(vec[i] - lo) / st;
4149     } else { // st < 0
4150       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4151     }
4152     iter_number = iter + ln * iter_number;
4153   }
4154   shft = iter_number % 32; // use 32-bit granularity
4155   iter_number >>= 5; // divided by 32
4156   flag = 1 << shft;
4157   KMP_MB();
4158   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4159     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4160   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4161                 (iter_number << 5) + shft));
4162 }
4163 
4164 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4165   kmp_int32 num_done;
4166   kmp_info_t *th = __kmp_threads[gtid];
4167   kmp_team_t *team = th->th.th_team;
4168   kmp_disp_t *pr_buf = th->th.th_dispatch;
4169 
4170   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4171   if (team->t.t_serialized) {
4172     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4173     return; // nothing to do
4174   }
4175   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4176   if (num_done == th->th.th_team_nproc) {
4177     // we are the last thread, need to free shared resources
4178     int idx = pr_buf->th_doacross_buf_idx - 1;
4179     dispatch_shared_info_t *sh_buf =
4180         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4181     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4182                      (kmp_int64)&sh_buf->doacross_num_done);
4183     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4184     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4185     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4186     sh_buf->doacross_flags = NULL;
4187     sh_buf->doacross_num_done = 0;
4188     sh_buf->doacross_buf_idx +=
4189         __kmp_dispatch_num_buffers; // free buffer for future re-use
4190   }
4191   // free private resources (need to keep buffer index forever)
4192   pr_buf->th_doacross_flags = NULL;
4193   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4194   pr_buf->th_doacross_info = NULL;
4195   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4196 }
4197 #endif
4198 
4199 #if OMP_50_ENABLED
4200 int __kmpc_get_target_offload(void) {
4201   if (!__kmp_init_serial) {
4202     __kmp_serial_initialize();
4203   }
4204   return __kmp_target_offload;
4205 }
4206 
4207 int __kmpc_pause_resource(kmp_pause_status_t level) {
4208   if (!__kmp_init_serial) {
4209     return 1; // Can't pause if runtime is not initialized
4210   }
4211   return __kmp_pause_resource(level);
4212 }
4213 #endif // OMP_50_ENABLED
4214 
4215 // end of file //
4216