1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 #if OMP_40_ENABLED
248 
249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
250                            kmp_int32 proc_bind) {
251   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252                 proc_bind));
253 
254   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
255 }
256 
257 #endif /* OMP_40_ENABLED */
258 
259 /*!
260 @ingroup PARALLEL
261 @param loc  source location information
262 @param argc  total number of arguments in the ellipsis
263 @param microtask  pointer to callback routine consisting of outlined parallel
264 construct
265 @param ...  pointers to shared variables that aren't global
266 
267 Do the actual fork and call the microtask in the relevant number of threads.
268 */
269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
270   int gtid = __kmp_entry_gtid();
271 
272 #if (KMP_STATS_ENABLED)
273   // If we were in a serial region, then stop the serial timer, record
274   // the event, and start parallel region timer
275   stats_state_e previous_state = KMP_GET_THREAD_STATE();
276   if (previous_state == stats_state_e::SERIAL_REGION) {
277     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
278   } else {
279     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
280   }
281   int inParallel = __kmpc_in_parallel(loc);
282   if (inParallel) {
283     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
284   } else {
285     KMP_COUNT_BLOCK(OMP_PARALLEL);
286   }
287 #endif
288 
289   // maybe to save thr_state is enough here
290   {
291     va_list ap;
292     va_start(ap, microtask);
293 
294 #if OMPT_SUPPORT
295     ompt_frame_t *ompt_frame;
296     if (ompt_enabled.enabled) {
297       kmp_info_t *master_th = __kmp_threads[gtid];
298       kmp_team_t *parent_team = master_th->th.th_team;
299       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
300       if (lwt)
301         ompt_frame = &(lwt->ompt_task_info.frame);
302       else {
303         int tid = __kmp_tid_from_gtid(gtid);
304         ompt_frame = &(
305             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
306       }
307       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
308       OMPT_STORE_RETURN_ADDRESS(gtid);
309     }
310 #endif
311 
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_FORKING();
314 #endif
315     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
316                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
317                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
320                     &ap
321 #else
322                     ap
323 #endif
324                     );
325 #if INCLUDE_SSC_MARKS
326     SSC_MARK_JOINING();
327 #endif
328     __kmp_join_call(loc, gtid
329 #if OMPT_SUPPORT
330                     ,
331                     fork_context_intel
332 #endif
333                     );
334 
335     va_end(ap);
336   }
337 
338 #if KMP_STATS_ENABLED
339   if (previous_state == stats_state_e::SERIAL_REGION) {
340     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
341   } else {
342     KMP_POP_PARTITIONED_TIMER();
343   }
344 #endif // KMP_STATS_ENABLED
345 }
346 
347 #if OMP_40_ENABLED
348 /*!
349 @ingroup PARALLEL
350 @param loc source location information
351 @param global_tid global thread number
352 @param num_teams number of teams requested for the teams construct
353 @param num_threads number of threads per team requested for the teams construct
354 
355 Set the number of teams to be used by the teams construct.
356 This call is only required if the teams construct has a `num_teams` clause
357 or a `thread_limit` clause (or both).
358 */
359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
360                            kmp_int32 num_teams, kmp_int32 num_threads) {
361   KA_TRACE(20,
362            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
363             global_tid, num_teams, num_threads));
364 
365   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
366 }
367 
368 /*!
369 @ingroup PARALLEL
370 @param loc  source location information
371 @param argc  total number of arguments in the ellipsis
372 @param microtask  pointer to callback routine consisting of outlined teams
373 construct
374 @param ...  pointers to shared variables that aren't global
375 
376 Do the actual fork and call the microtask in the relevant number of threads.
377 */
378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
379                        ...) {
380   int gtid = __kmp_entry_gtid();
381   kmp_info_t *this_thr = __kmp_threads[gtid];
382   va_list ap;
383   va_start(ap, microtask);
384 
385 #if KMP_STATS_ENABLED
386   KMP_COUNT_BLOCK(OMP_TEAMS);
387   stats_state_e previous_state = KMP_GET_THREAD_STATE();
388   if (previous_state == stats_state_e::SERIAL_REGION) {
389     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
390   } else {
391     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
392   }
393 #endif
394 
395   // remember teams entry point and nesting level
396   this_thr->th.th_teams_microtask = microtask;
397   this_thr->th.th_teams_level =
398       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
399 
400 #if OMPT_SUPPORT
401   kmp_team_t *parent_team = this_thr->th.th_team;
402   int tid = __kmp_tid_from_gtid(gtid);
403   if (ompt_enabled.enabled) {
404     parent_team->t.t_implicit_task_taskdata[tid]
405         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
406   }
407   OMPT_STORE_RETURN_ADDRESS(gtid);
408 #endif
409 
410   // check if __kmpc_push_num_teams called, set default number of teams
411   // otherwise
412   if (this_thr->th.th_teams_size.nteams == 0) {
413     __kmp_push_num_teams(loc, gtid, 0, 0);
414   }
415   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
416   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
417   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
418 
419   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
420                   VOLATILE_CAST(microtask_t)
421                       __kmp_teams_master, // "wrapped" task
422                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
423 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
424                   &ap
425 #else
426                   ap
427 #endif
428                   );
429   __kmp_join_call(loc, gtid
430 #if OMPT_SUPPORT
431                   ,
432                   fork_context_intel
433 #endif
434                   );
435 
436   // Pop current CG root off list
437   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
438   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
439   this_thr->th.th_cg_roots = tmp->up;
440   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
441                  " to node %p. cg_nthreads was %d\n",
442                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
443   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
444   int i = tmp->cg_nthreads--;
445   if (i == 1) { // check is we are the last thread in CG (not always the case)
446     __kmp_free(tmp);
447   }
448   // Restore current task's thread_limit from CG root
449   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
450   this_thr->th.th_current_task->td_icvs.thread_limit =
451       this_thr->th.th_cg_roots->cg_thread_limit;
452 
453   this_thr->th.th_teams_microtask = NULL;
454   this_thr->th.th_teams_level = 0;
455   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
456   va_end(ap);
457 #if KMP_STATS_ENABLED
458   if (previous_state == stats_state_e::SERIAL_REGION) {
459     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
460   } else {
461     KMP_POP_PARTITIONED_TIMER();
462   }
463 #endif // KMP_STATS_ENABLED
464 }
465 #endif /* OMP_40_ENABLED */
466 
467 // I don't think this function should ever have been exported.
468 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
469 // openmp code ever called it, but it's been exported from the RTL for so
470 // long that I'm afraid to remove the definition.
471 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
472 
473 /*!
474 @ingroup PARALLEL
475 @param loc  source location information
476 @param global_tid  global thread number
477 
478 Enter a serialized parallel construct. This interface is used to handle a
479 conditional parallel region, like this,
480 @code
481 #pragma omp parallel if (condition)
482 @endcode
483 when the condition is false.
484 */
485 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
486 // The implementation is now in kmp_runtime.cpp so that it can share static
487 // functions with kmp_fork_call since the tasks to be done are similar in
488 // each case.
489 #if OMPT_SUPPORT
490   OMPT_STORE_RETURN_ADDRESS(global_tid);
491 #endif
492   __kmp_serialized_parallel(loc, global_tid);
493 }
494 
495 /*!
496 @ingroup PARALLEL
497 @param loc  source location information
498 @param global_tid  global thread number
499 
500 Leave a serialized parallel construct.
501 */
502 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
503   kmp_internal_control_t *top;
504   kmp_info_t *this_thr;
505   kmp_team_t *serial_team;
506 
507   KC_TRACE(10,
508            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
509 
510   /* skip all this code for autopar serialized loops since it results in
511      unacceptable overhead */
512   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
513     return;
514 
515   // Not autopar code
516   if (!TCR_4(__kmp_init_parallel))
517     __kmp_parallel_initialize();
518 
519 #if OMP_50_ENABLED
520   __kmp_resume_if_soft_paused();
521 #endif
522 
523   this_thr = __kmp_threads[global_tid];
524   serial_team = this_thr->th.th_serial_team;
525 
526 #if OMP_45_ENABLED
527   kmp_task_team_t *task_team = this_thr->th.th_task_team;
528 
529   // we need to wait for the proxy tasks before finishing the thread
530   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
531     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
532 #endif
533 
534   KMP_MB();
535   KMP_DEBUG_ASSERT(serial_team);
536   KMP_ASSERT(serial_team->t.t_serialized);
537   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
538   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
539   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
540   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
541 
542 #if OMPT_SUPPORT
543   if (ompt_enabled.enabled &&
544       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
545     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
546     if (ompt_enabled.ompt_callback_implicit_task) {
547       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
548           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
549           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
550     }
551 
552     // reset clear the task id only after unlinking the task
553     ompt_data_t *parent_task_data;
554     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
555 
556     if (ompt_enabled.ompt_callback_parallel_end) {
557       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
558           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
559           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
560     }
561     __ompt_lw_taskteam_unlink(this_thr);
562     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
563   }
564 #endif
565 
566   /* If necessary, pop the internal control stack values and replace the team
567    * values */
568   top = serial_team->t.t_control_stack_top;
569   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
570     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
571     serial_team->t.t_control_stack_top = top->next;
572     __kmp_free(top);
573   }
574 
575   // if( serial_team -> t.t_serialized > 1 )
576   serial_team->t.t_level--;
577 
578   /* pop dispatch buffers stack */
579   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
580   {
581     dispatch_private_info_t *disp_buffer =
582         serial_team->t.t_dispatch->th_disp_buffer;
583     serial_team->t.t_dispatch->th_disp_buffer =
584         serial_team->t.t_dispatch->th_disp_buffer->next;
585     __kmp_free(disp_buffer);
586   }
587 #if OMP_50_ENABLED
588   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
589 #endif
590 
591   --serial_team->t.t_serialized;
592   if (serial_team->t.t_serialized == 0) {
593 
594 /* return to the parallel section */
595 
596 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
597     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
598       __kmp_clear_x87_fpu_status_word();
599       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
600       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
601     }
602 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
603 
604     this_thr->th.th_team = serial_team->t.t_parent;
605     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
606 
607     /* restore values cached in the thread */
608     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
609     this_thr->th.th_team_master =
610         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
611     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
612 
613     /* TODO the below shouldn't need to be adjusted for serialized teams */
614     this_thr->th.th_dispatch =
615         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
616 
617     __kmp_pop_current_task_from_thread(this_thr);
618 
619     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
620     this_thr->th.th_current_task->td_flags.executing = 1;
621 
622     if (__kmp_tasking_mode != tskm_immediate_exec) {
623       // Copy the task team from the new child / old parent team to the thread.
624       this_thr->th.th_task_team =
625           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
626       KA_TRACE(20,
627                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
628                 "team %p\n",
629                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
630     }
631   } else {
632     if (__kmp_tasking_mode != tskm_immediate_exec) {
633       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
634                     "depth of serial team %p to %d\n",
635                     global_tid, serial_team, serial_team->t.t_serialized));
636     }
637   }
638 
639   if (__kmp_env_consistency_check)
640     __kmp_pop_parallel(global_tid, NULL);
641 #if OMPT_SUPPORT
642   if (ompt_enabled.enabled)
643     this_thr->th.ompt_thread_info.state =
644         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
645                                            : ompt_state_work_parallel);
646 #endif
647 }
648 
649 /*!
650 @ingroup SYNCHRONIZATION
651 @param loc  source location information.
652 
653 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
654 depending on the memory ordering convention obeyed by the compiler
655 even that may not be necessary).
656 */
657 void __kmpc_flush(ident_t *loc) {
658   KC_TRACE(10, ("__kmpc_flush: called\n"));
659 
660   /* need explicit __mf() here since use volatile instead in library */
661   KMP_MB(); /* Flush all pending memory write invalidates.  */
662 
663 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
664 #if KMP_MIC
665 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
666 // We shouldn't need it, though, since the ABI rules require that
667 // * If the compiler generates NGO stores it also generates the fence
668 // * If users hand-code NGO stores they should insert the fence
669 // therefore no incomplete unordered stores should be visible.
670 #else
671   // C74404
672   // This is to address non-temporal store instructions (sfence needed).
673   // The clflush instruction is addressed either (mfence needed).
674   // Probably the non-temporal load monvtdqa instruction should also be
675   // addressed.
676   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
677   if (!__kmp_cpuinfo.initialized) {
678     __kmp_query_cpuid(&__kmp_cpuinfo);
679   }
680   if (!__kmp_cpuinfo.sse2) {
681     // CPU cannot execute SSE2 instructions.
682   } else {
683 #if KMP_COMPILER_ICC
684     _mm_mfence();
685 #elif KMP_COMPILER_MSVC
686     MemoryBarrier();
687 #else
688     __sync_synchronize();
689 #endif // KMP_COMPILER_ICC
690   }
691 #endif // KMP_MIC
692 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
693 // Nothing to see here move along
694 #elif KMP_ARCH_PPC64
695 // Nothing needed here (we have a real MB above).
696 #if KMP_OS_CNK
697   // The flushing thread needs to yield here; this prevents a
698   // busy-waiting thread from saturating the pipeline. flush is
699   // often used in loops like this:
700   // while (!flag) {
701   //   #pragma omp flush(flag)
702   // }
703   // and adding the yield here is good for at least a 10x speedup
704   // when running >2 threads per core (on the NAS LU benchmark).
705   __kmp_yield();
706 #endif
707 #else
708 #error Unknown or unsupported architecture
709 #endif
710 
711 #if OMPT_SUPPORT && OMPT_OPTIONAL
712   if (ompt_enabled.ompt_callback_flush) {
713     ompt_callbacks.ompt_callback(ompt_callback_flush)(
714         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
715   }
716 #endif
717 }
718 
719 /* -------------------------------------------------------------------------- */
720 /*!
721 @ingroup SYNCHRONIZATION
722 @param loc source location information
723 @param global_tid thread id.
724 
725 Execute a barrier.
726 */
727 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
728   KMP_COUNT_BLOCK(OMP_BARRIER);
729   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
730 
731   if (!TCR_4(__kmp_init_parallel))
732     __kmp_parallel_initialize();
733 
734 #if OMP_50_ENABLED
735   __kmp_resume_if_soft_paused();
736 #endif
737 
738   if (__kmp_env_consistency_check) {
739     if (loc == 0) {
740       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
741     }
742 
743     __kmp_check_barrier(global_tid, ct_barrier, loc);
744   }
745 
746 #if OMPT_SUPPORT
747   ompt_frame_t *ompt_frame;
748   if (ompt_enabled.enabled) {
749     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
750     if (ompt_frame->enter_frame.ptr == NULL)
751       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
752     OMPT_STORE_RETURN_ADDRESS(global_tid);
753   }
754 #endif
755   __kmp_threads[global_tid]->th.th_ident = loc;
756   // TODO: explicit barrier_wait_id:
757   //   this function is called when 'barrier' directive is present or
758   //   implicit barrier at the end of a worksharing construct.
759   // 1) better to add a per-thread barrier counter to a thread data structure
760   // 2) set to 0 when a new team is created
761   // 4) no sync is required
762 
763   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
764 #if OMPT_SUPPORT && OMPT_OPTIONAL
765   if (ompt_enabled.enabled) {
766     ompt_frame->enter_frame = ompt_data_none;
767   }
768 #endif
769 }
770 
771 /* The BARRIER for a MASTER section is always explicit   */
772 /*!
773 @ingroup WORK_SHARING
774 @param loc  source location information.
775 @param global_tid  global thread number .
776 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
777 */
778 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
779   int status = 0;
780 
781   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
782 
783   if (!TCR_4(__kmp_init_parallel))
784     __kmp_parallel_initialize();
785 
786 #if OMP_50_ENABLED
787   __kmp_resume_if_soft_paused();
788 #endif
789 
790   if (KMP_MASTER_GTID(global_tid)) {
791     KMP_COUNT_BLOCK(OMP_MASTER);
792     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
793     status = 1;
794   }
795 
796 #if OMPT_SUPPORT && OMPT_OPTIONAL
797   if (status) {
798     if (ompt_enabled.ompt_callback_master) {
799       kmp_info_t *this_thr = __kmp_threads[global_tid];
800       kmp_team_t *team = this_thr->th.th_team;
801 
802       int tid = __kmp_tid_from_gtid(global_tid);
803       ompt_callbacks.ompt_callback(ompt_callback_master)(
804           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
805           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
806           OMPT_GET_RETURN_ADDRESS(0));
807     }
808   }
809 #endif
810 
811   if (__kmp_env_consistency_check) {
812 #if KMP_USE_DYNAMIC_LOCK
813     if (status)
814       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
815     else
816       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
817 #else
818     if (status)
819       __kmp_push_sync(global_tid, ct_master, loc, NULL);
820     else
821       __kmp_check_sync(global_tid, ct_master, loc, NULL);
822 #endif
823   }
824 
825   return status;
826 }
827 
828 /*!
829 @ingroup WORK_SHARING
830 @param loc  source location information.
831 @param global_tid  global thread number .
832 
833 Mark the end of a <tt>master</tt> region. This should only be called by the
834 thread that executes the <tt>master</tt> region.
835 */
836 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
837   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
838 
839   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
840   KMP_POP_PARTITIONED_TIMER();
841 
842 #if OMPT_SUPPORT && OMPT_OPTIONAL
843   kmp_info_t *this_thr = __kmp_threads[global_tid];
844   kmp_team_t *team = this_thr->th.th_team;
845   if (ompt_enabled.ompt_callback_master) {
846     int tid = __kmp_tid_from_gtid(global_tid);
847     ompt_callbacks.ompt_callback(ompt_callback_master)(
848         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
849         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
850         OMPT_GET_RETURN_ADDRESS(0));
851   }
852 #endif
853 
854   if (__kmp_env_consistency_check) {
855     if (global_tid < 0)
856       KMP_WARNING(ThreadIdentInvalid);
857 
858     if (KMP_MASTER_GTID(global_tid))
859       __kmp_pop_sync(global_tid, ct_master, loc);
860   }
861 }
862 
863 /*!
864 @ingroup WORK_SHARING
865 @param loc  source location information.
866 @param gtid  global thread number.
867 
868 Start execution of an <tt>ordered</tt> construct.
869 */
870 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
871   int cid = 0;
872   kmp_info_t *th;
873   KMP_DEBUG_ASSERT(__kmp_init_serial);
874 
875   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
876 
877   if (!TCR_4(__kmp_init_parallel))
878     __kmp_parallel_initialize();
879 
880 #if OMP_50_ENABLED
881   __kmp_resume_if_soft_paused();
882 #endif
883 
884 #if USE_ITT_BUILD
885   __kmp_itt_ordered_prep(gtid);
886 // TODO: ordered_wait_id
887 #endif /* USE_ITT_BUILD */
888 
889   th = __kmp_threads[gtid];
890 
891 #if OMPT_SUPPORT && OMPT_OPTIONAL
892   kmp_team_t *team;
893   ompt_wait_id_t lck;
894   void *codeptr_ra;
895   if (ompt_enabled.enabled) {
896     OMPT_STORE_RETURN_ADDRESS(gtid);
897     team = __kmp_team_from_gtid(gtid);
898     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
899     /* OMPT state update */
900     th->th.ompt_thread_info.wait_id = lck;
901     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
902 
903     /* OMPT event callback */
904     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
905     if (ompt_enabled.ompt_callback_mutex_acquire) {
906       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
907           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
908           codeptr_ra);
909     }
910   }
911 #endif
912 
913   if (th->th.th_dispatch->th_deo_fcn != 0)
914     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
915   else
916     __kmp_parallel_deo(&gtid, &cid, loc);
917 
918 #if OMPT_SUPPORT && OMPT_OPTIONAL
919   if (ompt_enabled.enabled) {
920     /* OMPT state update */
921     th->th.ompt_thread_info.state = ompt_state_work_parallel;
922     th->th.ompt_thread_info.wait_id = 0;
923 
924     /* OMPT event callback */
925     if (ompt_enabled.ompt_callback_mutex_acquired) {
926       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
927           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
928     }
929   }
930 #endif
931 
932 #if USE_ITT_BUILD
933   __kmp_itt_ordered_start(gtid);
934 #endif /* USE_ITT_BUILD */
935 }
936 
937 /*!
938 @ingroup WORK_SHARING
939 @param loc  source location information.
940 @param gtid  global thread number.
941 
942 End execution of an <tt>ordered</tt> construct.
943 */
944 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
945   int cid = 0;
946   kmp_info_t *th;
947 
948   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
949 
950 #if USE_ITT_BUILD
951   __kmp_itt_ordered_end(gtid);
952 // TODO: ordered_wait_id
953 #endif /* USE_ITT_BUILD */
954 
955   th = __kmp_threads[gtid];
956 
957   if (th->th.th_dispatch->th_dxo_fcn != 0)
958     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
959   else
960     __kmp_parallel_dxo(&gtid, &cid, loc);
961 
962 #if OMPT_SUPPORT && OMPT_OPTIONAL
963   OMPT_STORE_RETURN_ADDRESS(gtid);
964   if (ompt_enabled.ompt_callback_mutex_released) {
965     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
966         ompt_mutex_ordered,
967         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
968             ->t.t_ordered.dt.t_value,
969         OMPT_LOAD_RETURN_ADDRESS(gtid));
970   }
971 #endif
972 }
973 
974 #if KMP_USE_DYNAMIC_LOCK
975 
976 static __forceinline void
977 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
978                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
979   // Pointer to the allocated indirect lock is written to crit, while indexing
980   // is ignored.
981   void *idx;
982   kmp_indirect_lock_t **lck;
983   lck = (kmp_indirect_lock_t **)crit;
984   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
985   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
986   KMP_SET_I_LOCK_LOCATION(ilk, loc);
987   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
988   KA_TRACE(20,
989            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
990 #if USE_ITT_BUILD
991   __kmp_itt_critical_creating(ilk->lock, loc);
992 #endif
993   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
994   if (status == 0) {
995 #if USE_ITT_BUILD
996     __kmp_itt_critical_destroyed(ilk->lock);
997 #endif
998     // We don't really need to destroy the unclaimed lock here since it will be
999     // cleaned up at program exit.
1000     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1001   }
1002   KMP_DEBUG_ASSERT(*lck != NULL);
1003 }
1004 
1005 // Fast-path acquire tas lock
1006 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1007   {                                                                            \
1008     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1009     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1010     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1011     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1012         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1013       kmp_uint32 spins;                                                        \
1014       KMP_FSYNC_PREPARE(l);                                                    \
1015       KMP_INIT_YIELD(spins);                                                   \
1016       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1017       do {                                                                     \
1018         if (TCR_4(__kmp_nth) >                                                 \
1019             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1020           KMP_YIELD(TRUE);                                                     \
1021         } else {                                                               \
1022           KMP_YIELD_SPIN(spins);                                               \
1023         }                                                                      \
1024         __kmp_spin_backoff(&backoff);                                          \
1025       } while (                                                                \
1026           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1027           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1028     }                                                                          \
1029     KMP_FSYNC_ACQUIRED(l);                                                     \
1030   }
1031 
1032 // Fast-path test tas lock
1033 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1034   {                                                                            \
1035     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1036     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1037     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1038     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1039          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1040   }
1041 
1042 // Fast-path release tas lock
1043 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1044   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1045 
1046 #if KMP_USE_FUTEX
1047 
1048 #include <sys/syscall.h>
1049 #include <unistd.h>
1050 #ifndef FUTEX_WAIT
1051 #define FUTEX_WAIT 0
1052 #endif
1053 #ifndef FUTEX_WAKE
1054 #define FUTEX_WAKE 1
1055 #endif
1056 
1057 // Fast-path acquire futex lock
1058 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1059   {                                                                            \
1060     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1061     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1062     KMP_MB();                                                                  \
1063     KMP_FSYNC_PREPARE(ftx);                                                    \
1064     kmp_int32 poll_val;                                                        \
1065     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1066                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1067                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1068       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1069       if (!cond) {                                                             \
1070         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1071                                          poll_val |                            \
1072                                              KMP_LOCK_BUSY(1, futex))) {       \
1073           continue;                                                            \
1074         }                                                                      \
1075         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1076       }                                                                        \
1077       kmp_int32 rc;                                                            \
1078       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1079                         NULL, NULL, 0)) != 0) {                                \
1080         continue;                                                              \
1081       }                                                                        \
1082       gtid_code |= 1;                                                          \
1083     }                                                                          \
1084     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1085   }
1086 
1087 // Fast-path test futex lock
1088 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1089   {                                                                            \
1090     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1091     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1092                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1093       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1094       rc = TRUE;                                                               \
1095     } else {                                                                   \
1096       rc = FALSE;                                                              \
1097     }                                                                          \
1098   }
1099 
1100 // Fast-path release futex lock
1101 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1102   {                                                                            \
1103     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1104     KMP_MB();                                                                  \
1105     KMP_FSYNC_RELEASING(ftx);                                                  \
1106     kmp_int32 poll_val =                                                       \
1107         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1108     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1109       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1110               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1111     }                                                                          \
1112     KMP_MB();                                                                  \
1113     KMP_YIELD_OVERSUB();                                                       \
1114   }
1115 
1116 #endif // KMP_USE_FUTEX
1117 
1118 #else // KMP_USE_DYNAMIC_LOCK
1119 
1120 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1121                                                       ident_t const *loc,
1122                                                       kmp_int32 gtid) {
1123   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1124 
1125   // Because of the double-check, the following load doesn't need to be volatile
1126   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1127 
1128   if (lck == NULL) {
1129     void *idx;
1130 
1131     // Allocate & initialize the lock.
1132     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1133     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1134     __kmp_init_user_lock_with_checks(lck);
1135     __kmp_set_user_lock_location(lck, loc);
1136 #if USE_ITT_BUILD
1137     __kmp_itt_critical_creating(lck);
1138 // __kmp_itt_critical_creating() should be called *before* the first usage
1139 // of underlying lock. It is the only place where we can guarantee it. There
1140 // are chances the lock will destroyed with no usage, but it is not a
1141 // problem, because this is not real event seen by user but rather setting
1142 // name for object (lock). See more details in kmp_itt.h.
1143 #endif /* USE_ITT_BUILD */
1144 
1145     // Use a cmpxchg instruction to slam the start of the critical section with
1146     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1147     // and use the lock that the other thread allocated.
1148     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1149 
1150     if (status == 0) {
1151 // Deallocate the lock and reload the value.
1152 #if USE_ITT_BUILD
1153       __kmp_itt_critical_destroyed(lck);
1154 // Let ITT know the lock is destroyed and the same memory location may be reused
1155 // for another purpose.
1156 #endif /* USE_ITT_BUILD */
1157       __kmp_destroy_user_lock_with_checks(lck);
1158       __kmp_user_lock_free(&idx, gtid, lck);
1159       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1160       KMP_DEBUG_ASSERT(lck != NULL);
1161     }
1162   }
1163   return lck;
1164 }
1165 
1166 #endif // KMP_USE_DYNAMIC_LOCK
1167 
1168 /*!
1169 @ingroup WORK_SHARING
1170 @param loc  source location information.
1171 @param global_tid  global thread number .
1172 @param crit identity of the critical section. This could be a pointer to a lock
1173 associated with the critical section, or some other suitably unique value.
1174 
1175 Enter code protected by a `critical` construct.
1176 This function blocks until the executing thread can enter the critical section.
1177 */
1178 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1179                      kmp_critical_name *crit) {
1180 #if KMP_USE_DYNAMIC_LOCK
1181 #if OMPT_SUPPORT && OMPT_OPTIONAL
1182   OMPT_STORE_RETURN_ADDRESS(global_tid);
1183 #endif // OMPT_SUPPORT
1184   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1185 #else
1186   KMP_COUNT_BLOCK(OMP_CRITICAL);
1187 #if OMPT_SUPPORT && OMPT_OPTIONAL
1188   ompt_state_t prev_state = ompt_state_undefined;
1189   ompt_thread_info_t ti;
1190 #endif
1191   kmp_user_lock_p lck;
1192 
1193   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1194 
1195   // TODO: add THR_OVHD_STATE
1196 
1197   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1198   KMP_CHECK_USER_LOCK_INIT();
1199 
1200   if ((__kmp_user_lock_kind == lk_tas) &&
1201       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1202     lck = (kmp_user_lock_p)crit;
1203   }
1204 #if KMP_USE_FUTEX
1205   else if ((__kmp_user_lock_kind == lk_futex) &&
1206            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1207     lck = (kmp_user_lock_p)crit;
1208   }
1209 #endif
1210   else { // ticket, queuing or drdpa
1211     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1212   }
1213 
1214   if (__kmp_env_consistency_check)
1215     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1216 
1217 // since the critical directive binds to all threads, not just the current
1218 // team we have to check this even if we are in a serialized team.
1219 // also, even if we are the uber thread, we still have to conduct the lock,
1220 // as we have to contend with sibling threads.
1221 
1222 #if USE_ITT_BUILD
1223   __kmp_itt_critical_acquiring(lck);
1224 #endif /* USE_ITT_BUILD */
1225 #if OMPT_SUPPORT && OMPT_OPTIONAL
1226   OMPT_STORE_RETURN_ADDRESS(gtid);
1227   void *codeptr_ra = NULL;
1228   if (ompt_enabled.enabled) {
1229     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1230     /* OMPT state update */
1231     prev_state = ti.state;
1232     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1233     ti.state = ompt_state_wait_critical;
1234 
1235     /* OMPT event callback */
1236     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1237     if (ompt_enabled.ompt_callback_mutex_acquire) {
1238       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1239           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1240           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1241     }
1242   }
1243 #endif
1244   // Value of 'crit' should be good for using as a critical_id of the critical
1245   // section directive.
1246   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1247 
1248 #if USE_ITT_BUILD
1249   __kmp_itt_critical_acquired(lck);
1250 #endif /* USE_ITT_BUILD */
1251 #if OMPT_SUPPORT && OMPT_OPTIONAL
1252   if (ompt_enabled.enabled) {
1253     /* OMPT state update */
1254     ti.state = prev_state;
1255     ti.wait_id = 0;
1256 
1257     /* OMPT event callback */
1258     if (ompt_enabled.ompt_callback_mutex_acquired) {
1259       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1260           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1261     }
1262   }
1263 #endif
1264   KMP_POP_PARTITIONED_TIMER();
1265 
1266   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1267   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1268 #endif // KMP_USE_DYNAMIC_LOCK
1269 }
1270 
1271 #if KMP_USE_DYNAMIC_LOCK
1272 
1273 // Converts the given hint to an internal lock implementation
1274 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1275 #if KMP_USE_TSX
1276 #define KMP_TSX_LOCK(seq) lockseq_##seq
1277 #else
1278 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1279 #endif
1280 
1281 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1282 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1283 #else
1284 #define KMP_CPUINFO_RTM 0
1285 #endif
1286 
1287   // Hints that do not require further logic
1288   if (hint & kmp_lock_hint_hle)
1289     return KMP_TSX_LOCK(hle);
1290   if (hint & kmp_lock_hint_rtm)
1291     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1292   if (hint & kmp_lock_hint_adaptive)
1293     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1294 
1295   // Rule out conflicting hints first by returning the default lock
1296   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1297     return __kmp_user_lock_seq;
1298   if ((hint & omp_lock_hint_speculative) &&
1299       (hint & omp_lock_hint_nonspeculative))
1300     return __kmp_user_lock_seq;
1301 
1302   // Do not even consider speculation when it appears to be contended
1303   if (hint & omp_lock_hint_contended)
1304     return lockseq_queuing;
1305 
1306   // Uncontended lock without speculation
1307   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1308     return lockseq_tas;
1309 
1310   // HLE lock for speculation
1311   if (hint & omp_lock_hint_speculative)
1312     return KMP_TSX_LOCK(hle);
1313 
1314   return __kmp_user_lock_seq;
1315 }
1316 
1317 #if OMPT_SUPPORT && OMPT_OPTIONAL
1318 #if KMP_USE_DYNAMIC_LOCK
1319 static kmp_mutex_impl_t
1320 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1321   if (user_lock) {
1322     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1323     case 0:
1324       break;
1325 #if KMP_USE_FUTEX
1326     case locktag_futex:
1327       return kmp_mutex_impl_queuing;
1328 #endif
1329     case locktag_tas:
1330       return kmp_mutex_impl_spin;
1331 #if KMP_USE_TSX
1332     case locktag_hle:
1333       return kmp_mutex_impl_speculative;
1334 #endif
1335     default:
1336       return kmp_mutex_impl_none;
1337     }
1338     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1339   }
1340   KMP_ASSERT(ilock);
1341   switch (ilock->type) {
1342 #if KMP_USE_TSX
1343   case locktag_adaptive:
1344   case locktag_rtm:
1345     return kmp_mutex_impl_speculative;
1346 #endif
1347   case locktag_nested_tas:
1348     return kmp_mutex_impl_spin;
1349 #if KMP_USE_FUTEX
1350   case locktag_nested_futex:
1351 #endif
1352   case locktag_ticket:
1353   case locktag_queuing:
1354   case locktag_drdpa:
1355   case locktag_nested_ticket:
1356   case locktag_nested_queuing:
1357   case locktag_nested_drdpa:
1358     return kmp_mutex_impl_queuing;
1359   default:
1360     return kmp_mutex_impl_none;
1361   }
1362 }
1363 #else
1364 // For locks without dynamic binding
1365 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1366   switch (__kmp_user_lock_kind) {
1367   case lk_tas:
1368     return kmp_mutex_impl_spin;
1369 #if KMP_USE_FUTEX
1370   case lk_futex:
1371 #endif
1372   case lk_ticket:
1373   case lk_queuing:
1374   case lk_drdpa:
1375     return kmp_mutex_impl_queuing;
1376 #if KMP_USE_TSX
1377   case lk_hle:
1378   case lk_rtm:
1379   case lk_adaptive:
1380     return kmp_mutex_impl_speculative;
1381 #endif
1382   default:
1383     return kmp_mutex_impl_none;
1384   }
1385 }
1386 #endif // KMP_USE_DYNAMIC_LOCK
1387 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1388 
1389 /*!
1390 @ingroup WORK_SHARING
1391 @param loc  source location information.
1392 @param global_tid  global thread number.
1393 @param crit identity of the critical section. This could be a pointer to a lock
1394 associated with the critical section, or some other suitably unique value.
1395 @param hint the lock hint.
1396 
1397 Enter code protected by a `critical` construct with a hint. The hint value is
1398 used to suggest a lock implementation. This function blocks until the executing
1399 thread can enter the critical section unless the hint suggests use of
1400 speculative execution and the hardware supports it.
1401 */
1402 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1403                                kmp_critical_name *crit, uint32_t hint) {
1404   KMP_COUNT_BLOCK(OMP_CRITICAL);
1405   kmp_user_lock_p lck;
1406 #if OMPT_SUPPORT && OMPT_OPTIONAL
1407   ompt_state_t prev_state = ompt_state_undefined;
1408   ompt_thread_info_t ti;
1409   // This is the case, if called from __kmpc_critical:
1410   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1411   if (!codeptr)
1412     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1413 #endif
1414 
1415   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1416 
1417   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1418   // Check if it is initialized.
1419   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1420   if (*lk == 0) {
1421     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1422     if (KMP_IS_D_LOCK(lckseq)) {
1423       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1424                                   KMP_GET_D_TAG(lckseq));
1425     } else {
1426       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1427     }
1428   }
1429   // Branch for accessing the actual lock object and set operation. This
1430   // branching is inevitable since this lock initialization does not follow the
1431   // normal dispatch path (lock table is not used).
1432   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1433     lck = (kmp_user_lock_p)lk;
1434     if (__kmp_env_consistency_check) {
1435       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1436                       __kmp_map_hint_to_lock(hint));
1437     }
1438 #if USE_ITT_BUILD
1439     __kmp_itt_critical_acquiring(lck);
1440 #endif
1441 #if OMPT_SUPPORT && OMPT_OPTIONAL
1442     if (ompt_enabled.enabled) {
1443       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1444       /* OMPT state update */
1445       prev_state = ti.state;
1446       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1447       ti.state = ompt_state_wait_critical;
1448 
1449       /* OMPT event callback */
1450       if (ompt_enabled.ompt_callback_mutex_acquire) {
1451         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1452             ompt_mutex_critical, (unsigned int)hint,
1453             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1454             codeptr);
1455       }
1456     }
1457 #endif
1458 #if KMP_USE_INLINED_TAS
1459     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1460       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1461     } else
1462 #elif KMP_USE_INLINED_FUTEX
1463     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1464       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1465     } else
1466 #endif
1467     {
1468       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1469     }
1470   } else {
1471     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1472     lck = ilk->lock;
1473     if (__kmp_env_consistency_check) {
1474       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1475                       __kmp_map_hint_to_lock(hint));
1476     }
1477 #if USE_ITT_BUILD
1478     __kmp_itt_critical_acquiring(lck);
1479 #endif
1480 #if OMPT_SUPPORT && OMPT_OPTIONAL
1481     if (ompt_enabled.enabled) {
1482       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1483       /* OMPT state update */
1484       prev_state = ti.state;
1485       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1486       ti.state = ompt_state_wait_critical;
1487 
1488       /* OMPT event callback */
1489       if (ompt_enabled.ompt_callback_mutex_acquire) {
1490         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1491             ompt_mutex_critical, (unsigned int)hint,
1492             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1493             codeptr);
1494       }
1495     }
1496 #endif
1497     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1498   }
1499   KMP_POP_PARTITIONED_TIMER();
1500 
1501 #if USE_ITT_BUILD
1502   __kmp_itt_critical_acquired(lck);
1503 #endif /* USE_ITT_BUILD */
1504 #if OMPT_SUPPORT && OMPT_OPTIONAL
1505   if (ompt_enabled.enabled) {
1506     /* OMPT state update */
1507     ti.state = prev_state;
1508     ti.wait_id = 0;
1509 
1510     /* OMPT event callback */
1511     if (ompt_enabled.ompt_callback_mutex_acquired) {
1512       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1513           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1514     }
1515   }
1516 #endif
1517 
1518   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1519   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1520 } // __kmpc_critical_with_hint
1521 
1522 #endif // KMP_USE_DYNAMIC_LOCK
1523 
1524 /*!
1525 @ingroup WORK_SHARING
1526 @param loc  source location information.
1527 @param global_tid  global thread number .
1528 @param crit identity of the critical section. This could be a pointer to a lock
1529 associated with the critical section, or some other suitably unique value.
1530 
1531 Leave a critical section, releasing any lock that was held during its execution.
1532 */
1533 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1534                          kmp_critical_name *crit) {
1535   kmp_user_lock_p lck;
1536 
1537   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1538 
1539 #if KMP_USE_DYNAMIC_LOCK
1540   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1541     lck = (kmp_user_lock_p)crit;
1542     KMP_ASSERT(lck != NULL);
1543     if (__kmp_env_consistency_check) {
1544       __kmp_pop_sync(global_tid, ct_critical, loc);
1545     }
1546 #if USE_ITT_BUILD
1547     __kmp_itt_critical_releasing(lck);
1548 #endif
1549 #if KMP_USE_INLINED_TAS
1550     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1551       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1552     } else
1553 #elif KMP_USE_INLINED_FUTEX
1554     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1555       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1556     } else
1557 #endif
1558     {
1559       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1560     }
1561   } else {
1562     kmp_indirect_lock_t *ilk =
1563         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1564     KMP_ASSERT(ilk != NULL);
1565     lck = ilk->lock;
1566     if (__kmp_env_consistency_check) {
1567       __kmp_pop_sync(global_tid, ct_critical, loc);
1568     }
1569 #if USE_ITT_BUILD
1570     __kmp_itt_critical_releasing(lck);
1571 #endif
1572     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1573   }
1574 
1575 #else // KMP_USE_DYNAMIC_LOCK
1576 
1577   if ((__kmp_user_lock_kind == lk_tas) &&
1578       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1579     lck = (kmp_user_lock_p)crit;
1580   }
1581 #if KMP_USE_FUTEX
1582   else if ((__kmp_user_lock_kind == lk_futex) &&
1583            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1584     lck = (kmp_user_lock_p)crit;
1585   }
1586 #endif
1587   else { // ticket, queuing or drdpa
1588     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1589   }
1590 
1591   KMP_ASSERT(lck != NULL);
1592 
1593   if (__kmp_env_consistency_check)
1594     __kmp_pop_sync(global_tid, ct_critical, loc);
1595 
1596 #if USE_ITT_BUILD
1597   __kmp_itt_critical_releasing(lck);
1598 #endif /* USE_ITT_BUILD */
1599   // Value of 'crit' should be good for using as a critical_id of the critical
1600   // section directive.
1601   __kmp_release_user_lock_with_checks(lck, global_tid);
1602 
1603 #endif // KMP_USE_DYNAMIC_LOCK
1604 
1605 #if OMPT_SUPPORT && OMPT_OPTIONAL
1606   /* OMPT release event triggers after lock is released; place here to trigger
1607    * for all #if branches */
1608   OMPT_STORE_RETURN_ADDRESS(global_tid);
1609   if (ompt_enabled.ompt_callback_mutex_released) {
1610     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1611         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1612         OMPT_LOAD_RETURN_ADDRESS(0));
1613   }
1614 #endif
1615 
1616   KMP_POP_PARTITIONED_TIMER();
1617   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1618 }
1619 
1620 /*!
1621 @ingroup SYNCHRONIZATION
1622 @param loc source location information
1623 @param global_tid thread id.
1624 @return one if the thread should execute the master block, zero otherwise
1625 
1626 Start execution of a combined barrier and master. The barrier is executed inside
1627 this function.
1628 */
1629 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1630   int status;
1631 
1632   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1633 
1634   if (!TCR_4(__kmp_init_parallel))
1635     __kmp_parallel_initialize();
1636 
1637 #if OMP_50_ENABLED
1638   __kmp_resume_if_soft_paused();
1639 #endif
1640 
1641   if (__kmp_env_consistency_check)
1642     __kmp_check_barrier(global_tid, ct_barrier, loc);
1643 
1644 #if OMPT_SUPPORT
1645   ompt_frame_t *ompt_frame;
1646   if (ompt_enabled.enabled) {
1647     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1648     if (ompt_frame->enter_frame.ptr == NULL)
1649       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1650     OMPT_STORE_RETURN_ADDRESS(global_tid);
1651   }
1652 #endif
1653 #if USE_ITT_NOTIFY
1654   __kmp_threads[global_tid]->th.th_ident = loc;
1655 #endif
1656   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1657 #if OMPT_SUPPORT && OMPT_OPTIONAL
1658   if (ompt_enabled.enabled) {
1659     ompt_frame->enter_frame = ompt_data_none;
1660   }
1661 #endif
1662 
1663   return (status != 0) ? 0 : 1;
1664 }
1665 
1666 /*!
1667 @ingroup SYNCHRONIZATION
1668 @param loc source location information
1669 @param global_tid thread id.
1670 
1671 Complete the execution of a combined barrier and master. This function should
1672 only be called at the completion of the <tt>master</tt> code. Other threads will
1673 still be waiting at the barrier and this call releases them.
1674 */
1675 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1676   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1677 
1678   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1679 }
1680 
1681 /*!
1682 @ingroup SYNCHRONIZATION
1683 @param loc source location information
1684 @param global_tid thread id.
1685 @return one if the thread should execute the master block, zero otherwise
1686 
1687 Start execution of a combined barrier and master(nowait) construct.
1688 The barrier is executed inside this function.
1689 There is no equivalent "end" function, since the
1690 */
1691 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1692   kmp_int32 ret;
1693 
1694   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1695 
1696   if (!TCR_4(__kmp_init_parallel))
1697     __kmp_parallel_initialize();
1698 
1699 #if OMP_50_ENABLED
1700   __kmp_resume_if_soft_paused();
1701 #endif
1702 
1703   if (__kmp_env_consistency_check) {
1704     if (loc == 0) {
1705       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1706     }
1707     __kmp_check_barrier(global_tid, ct_barrier, loc);
1708   }
1709 
1710 #if OMPT_SUPPORT
1711   ompt_frame_t *ompt_frame;
1712   if (ompt_enabled.enabled) {
1713     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1714     if (ompt_frame->enter_frame.ptr == NULL)
1715       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1716     OMPT_STORE_RETURN_ADDRESS(global_tid);
1717   }
1718 #endif
1719 #if USE_ITT_NOTIFY
1720   __kmp_threads[global_tid]->th.th_ident = loc;
1721 #endif
1722   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1723 #if OMPT_SUPPORT && OMPT_OPTIONAL
1724   if (ompt_enabled.enabled) {
1725     ompt_frame->enter_frame = ompt_data_none;
1726   }
1727 #endif
1728 
1729   ret = __kmpc_master(loc, global_tid);
1730 
1731   if (__kmp_env_consistency_check) {
1732     /*  there's no __kmpc_end_master called; so the (stats) */
1733     /*  actions of __kmpc_end_master are done here          */
1734 
1735     if (global_tid < 0) {
1736       KMP_WARNING(ThreadIdentInvalid);
1737     }
1738     if (ret) {
1739       /* only one thread should do the pop since only */
1740       /* one did the push (see __kmpc_master())       */
1741 
1742       __kmp_pop_sync(global_tid, ct_master, loc);
1743     }
1744   }
1745 
1746   return (ret);
1747 }
1748 
1749 /* The BARRIER for a SINGLE process section is always explicit   */
1750 /*!
1751 @ingroup WORK_SHARING
1752 @param loc  source location information
1753 @param global_tid  global thread number
1754 @return One if this thread should execute the single construct, zero otherwise.
1755 
1756 Test whether to execute a <tt>single</tt> construct.
1757 There are no implicit barriers in the two "single" calls, rather the compiler
1758 should introduce an explicit barrier if it is required.
1759 */
1760 
1761 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1762   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1763 
1764   if (rc) {
1765     // We are going to execute the single statement, so we should count it.
1766     KMP_COUNT_BLOCK(OMP_SINGLE);
1767     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1768   }
1769 
1770 #if OMPT_SUPPORT && OMPT_OPTIONAL
1771   kmp_info_t *this_thr = __kmp_threads[global_tid];
1772   kmp_team_t *team = this_thr->th.th_team;
1773   int tid = __kmp_tid_from_gtid(global_tid);
1774 
1775   if (ompt_enabled.enabled) {
1776     if (rc) {
1777       if (ompt_enabled.ompt_callback_work) {
1778         ompt_callbacks.ompt_callback(ompt_callback_work)(
1779             ompt_work_single_executor, ompt_scope_begin,
1780             &(team->t.ompt_team_info.parallel_data),
1781             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1782             1, OMPT_GET_RETURN_ADDRESS(0));
1783       }
1784     } else {
1785       if (ompt_enabled.ompt_callback_work) {
1786         ompt_callbacks.ompt_callback(ompt_callback_work)(
1787             ompt_work_single_other, ompt_scope_begin,
1788             &(team->t.ompt_team_info.parallel_data),
1789             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1790             1, OMPT_GET_RETURN_ADDRESS(0));
1791         ompt_callbacks.ompt_callback(ompt_callback_work)(
1792             ompt_work_single_other, ompt_scope_end,
1793             &(team->t.ompt_team_info.parallel_data),
1794             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1795             1, OMPT_GET_RETURN_ADDRESS(0));
1796       }
1797     }
1798   }
1799 #endif
1800 
1801   return rc;
1802 }
1803 
1804 /*!
1805 @ingroup WORK_SHARING
1806 @param loc  source location information
1807 @param global_tid  global thread number
1808 
1809 Mark the end of a <tt>single</tt> construct.  This function should
1810 only be called by the thread that executed the block of code protected
1811 by the `single` construct.
1812 */
1813 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1814   __kmp_exit_single(global_tid);
1815   KMP_POP_PARTITIONED_TIMER();
1816 
1817 #if OMPT_SUPPORT && OMPT_OPTIONAL
1818   kmp_info_t *this_thr = __kmp_threads[global_tid];
1819   kmp_team_t *team = this_thr->th.th_team;
1820   int tid = __kmp_tid_from_gtid(global_tid);
1821 
1822   if (ompt_enabled.ompt_callback_work) {
1823     ompt_callbacks.ompt_callback(ompt_callback_work)(
1824         ompt_work_single_executor, ompt_scope_end,
1825         &(team->t.ompt_team_info.parallel_data),
1826         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1827         OMPT_GET_RETURN_ADDRESS(0));
1828   }
1829 #endif
1830 }
1831 
1832 /*!
1833 @ingroup WORK_SHARING
1834 @param loc Source location
1835 @param global_tid Global thread id
1836 
1837 Mark the end of a statically scheduled loop.
1838 */
1839 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1840   KMP_POP_PARTITIONED_TIMER();
1841   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1842 
1843 #if OMPT_SUPPORT && OMPT_OPTIONAL
1844   if (ompt_enabled.ompt_callback_work) {
1845     ompt_work_t ompt_work_type = ompt_work_loop;
1846     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1847     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1848     // Determine workshare type
1849     if (loc != NULL) {
1850       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1851         ompt_work_type = ompt_work_loop;
1852       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1853         ompt_work_type = ompt_work_sections;
1854       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1855         ompt_work_type = ompt_work_distribute;
1856       } else {
1857         // use default set above.
1858         // a warning about this case is provided in __kmpc_for_static_init
1859       }
1860       KMP_DEBUG_ASSERT(ompt_work_type);
1861     }
1862     ompt_callbacks.ompt_callback(ompt_callback_work)(
1863         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1864         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1865   }
1866 #endif
1867   if (__kmp_env_consistency_check)
1868     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1869 }
1870 
1871 // User routines which take C-style arguments (call by value)
1872 // different from the Fortran equivalent routines
1873 
1874 void ompc_set_num_threads(int arg) {
1875   // !!!!! TODO: check the per-task binding
1876   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1877 }
1878 
1879 void ompc_set_dynamic(int flag) {
1880   kmp_info_t *thread;
1881 
1882   /* For the thread-private implementation of the internal controls */
1883   thread = __kmp_entry_thread();
1884 
1885   __kmp_save_internal_controls(thread);
1886 
1887   set__dynamic(thread, flag ? TRUE : FALSE);
1888 }
1889 
1890 void ompc_set_nested(int flag) {
1891   kmp_info_t *thread;
1892 
1893   /* For the thread-private internal controls implementation */
1894   thread = __kmp_entry_thread();
1895 
1896   __kmp_save_internal_controls(thread);
1897 
1898   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1899 }
1900 
1901 void ompc_set_max_active_levels(int max_active_levels) {
1902   /* TO DO */
1903   /* we want per-task implementation of this internal control */
1904 
1905   /* For the per-thread internal controls implementation */
1906   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1907 }
1908 
1909 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1910   // !!!!! TODO: check the per-task binding
1911   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1912 }
1913 
1914 int ompc_get_ancestor_thread_num(int level) {
1915   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1916 }
1917 
1918 int ompc_get_team_size(int level) {
1919   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1920 }
1921 
1922 #if OMP_50_ENABLED
1923 /* OpenMP 5.0 Affinity Format API */
1924 
1925 void ompc_set_affinity_format(char const *format) {
1926   if (!__kmp_init_serial) {
1927     __kmp_serial_initialize();
1928   }
1929   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1930                          format, KMP_STRLEN(format) + 1);
1931 }
1932 
1933 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1934   size_t format_size;
1935   if (!__kmp_init_serial) {
1936     __kmp_serial_initialize();
1937   }
1938   format_size = KMP_STRLEN(__kmp_affinity_format);
1939   if (buffer && size) {
1940     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1941                            format_size + 1);
1942   }
1943   return format_size;
1944 }
1945 
1946 void ompc_display_affinity(char const *format) {
1947   int gtid;
1948   if (!TCR_4(__kmp_init_middle)) {
1949     __kmp_middle_initialize();
1950   }
1951   gtid = __kmp_get_gtid();
1952   __kmp_aux_display_affinity(gtid, format);
1953 }
1954 
1955 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1956                              char const *format) {
1957   int gtid;
1958   size_t num_required;
1959   kmp_str_buf_t capture_buf;
1960   if (!TCR_4(__kmp_init_middle)) {
1961     __kmp_middle_initialize();
1962   }
1963   gtid = __kmp_get_gtid();
1964   __kmp_str_buf_init(&capture_buf);
1965   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1966   if (buffer && buf_size) {
1967     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1968                            capture_buf.used + 1);
1969   }
1970   __kmp_str_buf_free(&capture_buf);
1971   return num_required;
1972 }
1973 #endif /* OMP_50_ENABLED */
1974 
1975 void kmpc_set_stacksize(int arg) {
1976   // __kmp_aux_set_stacksize initializes the library if needed
1977   __kmp_aux_set_stacksize(arg);
1978 }
1979 
1980 void kmpc_set_stacksize_s(size_t arg) {
1981   // __kmp_aux_set_stacksize initializes the library if needed
1982   __kmp_aux_set_stacksize(arg);
1983 }
1984 
1985 void kmpc_set_blocktime(int arg) {
1986   int gtid, tid;
1987   kmp_info_t *thread;
1988 
1989   gtid = __kmp_entry_gtid();
1990   tid = __kmp_tid_from_gtid(gtid);
1991   thread = __kmp_thread_from_gtid(gtid);
1992 
1993   __kmp_aux_set_blocktime(arg, thread, tid);
1994 }
1995 
1996 void kmpc_set_library(int arg) {
1997   // __kmp_user_set_library initializes the library if needed
1998   __kmp_user_set_library((enum library_type)arg);
1999 }
2000 
2001 void kmpc_set_defaults(char const *str) {
2002   // __kmp_aux_set_defaults initializes the library if needed
2003   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2004 }
2005 
2006 void kmpc_set_disp_num_buffers(int arg) {
2007   // ignore after initialization because some teams have already
2008   // allocated dispatch buffers
2009   if (__kmp_init_serial == 0 && arg > 0)
2010     __kmp_dispatch_num_buffers = arg;
2011 }
2012 
2013 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2014 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2015   return -1;
2016 #else
2017   if (!TCR_4(__kmp_init_middle)) {
2018     __kmp_middle_initialize();
2019   }
2020   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2021 #endif
2022 }
2023 
2024 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2025 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2026   return -1;
2027 #else
2028   if (!TCR_4(__kmp_init_middle)) {
2029     __kmp_middle_initialize();
2030   }
2031   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2032 #endif
2033 }
2034 
2035 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2036 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2037   return -1;
2038 #else
2039   if (!TCR_4(__kmp_init_middle)) {
2040     __kmp_middle_initialize();
2041   }
2042   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2043 #endif
2044 }
2045 
2046 /* -------------------------------------------------------------------------- */
2047 /*!
2048 @ingroup THREADPRIVATE
2049 @param loc       source location information
2050 @param gtid      global thread number
2051 @param cpy_size  size of the cpy_data buffer
2052 @param cpy_data  pointer to data to be copied
2053 @param cpy_func  helper function to call for copying data
2054 @param didit     flag variable: 1=single thread; 0=not single thread
2055 
2056 __kmpc_copyprivate implements the interface for the private data broadcast
2057 needed for the copyprivate clause associated with a single region in an
2058 OpenMP<sup>*</sup> program (both C and Fortran).
2059 All threads participating in the parallel region call this routine.
2060 One of the threads (called the single thread) should have the <tt>didit</tt>
2061 variable set to 1 and all other threads should have that variable set to 0.
2062 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2063 
2064 The OpenMP specification forbids the use of nowait on the single region when a
2065 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2066 barrier internally to avoid race conditions, so the code generation for the
2067 single region should avoid generating a barrier after the call to @ref
2068 __kmpc_copyprivate.
2069 
2070 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2071 The <tt>loc</tt> parameter is a pointer to source location information.
2072 
2073 Internal implementation: The single thread will first copy its descriptor
2074 address (cpy_data) to a team-private location, then the other threads will each
2075 call the function pointed to by the parameter cpy_func, which carries out the
2076 copy by copying the data using the cpy_data buffer.
2077 
2078 The cpy_func routine used for the copy and the contents of the data area defined
2079 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2080 to be done. For instance, the cpy_data buffer can hold the actual data to be
2081 copied or it may hold a list of pointers to the data. The cpy_func routine must
2082 interpret the cpy_data buffer appropriately.
2083 
2084 The interface to cpy_func is as follows:
2085 @code
2086 void cpy_func( void *destination, void *source )
2087 @endcode
2088 where void *destination is the cpy_data pointer for the thread being copied to
2089 and void *source is the cpy_data pointer for the thread being copied from.
2090 */
2091 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2092                         void *cpy_data, void (*cpy_func)(void *, void *),
2093                         kmp_int32 didit) {
2094   void **data_ptr;
2095 
2096   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2097 
2098   KMP_MB();
2099 
2100   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2101 
2102   if (__kmp_env_consistency_check) {
2103     if (loc == 0) {
2104       KMP_WARNING(ConstructIdentInvalid);
2105     }
2106   }
2107 
2108   // ToDo: Optimize the following two barriers into some kind of split barrier
2109 
2110   if (didit)
2111     *data_ptr = cpy_data;
2112 
2113 #if OMPT_SUPPORT
2114   ompt_frame_t *ompt_frame;
2115   if (ompt_enabled.enabled) {
2116     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2117     if (ompt_frame->enter_frame.ptr == NULL)
2118       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2119     OMPT_STORE_RETURN_ADDRESS(gtid);
2120   }
2121 #endif
2122 /* This barrier is not a barrier region boundary */
2123 #if USE_ITT_NOTIFY
2124   __kmp_threads[gtid]->th.th_ident = loc;
2125 #endif
2126   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2127 
2128   if (!didit)
2129     (*cpy_func)(cpy_data, *data_ptr);
2130 
2131 // Consider next barrier a user-visible barrier for barrier region boundaries
2132 // Nesting checks are already handled by the single construct checks
2133 
2134 #if OMPT_SUPPORT
2135   if (ompt_enabled.enabled) {
2136     OMPT_STORE_RETURN_ADDRESS(gtid);
2137   }
2138 #endif
2139 #if USE_ITT_NOTIFY
2140   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2141 // tasks can overwrite the location)
2142 #endif
2143   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2144 #if OMPT_SUPPORT && OMPT_OPTIONAL
2145   if (ompt_enabled.enabled) {
2146     ompt_frame->enter_frame = ompt_data_none;
2147   }
2148 #endif
2149 }
2150 
2151 /* -------------------------------------------------------------------------- */
2152 
2153 #define INIT_LOCK __kmp_init_user_lock_with_checks
2154 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2155 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2156 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2157 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2158 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2159   __kmp_acquire_nested_user_lock_with_checks_timed
2160 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2161 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2162 #define TEST_LOCK __kmp_test_user_lock_with_checks
2163 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2164 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2165 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2166 
2167 // TODO: Make check abort messages use location info & pass it into
2168 // with_checks routines
2169 
2170 #if KMP_USE_DYNAMIC_LOCK
2171 
2172 // internal lock initializer
2173 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2174                                                     kmp_dyna_lockseq_t seq) {
2175   if (KMP_IS_D_LOCK(seq)) {
2176     KMP_INIT_D_LOCK(lock, seq);
2177 #if USE_ITT_BUILD
2178     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2179 #endif
2180   } else {
2181     KMP_INIT_I_LOCK(lock, seq);
2182 #if USE_ITT_BUILD
2183     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2184     __kmp_itt_lock_creating(ilk->lock, loc);
2185 #endif
2186   }
2187 }
2188 
2189 // internal nest lock initializer
2190 static __forceinline void
2191 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2192                                kmp_dyna_lockseq_t seq) {
2193 #if KMP_USE_TSX
2194   // Don't have nested lock implementation for speculative locks
2195   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2196     seq = __kmp_user_lock_seq;
2197 #endif
2198   switch (seq) {
2199   case lockseq_tas:
2200     seq = lockseq_nested_tas;
2201     break;
2202 #if KMP_USE_FUTEX
2203   case lockseq_futex:
2204     seq = lockseq_nested_futex;
2205     break;
2206 #endif
2207   case lockseq_ticket:
2208     seq = lockseq_nested_ticket;
2209     break;
2210   case lockseq_queuing:
2211     seq = lockseq_nested_queuing;
2212     break;
2213   case lockseq_drdpa:
2214     seq = lockseq_nested_drdpa;
2215     break;
2216   default:
2217     seq = lockseq_nested_queuing;
2218   }
2219   KMP_INIT_I_LOCK(lock, seq);
2220 #if USE_ITT_BUILD
2221   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2222   __kmp_itt_lock_creating(ilk->lock, loc);
2223 #endif
2224 }
2225 
2226 /* initialize the lock with a hint */
2227 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2228                                 uintptr_t hint) {
2229   KMP_DEBUG_ASSERT(__kmp_init_serial);
2230   if (__kmp_env_consistency_check && user_lock == NULL) {
2231     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2232   }
2233 
2234   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2235 
2236 #if OMPT_SUPPORT && OMPT_OPTIONAL
2237   // This is the case, if called from omp_init_lock_with_hint:
2238   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2239   if (!codeptr)
2240     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2241   if (ompt_enabled.ompt_callback_lock_init) {
2242     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2243         ompt_mutex_lock, (omp_lock_hint_t)hint,
2244         __ompt_get_mutex_impl_type(user_lock),
2245         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2246   }
2247 #endif
2248 }
2249 
2250 /* initialize the lock with a hint */
2251 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2252                                      void **user_lock, uintptr_t hint) {
2253   KMP_DEBUG_ASSERT(__kmp_init_serial);
2254   if (__kmp_env_consistency_check && user_lock == NULL) {
2255     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2256   }
2257 
2258   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2259 
2260 #if OMPT_SUPPORT && OMPT_OPTIONAL
2261   // This is the case, if called from omp_init_lock_with_hint:
2262   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2263   if (!codeptr)
2264     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2265   if (ompt_enabled.ompt_callback_lock_init) {
2266     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2267         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2268         __ompt_get_mutex_impl_type(user_lock),
2269         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2270   }
2271 #endif
2272 }
2273 
2274 #endif // KMP_USE_DYNAMIC_LOCK
2275 
2276 /* initialize the lock */
2277 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2278 #if KMP_USE_DYNAMIC_LOCK
2279 
2280   KMP_DEBUG_ASSERT(__kmp_init_serial);
2281   if (__kmp_env_consistency_check && user_lock == NULL) {
2282     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2283   }
2284   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2285 
2286 #if OMPT_SUPPORT && OMPT_OPTIONAL
2287   // This is the case, if called from omp_init_lock_with_hint:
2288   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2289   if (!codeptr)
2290     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2291   if (ompt_enabled.ompt_callback_lock_init) {
2292     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2293         ompt_mutex_lock, omp_lock_hint_none,
2294         __ompt_get_mutex_impl_type(user_lock),
2295         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2296   }
2297 #endif
2298 
2299 #else // KMP_USE_DYNAMIC_LOCK
2300 
2301   static char const *const func = "omp_init_lock";
2302   kmp_user_lock_p lck;
2303   KMP_DEBUG_ASSERT(__kmp_init_serial);
2304 
2305   if (__kmp_env_consistency_check) {
2306     if (user_lock == NULL) {
2307       KMP_FATAL(LockIsUninitialized, func);
2308     }
2309   }
2310 
2311   KMP_CHECK_USER_LOCK_INIT();
2312 
2313   if ((__kmp_user_lock_kind == lk_tas) &&
2314       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2315     lck = (kmp_user_lock_p)user_lock;
2316   }
2317 #if KMP_USE_FUTEX
2318   else if ((__kmp_user_lock_kind == lk_futex) &&
2319            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2320     lck = (kmp_user_lock_p)user_lock;
2321   }
2322 #endif
2323   else {
2324     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2325   }
2326   INIT_LOCK(lck);
2327   __kmp_set_user_lock_location(lck, loc);
2328 
2329 #if OMPT_SUPPORT && OMPT_OPTIONAL
2330   // This is the case, if called from omp_init_lock_with_hint:
2331   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2332   if (!codeptr)
2333     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2334   if (ompt_enabled.ompt_callback_lock_init) {
2335     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2336         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2337         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2338   }
2339 #endif
2340 
2341 #if USE_ITT_BUILD
2342   __kmp_itt_lock_creating(lck);
2343 #endif /* USE_ITT_BUILD */
2344 
2345 #endif // KMP_USE_DYNAMIC_LOCK
2346 } // __kmpc_init_lock
2347 
2348 /* initialize the lock */
2349 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2350 #if KMP_USE_DYNAMIC_LOCK
2351 
2352   KMP_DEBUG_ASSERT(__kmp_init_serial);
2353   if (__kmp_env_consistency_check && user_lock == NULL) {
2354     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2355   }
2356   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2357 
2358 #if OMPT_SUPPORT && OMPT_OPTIONAL
2359   // This is the case, if called from omp_init_lock_with_hint:
2360   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2361   if (!codeptr)
2362     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2363   if (ompt_enabled.ompt_callback_lock_init) {
2364     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2365         ompt_mutex_nest_lock, omp_lock_hint_none,
2366         __ompt_get_mutex_impl_type(user_lock),
2367         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2368   }
2369 #endif
2370 
2371 #else // KMP_USE_DYNAMIC_LOCK
2372 
2373   static char const *const func = "omp_init_nest_lock";
2374   kmp_user_lock_p lck;
2375   KMP_DEBUG_ASSERT(__kmp_init_serial);
2376 
2377   if (__kmp_env_consistency_check) {
2378     if (user_lock == NULL) {
2379       KMP_FATAL(LockIsUninitialized, func);
2380     }
2381   }
2382 
2383   KMP_CHECK_USER_LOCK_INIT();
2384 
2385   if ((__kmp_user_lock_kind == lk_tas) &&
2386       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2387        OMP_NEST_LOCK_T_SIZE)) {
2388     lck = (kmp_user_lock_p)user_lock;
2389   }
2390 #if KMP_USE_FUTEX
2391   else if ((__kmp_user_lock_kind == lk_futex) &&
2392            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2393             OMP_NEST_LOCK_T_SIZE)) {
2394     lck = (kmp_user_lock_p)user_lock;
2395   }
2396 #endif
2397   else {
2398     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2399   }
2400 
2401   INIT_NESTED_LOCK(lck);
2402   __kmp_set_user_lock_location(lck, loc);
2403 
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405   // This is the case, if called from omp_init_lock_with_hint:
2406   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2407   if (!codeptr)
2408     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2409   if (ompt_enabled.ompt_callback_lock_init) {
2410     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2411         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2412         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2413   }
2414 #endif
2415 
2416 #if USE_ITT_BUILD
2417   __kmp_itt_lock_creating(lck);
2418 #endif /* USE_ITT_BUILD */
2419 
2420 #endif // KMP_USE_DYNAMIC_LOCK
2421 } // __kmpc_init_nest_lock
2422 
2423 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2424 #if KMP_USE_DYNAMIC_LOCK
2425 
2426 #if USE_ITT_BUILD
2427   kmp_user_lock_p lck;
2428   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2429     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2430   } else {
2431     lck = (kmp_user_lock_p)user_lock;
2432   }
2433   __kmp_itt_lock_destroyed(lck);
2434 #endif
2435 #if OMPT_SUPPORT && OMPT_OPTIONAL
2436   // This is the case, if called from omp_init_lock_with_hint:
2437   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2438   if (!codeptr)
2439     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2440   if (ompt_enabled.ompt_callback_lock_destroy) {
2441     kmp_user_lock_p lck;
2442     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2443       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2444     } else {
2445       lck = (kmp_user_lock_p)user_lock;
2446     }
2447     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2448         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2449   }
2450 #endif
2451   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2452 #else
2453   kmp_user_lock_p lck;
2454 
2455   if ((__kmp_user_lock_kind == lk_tas) &&
2456       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2457     lck = (kmp_user_lock_p)user_lock;
2458   }
2459 #if KMP_USE_FUTEX
2460   else if ((__kmp_user_lock_kind == lk_futex) &&
2461            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2462     lck = (kmp_user_lock_p)user_lock;
2463   }
2464 #endif
2465   else {
2466     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2467   }
2468 
2469 #if OMPT_SUPPORT && OMPT_OPTIONAL
2470   // This is the case, if called from omp_init_lock_with_hint:
2471   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2472   if (!codeptr)
2473     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2474   if (ompt_enabled.ompt_callback_lock_destroy) {
2475     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2476         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2477   }
2478 #endif
2479 
2480 #if USE_ITT_BUILD
2481   __kmp_itt_lock_destroyed(lck);
2482 #endif /* USE_ITT_BUILD */
2483   DESTROY_LOCK(lck);
2484 
2485   if ((__kmp_user_lock_kind == lk_tas) &&
2486       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2487     ;
2488   }
2489 #if KMP_USE_FUTEX
2490   else if ((__kmp_user_lock_kind == lk_futex) &&
2491            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2492     ;
2493   }
2494 #endif
2495   else {
2496     __kmp_user_lock_free(user_lock, gtid, lck);
2497   }
2498 #endif // KMP_USE_DYNAMIC_LOCK
2499 } // __kmpc_destroy_lock
2500 
2501 /* destroy the lock */
2502 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2503 #if KMP_USE_DYNAMIC_LOCK
2504 
2505 #if USE_ITT_BUILD
2506   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2507   __kmp_itt_lock_destroyed(ilk->lock);
2508 #endif
2509 #if OMPT_SUPPORT && OMPT_OPTIONAL
2510   // This is the case, if called from omp_init_lock_with_hint:
2511   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2512   if (!codeptr)
2513     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2514   if (ompt_enabled.ompt_callback_lock_destroy) {
2515     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2516         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2517   }
2518 #endif
2519   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2520 
2521 #else // KMP_USE_DYNAMIC_LOCK
2522 
2523   kmp_user_lock_p lck;
2524 
2525   if ((__kmp_user_lock_kind == lk_tas) &&
2526       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2527        OMP_NEST_LOCK_T_SIZE)) {
2528     lck = (kmp_user_lock_p)user_lock;
2529   }
2530 #if KMP_USE_FUTEX
2531   else if ((__kmp_user_lock_kind == lk_futex) &&
2532            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2533             OMP_NEST_LOCK_T_SIZE)) {
2534     lck = (kmp_user_lock_p)user_lock;
2535   }
2536 #endif
2537   else {
2538     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2539   }
2540 
2541 #if OMPT_SUPPORT && OMPT_OPTIONAL
2542   // This is the case, if called from omp_init_lock_with_hint:
2543   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2544   if (!codeptr)
2545     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2546   if (ompt_enabled.ompt_callback_lock_destroy) {
2547     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2548         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2549   }
2550 #endif
2551 
2552 #if USE_ITT_BUILD
2553   __kmp_itt_lock_destroyed(lck);
2554 #endif /* USE_ITT_BUILD */
2555 
2556   DESTROY_NESTED_LOCK(lck);
2557 
2558   if ((__kmp_user_lock_kind == lk_tas) &&
2559       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2560        OMP_NEST_LOCK_T_SIZE)) {
2561     ;
2562   }
2563 #if KMP_USE_FUTEX
2564   else if ((__kmp_user_lock_kind == lk_futex) &&
2565            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2566             OMP_NEST_LOCK_T_SIZE)) {
2567     ;
2568   }
2569 #endif
2570   else {
2571     __kmp_user_lock_free(user_lock, gtid, lck);
2572   }
2573 #endif // KMP_USE_DYNAMIC_LOCK
2574 } // __kmpc_destroy_nest_lock
2575 
2576 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2577   KMP_COUNT_BLOCK(OMP_set_lock);
2578 #if KMP_USE_DYNAMIC_LOCK
2579   int tag = KMP_EXTRACT_D_TAG(user_lock);
2580 #if USE_ITT_BUILD
2581   __kmp_itt_lock_acquiring(
2582       (kmp_user_lock_p)
2583           user_lock); // itt function will get to the right lock object.
2584 #endif
2585 #if OMPT_SUPPORT && OMPT_OPTIONAL
2586   // This is the case, if called from omp_init_lock_with_hint:
2587   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2588   if (!codeptr)
2589     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2590   if (ompt_enabled.ompt_callback_mutex_acquire) {
2591     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2592         ompt_mutex_lock, omp_lock_hint_none,
2593         __ompt_get_mutex_impl_type(user_lock),
2594         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2595   }
2596 #endif
2597 #if KMP_USE_INLINED_TAS
2598   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2599     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2600   } else
2601 #elif KMP_USE_INLINED_FUTEX
2602   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2603     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2604   } else
2605 #endif
2606   {
2607     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2608   }
2609 #if USE_ITT_BUILD
2610   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2611 #endif
2612 #if OMPT_SUPPORT && OMPT_OPTIONAL
2613   if (ompt_enabled.ompt_callback_mutex_acquired) {
2614     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2615         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2616   }
2617 #endif
2618 
2619 #else // KMP_USE_DYNAMIC_LOCK
2620 
2621   kmp_user_lock_p lck;
2622 
2623   if ((__kmp_user_lock_kind == lk_tas) &&
2624       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2625     lck = (kmp_user_lock_p)user_lock;
2626   }
2627 #if KMP_USE_FUTEX
2628   else if ((__kmp_user_lock_kind == lk_futex) &&
2629            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2630     lck = (kmp_user_lock_p)user_lock;
2631   }
2632 #endif
2633   else {
2634     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2635   }
2636 
2637 #if USE_ITT_BUILD
2638   __kmp_itt_lock_acquiring(lck);
2639 #endif /* USE_ITT_BUILD */
2640 #if OMPT_SUPPORT && OMPT_OPTIONAL
2641   // This is the case, if called from omp_init_lock_with_hint:
2642   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2643   if (!codeptr)
2644     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2645   if (ompt_enabled.ompt_callback_mutex_acquire) {
2646     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2647         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2648         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2649   }
2650 #endif
2651 
2652   ACQUIRE_LOCK(lck, gtid);
2653 
2654 #if USE_ITT_BUILD
2655   __kmp_itt_lock_acquired(lck);
2656 #endif /* USE_ITT_BUILD */
2657 
2658 #if OMPT_SUPPORT && OMPT_OPTIONAL
2659   if (ompt_enabled.ompt_callback_mutex_acquired) {
2660     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2661         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2662   }
2663 #endif
2664 
2665 #endif // KMP_USE_DYNAMIC_LOCK
2666 }
2667 
2668 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2669 #if KMP_USE_DYNAMIC_LOCK
2670 
2671 #if USE_ITT_BUILD
2672   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2673 #endif
2674 #if OMPT_SUPPORT && OMPT_OPTIONAL
2675   // This is the case, if called from omp_init_lock_with_hint:
2676   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2677   if (!codeptr)
2678     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2679   if (ompt_enabled.enabled) {
2680     if (ompt_enabled.ompt_callback_mutex_acquire) {
2681       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2682           ompt_mutex_nest_lock, omp_lock_hint_none,
2683           __ompt_get_mutex_impl_type(user_lock),
2684           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2685     }
2686   }
2687 #endif
2688   int acquire_status =
2689       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2690   (void) acquire_status;
2691 #if USE_ITT_BUILD
2692   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2693 #endif
2694 
2695 #if OMPT_SUPPORT && OMPT_OPTIONAL
2696   if (ompt_enabled.enabled) {
2697     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2698       if (ompt_enabled.ompt_callback_mutex_acquired) {
2699         // lock_first
2700         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2701             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2702             codeptr);
2703       }
2704     } else {
2705       if (ompt_enabled.ompt_callback_nest_lock) {
2706         // lock_next
2707         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2708             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2709       }
2710     }
2711   }
2712 #endif
2713 
2714 #else // KMP_USE_DYNAMIC_LOCK
2715   int acquire_status;
2716   kmp_user_lock_p lck;
2717 
2718   if ((__kmp_user_lock_kind == lk_tas) &&
2719       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2720        OMP_NEST_LOCK_T_SIZE)) {
2721     lck = (kmp_user_lock_p)user_lock;
2722   }
2723 #if KMP_USE_FUTEX
2724   else if ((__kmp_user_lock_kind == lk_futex) &&
2725            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2726             OMP_NEST_LOCK_T_SIZE)) {
2727     lck = (kmp_user_lock_p)user_lock;
2728   }
2729 #endif
2730   else {
2731     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2732   }
2733 
2734 #if USE_ITT_BUILD
2735   __kmp_itt_lock_acquiring(lck);
2736 #endif /* USE_ITT_BUILD */
2737 #if OMPT_SUPPORT && OMPT_OPTIONAL
2738   // This is the case, if called from omp_init_lock_with_hint:
2739   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2740   if (!codeptr)
2741     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2742   if (ompt_enabled.enabled) {
2743     if (ompt_enabled.ompt_callback_mutex_acquire) {
2744       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2745           ompt_mutex_nest_lock, omp_lock_hint_none,
2746           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2747           codeptr);
2748     }
2749   }
2750 #endif
2751 
2752   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2753 
2754 #if USE_ITT_BUILD
2755   __kmp_itt_lock_acquired(lck);
2756 #endif /* USE_ITT_BUILD */
2757 
2758 #if OMPT_SUPPORT && OMPT_OPTIONAL
2759   if (ompt_enabled.enabled) {
2760     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2761       if (ompt_enabled.ompt_callback_mutex_acquired) {
2762         // lock_first
2763         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2764             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2765       }
2766     } else {
2767       if (ompt_enabled.ompt_callback_nest_lock) {
2768         // lock_next
2769         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2770             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2771       }
2772     }
2773   }
2774 #endif
2775 
2776 #endif // KMP_USE_DYNAMIC_LOCK
2777 }
2778 
2779 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2780 #if KMP_USE_DYNAMIC_LOCK
2781 
2782   int tag = KMP_EXTRACT_D_TAG(user_lock);
2783 #if USE_ITT_BUILD
2784   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2785 #endif
2786 #if KMP_USE_INLINED_TAS
2787   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2788     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2789   } else
2790 #elif KMP_USE_INLINED_FUTEX
2791   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2792     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2793   } else
2794 #endif
2795   {
2796     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2797   }
2798 
2799 #if OMPT_SUPPORT && OMPT_OPTIONAL
2800   // This is the case, if called from omp_init_lock_with_hint:
2801   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2802   if (!codeptr)
2803     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2804   if (ompt_enabled.ompt_callback_mutex_released) {
2805     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2806         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2807   }
2808 #endif
2809 
2810 #else // KMP_USE_DYNAMIC_LOCK
2811 
2812   kmp_user_lock_p lck;
2813 
2814   /* Can't use serial interval since not block structured */
2815   /* release the lock */
2816 
2817   if ((__kmp_user_lock_kind == lk_tas) &&
2818       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2819 #if KMP_OS_LINUX &&                                                            \
2820     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2821 // "fast" path implemented to fix customer performance issue
2822 #if USE_ITT_BUILD
2823     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2824 #endif /* USE_ITT_BUILD */
2825     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2826     KMP_MB();
2827 
2828 #if OMPT_SUPPORT && OMPT_OPTIONAL
2829     // This is the case, if called from omp_init_lock_with_hint:
2830     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2831     if (!codeptr)
2832       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2833     if (ompt_enabled.ompt_callback_mutex_released) {
2834       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2835           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2836     }
2837 #endif
2838 
2839     return;
2840 #else
2841     lck = (kmp_user_lock_p)user_lock;
2842 #endif
2843   }
2844 #if KMP_USE_FUTEX
2845   else if ((__kmp_user_lock_kind == lk_futex) &&
2846            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2847     lck = (kmp_user_lock_p)user_lock;
2848   }
2849 #endif
2850   else {
2851     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2852   }
2853 
2854 #if USE_ITT_BUILD
2855   __kmp_itt_lock_releasing(lck);
2856 #endif /* USE_ITT_BUILD */
2857 
2858   RELEASE_LOCK(lck, gtid);
2859 
2860 #if OMPT_SUPPORT && OMPT_OPTIONAL
2861   // This is the case, if called from omp_init_lock_with_hint:
2862   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2863   if (!codeptr)
2864     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2865   if (ompt_enabled.ompt_callback_mutex_released) {
2866     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2867         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2868   }
2869 #endif
2870 
2871 #endif // KMP_USE_DYNAMIC_LOCK
2872 }
2873 
2874 /* release the lock */
2875 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2876 #if KMP_USE_DYNAMIC_LOCK
2877 
2878 #if USE_ITT_BUILD
2879   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2880 #endif
2881   int release_status =
2882       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2883   (void) release_status;
2884 
2885 #if OMPT_SUPPORT && OMPT_OPTIONAL
2886   // This is the case, if called from omp_init_lock_with_hint:
2887   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2888   if (!codeptr)
2889     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2890   if (ompt_enabled.enabled) {
2891     if (release_status == KMP_LOCK_RELEASED) {
2892       if (ompt_enabled.ompt_callback_mutex_released) {
2893         // release_lock_last
2894         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2895             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2896             codeptr);
2897       }
2898     } else if (ompt_enabled.ompt_callback_nest_lock) {
2899       // release_lock_prev
2900       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2901           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2902     }
2903   }
2904 #endif
2905 
2906 #else // KMP_USE_DYNAMIC_LOCK
2907 
2908   kmp_user_lock_p lck;
2909 
2910   /* Can't use serial interval since not block structured */
2911 
2912   if ((__kmp_user_lock_kind == lk_tas) &&
2913       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2914        OMP_NEST_LOCK_T_SIZE)) {
2915 #if KMP_OS_LINUX &&                                                            \
2916     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2917     // "fast" path implemented to fix customer performance issue
2918     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2919 #if USE_ITT_BUILD
2920     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2921 #endif /* USE_ITT_BUILD */
2922 
2923 #if OMPT_SUPPORT && OMPT_OPTIONAL
2924     int release_status = KMP_LOCK_STILL_HELD;
2925 #endif
2926 
2927     if (--(tl->lk.depth_locked) == 0) {
2928       TCW_4(tl->lk.poll, 0);
2929 #if OMPT_SUPPORT && OMPT_OPTIONAL
2930       release_status = KMP_LOCK_RELEASED;
2931 #endif
2932     }
2933     KMP_MB();
2934 
2935 #if OMPT_SUPPORT && OMPT_OPTIONAL
2936     // This is the case, if called from omp_init_lock_with_hint:
2937     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2938     if (!codeptr)
2939       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2940     if (ompt_enabled.enabled) {
2941       if (release_status == KMP_LOCK_RELEASED) {
2942         if (ompt_enabled.ompt_callback_mutex_released) {
2943           // release_lock_last
2944           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2945               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2946         }
2947       } else if (ompt_enabled.ompt_callback_nest_lock) {
2948         // release_lock_previous
2949         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2950             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2951       }
2952     }
2953 #endif
2954 
2955     return;
2956 #else
2957     lck = (kmp_user_lock_p)user_lock;
2958 #endif
2959   }
2960 #if KMP_USE_FUTEX
2961   else if ((__kmp_user_lock_kind == lk_futex) &&
2962            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2963             OMP_NEST_LOCK_T_SIZE)) {
2964     lck = (kmp_user_lock_p)user_lock;
2965   }
2966 #endif
2967   else {
2968     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2969   }
2970 
2971 #if USE_ITT_BUILD
2972   __kmp_itt_lock_releasing(lck);
2973 #endif /* USE_ITT_BUILD */
2974 
2975   int release_status;
2976   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2977 #if OMPT_SUPPORT && OMPT_OPTIONAL
2978   // This is the case, if called from omp_init_lock_with_hint:
2979   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2980   if (!codeptr)
2981     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2982   if (ompt_enabled.enabled) {
2983     if (release_status == KMP_LOCK_RELEASED) {
2984       if (ompt_enabled.ompt_callback_mutex_released) {
2985         // release_lock_last
2986         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2987             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2988       }
2989     } else if (ompt_enabled.ompt_callback_nest_lock) {
2990       // release_lock_previous
2991       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2992           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2993     }
2994   }
2995 #endif
2996 
2997 #endif // KMP_USE_DYNAMIC_LOCK
2998 }
2999 
3000 /* try to acquire the lock */
3001 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3002   KMP_COUNT_BLOCK(OMP_test_lock);
3003 
3004 #if KMP_USE_DYNAMIC_LOCK
3005   int rc;
3006   int tag = KMP_EXTRACT_D_TAG(user_lock);
3007 #if USE_ITT_BUILD
3008   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3009 #endif
3010 #if OMPT_SUPPORT && OMPT_OPTIONAL
3011   // This is the case, if called from omp_init_lock_with_hint:
3012   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3013   if (!codeptr)
3014     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3015   if (ompt_enabled.ompt_callback_mutex_acquire) {
3016     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3017         ompt_mutex_lock, omp_lock_hint_none,
3018         __ompt_get_mutex_impl_type(user_lock),
3019         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3020   }
3021 #endif
3022 #if KMP_USE_INLINED_TAS
3023   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3024     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3025   } else
3026 #elif KMP_USE_INLINED_FUTEX
3027   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3028     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3029   } else
3030 #endif
3031   {
3032     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3033   }
3034   if (rc) {
3035 #if USE_ITT_BUILD
3036     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3037 #endif
3038 #if OMPT_SUPPORT && OMPT_OPTIONAL
3039     if (ompt_enabled.ompt_callback_mutex_acquired) {
3040       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3041           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3042     }
3043 #endif
3044     return FTN_TRUE;
3045   } else {
3046 #if USE_ITT_BUILD
3047     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3048 #endif
3049     return FTN_FALSE;
3050   }
3051 
3052 #else // KMP_USE_DYNAMIC_LOCK
3053 
3054   kmp_user_lock_p lck;
3055   int rc;
3056 
3057   if ((__kmp_user_lock_kind == lk_tas) &&
3058       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3059     lck = (kmp_user_lock_p)user_lock;
3060   }
3061 #if KMP_USE_FUTEX
3062   else if ((__kmp_user_lock_kind == lk_futex) &&
3063            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3064     lck = (kmp_user_lock_p)user_lock;
3065   }
3066 #endif
3067   else {
3068     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3069   }
3070 
3071 #if USE_ITT_BUILD
3072   __kmp_itt_lock_acquiring(lck);
3073 #endif /* USE_ITT_BUILD */
3074 #if OMPT_SUPPORT && OMPT_OPTIONAL
3075   // This is the case, if called from omp_init_lock_with_hint:
3076   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3077   if (!codeptr)
3078     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3079   if (ompt_enabled.ompt_callback_mutex_acquire) {
3080     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3081         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3082         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3083   }
3084 #endif
3085 
3086   rc = TEST_LOCK(lck, gtid);
3087 #if USE_ITT_BUILD
3088   if (rc) {
3089     __kmp_itt_lock_acquired(lck);
3090   } else {
3091     __kmp_itt_lock_cancelled(lck);
3092   }
3093 #endif /* USE_ITT_BUILD */
3094 #if OMPT_SUPPORT && OMPT_OPTIONAL
3095   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3096     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3097         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3098   }
3099 #endif
3100 
3101   return (rc ? FTN_TRUE : FTN_FALSE);
3102 
3103 /* Can't use serial interval since not block structured */
3104 
3105 #endif // KMP_USE_DYNAMIC_LOCK
3106 }
3107 
3108 /* try to acquire the lock */
3109 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3110 #if KMP_USE_DYNAMIC_LOCK
3111   int rc;
3112 #if USE_ITT_BUILD
3113   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3114 #endif
3115 #if OMPT_SUPPORT && OMPT_OPTIONAL
3116   // This is the case, if called from omp_init_lock_with_hint:
3117   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3118   if (!codeptr)
3119     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3120   if (ompt_enabled.ompt_callback_mutex_acquire) {
3121     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3122         ompt_mutex_nest_lock, omp_lock_hint_none,
3123         __ompt_get_mutex_impl_type(user_lock),
3124         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3125   }
3126 #endif
3127   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3128 #if USE_ITT_BUILD
3129   if (rc) {
3130     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3131   } else {
3132     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3133   }
3134 #endif
3135 #if OMPT_SUPPORT && OMPT_OPTIONAL
3136   if (ompt_enabled.enabled && rc) {
3137     if (rc == 1) {
3138       if (ompt_enabled.ompt_callback_mutex_acquired) {
3139         // lock_first
3140         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3141             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3142             codeptr);
3143       }
3144     } else {
3145       if (ompt_enabled.ompt_callback_nest_lock) {
3146         // lock_next
3147         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3148             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3149       }
3150     }
3151   }
3152 #endif
3153   return rc;
3154 
3155 #else // KMP_USE_DYNAMIC_LOCK
3156 
3157   kmp_user_lock_p lck;
3158   int rc;
3159 
3160   if ((__kmp_user_lock_kind == lk_tas) &&
3161       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3162        OMP_NEST_LOCK_T_SIZE)) {
3163     lck = (kmp_user_lock_p)user_lock;
3164   }
3165 #if KMP_USE_FUTEX
3166   else if ((__kmp_user_lock_kind == lk_futex) &&
3167            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3168             OMP_NEST_LOCK_T_SIZE)) {
3169     lck = (kmp_user_lock_p)user_lock;
3170   }
3171 #endif
3172   else {
3173     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3174   }
3175 
3176 #if USE_ITT_BUILD
3177   __kmp_itt_lock_acquiring(lck);
3178 #endif /* USE_ITT_BUILD */
3179 
3180 #if OMPT_SUPPORT && OMPT_OPTIONAL
3181   // This is the case, if called from omp_init_lock_with_hint:
3182   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3183   if (!codeptr)
3184     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3185   if (ompt_enabled.enabled) &&
3186         ompt_enabled.ompt_callback_mutex_acquire) {
3187       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3188           ompt_mutex_nest_lock, omp_lock_hint_none,
3189           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3190           codeptr);
3191     }
3192 #endif
3193 
3194   rc = TEST_NESTED_LOCK(lck, gtid);
3195 #if USE_ITT_BUILD
3196   if (rc) {
3197     __kmp_itt_lock_acquired(lck);
3198   } else {
3199     __kmp_itt_lock_cancelled(lck);
3200   }
3201 #endif /* USE_ITT_BUILD */
3202 #if OMPT_SUPPORT && OMPT_OPTIONAL
3203   if (ompt_enabled.enabled && rc) {
3204     if (rc == 1) {
3205       if (ompt_enabled.ompt_callback_mutex_acquired) {
3206         // lock_first
3207         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3208             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3209       }
3210     } else {
3211       if (ompt_enabled.ompt_callback_nest_lock) {
3212         // lock_next
3213         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3214             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3215       }
3216     }
3217   }
3218 #endif
3219   return rc;
3220 
3221 /* Can't use serial interval since not block structured */
3222 
3223 #endif // KMP_USE_DYNAMIC_LOCK
3224 }
3225 
3226 // Interface to fast scalable reduce methods routines
3227 
3228 // keep the selected method in a thread local structure for cross-function
3229 // usage: will be used in __kmpc_end_reduce* functions;
3230 // another solution: to re-determine the method one more time in
3231 // __kmpc_end_reduce* functions (new prototype required then)
3232 // AT: which solution is better?
3233 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3234   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3235 
3236 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3237   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3238 
3239 // description of the packed_reduction_method variable: look at the macros in
3240 // kmp.h
3241 
3242 // used in a critical section reduce block
3243 static __forceinline void
3244 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3245                                           kmp_critical_name *crit) {
3246 
3247   // this lock was visible to a customer and to the threading profile tool as a
3248   // serial overhead span (although it's used for an internal purpose only)
3249   //            why was it visible in previous implementation?
3250   //            should we keep it visible in new reduce block?
3251   kmp_user_lock_p lck;
3252 
3253 #if KMP_USE_DYNAMIC_LOCK
3254 
3255   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3256   // Check if it is initialized.
3257   if (*lk == 0) {
3258     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3259       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3260                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3261     } else {
3262       __kmp_init_indirect_csptr(crit, loc, global_tid,
3263                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3264     }
3265   }
3266   // Branch for accessing the actual lock object and set operation. This
3267   // branching is inevitable since this lock initialization does not follow the
3268   // normal dispatch path (lock table is not used).
3269   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3270     lck = (kmp_user_lock_p)lk;
3271     KMP_DEBUG_ASSERT(lck != NULL);
3272     if (__kmp_env_consistency_check) {
3273       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3274     }
3275     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3276   } else {
3277     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3278     lck = ilk->lock;
3279     KMP_DEBUG_ASSERT(lck != NULL);
3280     if (__kmp_env_consistency_check) {
3281       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3282     }
3283     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3284   }
3285 
3286 #else // KMP_USE_DYNAMIC_LOCK
3287 
3288   // We know that the fast reduction code is only emitted by Intel compilers
3289   // with 32 byte critical sections. If there isn't enough space, then we
3290   // have to use a pointer.
3291   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3292     lck = (kmp_user_lock_p)crit;
3293   } else {
3294     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3295   }
3296   KMP_DEBUG_ASSERT(lck != NULL);
3297 
3298   if (__kmp_env_consistency_check)
3299     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3300 
3301   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3302 
3303 #endif // KMP_USE_DYNAMIC_LOCK
3304 }
3305 
3306 // used in a critical section reduce block
3307 static __forceinline void
3308 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3309                                         kmp_critical_name *crit) {
3310 
3311   kmp_user_lock_p lck;
3312 
3313 #if KMP_USE_DYNAMIC_LOCK
3314 
3315   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3316     lck = (kmp_user_lock_p)crit;
3317     if (__kmp_env_consistency_check)
3318       __kmp_pop_sync(global_tid, ct_critical, loc);
3319     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3320   } else {
3321     kmp_indirect_lock_t *ilk =
3322         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3323     if (__kmp_env_consistency_check)
3324       __kmp_pop_sync(global_tid, ct_critical, loc);
3325     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3326   }
3327 
3328 #else // KMP_USE_DYNAMIC_LOCK
3329 
3330   // We know that the fast reduction code is only emitted by Intel compilers
3331   // with 32 byte critical sections. If there isn't enough space, then we have
3332   // to use a pointer.
3333   if (__kmp_base_user_lock_size > 32) {
3334     lck = *((kmp_user_lock_p *)crit);
3335     KMP_ASSERT(lck != NULL);
3336   } else {
3337     lck = (kmp_user_lock_p)crit;
3338   }
3339 
3340   if (__kmp_env_consistency_check)
3341     __kmp_pop_sync(global_tid, ct_critical, loc);
3342 
3343   __kmp_release_user_lock_with_checks(lck, global_tid);
3344 
3345 #endif // KMP_USE_DYNAMIC_LOCK
3346 } // __kmp_end_critical_section_reduce_block
3347 
3348 #if OMP_40_ENABLED
3349 static __forceinline int
3350 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3351                                      int *task_state) {
3352   kmp_team_t *team;
3353 
3354   // Check if we are inside the teams construct?
3355   if (th->th.th_teams_microtask) {
3356     *team_p = team = th->th.th_team;
3357     if (team->t.t_level == th->th.th_teams_level) {
3358       // This is reduction at teams construct.
3359       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3360       // Let's swap teams temporarily for the reduction.
3361       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3362       th->th.th_team = team->t.t_parent;
3363       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3364       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3365       *task_state = th->th.th_task_state;
3366       th->th.th_task_state = 0;
3367 
3368       return 1;
3369     }
3370   }
3371   return 0;
3372 }
3373 
3374 static __forceinline void
3375 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3376   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3377   th->th.th_info.ds.ds_tid = 0;
3378   th->th.th_team = team;
3379   th->th.th_team_nproc = team->t.t_nproc;
3380   th->th.th_task_team = team->t.t_task_team[task_state];
3381   th->th.th_task_state = task_state;
3382 }
3383 #endif
3384 
3385 /* 2.a.i. Reduce Block without a terminating barrier */
3386 /*!
3387 @ingroup SYNCHRONIZATION
3388 @param loc source location information
3389 @param global_tid global thread number
3390 @param num_vars number of items (variables) to be reduced
3391 @param reduce_size size of data in bytes to be reduced
3392 @param reduce_data pointer to data to be reduced
3393 @param reduce_func callback function providing reduction operation on two
3394 operands and returning result of reduction in lhs_data
3395 @param lck pointer to the unique lock data structure
3396 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3397 threads if atomic reduction needed
3398 
3399 The nowait version is used for a reduce clause with the nowait argument.
3400 */
3401 kmp_int32
3402 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3403                      size_t reduce_size, void *reduce_data,
3404                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3405                      kmp_critical_name *lck) {
3406 
3407   KMP_COUNT_BLOCK(REDUCE_nowait);
3408   int retval = 0;
3409   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3410 #if OMP_40_ENABLED
3411   kmp_info_t *th;
3412   kmp_team_t *team;
3413   int teams_swapped = 0, task_state;
3414 #endif
3415   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3416 
3417   // why do we need this initialization here at all?
3418   // Reduction clause can not be used as a stand-alone directive.
3419 
3420   // do not call __kmp_serial_initialize(), it will be called by
3421   // __kmp_parallel_initialize() if needed
3422   // possible detection of false-positive race by the threadchecker ???
3423   if (!TCR_4(__kmp_init_parallel))
3424     __kmp_parallel_initialize();
3425 
3426 #if OMP_50_ENABLED
3427   __kmp_resume_if_soft_paused();
3428 #endif
3429 
3430 // check correctness of reduce block nesting
3431 #if KMP_USE_DYNAMIC_LOCK
3432   if (__kmp_env_consistency_check)
3433     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3434 #else
3435   if (__kmp_env_consistency_check)
3436     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3437 #endif
3438 
3439 #if OMP_40_ENABLED
3440   th = __kmp_thread_from_gtid(global_tid);
3441   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3442 #endif // OMP_40_ENABLED
3443 
3444   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3445   // the value should be kept in a variable
3446   // the variable should be either a construct-specific or thread-specific
3447   // property, not a team specific property
3448   //     (a thread can reach the next reduce block on the next construct, reduce
3449   //     method may differ on the next construct)
3450   // an ident_t "loc" parameter could be used as a construct-specific property
3451   // (what if loc == 0?)
3452   //     (if both construct-specific and team-specific variables were shared,
3453   //     then unness extra syncs should be needed)
3454   // a thread-specific variable is better regarding two issues above (next
3455   // construct and extra syncs)
3456   // a thread-specific "th_local.reduction_method" variable is used currently
3457   // each thread executes 'determine' and 'set' lines (no need to execute by one
3458   // thread, to avoid unness extra syncs)
3459 
3460   packed_reduction_method = __kmp_determine_reduction_method(
3461       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3462   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3463 
3464   if (packed_reduction_method == critical_reduce_block) {
3465 
3466     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3467     retval = 1;
3468 
3469   } else if (packed_reduction_method == empty_reduce_block) {
3470 
3471     // usage: if team size == 1, no synchronization is required ( Intel
3472     // platforms only )
3473     retval = 1;
3474 
3475   } else if (packed_reduction_method == atomic_reduce_block) {
3476 
3477     retval = 2;
3478 
3479     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3480     // won't be called by the code gen)
3481     //     (it's not quite good, because the checking block has been closed by
3482     //     this 'pop',
3483     //      but atomic operation has not been executed yet, will be executed
3484     //      slightly later, literally on next instruction)
3485     if (__kmp_env_consistency_check)
3486       __kmp_pop_sync(global_tid, ct_reduce, loc);
3487 
3488   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3489                                    tree_reduce_block)) {
3490 
3491 // AT: performance issue: a real barrier here
3492 // AT:     (if master goes slow, other threads are blocked here waiting for the
3493 // master to come and release them)
3494 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3495 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3496 // be confusing to a customer)
3497 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3498 // might go faster and be more in line with sense of NOWAIT
3499 // AT: TO DO: do epcc test and compare times
3500 
3501 // this barrier should be invisible to a customer and to the threading profile
3502 // tool (it's neither a terminating barrier nor customer's code, it's
3503 // used for an internal purpose)
3504 #if OMPT_SUPPORT
3505     // JP: can this barrier potentially leed to task scheduling?
3506     // JP: as long as there is a barrier in the implementation, OMPT should and
3507     // will provide the barrier events
3508     //         so we set-up the necessary frame/return addresses.
3509     ompt_frame_t *ompt_frame;
3510     if (ompt_enabled.enabled) {
3511       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3512       if (ompt_frame->enter_frame.ptr == NULL)
3513         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3514       OMPT_STORE_RETURN_ADDRESS(global_tid);
3515     }
3516 #endif
3517 #if USE_ITT_NOTIFY
3518     __kmp_threads[global_tid]->th.th_ident = loc;
3519 #endif
3520     retval =
3521         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3522                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3523     retval = (retval != 0) ? (0) : (1);
3524 #if OMPT_SUPPORT && OMPT_OPTIONAL
3525     if (ompt_enabled.enabled) {
3526       ompt_frame->enter_frame = ompt_data_none;
3527     }
3528 #endif
3529 
3530     // all other workers except master should do this pop here
3531     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3532     if (__kmp_env_consistency_check) {
3533       if (retval == 0) {
3534         __kmp_pop_sync(global_tid, ct_reduce, loc);
3535       }
3536     }
3537 
3538   } else {
3539 
3540     // should never reach this block
3541     KMP_ASSERT(0); // "unexpected method"
3542   }
3543 #if OMP_40_ENABLED
3544   if (teams_swapped) {
3545     __kmp_restore_swapped_teams(th, team, task_state);
3546   }
3547 #endif
3548   KA_TRACE(
3549       10,
3550       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3551        global_tid, packed_reduction_method, retval));
3552 
3553   return retval;
3554 }
3555 
3556 /*!
3557 @ingroup SYNCHRONIZATION
3558 @param loc source location information
3559 @param global_tid global thread id.
3560 @param lck pointer to the unique lock data structure
3561 
3562 Finish the execution of a reduce nowait.
3563 */
3564 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3565                               kmp_critical_name *lck) {
3566 
3567   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3568 
3569   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3570 
3571   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3572 
3573   if (packed_reduction_method == critical_reduce_block) {
3574 
3575     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3576 
3577   } else if (packed_reduction_method == empty_reduce_block) {
3578 
3579     // usage: if team size == 1, no synchronization is required ( on Intel
3580     // platforms only )
3581 
3582   } else if (packed_reduction_method == atomic_reduce_block) {
3583 
3584     // neither master nor other workers should get here
3585     //     (code gen does not generate this call in case 2: atomic reduce block)
3586     // actually it's better to remove this elseif at all;
3587     // after removal this value will checked by the 'else' and will assert
3588 
3589   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3590                                    tree_reduce_block)) {
3591 
3592     // only master gets here
3593 
3594   } else {
3595 
3596     // should never reach this block
3597     KMP_ASSERT(0); // "unexpected method"
3598   }
3599 
3600   if (__kmp_env_consistency_check)
3601     __kmp_pop_sync(global_tid, ct_reduce, loc);
3602 
3603   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3604                 global_tid, packed_reduction_method));
3605 
3606   return;
3607 }
3608 
3609 /* 2.a.ii. Reduce Block with a terminating barrier */
3610 
3611 /*!
3612 @ingroup SYNCHRONIZATION
3613 @param loc source location information
3614 @param global_tid global thread number
3615 @param num_vars number of items (variables) to be reduced
3616 @param reduce_size size of data in bytes to be reduced
3617 @param reduce_data pointer to data to be reduced
3618 @param reduce_func callback function providing reduction operation on two
3619 operands and returning result of reduction in lhs_data
3620 @param lck pointer to the unique lock data structure
3621 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3622 threads if atomic reduction needed
3623 
3624 A blocking reduce that includes an implicit barrier.
3625 */
3626 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3627                         size_t reduce_size, void *reduce_data,
3628                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3629                         kmp_critical_name *lck) {
3630   KMP_COUNT_BLOCK(REDUCE_wait);
3631   int retval = 0;
3632   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3633 #if OMP_40_ENABLED
3634   kmp_info_t *th;
3635   kmp_team_t *team;
3636   int teams_swapped = 0, task_state;
3637 #endif
3638 
3639   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3640 
3641   // why do we need this initialization here at all?
3642   // Reduction clause can not be a stand-alone directive.
3643 
3644   // do not call __kmp_serial_initialize(), it will be called by
3645   // __kmp_parallel_initialize() if needed
3646   // possible detection of false-positive race by the threadchecker ???
3647   if (!TCR_4(__kmp_init_parallel))
3648     __kmp_parallel_initialize();
3649 
3650 #if OMP_50_ENABLED
3651   __kmp_resume_if_soft_paused();
3652 #endif
3653 
3654 // check correctness of reduce block nesting
3655 #if KMP_USE_DYNAMIC_LOCK
3656   if (__kmp_env_consistency_check)
3657     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3658 #else
3659   if (__kmp_env_consistency_check)
3660     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3661 #endif
3662 
3663 #if OMP_40_ENABLED
3664   th = __kmp_thread_from_gtid(global_tid);
3665   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3666 #endif // OMP_40_ENABLED
3667 
3668   packed_reduction_method = __kmp_determine_reduction_method(
3669       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3670   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3671 
3672   if (packed_reduction_method == critical_reduce_block) {
3673 
3674     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3675     retval = 1;
3676 
3677   } else if (packed_reduction_method == empty_reduce_block) {
3678 
3679     // usage: if team size == 1, no synchronization is required ( Intel
3680     // platforms only )
3681     retval = 1;
3682 
3683   } else if (packed_reduction_method == atomic_reduce_block) {
3684 
3685     retval = 2;
3686 
3687   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3688                                    tree_reduce_block)) {
3689 
3690 // case tree_reduce_block:
3691 // this barrier should be visible to a customer and to the threading profile
3692 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3693 #if OMPT_SUPPORT
3694     ompt_frame_t *ompt_frame;
3695     if (ompt_enabled.enabled) {
3696       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3697       if (ompt_frame->enter_frame.ptr == NULL)
3698         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3699       OMPT_STORE_RETURN_ADDRESS(global_tid);
3700     }
3701 #endif
3702 #if USE_ITT_NOTIFY
3703     __kmp_threads[global_tid]->th.th_ident =
3704         loc; // needed for correct notification of frames
3705 #endif
3706     retval =
3707         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3708                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3709     retval = (retval != 0) ? (0) : (1);
3710 #if OMPT_SUPPORT && OMPT_OPTIONAL
3711     if (ompt_enabled.enabled) {
3712       ompt_frame->enter_frame = ompt_data_none;
3713     }
3714 #endif
3715 
3716     // all other workers except master should do this pop here
3717     // ( none of other workers except master will enter __kmpc_end_reduce() )
3718     if (__kmp_env_consistency_check) {
3719       if (retval == 0) { // 0: all other workers; 1: master
3720         __kmp_pop_sync(global_tid, ct_reduce, loc);
3721       }
3722     }
3723 
3724   } else {
3725 
3726     // should never reach this block
3727     KMP_ASSERT(0); // "unexpected method"
3728   }
3729 #if OMP_40_ENABLED
3730   if (teams_swapped) {
3731     __kmp_restore_swapped_teams(th, team, task_state);
3732   }
3733 #endif
3734 
3735   KA_TRACE(10,
3736            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3737             global_tid, packed_reduction_method, retval));
3738 
3739   return retval;
3740 }
3741 
3742 /*!
3743 @ingroup SYNCHRONIZATION
3744 @param loc source location information
3745 @param global_tid global thread id.
3746 @param lck pointer to the unique lock data structure
3747 
3748 Finish the execution of a blocking reduce.
3749 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3750 start function.
3751 */
3752 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3753                        kmp_critical_name *lck) {
3754 
3755   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3756 #if OMP_40_ENABLED
3757   kmp_info_t *th;
3758   kmp_team_t *team;
3759   int teams_swapped = 0, task_state;
3760 #endif
3761 
3762   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3763 
3764 #if OMP_40_ENABLED
3765   th = __kmp_thread_from_gtid(global_tid);
3766   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3767 #endif // OMP_40_ENABLED
3768 
3769   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3770 
3771   // this barrier should be visible to a customer and to the threading profile
3772   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3773 
3774   if (packed_reduction_method == critical_reduce_block) {
3775 
3776     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3777 
3778 // TODO: implicit barrier: should be exposed
3779 #if OMPT_SUPPORT
3780     ompt_frame_t *ompt_frame;
3781     if (ompt_enabled.enabled) {
3782       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3783       if (ompt_frame->enter_frame.ptr == NULL)
3784         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3785       OMPT_STORE_RETURN_ADDRESS(global_tid);
3786     }
3787 #endif
3788 #if USE_ITT_NOTIFY
3789     __kmp_threads[global_tid]->th.th_ident = loc;
3790 #endif
3791     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3792 #if OMPT_SUPPORT && OMPT_OPTIONAL
3793     if (ompt_enabled.enabled) {
3794       ompt_frame->enter_frame = ompt_data_none;
3795     }
3796 #endif
3797 
3798   } else if (packed_reduction_method == empty_reduce_block) {
3799 
3800 // usage: if team size==1, no synchronization is required (Intel platforms only)
3801 
3802 // TODO: implicit barrier: should be exposed
3803 #if OMPT_SUPPORT
3804     ompt_frame_t *ompt_frame;
3805     if (ompt_enabled.enabled) {
3806       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3807       if (ompt_frame->enter_frame.ptr == NULL)
3808         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3809       OMPT_STORE_RETURN_ADDRESS(global_tid);
3810     }
3811 #endif
3812 #if USE_ITT_NOTIFY
3813     __kmp_threads[global_tid]->th.th_ident = loc;
3814 #endif
3815     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3816 #if OMPT_SUPPORT && OMPT_OPTIONAL
3817     if (ompt_enabled.enabled) {
3818       ompt_frame->enter_frame = ompt_data_none;
3819     }
3820 #endif
3821 
3822   } else if (packed_reduction_method == atomic_reduce_block) {
3823 
3824 #if OMPT_SUPPORT
3825     ompt_frame_t *ompt_frame;
3826     if (ompt_enabled.enabled) {
3827       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3828       if (ompt_frame->enter_frame.ptr == NULL)
3829         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3830       OMPT_STORE_RETURN_ADDRESS(global_tid);
3831     }
3832 #endif
3833 // TODO: implicit barrier: should be exposed
3834 #if USE_ITT_NOTIFY
3835     __kmp_threads[global_tid]->th.th_ident = loc;
3836 #endif
3837     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3838 #if OMPT_SUPPORT && OMPT_OPTIONAL
3839     if (ompt_enabled.enabled) {
3840       ompt_frame->enter_frame = ompt_data_none;
3841     }
3842 #endif
3843 
3844   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3845                                    tree_reduce_block)) {
3846 
3847     // only master executes here (master releases all other workers)
3848     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3849                             global_tid);
3850 
3851   } else {
3852 
3853     // should never reach this block
3854     KMP_ASSERT(0); // "unexpected method"
3855   }
3856 #if OMP_40_ENABLED
3857   if (teams_swapped) {
3858     __kmp_restore_swapped_teams(th, team, task_state);
3859   }
3860 #endif
3861 
3862   if (__kmp_env_consistency_check)
3863     __kmp_pop_sync(global_tid, ct_reduce, loc);
3864 
3865   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3866                 global_tid, packed_reduction_method));
3867 
3868   return;
3869 }
3870 
3871 #undef __KMP_GET_REDUCTION_METHOD
3872 #undef __KMP_SET_REDUCTION_METHOD
3873 
3874 /* end of interface to fast scalable reduce routines */
3875 
3876 kmp_uint64 __kmpc_get_taskid() {
3877 
3878   kmp_int32 gtid;
3879   kmp_info_t *thread;
3880 
3881   gtid = __kmp_get_gtid();
3882   if (gtid < 0) {
3883     return 0;
3884   }
3885   thread = __kmp_thread_from_gtid(gtid);
3886   return thread->th.th_current_task->td_task_id;
3887 
3888 } // __kmpc_get_taskid
3889 
3890 kmp_uint64 __kmpc_get_parent_taskid() {
3891 
3892   kmp_int32 gtid;
3893   kmp_info_t *thread;
3894   kmp_taskdata_t *parent_task;
3895 
3896   gtid = __kmp_get_gtid();
3897   if (gtid < 0) {
3898     return 0;
3899   }
3900   thread = __kmp_thread_from_gtid(gtid);
3901   parent_task = thread->th.th_current_task->td_parent;
3902   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3903 
3904 } // __kmpc_get_parent_taskid
3905 
3906 #if OMP_45_ENABLED
3907 /*!
3908 @ingroup WORK_SHARING
3909 @param loc  source location information.
3910 @param gtid  global thread number.
3911 @param num_dims  number of associated doacross loops.
3912 @param dims  info on loops bounds.
3913 
3914 Initialize doacross loop information.
3915 Expect compiler send us inclusive bounds,
3916 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3917 */
3918 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3919                           const struct kmp_dim *dims) {
3920   int j, idx;
3921   kmp_int64 last, trace_count;
3922   kmp_info_t *th = __kmp_threads[gtid];
3923   kmp_team_t *team = th->th.th_team;
3924   kmp_uint32 *flags;
3925   kmp_disp_t *pr_buf = th->th.th_dispatch;
3926   dispatch_shared_info_t *sh_buf;
3927 
3928   KA_TRACE(
3929       20,
3930       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3931        gtid, num_dims, !team->t.t_serialized));
3932   KMP_DEBUG_ASSERT(dims != NULL);
3933   KMP_DEBUG_ASSERT(num_dims > 0);
3934 
3935   if (team->t.t_serialized) {
3936     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3937     return; // no dependencies if team is serialized
3938   }
3939   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3940   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3941   // the next loop
3942   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3943 
3944   // Save bounds info into allocated private buffer
3945   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3946   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3947       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3948   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3949   pr_buf->th_doacross_info[0] =
3950       (kmp_int64)num_dims; // first element is number of dimensions
3951   // Save also address of num_done in order to access it later without knowing
3952   // the buffer index
3953   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3954   pr_buf->th_doacross_info[2] = dims[0].lo;
3955   pr_buf->th_doacross_info[3] = dims[0].up;
3956   pr_buf->th_doacross_info[4] = dims[0].st;
3957   last = 5;
3958   for (j = 1; j < num_dims; ++j) {
3959     kmp_int64
3960         range_length; // To keep ranges of all dimensions but the first dims[0]
3961     if (dims[j].st == 1) { // most common case
3962       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3963       range_length = dims[j].up - dims[j].lo + 1;
3964     } else {
3965       if (dims[j].st > 0) {
3966         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3967         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3968       } else { // negative increment
3969         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3970         range_length =
3971             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3972       }
3973     }
3974     pr_buf->th_doacross_info[last++] = range_length;
3975     pr_buf->th_doacross_info[last++] = dims[j].lo;
3976     pr_buf->th_doacross_info[last++] = dims[j].up;
3977     pr_buf->th_doacross_info[last++] = dims[j].st;
3978   }
3979 
3980   // Compute total trip count.
3981   // Start with range of dims[0] which we don't need to keep in the buffer.
3982   if (dims[0].st == 1) { // most common case
3983     trace_count = dims[0].up - dims[0].lo + 1;
3984   } else if (dims[0].st > 0) {
3985     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3986     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3987   } else { // negative increment
3988     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3989     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3990   }
3991   for (j = 1; j < num_dims; ++j) {
3992     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3993   }
3994   KMP_DEBUG_ASSERT(trace_count > 0);
3995 
3996   // Check if shared buffer is not occupied by other loop (idx -
3997   // __kmp_dispatch_num_buffers)
3998   if (idx != sh_buf->doacross_buf_idx) {
3999     // Shared buffer is occupied, wait for it to be free
4000     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4001                  __kmp_eq_4, NULL);
4002   }
4003 #if KMP_32_BIT_ARCH
4004   // Check if we are the first thread. After the CAS the first thread gets 0,
4005   // others get 1 if initialization is in progress, allocated pointer otherwise.
4006   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4007   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4008       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4009 #else
4010   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4011       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4012 #endif
4013   if (flags == NULL) {
4014     // we are the first thread, allocate the array of flags
4015     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
4016     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4017     KMP_MB();
4018     sh_buf->doacross_flags = flags;
4019   } else if (flags == (kmp_uint32 *)1) {
4020 #if KMP_32_BIT_ARCH
4021     // initialization is still in progress, need to wait
4022     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4023 #else
4024     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4025 #endif
4026       KMP_YIELD(TRUE);
4027     KMP_MB();
4028   } else {
4029     KMP_MB();
4030   }
4031   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4032   pr_buf->th_doacross_flags =
4033       sh_buf->doacross_flags; // save private copy in order to not
4034   // touch shared buffer on each iteration
4035   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4036 }
4037 
4038 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4039   kmp_int32 shft, num_dims, i;
4040   kmp_uint32 flag;
4041   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4042   kmp_info_t *th = __kmp_threads[gtid];
4043   kmp_team_t *team = th->th.th_team;
4044   kmp_disp_t *pr_buf;
4045   kmp_int64 lo, up, st;
4046 
4047   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4048   if (team->t.t_serialized) {
4049     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4050     return; // no dependencies if team is serialized
4051   }
4052 
4053   // calculate sequential iteration number and check out-of-bounds condition
4054   pr_buf = th->th.th_dispatch;
4055   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4056   num_dims = pr_buf->th_doacross_info[0];
4057   lo = pr_buf->th_doacross_info[2];
4058   up = pr_buf->th_doacross_info[3];
4059   st = pr_buf->th_doacross_info[4];
4060   if (st == 1) { // most common case
4061     if (vec[0] < lo || vec[0] > up) {
4062       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4063                     "bounds [%lld,%lld]\n",
4064                     gtid, vec[0], lo, up));
4065       return;
4066     }
4067     iter_number = vec[0] - lo;
4068   } else if (st > 0) {
4069     if (vec[0] < lo || vec[0] > up) {
4070       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4071                     "bounds [%lld,%lld]\n",
4072                     gtid, vec[0], lo, up));
4073       return;
4074     }
4075     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4076   } else { // negative increment
4077     if (vec[0] > lo || vec[0] < up) {
4078       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4079                     "bounds [%lld,%lld]\n",
4080                     gtid, vec[0], lo, up));
4081       return;
4082     }
4083     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4084   }
4085   for (i = 1; i < num_dims; ++i) {
4086     kmp_int64 iter, ln;
4087     kmp_int32 j = i * 4;
4088     ln = pr_buf->th_doacross_info[j + 1];
4089     lo = pr_buf->th_doacross_info[j + 2];
4090     up = pr_buf->th_doacross_info[j + 3];
4091     st = pr_buf->th_doacross_info[j + 4];
4092     if (st == 1) {
4093       if (vec[i] < lo || vec[i] > up) {
4094         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4095                       "bounds [%lld,%lld]\n",
4096                       gtid, vec[i], lo, up));
4097         return;
4098       }
4099       iter = vec[i] - lo;
4100     } else if (st > 0) {
4101       if (vec[i] < lo || vec[i] > up) {
4102         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4103                       "bounds [%lld,%lld]\n",
4104                       gtid, vec[i], lo, up));
4105         return;
4106       }
4107       iter = (kmp_uint64)(vec[i] - lo) / st;
4108     } else { // st < 0
4109       if (vec[i] > lo || vec[i] < up) {
4110         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4111                       "bounds [%lld,%lld]\n",
4112                       gtid, vec[i], lo, up));
4113         return;
4114       }
4115       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4116     }
4117     iter_number = iter + ln * iter_number;
4118   }
4119   shft = iter_number % 32; // use 32-bit granularity
4120   iter_number >>= 5; // divided by 32
4121   flag = 1 << shft;
4122   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4123     KMP_YIELD(TRUE);
4124   }
4125   KMP_MB();
4126   KA_TRACE(20,
4127            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4128             gtid, (iter_number << 5) + shft));
4129 }
4130 
4131 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4132   kmp_int32 shft, num_dims, i;
4133   kmp_uint32 flag;
4134   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4135   kmp_info_t *th = __kmp_threads[gtid];
4136   kmp_team_t *team = th->th.th_team;
4137   kmp_disp_t *pr_buf;
4138   kmp_int64 lo, st;
4139 
4140   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4141   if (team->t.t_serialized) {
4142     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4143     return; // no dependencies if team is serialized
4144   }
4145 
4146   // calculate sequential iteration number (same as in "wait" but no
4147   // out-of-bounds checks)
4148   pr_buf = th->th.th_dispatch;
4149   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4150   num_dims = pr_buf->th_doacross_info[0];
4151   lo = pr_buf->th_doacross_info[2];
4152   st = pr_buf->th_doacross_info[4];
4153   if (st == 1) { // most common case
4154     iter_number = vec[0] - lo;
4155   } else if (st > 0) {
4156     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4157   } else { // negative increment
4158     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4159   }
4160   for (i = 1; i < num_dims; ++i) {
4161     kmp_int64 iter, ln;
4162     kmp_int32 j = i * 4;
4163     ln = pr_buf->th_doacross_info[j + 1];
4164     lo = pr_buf->th_doacross_info[j + 2];
4165     st = pr_buf->th_doacross_info[j + 4];
4166     if (st == 1) {
4167       iter = vec[i] - lo;
4168     } else if (st > 0) {
4169       iter = (kmp_uint64)(vec[i] - lo) / st;
4170     } else { // st < 0
4171       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4172     }
4173     iter_number = iter + ln * iter_number;
4174   }
4175   shft = iter_number % 32; // use 32-bit granularity
4176   iter_number >>= 5; // divided by 32
4177   flag = 1 << shft;
4178   KMP_MB();
4179   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4180     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4181   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4182                 (iter_number << 5) + shft));
4183 }
4184 
4185 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4186   kmp_int32 num_done;
4187   kmp_info_t *th = __kmp_threads[gtid];
4188   kmp_team_t *team = th->th.th_team;
4189   kmp_disp_t *pr_buf = th->th.th_dispatch;
4190 
4191   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4192   if (team->t.t_serialized) {
4193     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4194     return; // nothing to do
4195   }
4196   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4197   if (num_done == th->th.th_team_nproc) {
4198     // we are the last thread, need to free shared resources
4199     int idx = pr_buf->th_doacross_buf_idx - 1;
4200     dispatch_shared_info_t *sh_buf =
4201         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4202     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4203                      (kmp_int64)&sh_buf->doacross_num_done);
4204     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4205     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4206     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4207     sh_buf->doacross_flags = NULL;
4208     sh_buf->doacross_num_done = 0;
4209     sh_buf->doacross_buf_idx +=
4210         __kmp_dispatch_num_buffers; // free buffer for future re-use
4211   }
4212   // free private resources (need to keep buffer index forever)
4213   pr_buf->th_doacross_flags = NULL;
4214   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4215   pr_buf->th_doacross_info = NULL;
4216   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4217 }
4218 #endif
4219 
4220 #if OMP_50_ENABLED
4221 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4222 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4223   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4224 }
4225 
4226 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4227   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4228 }
4229 
4230 int __kmpc_get_target_offload(void) {
4231   if (!__kmp_init_serial) {
4232     __kmp_serial_initialize();
4233   }
4234   return __kmp_target_offload;
4235 }
4236 
4237 int __kmpc_pause_resource(kmp_pause_status_t level) {
4238   if (!__kmp_init_serial) {
4239     return 1; // Can't pause if runtime is not initialized
4240   }
4241   return __kmp_pause_resource(level);
4242 }
4243 #endif // OMP_50_ENABLED
4244 
4245 // end of file //
4246