1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 #if OMP_40_ENABLED
248 
249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
250                            kmp_int32 proc_bind) {
251   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252                 proc_bind));
253 
254   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
255 }
256 
257 #endif /* OMP_40_ENABLED */
258 
259 /*!
260 @ingroup PARALLEL
261 @param loc  source location information
262 @param argc  total number of arguments in the ellipsis
263 @param microtask  pointer to callback routine consisting of outlined parallel
264 construct
265 @param ...  pointers to shared variables that aren't global
266 
267 Do the actual fork and call the microtask in the relevant number of threads.
268 */
269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
270   int gtid = __kmp_entry_gtid();
271 
272 #if (KMP_STATS_ENABLED)
273   // If we were in a serial region, then stop the serial timer, record
274   // the event, and start parallel region timer
275   stats_state_e previous_state = KMP_GET_THREAD_STATE();
276   if (previous_state == stats_state_e::SERIAL_REGION) {
277     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
278   } else {
279     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
280   }
281   int inParallel = __kmpc_in_parallel(loc);
282   if (inParallel) {
283     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
284   } else {
285     KMP_COUNT_BLOCK(OMP_PARALLEL);
286   }
287 #endif
288 
289   // maybe to save thr_state is enough here
290   {
291     va_list ap;
292     va_start(ap, microtask);
293 
294 #if OMPT_SUPPORT
295     ompt_frame_t *ompt_frame;
296     if (ompt_enabled.enabled) {
297       kmp_info_t *master_th = __kmp_threads[gtid];
298       kmp_team_t *parent_team = master_th->th.th_team;
299       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
300       if (lwt)
301         ompt_frame = &(lwt->ompt_task_info.frame);
302       else {
303         int tid = __kmp_tid_from_gtid(gtid);
304         ompt_frame = &(
305             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
306       }
307       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
308       OMPT_STORE_RETURN_ADDRESS(gtid);
309     }
310 #endif
311 
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_FORKING();
314 #endif
315     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
316                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
317                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
320                     &ap
321 #else
322                     ap
323 #endif
324                     );
325 #if INCLUDE_SSC_MARKS
326     SSC_MARK_JOINING();
327 #endif
328     __kmp_join_call(loc, gtid
329 #if OMPT_SUPPORT
330                     ,
331                     fork_context_intel
332 #endif
333                     );
334 
335     va_end(ap);
336   }
337 
338 #if KMP_STATS_ENABLED
339   if (previous_state == stats_state_e::SERIAL_REGION) {
340     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
341   } else {
342     KMP_POP_PARTITIONED_TIMER();
343   }
344 #endif // KMP_STATS_ENABLED
345 }
346 
347 #if OMP_40_ENABLED
348 /*!
349 @ingroup PARALLEL
350 @param loc source location information
351 @param global_tid global thread number
352 @param num_teams number of teams requested for the teams construct
353 @param num_threads number of threads per team requested for the teams construct
354 
355 Set the number of teams to be used by the teams construct.
356 This call is only required if the teams construct has a `num_teams` clause
357 or a `thread_limit` clause (or both).
358 */
359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
360                            kmp_int32 num_teams, kmp_int32 num_threads) {
361   KA_TRACE(20,
362            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
363             global_tid, num_teams, num_threads));
364 
365   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
366 }
367 
368 /*!
369 @ingroup PARALLEL
370 @param loc  source location information
371 @param argc  total number of arguments in the ellipsis
372 @param microtask  pointer to callback routine consisting of outlined teams
373 construct
374 @param ...  pointers to shared variables that aren't global
375 
376 Do the actual fork and call the microtask in the relevant number of threads.
377 */
378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
379                        ...) {
380   int gtid = __kmp_entry_gtid();
381   kmp_info_t *this_thr = __kmp_threads[gtid];
382   va_list ap;
383   va_start(ap, microtask);
384 
385 #if KMP_STATS_ENABLED
386   KMP_COUNT_BLOCK(OMP_TEAMS);
387   stats_state_e previous_state = KMP_GET_THREAD_STATE();
388   if (previous_state == stats_state_e::SERIAL_REGION) {
389     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
390   } else {
391     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
392   }
393 #endif
394 
395   // remember teams entry point and nesting level
396   this_thr->th.th_teams_microtask = microtask;
397   this_thr->th.th_teams_level =
398       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
399 
400 #if OMPT_SUPPORT
401   kmp_team_t *parent_team = this_thr->th.th_team;
402   int tid = __kmp_tid_from_gtid(gtid);
403   if (ompt_enabled.enabled) {
404     parent_team->t.t_implicit_task_taskdata[tid]
405         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
406   }
407   OMPT_STORE_RETURN_ADDRESS(gtid);
408 #endif
409 
410   // check if __kmpc_push_num_teams called, set default number of teams
411   // otherwise
412   if (this_thr->th.th_teams_size.nteams == 0) {
413     __kmp_push_num_teams(loc, gtid, 0, 0);
414   }
415   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
416   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
417   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
418 
419   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
420                   VOLATILE_CAST(microtask_t)
421                       __kmp_teams_master, // "wrapped" task
422                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
423 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
424                   &ap
425 #else
426                   ap
427 #endif
428                   );
429   __kmp_join_call(loc, gtid
430 #if OMPT_SUPPORT
431                   ,
432                   fork_context_intel
433 #endif
434                   );
435 
436   // Pop current CG root off list
437   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
438   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
439   this_thr->th.th_cg_roots = tmp->up;
440   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
441                  " to node %p. cg_nthreads was %d\n",
442                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
443   __kmp_free(tmp);
444   // Restore current task's thread_limit from CG root
445   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446   this_thr->th.th_current_task->td_icvs.thread_limit =
447       this_thr->th.th_cg_roots->cg_thread_limit;
448 
449   this_thr->th.th_teams_microtask = NULL;
450   this_thr->th.th_teams_level = 0;
451   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
452   va_end(ap);
453 #if KMP_STATS_ENABLED
454   if (previous_state == stats_state_e::SERIAL_REGION) {
455     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
456   } else {
457     KMP_POP_PARTITIONED_TIMER();
458   }
459 #endif // KMP_STATS_ENABLED
460 }
461 #endif /* OMP_40_ENABLED */
462 
463 // I don't think this function should ever have been exported.
464 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
465 // openmp code ever called it, but it's been exported from the RTL for so
466 // long that I'm afraid to remove the definition.
467 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
468 
469 /*!
470 @ingroup PARALLEL
471 @param loc  source location information
472 @param global_tid  global thread number
473 
474 Enter a serialized parallel construct. This interface is used to handle a
475 conditional parallel region, like this,
476 @code
477 #pragma omp parallel if (condition)
478 @endcode
479 when the condition is false.
480 */
481 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482 // The implementation is now in kmp_runtime.cpp so that it can share static
483 // functions with kmp_fork_call since the tasks to be done are similar in
484 // each case.
485 #if OMPT_SUPPORT
486   OMPT_STORE_RETURN_ADDRESS(global_tid);
487 #endif
488   __kmp_serialized_parallel(loc, global_tid);
489 }
490 
491 /*!
492 @ingroup PARALLEL
493 @param loc  source location information
494 @param global_tid  global thread number
495 
496 Leave a serialized parallel construct.
497 */
498 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
499   kmp_internal_control_t *top;
500   kmp_info_t *this_thr;
501   kmp_team_t *serial_team;
502 
503   KC_TRACE(10,
504            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
505 
506   /* skip all this code for autopar serialized loops since it results in
507      unacceptable overhead */
508   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
509     return;
510 
511   // Not autopar code
512   if (!TCR_4(__kmp_init_parallel))
513     __kmp_parallel_initialize();
514 
515 #if OMP_50_ENABLED
516   __kmp_resume_if_soft_paused();
517 #endif
518 
519   this_thr = __kmp_threads[global_tid];
520   serial_team = this_thr->th.th_serial_team;
521 
522 #if OMP_45_ENABLED
523   kmp_task_team_t *task_team = this_thr->th.th_task_team;
524 
525   // we need to wait for the proxy tasks before finishing the thread
526   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
527     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
528 #endif
529 
530   KMP_MB();
531   KMP_DEBUG_ASSERT(serial_team);
532   KMP_ASSERT(serial_team->t.t_serialized);
533   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
534   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
535   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
536   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
537 
538 #if OMPT_SUPPORT
539   if (ompt_enabled.enabled &&
540       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
541     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
542     if (ompt_enabled.ompt_callback_implicit_task) {
543       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
544           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
545           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
546     }
547 
548     // reset clear the task id only after unlinking the task
549     ompt_data_t *parent_task_data;
550     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
551 
552     if (ompt_enabled.ompt_callback_parallel_end) {
553       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
554           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
555           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
556     }
557     __ompt_lw_taskteam_unlink(this_thr);
558     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
559   }
560 #endif
561 
562   /* If necessary, pop the internal control stack values and replace the team
563    * values */
564   top = serial_team->t.t_control_stack_top;
565   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
566     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
567     serial_team->t.t_control_stack_top = top->next;
568     __kmp_free(top);
569   }
570 
571   // if( serial_team -> t.t_serialized > 1 )
572   serial_team->t.t_level--;
573 
574   /* pop dispatch buffers stack */
575   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
576   {
577     dispatch_private_info_t *disp_buffer =
578         serial_team->t.t_dispatch->th_disp_buffer;
579     serial_team->t.t_dispatch->th_disp_buffer =
580         serial_team->t.t_dispatch->th_disp_buffer->next;
581     __kmp_free(disp_buffer);
582   }
583 #if OMP_50_ENABLED
584   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
585 #endif
586 
587   --serial_team->t.t_serialized;
588   if (serial_team->t.t_serialized == 0) {
589 
590 /* return to the parallel section */
591 
592 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
593     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
594       __kmp_clear_x87_fpu_status_word();
595       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
596       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
597     }
598 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
599 
600     this_thr->th.th_team = serial_team->t.t_parent;
601     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
602 
603     /* restore values cached in the thread */
604     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
605     this_thr->th.th_team_master =
606         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
607     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
608 
609     /* TODO the below shouldn't need to be adjusted for serialized teams */
610     this_thr->th.th_dispatch =
611         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
612 
613     __kmp_pop_current_task_from_thread(this_thr);
614 
615     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
616     this_thr->th.th_current_task->td_flags.executing = 1;
617 
618     if (__kmp_tasking_mode != tskm_immediate_exec) {
619       // Copy the task team from the new child / old parent team to the thread.
620       this_thr->th.th_task_team =
621           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
622       KA_TRACE(20,
623                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
624                 "team %p\n",
625                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
626     }
627   } else {
628     if (__kmp_tasking_mode != tskm_immediate_exec) {
629       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
630                     "depth of serial team %p to %d\n",
631                     global_tid, serial_team, serial_team->t.t_serialized));
632     }
633   }
634 
635   if (__kmp_env_consistency_check)
636     __kmp_pop_parallel(global_tid, NULL);
637 #if OMPT_SUPPORT
638   if (ompt_enabled.enabled)
639     this_thr->th.ompt_thread_info.state =
640         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
641                                            : ompt_state_work_parallel);
642 #endif
643 }
644 
645 /*!
646 @ingroup SYNCHRONIZATION
647 @param loc  source location information.
648 
649 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
650 depending on the memory ordering convention obeyed by the compiler
651 even that may not be necessary).
652 */
653 void __kmpc_flush(ident_t *loc) {
654   KC_TRACE(10, ("__kmpc_flush: called\n"));
655 
656   /* need explicit __mf() here since use volatile instead in library */
657   KMP_MB(); /* Flush all pending memory write invalidates.  */
658 
659 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
660 #if KMP_MIC
661 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
662 // We shouldn't need it, though, since the ABI rules require that
663 // * If the compiler generates NGO stores it also generates the fence
664 // * If users hand-code NGO stores they should insert the fence
665 // therefore no incomplete unordered stores should be visible.
666 #else
667   // C74404
668   // This is to address non-temporal store instructions (sfence needed).
669   // The clflush instruction is addressed either (mfence needed).
670   // Probably the non-temporal load monvtdqa instruction should also be
671   // addressed.
672   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
673   if (!__kmp_cpuinfo.initialized) {
674     __kmp_query_cpuid(&__kmp_cpuinfo);
675   }
676   if (!__kmp_cpuinfo.sse2) {
677     // CPU cannot execute SSE2 instructions.
678   } else {
679 #if KMP_COMPILER_ICC
680     _mm_mfence();
681 #elif KMP_COMPILER_MSVC
682     MemoryBarrier();
683 #else
684     __sync_synchronize();
685 #endif // KMP_COMPILER_ICC
686   }
687 #endif // KMP_MIC
688 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
689 // Nothing to see here move along
690 #elif KMP_ARCH_PPC64
691 // Nothing needed here (we have a real MB above).
692 #if KMP_OS_CNK
693   // The flushing thread needs to yield here; this prevents a
694   // busy-waiting thread from saturating the pipeline. flush is
695   // often used in loops like this:
696   // while (!flag) {
697   //   #pragma omp flush(flag)
698   // }
699   // and adding the yield here is good for at least a 10x speedup
700   // when running >2 threads per core (on the NAS LU benchmark).
701   __kmp_yield();
702 #endif
703 #else
704 #error Unknown or unsupported architecture
705 #endif
706 
707 #if OMPT_SUPPORT && OMPT_OPTIONAL
708   if (ompt_enabled.ompt_callback_flush) {
709     ompt_callbacks.ompt_callback(ompt_callback_flush)(
710         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
711   }
712 #endif
713 }
714 
715 /* -------------------------------------------------------------------------- */
716 /*!
717 @ingroup SYNCHRONIZATION
718 @param loc source location information
719 @param global_tid thread id.
720 
721 Execute a barrier.
722 */
723 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
724   KMP_COUNT_BLOCK(OMP_BARRIER);
725   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729 
730 #if OMP_50_ENABLED
731   __kmp_resume_if_soft_paused();
732 #endif
733 
734   if (__kmp_env_consistency_check) {
735     if (loc == 0) {
736       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
737     }
738 
739     __kmp_check_barrier(global_tid, ct_barrier, loc);
740   }
741 
742 #if OMPT_SUPPORT
743   ompt_frame_t *ompt_frame;
744   if (ompt_enabled.enabled) {
745     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
746     if (ompt_frame->enter_frame.ptr == NULL)
747       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
748     OMPT_STORE_RETURN_ADDRESS(global_tid);
749   }
750 #endif
751   __kmp_threads[global_tid]->th.th_ident = loc;
752   // TODO: explicit barrier_wait_id:
753   //   this function is called when 'barrier' directive is present or
754   //   implicit barrier at the end of a worksharing construct.
755   // 1) better to add a per-thread barrier counter to a thread data structure
756   // 2) set to 0 when a new team is created
757   // 4) no sync is required
758 
759   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
760 #if OMPT_SUPPORT && OMPT_OPTIONAL
761   if (ompt_enabled.enabled) {
762     ompt_frame->enter_frame = ompt_data_none;
763   }
764 #endif
765 }
766 
767 /* The BARRIER for a MASTER section is always explicit   */
768 /*!
769 @ingroup WORK_SHARING
770 @param loc  source location information.
771 @param global_tid  global thread number .
772 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
773 */
774 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
775   int status = 0;
776 
777   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
778 
779   if (!TCR_4(__kmp_init_parallel))
780     __kmp_parallel_initialize();
781 
782 #if OMP_50_ENABLED
783   __kmp_resume_if_soft_paused();
784 #endif
785 
786   if (KMP_MASTER_GTID(global_tid)) {
787     KMP_COUNT_BLOCK(OMP_MASTER);
788     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
789     status = 1;
790   }
791 
792 #if OMPT_SUPPORT && OMPT_OPTIONAL
793   if (status) {
794     if (ompt_enabled.ompt_callback_master) {
795       kmp_info_t *this_thr = __kmp_threads[global_tid];
796       kmp_team_t *team = this_thr->th.th_team;
797 
798       int tid = __kmp_tid_from_gtid(global_tid);
799       ompt_callbacks.ompt_callback(ompt_callback_master)(
800           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
801           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
802           OMPT_GET_RETURN_ADDRESS(0));
803     }
804   }
805 #endif
806 
807   if (__kmp_env_consistency_check) {
808 #if KMP_USE_DYNAMIC_LOCK
809     if (status)
810       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
811     else
812       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
813 #else
814     if (status)
815       __kmp_push_sync(global_tid, ct_master, loc, NULL);
816     else
817       __kmp_check_sync(global_tid, ct_master, loc, NULL);
818 #endif
819   }
820 
821   return status;
822 }
823 
824 /*!
825 @ingroup WORK_SHARING
826 @param loc  source location information.
827 @param global_tid  global thread number .
828 
829 Mark the end of a <tt>master</tt> region. This should only be called by the
830 thread that executes the <tt>master</tt> region.
831 */
832 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
833   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
834 
835   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
836   KMP_POP_PARTITIONED_TIMER();
837 
838 #if OMPT_SUPPORT && OMPT_OPTIONAL
839   kmp_info_t *this_thr = __kmp_threads[global_tid];
840   kmp_team_t *team = this_thr->th.th_team;
841   if (ompt_enabled.ompt_callback_master) {
842     int tid = __kmp_tid_from_gtid(global_tid);
843     ompt_callbacks.ompt_callback(ompt_callback_master)(
844         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
845         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
846         OMPT_GET_RETURN_ADDRESS(0));
847   }
848 #endif
849 
850   if (__kmp_env_consistency_check) {
851     if (global_tid < 0)
852       KMP_WARNING(ThreadIdentInvalid);
853 
854     if (KMP_MASTER_GTID(global_tid))
855       __kmp_pop_sync(global_tid, ct_master, loc);
856   }
857 }
858 
859 /*!
860 @ingroup WORK_SHARING
861 @param loc  source location information.
862 @param gtid  global thread number.
863 
864 Start execution of an <tt>ordered</tt> construct.
865 */
866 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
867   int cid = 0;
868   kmp_info_t *th;
869   KMP_DEBUG_ASSERT(__kmp_init_serial);
870 
871   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
872 
873   if (!TCR_4(__kmp_init_parallel))
874     __kmp_parallel_initialize();
875 
876 #if OMP_50_ENABLED
877   __kmp_resume_if_soft_paused();
878 #endif
879 
880 #if USE_ITT_BUILD
881   __kmp_itt_ordered_prep(gtid);
882 // TODO: ordered_wait_id
883 #endif /* USE_ITT_BUILD */
884 
885   th = __kmp_threads[gtid];
886 
887 #if OMPT_SUPPORT && OMPT_OPTIONAL
888   kmp_team_t *team;
889   ompt_wait_id_t lck;
890   void *codeptr_ra;
891   if (ompt_enabled.enabled) {
892     OMPT_STORE_RETURN_ADDRESS(gtid);
893     team = __kmp_team_from_gtid(gtid);
894     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
895     /* OMPT state update */
896     th->th.ompt_thread_info.wait_id = lck;
897     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
898 
899     /* OMPT event callback */
900     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
901     if (ompt_enabled.ompt_callback_mutex_acquire) {
902       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
903           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
904           codeptr_ra);
905     }
906   }
907 #endif
908 
909   if (th->th.th_dispatch->th_deo_fcn != 0)
910     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
911   else
912     __kmp_parallel_deo(&gtid, &cid, loc);
913 
914 #if OMPT_SUPPORT && OMPT_OPTIONAL
915   if (ompt_enabled.enabled) {
916     /* OMPT state update */
917     th->th.ompt_thread_info.state = ompt_state_work_parallel;
918     th->th.ompt_thread_info.wait_id = 0;
919 
920     /* OMPT event callback */
921     if (ompt_enabled.ompt_callback_mutex_acquired) {
922       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
923           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
924     }
925   }
926 #endif
927 
928 #if USE_ITT_BUILD
929   __kmp_itt_ordered_start(gtid);
930 #endif /* USE_ITT_BUILD */
931 }
932 
933 /*!
934 @ingroup WORK_SHARING
935 @param loc  source location information.
936 @param gtid  global thread number.
937 
938 End execution of an <tt>ordered</tt> construct.
939 */
940 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
941   int cid = 0;
942   kmp_info_t *th;
943 
944   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
945 
946 #if USE_ITT_BUILD
947   __kmp_itt_ordered_end(gtid);
948 // TODO: ordered_wait_id
949 #endif /* USE_ITT_BUILD */
950 
951   th = __kmp_threads[gtid];
952 
953   if (th->th.th_dispatch->th_dxo_fcn != 0)
954     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
955   else
956     __kmp_parallel_dxo(&gtid, &cid, loc);
957 
958 #if OMPT_SUPPORT && OMPT_OPTIONAL
959   OMPT_STORE_RETURN_ADDRESS(gtid);
960   if (ompt_enabled.ompt_callback_mutex_released) {
961     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
962         ompt_mutex_ordered,
963         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
964             ->t.t_ordered.dt.t_value,
965         OMPT_LOAD_RETURN_ADDRESS(gtid));
966   }
967 #endif
968 }
969 
970 #if KMP_USE_DYNAMIC_LOCK
971 
972 static __forceinline void
973 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
974                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
975   // Pointer to the allocated indirect lock is written to crit, while indexing
976   // is ignored.
977   void *idx;
978   kmp_indirect_lock_t **lck;
979   lck = (kmp_indirect_lock_t **)crit;
980   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
981   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
982   KMP_SET_I_LOCK_LOCATION(ilk, loc);
983   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
984   KA_TRACE(20,
985            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
986 #if USE_ITT_BUILD
987   __kmp_itt_critical_creating(ilk->lock, loc);
988 #endif
989   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
990   if (status == 0) {
991 #if USE_ITT_BUILD
992     __kmp_itt_critical_destroyed(ilk->lock);
993 #endif
994     // We don't really need to destroy the unclaimed lock here since it will be
995     // cleaned up at program exit.
996     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
997   }
998   KMP_DEBUG_ASSERT(*lck != NULL);
999 }
1000 
1001 // Fast-path acquire tas lock
1002 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1003   {                                                                            \
1004     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1005     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1006     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1007     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1008         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1009       kmp_uint32 spins;                                                        \
1010       KMP_FSYNC_PREPARE(l);                                                    \
1011       KMP_INIT_YIELD(spins);                                                   \
1012       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1013       do {                                                                     \
1014         if (TCR_4(__kmp_nth) >                                                 \
1015             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1016           KMP_YIELD(TRUE);                                                     \
1017         } else {                                                               \
1018           KMP_YIELD_SPIN(spins);                                               \
1019         }                                                                      \
1020         __kmp_spin_backoff(&backoff);                                          \
1021       } while (                                                                \
1022           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1023           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1024     }                                                                          \
1025     KMP_FSYNC_ACQUIRED(l);                                                     \
1026   }
1027 
1028 // Fast-path test tas lock
1029 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1030   {                                                                            \
1031     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1032     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1033     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1034     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1035          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1036   }
1037 
1038 // Fast-path release tas lock
1039 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1040   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1041 
1042 #if KMP_USE_FUTEX
1043 
1044 #include <sys/syscall.h>
1045 #include <unistd.h>
1046 #ifndef FUTEX_WAIT
1047 #define FUTEX_WAIT 0
1048 #endif
1049 #ifndef FUTEX_WAKE
1050 #define FUTEX_WAKE 1
1051 #endif
1052 
1053 // Fast-path acquire futex lock
1054 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1055   {                                                                            \
1056     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1057     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1058     KMP_MB();                                                                  \
1059     KMP_FSYNC_PREPARE(ftx);                                                    \
1060     kmp_int32 poll_val;                                                        \
1061     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1062                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1063                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1064       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1065       if (!cond) {                                                             \
1066         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1067                                          poll_val |                            \
1068                                              KMP_LOCK_BUSY(1, futex))) {       \
1069           continue;                                                            \
1070         }                                                                      \
1071         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1072       }                                                                        \
1073       kmp_int32 rc;                                                            \
1074       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1075                         NULL, NULL, 0)) != 0) {                                \
1076         continue;                                                              \
1077       }                                                                        \
1078       gtid_code |= 1;                                                          \
1079     }                                                                          \
1080     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1081   }
1082 
1083 // Fast-path test futex lock
1084 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1085   {                                                                            \
1086     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1087     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1088                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1089       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1090       rc = TRUE;                                                               \
1091     } else {                                                                   \
1092       rc = FALSE;                                                              \
1093     }                                                                          \
1094   }
1095 
1096 // Fast-path release futex lock
1097 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1098   {                                                                            \
1099     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1100     KMP_MB();                                                                  \
1101     KMP_FSYNC_RELEASING(ftx);                                                  \
1102     kmp_int32 poll_val =                                                       \
1103         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1104     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1105       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1106               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1107     }                                                                          \
1108     KMP_MB();                                                                  \
1109     KMP_YIELD_OVERSUB();                                                       \
1110   }
1111 
1112 #endif // KMP_USE_FUTEX
1113 
1114 #else // KMP_USE_DYNAMIC_LOCK
1115 
1116 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1117                                                       ident_t const *loc,
1118                                                       kmp_int32 gtid) {
1119   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1120 
1121   // Because of the double-check, the following load doesn't need to be volatile
1122   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1123 
1124   if (lck == NULL) {
1125     void *idx;
1126 
1127     // Allocate & initialize the lock.
1128     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1129     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1130     __kmp_init_user_lock_with_checks(lck);
1131     __kmp_set_user_lock_location(lck, loc);
1132 #if USE_ITT_BUILD
1133     __kmp_itt_critical_creating(lck);
1134 // __kmp_itt_critical_creating() should be called *before* the first usage
1135 // of underlying lock. It is the only place where we can guarantee it. There
1136 // are chances the lock will destroyed with no usage, but it is not a
1137 // problem, because this is not real event seen by user but rather setting
1138 // name for object (lock). See more details in kmp_itt.h.
1139 #endif /* USE_ITT_BUILD */
1140 
1141     // Use a cmpxchg instruction to slam the start of the critical section with
1142     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1143     // and use the lock that the other thread allocated.
1144     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1145 
1146     if (status == 0) {
1147 // Deallocate the lock and reload the value.
1148 #if USE_ITT_BUILD
1149       __kmp_itt_critical_destroyed(lck);
1150 // Let ITT know the lock is destroyed and the same memory location may be reused
1151 // for another purpose.
1152 #endif /* USE_ITT_BUILD */
1153       __kmp_destroy_user_lock_with_checks(lck);
1154       __kmp_user_lock_free(&idx, gtid, lck);
1155       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1156       KMP_DEBUG_ASSERT(lck != NULL);
1157     }
1158   }
1159   return lck;
1160 }
1161 
1162 #endif // KMP_USE_DYNAMIC_LOCK
1163 
1164 /*!
1165 @ingroup WORK_SHARING
1166 @param loc  source location information.
1167 @param global_tid  global thread number .
1168 @param crit identity of the critical section. This could be a pointer to a lock
1169 associated with the critical section, or some other suitably unique value.
1170 
1171 Enter code protected by a `critical` construct.
1172 This function blocks until the executing thread can enter the critical section.
1173 */
1174 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1175                      kmp_critical_name *crit) {
1176 #if KMP_USE_DYNAMIC_LOCK
1177 #if OMPT_SUPPORT && OMPT_OPTIONAL
1178   OMPT_STORE_RETURN_ADDRESS(global_tid);
1179 #endif // OMPT_SUPPORT
1180   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1181 #else
1182   KMP_COUNT_BLOCK(OMP_CRITICAL);
1183 #if OMPT_SUPPORT && OMPT_OPTIONAL
1184   ompt_state_t prev_state = ompt_state_undefined;
1185   ompt_thread_info_t ti;
1186 #endif
1187   kmp_user_lock_p lck;
1188 
1189   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1190 
1191   // TODO: add THR_OVHD_STATE
1192 
1193   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1194   KMP_CHECK_USER_LOCK_INIT();
1195 
1196   if ((__kmp_user_lock_kind == lk_tas) &&
1197       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1198     lck = (kmp_user_lock_p)crit;
1199   }
1200 #if KMP_USE_FUTEX
1201   else if ((__kmp_user_lock_kind == lk_futex) &&
1202            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1203     lck = (kmp_user_lock_p)crit;
1204   }
1205 #endif
1206   else { // ticket, queuing or drdpa
1207     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1208   }
1209 
1210   if (__kmp_env_consistency_check)
1211     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1212 
1213 // since the critical directive binds to all threads, not just the current
1214 // team we have to check this even if we are in a serialized team.
1215 // also, even if we are the uber thread, we still have to conduct the lock,
1216 // as we have to contend with sibling threads.
1217 
1218 #if USE_ITT_BUILD
1219   __kmp_itt_critical_acquiring(lck);
1220 #endif /* USE_ITT_BUILD */
1221 #if OMPT_SUPPORT && OMPT_OPTIONAL
1222   OMPT_STORE_RETURN_ADDRESS(gtid);
1223   void *codeptr_ra = NULL;
1224   if (ompt_enabled.enabled) {
1225     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1226     /* OMPT state update */
1227     prev_state = ti.state;
1228     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1229     ti.state = ompt_state_wait_critical;
1230 
1231     /* OMPT event callback */
1232     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1233     if (ompt_enabled.ompt_callback_mutex_acquire) {
1234       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1235           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1236           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1237     }
1238   }
1239 #endif
1240   // Value of 'crit' should be good for using as a critical_id of the critical
1241   // section directive.
1242   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1243 
1244 #if USE_ITT_BUILD
1245   __kmp_itt_critical_acquired(lck);
1246 #endif /* USE_ITT_BUILD */
1247 #if OMPT_SUPPORT && OMPT_OPTIONAL
1248   if (ompt_enabled.enabled) {
1249     /* OMPT state update */
1250     ti.state = prev_state;
1251     ti.wait_id = 0;
1252 
1253     /* OMPT event callback */
1254     if (ompt_enabled.ompt_callback_mutex_acquired) {
1255       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1256           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1257     }
1258   }
1259 #endif
1260   KMP_POP_PARTITIONED_TIMER();
1261 
1262   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1263   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1264 #endif // KMP_USE_DYNAMIC_LOCK
1265 }
1266 
1267 #if KMP_USE_DYNAMIC_LOCK
1268 
1269 // Converts the given hint to an internal lock implementation
1270 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1271 #if KMP_USE_TSX
1272 #define KMP_TSX_LOCK(seq) lockseq_##seq
1273 #else
1274 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1275 #endif
1276 
1277 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1278 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1279 #else
1280 #define KMP_CPUINFO_RTM 0
1281 #endif
1282 
1283   // Hints that do not require further logic
1284   if (hint & kmp_lock_hint_hle)
1285     return KMP_TSX_LOCK(hle);
1286   if (hint & kmp_lock_hint_rtm)
1287     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1288   if (hint & kmp_lock_hint_adaptive)
1289     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1290 
1291   // Rule out conflicting hints first by returning the default lock
1292   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1293     return __kmp_user_lock_seq;
1294   if ((hint & omp_lock_hint_speculative) &&
1295       (hint & omp_lock_hint_nonspeculative))
1296     return __kmp_user_lock_seq;
1297 
1298   // Do not even consider speculation when it appears to be contended
1299   if (hint & omp_lock_hint_contended)
1300     return lockseq_queuing;
1301 
1302   // Uncontended lock without speculation
1303   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1304     return lockseq_tas;
1305 
1306   // HLE lock for speculation
1307   if (hint & omp_lock_hint_speculative)
1308     return KMP_TSX_LOCK(hle);
1309 
1310   return __kmp_user_lock_seq;
1311 }
1312 
1313 #if OMPT_SUPPORT && OMPT_OPTIONAL
1314 #if KMP_USE_DYNAMIC_LOCK
1315 static kmp_mutex_impl_t
1316 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1317   if (user_lock) {
1318     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1319     case 0:
1320       break;
1321 #if KMP_USE_FUTEX
1322     case locktag_futex:
1323       return kmp_mutex_impl_queuing;
1324 #endif
1325     case locktag_tas:
1326       return kmp_mutex_impl_spin;
1327 #if KMP_USE_TSX
1328     case locktag_hle:
1329       return kmp_mutex_impl_speculative;
1330 #endif
1331     default:
1332       return kmp_mutex_impl_none;
1333     }
1334     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1335   }
1336   KMP_ASSERT(ilock);
1337   switch (ilock->type) {
1338 #if KMP_USE_TSX
1339   case locktag_adaptive:
1340   case locktag_rtm:
1341     return kmp_mutex_impl_speculative;
1342 #endif
1343   case locktag_nested_tas:
1344     return kmp_mutex_impl_spin;
1345 #if KMP_USE_FUTEX
1346   case locktag_nested_futex:
1347 #endif
1348   case locktag_ticket:
1349   case locktag_queuing:
1350   case locktag_drdpa:
1351   case locktag_nested_ticket:
1352   case locktag_nested_queuing:
1353   case locktag_nested_drdpa:
1354     return kmp_mutex_impl_queuing;
1355   default:
1356     return kmp_mutex_impl_none;
1357   }
1358 }
1359 #else
1360 // For locks without dynamic binding
1361 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1362   switch (__kmp_user_lock_kind) {
1363   case lk_tas:
1364     return kmp_mutex_impl_spin;
1365 #if KMP_USE_FUTEX
1366   case lk_futex:
1367 #endif
1368   case lk_ticket:
1369   case lk_queuing:
1370   case lk_drdpa:
1371     return kmp_mutex_impl_queuing;
1372 #if KMP_USE_TSX
1373   case lk_hle:
1374   case lk_rtm:
1375   case lk_adaptive:
1376     return kmp_mutex_impl_speculative;
1377 #endif
1378   default:
1379     return kmp_mutex_impl_none;
1380   }
1381 }
1382 #endif // KMP_USE_DYNAMIC_LOCK
1383 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1384 
1385 /*!
1386 @ingroup WORK_SHARING
1387 @param loc  source location information.
1388 @param global_tid  global thread number.
1389 @param crit identity of the critical section. This could be a pointer to a lock
1390 associated with the critical section, or some other suitably unique value.
1391 @param hint the lock hint.
1392 
1393 Enter code protected by a `critical` construct with a hint. The hint value is
1394 used to suggest a lock implementation. This function blocks until the executing
1395 thread can enter the critical section unless the hint suggests use of
1396 speculative execution and the hardware supports it.
1397 */
1398 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1399                                kmp_critical_name *crit, uint32_t hint) {
1400   KMP_COUNT_BLOCK(OMP_CRITICAL);
1401   kmp_user_lock_p lck;
1402 #if OMPT_SUPPORT && OMPT_OPTIONAL
1403   ompt_state_t prev_state = ompt_state_undefined;
1404   ompt_thread_info_t ti;
1405   // This is the case, if called from __kmpc_critical:
1406   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1407   if (!codeptr)
1408     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1409 #endif
1410 
1411   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1412 
1413   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1414   // Check if it is initialized.
1415   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1416   if (*lk == 0) {
1417     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1418     if (KMP_IS_D_LOCK(lckseq)) {
1419       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1420                                   KMP_GET_D_TAG(lckseq));
1421     } else {
1422       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1423     }
1424   }
1425   // Branch for accessing the actual lock object and set operation. This
1426   // branching is inevitable since this lock initialization does not follow the
1427   // normal dispatch path (lock table is not used).
1428   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1429     lck = (kmp_user_lock_p)lk;
1430     if (__kmp_env_consistency_check) {
1431       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1432                       __kmp_map_hint_to_lock(hint));
1433     }
1434 #if USE_ITT_BUILD
1435     __kmp_itt_critical_acquiring(lck);
1436 #endif
1437 #if OMPT_SUPPORT && OMPT_OPTIONAL
1438     if (ompt_enabled.enabled) {
1439       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1440       /* OMPT state update */
1441       prev_state = ti.state;
1442       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1443       ti.state = ompt_state_wait_critical;
1444 
1445       /* OMPT event callback */
1446       if (ompt_enabled.ompt_callback_mutex_acquire) {
1447         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1448             ompt_mutex_critical, (unsigned int)hint,
1449             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1450             codeptr);
1451       }
1452     }
1453 #endif
1454 #if KMP_USE_INLINED_TAS
1455     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1456       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1457     } else
1458 #elif KMP_USE_INLINED_FUTEX
1459     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1460       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1461     } else
1462 #endif
1463     {
1464       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1465     }
1466   } else {
1467     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1468     lck = ilk->lock;
1469     if (__kmp_env_consistency_check) {
1470       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1471                       __kmp_map_hint_to_lock(hint));
1472     }
1473 #if USE_ITT_BUILD
1474     __kmp_itt_critical_acquiring(lck);
1475 #endif
1476 #if OMPT_SUPPORT && OMPT_OPTIONAL
1477     if (ompt_enabled.enabled) {
1478       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1479       /* OMPT state update */
1480       prev_state = ti.state;
1481       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1482       ti.state = ompt_state_wait_critical;
1483 
1484       /* OMPT event callback */
1485       if (ompt_enabled.ompt_callback_mutex_acquire) {
1486         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1487             ompt_mutex_critical, (unsigned int)hint,
1488             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1489             codeptr);
1490       }
1491     }
1492 #endif
1493     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1494   }
1495   KMP_POP_PARTITIONED_TIMER();
1496 
1497 #if USE_ITT_BUILD
1498   __kmp_itt_critical_acquired(lck);
1499 #endif /* USE_ITT_BUILD */
1500 #if OMPT_SUPPORT && OMPT_OPTIONAL
1501   if (ompt_enabled.enabled) {
1502     /* OMPT state update */
1503     ti.state = prev_state;
1504     ti.wait_id = 0;
1505 
1506     /* OMPT event callback */
1507     if (ompt_enabled.ompt_callback_mutex_acquired) {
1508       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1509           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1510     }
1511   }
1512 #endif
1513 
1514   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1515   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1516 } // __kmpc_critical_with_hint
1517 
1518 #endif // KMP_USE_DYNAMIC_LOCK
1519 
1520 /*!
1521 @ingroup WORK_SHARING
1522 @param loc  source location information.
1523 @param global_tid  global thread number .
1524 @param crit identity of the critical section. This could be a pointer to a lock
1525 associated with the critical section, or some other suitably unique value.
1526 
1527 Leave a critical section, releasing any lock that was held during its execution.
1528 */
1529 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1530                          kmp_critical_name *crit) {
1531   kmp_user_lock_p lck;
1532 
1533   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1534 
1535 #if KMP_USE_DYNAMIC_LOCK
1536   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1537     lck = (kmp_user_lock_p)crit;
1538     KMP_ASSERT(lck != NULL);
1539     if (__kmp_env_consistency_check) {
1540       __kmp_pop_sync(global_tid, ct_critical, loc);
1541     }
1542 #if USE_ITT_BUILD
1543     __kmp_itt_critical_releasing(lck);
1544 #endif
1545 #if KMP_USE_INLINED_TAS
1546     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1547       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1548     } else
1549 #elif KMP_USE_INLINED_FUTEX
1550     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1551       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1552     } else
1553 #endif
1554     {
1555       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1556     }
1557   } else {
1558     kmp_indirect_lock_t *ilk =
1559         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1560     KMP_ASSERT(ilk != NULL);
1561     lck = ilk->lock;
1562     if (__kmp_env_consistency_check) {
1563       __kmp_pop_sync(global_tid, ct_critical, loc);
1564     }
1565 #if USE_ITT_BUILD
1566     __kmp_itt_critical_releasing(lck);
1567 #endif
1568     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1569   }
1570 
1571 #else // KMP_USE_DYNAMIC_LOCK
1572 
1573   if ((__kmp_user_lock_kind == lk_tas) &&
1574       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1575     lck = (kmp_user_lock_p)crit;
1576   }
1577 #if KMP_USE_FUTEX
1578   else if ((__kmp_user_lock_kind == lk_futex) &&
1579            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1580     lck = (kmp_user_lock_p)crit;
1581   }
1582 #endif
1583   else { // ticket, queuing or drdpa
1584     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1585   }
1586 
1587   KMP_ASSERT(lck != NULL);
1588 
1589   if (__kmp_env_consistency_check)
1590     __kmp_pop_sync(global_tid, ct_critical, loc);
1591 
1592 #if USE_ITT_BUILD
1593   __kmp_itt_critical_releasing(lck);
1594 #endif /* USE_ITT_BUILD */
1595   // Value of 'crit' should be good for using as a critical_id of the critical
1596   // section directive.
1597   __kmp_release_user_lock_with_checks(lck, global_tid);
1598 
1599 #endif // KMP_USE_DYNAMIC_LOCK
1600 
1601 #if OMPT_SUPPORT && OMPT_OPTIONAL
1602   /* OMPT release event triggers after lock is released; place here to trigger
1603    * for all #if branches */
1604   OMPT_STORE_RETURN_ADDRESS(global_tid);
1605   if (ompt_enabled.ompt_callback_mutex_released) {
1606     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1607         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1608         OMPT_LOAD_RETURN_ADDRESS(0));
1609   }
1610 #endif
1611 
1612   KMP_POP_PARTITIONED_TIMER();
1613   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1614 }
1615 
1616 /*!
1617 @ingroup SYNCHRONIZATION
1618 @param loc source location information
1619 @param global_tid thread id.
1620 @return one if the thread should execute the master block, zero otherwise
1621 
1622 Start execution of a combined barrier and master. The barrier is executed inside
1623 this function.
1624 */
1625 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1626   int status;
1627 
1628   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1629 
1630   if (!TCR_4(__kmp_init_parallel))
1631     __kmp_parallel_initialize();
1632 
1633 #if OMP_50_ENABLED
1634   __kmp_resume_if_soft_paused();
1635 #endif
1636 
1637   if (__kmp_env_consistency_check)
1638     __kmp_check_barrier(global_tid, ct_barrier, loc);
1639 
1640 #if OMPT_SUPPORT
1641   ompt_frame_t *ompt_frame;
1642   if (ompt_enabled.enabled) {
1643     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1644     if (ompt_frame->enter_frame.ptr == NULL)
1645       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1646     OMPT_STORE_RETURN_ADDRESS(global_tid);
1647   }
1648 #endif
1649 #if USE_ITT_NOTIFY
1650   __kmp_threads[global_tid]->th.th_ident = loc;
1651 #endif
1652   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1653 #if OMPT_SUPPORT && OMPT_OPTIONAL
1654   if (ompt_enabled.enabled) {
1655     ompt_frame->enter_frame = ompt_data_none;
1656   }
1657 #endif
1658 
1659   return (status != 0) ? 0 : 1;
1660 }
1661 
1662 /*!
1663 @ingroup SYNCHRONIZATION
1664 @param loc source location information
1665 @param global_tid thread id.
1666 
1667 Complete the execution of a combined barrier and master. This function should
1668 only be called at the completion of the <tt>master</tt> code. Other threads will
1669 still be waiting at the barrier and this call releases them.
1670 */
1671 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1672   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1673 
1674   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1675 }
1676 
1677 /*!
1678 @ingroup SYNCHRONIZATION
1679 @param loc source location information
1680 @param global_tid thread id.
1681 @return one if the thread should execute the master block, zero otherwise
1682 
1683 Start execution of a combined barrier and master(nowait) construct.
1684 The barrier is executed inside this function.
1685 There is no equivalent "end" function, since the
1686 */
1687 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1688   kmp_int32 ret;
1689 
1690   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1691 
1692   if (!TCR_4(__kmp_init_parallel))
1693     __kmp_parallel_initialize();
1694 
1695 #if OMP_50_ENABLED
1696   __kmp_resume_if_soft_paused();
1697 #endif
1698 
1699   if (__kmp_env_consistency_check) {
1700     if (loc == 0) {
1701       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1702     }
1703     __kmp_check_barrier(global_tid, ct_barrier, loc);
1704   }
1705 
1706 #if OMPT_SUPPORT
1707   ompt_frame_t *ompt_frame;
1708   if (ompt_enabled.enabled) {
1709     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1710     if (ompt_frame->enter_frame.ptr == NULL)
1711       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1712     OMPT_STORE_RETURN_ADDRESS(global_tid);
1713   }
1714 #endif
1715 #if USE_ITT_NOTIFY
1716   __kmp_threads[global_tid]->th.th_ident = loc;
1717 #endif
1718   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1719 #if OMPT_SUPPORT && OMPT_OPTIONAL
1720   if (ompt_enabled.enabled) {
1721     ompt_frame->enter_frame = ompt_data_none;
1722   }
1723 #endif
1724 
1725   ret = __kmpc_master(loc, global_tid);
1726 
1727   if (__kmp_env_consistency_check) {
1728     /*  there's no __kmpc_end_master called; so the (stats) */
1729     /*  actions of __kmpc_end_master are done here          */
1730 
1731     if (global_tid < 0) {
1732       KMP_WARNING(ThreadIdentInvalid);
1733     }
1734     if (ret) {
1735       /* only one thread should do the pop since only */
1736       /* one did the push (see __kmpc_master())       */
1737 
1738       __kmp_pop_sync(global_tid, ct_master, loc);
1739     }
1740   }
1741 
1742   return (ret);
1743 }
1744 
1745 /* The BARRIER for a SINGLE process section is always explicit   */
1746 /*!
1747 @ingroup WORK_SHARING
1748 @param loc  source location information
1749 @param global_tid  global thread number
1750 @return One if this thread should execute the single construct, zero otherwise.
1751 
1752 Test whether to execute a <tt>single</tt> construct.
1753 There are no implicit barriers in the two "single" calls, rather the compiler
1754 should introduce an explicit barrier if it is required.
1755 */
1756 
1757 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1758   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1759 
1760   if (rc) {
1761     // We are going to execute the single statement, so we should count it.
1762     KMP_COUNT_BLOCK(OMP_SINGLE);
1763     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1764   }
1765 
1766 #if OMPT_SUPPORT && OMPT_OPTIONAL
1767   kmp_info_t *this_thr = __kmp_threads[global_tid];
1768   kmp_team_t *team = this_thr->th.th_team;
1769   int tid = __kmp_tid_from_gtid(global_tid);
1770 
1771   if (ompt_enabled.enabled) {
1772     if (rc) {
1773       if (ompt_enabled.ompt_callback_work) {
1774         ompt_callbacks.ompt_callback(ompt_callback_work)(
1775             ompt_work_single_executor, ompt_scope_begin,
1776             &(team->t.ompt_team_info.parallel_data),
1777             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1778             1, OMPT_GET_RETURN_ADDRESS(0));
1779       }
1780     } else {
1781       if (ompt_enabled.ompt_callback_work) {
1782         ompt_callbacks.ompt_callback(ompt_callback_work)(
1783             ompt_work_single_other, ompt_scope_begin,
1784             &(team->t.ompt_team_info.parallel_data),
1785             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1786             1, OMPT_GET_RETURN_ADDRESS(0));
1787         ompt_callbacks.ompt_callback(ompt_callback_work)(
1788             ompt_work_single_other, ompt_scope_end,
1789             &(team->t.ompt_team_info.parallel_data),
1790             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1791             1, OMPT_GET_RETURN_ADDRESS(0));
1792       }
1793     }
1794   }
1795 #endif
1796 
1797   return rc;
1798 }
1799 
1800 /*!
1801 @ingroup WORK_SHARING
1802 @param loc  source location information
1803 @param global_tid  global thread number
1804 
1805 Mark the end of a <tt>single</tt> construct.  This function should
1806 only be called by the thread that executed the block of code protected
1807 by the `single` construct.
1808 */
1809 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1810   __kmp_exit_single(global_tid);
1811   KMP_POP_PARTITIONED_TIMER();
1812 
1813 #if OMPT_SUPPORT && OMPT_OPTIONAL
1814   kmp_info_t *this_thr = __kmp_threads[global_tid];
1815   kmp_team_t *team = this_thr->th.th_team;
1816   int tid = __kmp_tid_from_gtid(global_tid);
1817 
1818   if (ompt_enabled.ompt_callback_work) {
1819     ompt_callbacks.ompt_callback(ompt_callback_work)(
1820         ompt_work_single_executor, ompt_scope_end,
1821         &(team->t.ompt_team_info.parallel_data),
1822         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1823         OMPT_GET_RETURN_ADDRESS(0));
1824   }
1825 #endif
1826 }
1827 
1828 /*!
1829 @ingroup WORK_SHARING
1830 @param loc Source location
1831 @param global_tid Global thread id
1832 
1833 Mark the end of a statically scheduled loop.
1834 */
1835 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1836   KMP_POP_PARTITIONED_TIMER();
1837   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1838 
1839 #if OMPT_SUPPORT && OMPT_OPTIONAL
1840   if (ompt_enabled.ompt_callback_work) {
1841     ompt_work_t ompt_work_type = ompt_work_loop;
1842     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1843     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1844     // Determine workshare type
1845     if (loc != NULL) {
1846       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1847         ompt_work_type = ompt_work_loop;
1848       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1849         ompt_work_type = ompt_work_sections;
1850       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1851         ompt_work_type = ompt_work_distribute;
1852       } else {
1853         // use default set above.
1854         // a warning about this case is provided in __kmpc_for_static_init
1855       }
1856       KMP_DEBUG_ASSERT(ompt_work_type);
1857     }
1858     ompt_callbacks.ompt_callback(ompt_callback_work)(
1859         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1860         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1861   }
1862 #endif
1863   if (__kmp_env_consistency_check)
1864     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1865 }
1866 
1867 // User routines which take C-style arguments (call by value)
1868 // different from the Fortran equivalent routines
1869 
1870 void ompc_set_num_threads(int arg) {
1871   // !!!!! TODO: check the per-task binding
1872   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1873 }
1874 
1875 void ompc_set_dynamic(int flag) {
1876   kmp_info_t *thread;
1877 
1878   /* For the thread-private implementation of the internal controls */
1879   thread = __kmp_entry_thread();
1880 
1881   __kmp_save_internal_controls(thread);
1882 
1883   set__dynamic(thread, flag ? TRUE : FALSE);
1884 }
1885 
1886 void ompc_set_nested(int flag) {
1887   kmp_info_t *thread;
1888 
1889   /* For the thread-private internal controls implementation */
1890   thread = __kmp_entry_thread();
1891 
1892   __kmp_save_internal_controls(thread);
1893 
1894   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1895 }
1896 
1897 void ompc_set_max_active_levels(int max_active_levels) {
1898   /* TO DO */
1899   /* we want per-task implementation of this internal control */
1900 
1901   /* For the per-thread internal controls implementation */
1902   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1903 }
1904 
1905 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1906   // !!!!! TODO: check the per-task binding
1907   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1908 }
1909 
1910 int ompc_get_ancestor_thread_num(int level) {
1911   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1912 }
1913 
1914 int ompc_get_team_size(int level) {
1915   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1916 }
1917 
1918 #if OMP_50_ENABLED
1919 /* OpenMP 5.0 Affinity Format API */
1920 
1921 void ompc_set_affinity_format(char const *format) {
1922   if (!__kmp_init_serial) {
1923     __kmp_serial_initialize();
1924   }
1925   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1926                          format, KMP_STRLEN(format) + 1);
1927 }
1928 
1929 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1930   size_t format_size;
1931   if (!__kmp_init_serial) {
1932     __kmp_serial_initialize();
1933   }
1934   format_size = KMP_STRLEN(__kmp_affinity_format);
1935   if (buffer && size) {
1936     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1937                            format_size + 1);
1938   }
1939   return format_size;
1940 }
1941 
1942 void ompc_display_affinity(char const *format) {
1943   int gtid;
1944   if (!TCR_4(__kmp_init_middle)) {
1945     __kmp_middle_initialize();
1946   }
1947   gtid = __kmp_get_gtid();
1948   __kmp_aux_display_affinity(gtid, format);
1949 }
1950 
1951 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1952                              char const *format) {
1953   int gtid;
1954   size_t num_required;
1955   kmp_str_buf_t capture_buf;
1956   if (!TCR_4(__kmp_init_middle)) {
1957     __kmp_middle_initialize();
1958   }
1959   gtid = __kmp_get_gtid();
1960   __kmp_str_buf_init(&capture_buf);
1961   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1962   if (buffer && buf_size) {
1963     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1964                            capture_buf.used + 1);
1965   }
1966   __kmp_str_buf_free(&capture_buf);
1967   return num_required;
1968 }
1969 #endif /* OMP_50_ENABLED */
1970 
1971 void kmpc_set_stacksize(int arg) {
1972   // __kmp_aux_set_stacksize initializes the library if needed
1973   __kmp_aux_set_stacksize(arg);
1974 }
1975 
1976 void kmpc_set_stacksize_s(size_t arg) {
1977   // __kmp_aux_set_stacksize initializes the library if needed
1978   __kmp_aux_set_stacksize(arg);
1979 }
1980 
1981 void kmpc_set_blocktime(int arg) {
1982   int gtid, tid;
1983   kmp_info_t *thread;
1984 
1985   gtid = __kmp_entry_gtid();
1986   tid = __kmp_tid_from_gtid(gtid);
1987   thread = __kmp_thread_from_gtid(gtid);
1988 
1989   __kmp_aux_set_blocktime(arg, thread, tid);
1990 }
1991 
1992 void kmpc_set_library(int arg) {
1993   // __kmp_user_set_library initializes the library if needed
1994   __kmp_user_set_library((enum library_type)arg);
1995 }
1996 
1997 void kmpc_set_defaults(char const *str) {
1998   // __kmp_aux_set_defaults initializes the library if needed
1999   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2000 }
2001 
2002 void kmpc_set_disp_num_buffers(int arg) {
2003   // ignore after initialization because some teams have already
2004   // allocated dispatch buffers
2005   if (__kmp_init_serial == 0 && arg > 0)
2006     __kmp_dispatch_num_buffers = arg;
2007 }
2008 
2009 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2010 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2011   return -1;
2012 #else
2013   if (!TCR_4(__kmp_init_middle)) {
2014     __kmp_middle_initialize();
2015   }
2016   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2017 #endif
2018 }
2019 
2020 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2021 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2022   return -1;
2023 #else
2024   if (!TCR_4(__kmp_init_middle)) {
2025     __kmp_middle_initialize();
2026   }
2027   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2028 #endif
2029 }
2030 
2031 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2032 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2033   return -1;
2034 #else
2035   if (!TCR_4(__kmp_init_middle)) {
2036     __kmp_middle_initialize();
2037   }
2038   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2039 #endif
2040 }
2041 
2042 /* -------------------------------------------------------------------------- */
2043 /*!
2044 @ingroup THREADPRIVATE
2045 @param loc       source location information
2046 @param gtid      global thread number
2047 @param cpy_size  size of the cpy_data buffer
2048 @param cpy_data  pointer to data to be copied
2049 @param cpy_func  helper function to call for copying data
2050 @param didit     flag variable: 1=single thread; 0=not single thread
2051 
2052 __kmpc_copyprivate implements the interface for the private data broadcast
2053 needed for the copyprivate clause associated with a single region in an
2054 OpenMP<sup>*</sup> program (both C and Fortran).
2055 All threads participating in the parallel region call this routine.
2056 One of the threads (called the single thread) should have the <tt>didit</tt>
2057 variable set to 1 and all other threads should have that variable set to 0.
2058 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2059 
2060 The OpenMP specification forbids the use of nowait on the single region when a
2061 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2062 barrier internally to avoid race conditions, so the code generation for the
2063 single region should avoid generating a barrier after the call to @ref
2064 __kmpc_copyprivate.
2065 
2066 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2067 The <tt>loc</tt> parameter is a pointer to source location information.
2068 
2069 Internal implementation: The single thread will first copy its descriptor
2070 address (cpy_data) to a team-private location, then the other threads will each
2071 call the function pointed to by the parameter cpy_func, which carries out the
2072 copy by copying the data using the cpy_data buffer.
2073 
2074 The cpy_func routine used for the copy and the contents of the data area defined
2075 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2076 to be done. For instance, the cpy_data buffer can hold the actual data to be
2077 copied or it may hold a list of pointers to the data. The cpy_func routine must
2078 interpret the cpy_data buffer appropriately.
2079 
2080 The interface to cpy_func is as follows:
2081 @code
2082 void cpy_func( void *destination, void *source )
2083 @endcode
2084 where void *destination is the cpy_data pointer for the thread being copied to
2085 and void *source is the cpy_data pointer for the thread being copied from.
2086 */
2087 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2088                         void *cpy_data, void (*cpy_func)(void *, void *),
2089                         kmp_int32 didit) {
2090   void **data_ptr;
2091 
2092   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2093 
2094   KMP_MB();
2095 
2096   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2097 
2098   if (__kmp_env_consistency_check) {
2099     if (loc == 0) {
2100       KMP_WARNING(ConstructIdentInvalid);
2101     }
2102   }
2103 
2104   // ToDo: Optimize the following two barriers into some kind of split barrier
2105 
2106   if (didit)
2107     *data_ptr = cpy_data;
2108 
2109 #if OMPT_SUPPORT
2110   ompt_frame_t *ompt_frame;
2111   if (ompt_enabled.enabled) {
2112     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2113     if (ompt_frame->enter_frame.ptr == NULL)
2114       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2115     OMPT_STORE_RETURN_ADDRESS(gtid);
2116   }
2117 #endif
2118 /* This barrier is not a barrier region boundary */
2119 #if USE_ITT_NOTIFY
2120   __kmp_threads[gtid]->th.th_ident = loc;
2121 #endif
2122   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2123 
2124   if (!didit)
2125     (*cpy_func)(cpy_data, *data_ptr);
2126 
2127 // Consider next barrier a user-visible barrier for barrier region boundaries
2128 // Nesting checks are already handled by the single construct checks
2129 
2130 #if OMPT_SUPPORT
2131   if (ompt_enabled.enabled) {
2132     OMPT_STORE_RETURN_ADDRESS(gtid);
2133   }
2134 #endif
2135 #if USE_ITT_NOTIFY
2136   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2137 // tasks can overwrite the location)
2138 #endif
2139   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2140 #if OMPT_SUPPORT && OMPT_OPTIONAL
2141   if (ompt_enabled.enabled) {
2142     ompt_frame->enter_frame = ompt_data_none;
2143   }
2144 #endif
2145 }
2146 
2147 /* -------------------------------------------------------------------------- */
2148 
2149 #define INIT_LOCK __kmp_init_user_lock_with_checks
2150 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2151 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2152 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2153 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2154 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2155   __kmp_acquire_nested_user_lock_with_checks_timed
2156 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2157 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2158 #define TEST_LOCK __kmp_test_user_lock_with_checks
2159 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2160 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2161 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2162 
2163 // TODO: Make check abort messages use location info & pass it into
2164 // with_checks routines
2165 
2166 #if KMP_USE_DYNAMIC_LOCK
2167 
2168 // internal lock initializer
2169 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2170                                                     kmp_dyna_lockseq_t seq) {
2171   if (KMP_IS_D_LOCK(seq)) {
2172     KMP_INIT_D_LOCK(lock, seq);
2173 #if USE_ITT_BUILD
2174     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2175 #endif
2176   } else {
2177     KMP_INIT_I_LOCK(lock, seq);
2178 #if USE_ITT_BUILD
2179     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2180     __kmp_itt_lock_creating(ilk->lock, loc);
2181 #endif
2182   }
2183 }
2184 
2185 // internal nest lock initializer
2186 static __forceinline void
2187 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2188                                kmp_dyna_lockseq_t seq) {
2189 #if KMP_USE_TSX
2190   // Don't have nested lock implementation for speculative locks
2191   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2192     seq = __kmp_user_lock_seq;
2193 #endif
2194   switch (seq) {
2195   case lockseq_tas:
2196     seq = lockseq_nested_tas;
2197     break;
2198 #if KMP_USE_FUTEX
2199   case lockseq_futex:
2200     seq = lockseq_nested_futex;
2201     break;
2202 #endif
2203   case lockseq_ticket:
2204     seq = lockseq_nested_ticket;
2205     break;
2206   case lockseq_queuing:
2207     seq = lockseq_nested_queuing;
2208     break;
2209   case lockseq_drdpa:
2210     seq = lockseq_nested_drdpa;
2211     break;
2212   default:
2213     seq = lockseq_nested_queuing;
2214   }
2215   KMP_INIT_I_LOCK(lock, seq);
2216 #if USE_ITT_BUILD
2217   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2218   __kmp_itt_lock_creating(ilk->lock, loc);
2219 #endif
2220 }
2221 
2222 /* initialize the lock with a hint */
2223 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2224                                 uintptr_t hint) {
2225   KMP_DEBUG_ASSERT(__kmp_init_serial);
2226   if (__kmp_env_consistency_check && user_lock == NULL) {
2227     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2228   }
2229 
2230   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2231 
2232 #if OMPT_SUPPORT && OMPT_OPTIONAL
2233   // This is the case, if called from omp_init_lock_with_hint:
2234   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2235   if (!codeptr)
2236     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2237   if (ompt_enabled.ompt_callback_lock_init) {
2238     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2239         ompt_mutex_lock, (omp_lock_hint_t)hint,
2240         __ompt_get_mutex_impl_type(user_lock),
2241         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2242   }
2243 #endif
2244 }
2245 
2246 /* initialize the lock with a hint */
2247 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2248                                      void **user_lock, uintptr_t hint) {
2249   KMP_DEBUG_ASSERT(__kmp_init_serial);
2250   if (__kmp_env_consistency_check && user_lock == NULL) {
2251     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2252   }
2253 
2254   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2255 
2256 #if OMPT_SUPPORT && OMPT_OPTIONAL
2257   // This is the case, if called from omp_init_lock_with_hint:
2258   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2259   if (!codeptr)
2260     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2261   if (ompt_enabled.ompt_callback_lock_init) {
2262     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2263         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2264         __ompt_get_mutex_impl_type(user_lock),
2265         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2266   }
2267 #endif
2268 }
2269 
2270 #endif // KMP_USE_DYNAMIC_LOCK
2271 
2272 /* initialize the lock */
2273 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2274 #if KMP_USE_DYNAMIC_LOCK
2275 
2276   KMP_DEBUG_ASSERT(__kmp_init_serial);
2277   if (__kmp_env_consistency_check && user_lock == NULL) {
2278     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2279   }
2280   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2281 
2282 #if OMPT_SUPPORT && OMPT_OPTIONAL
2283   // This is the case, if called from omp_init_lock_with_hint:
2284   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2285   if (!codeptr)
2286     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2287   if (ompt_enabled.ompt_callback_lock_init) {
2288     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2289         ompt_mutex_lock, omp_lock_hint_none,
2290         __ompt_get_mutex_impl_type(user_lock),
2291         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2292   }
2293 #endif
2294 
2295 #else // KMP_USE_DYNAMIC_LOCK
2296 
2297   static char const *const func = "omp_init_lock";
2298   kmp_user_lock_p lck;
2299   KMP_DEBUG_ASSERT(__kmp_init_serial);
2300 
2301   if (__kmp_env_consistency_check) {
2302     if (user_lock == NULL) {
2303       KMP_FATAL(LockIsUninitialized, func);
2304     }
2305   }
2306 
2307   KMP_CHECK_USER_LOCK_INIT();
2308 
2309   if ((__kmp_user_lock_kind == lk_tas) &&
2310       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2311     lck = (kmp_user_lock_p)user_lock;
2312   }
2313 #if KMP_USE_FUTEX
2314   else if ((__kmp_user_lock_kind == lk_futex) &&
2315            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2316     lck = (kmp_user_lock_p)user_lock;
2317   }
2318 #endif
2319   else {
2320     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2321   }
2322   INIT_LOCK(lck);
2323   __kmp_set_user_lock_location(lck, loc);
2324 
2325 #if OMPT_SUPPORT && OMPT_OPTIONAL
2326   // This is the case, if called from omp_init_lock_with_hint:
2327   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2328   if (!codeptr)
2329     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2330   if (ompt_enabled.ompt_callback_lock_init) {
2331     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2332         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2333         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2334   }
2335 #endif
2336 
2337 #if USE_ITT_BUILD
2338   __kmp_itt_lock_creating(lck);
2339 #endif /* USE_ITT_BUILD */
2340 
2341 #endif // KMP_USE_DYNAMIC_LOCK
2342 } // __kmpc_init_lock
2343 
2344 /* initialize the lock */
2345 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2346 #if KMP_USE_DYNAMIC_LOCK
2347 
2348   KMP_DEBUG_ASSERT(__kmp_init_serial);
2349   if (__kmp_env_consistency_check && user_lock == NULL) {
2350     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2351   }
2352   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2353 
2354 #if OMPT_SUPPORT && OMPT_OPTIONAL
2355   // This is the case, if called from omp_init_lock_with_hint:
2356   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2357   if (!codeptr)
2358     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2359   if (ompt_enabled.ompt_callback_lock_init) {
2360     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2361         ompt_mutex_nest_lock, omp_lock_hint_none,
2362         __ompt_get_mutex_impl_type(user_lock),
2363         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2364   }
2365 #endif
2366 
2367 #else // KMP_USE_DYNAMIC_LOCK
2368 
2369   static char const *const func = "omp_init_nest_lock";
2370   kmp_user_lock_p lck;
2371   KMP_DEBUG_ASSERT(__kmp_init_serial);
2372 
2373   if (__kmp_env_consistency_check) {
2374     if (user_lock == NULL) {
2375       KMP_FATAL(LockIsUninitialized, func);
2376     }
2377   }
2378 
2379   KMP_CHECK_USER_LOCK_INIT();
2380 
2381   if ((__kmp_user_lock_kind == lk_tas) &&
2382       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2383        OMP_NEST_LOCK_T_SIZE)) {
2384     lck = (kmp_user_lock_p)user_lock;
2385   }
2386 #if KMP_USE_FUTEX
2387   else if ((__kmp_user_lock_kind == lk_futex) &&
2388            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2389             OMP_NEST_LOCK_T_SIZE)) {
2390     lck = (kmp_user_lock_p)user_lock;
2391   }
2392 #endif
2393   else {
2394     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2395   }
2396 
2397   INIT_NESTED_LOCK(lck);
2398   __kmp_set_user_lock_location(lck, loc);
2399 
2400 #if OMPT_SUPPORT && OMPT_OPTIONAL
2401   // This is the case, if called from omp_init_lock_with_hint:
2402   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2403   if (!codeptr)
2404     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2405   if (ompt_enabled.ompt_callback_lock_init) {
2406     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2407         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2408         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2409   }
2410 #endif
2411 
2412 #if USE_ITT_BUILD
2413   __kmp_itt_lock_creating(lck);
2414 #endif /* USE_ITT_BUILD */
2415 
2416 #endif // KMP_USE_DYNAMIC_LOCK
2417 } // __kmpc_init_nest_lock
2418 
2419 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2420 #if KMP_USE_DYNAMIC_LOCK
2421 
2422 #if USE_ITT_BUILD
2423   kmp_user_lock_p lck;
2424   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2425     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2426   } else {
2427     lck = (kmp_user_lock_p)user_lock;
2428   }
2429   __kmp_itt_lock_destroyed(lck);
2430 #endif
2431 #if OMPT_SUPPORT && OMPT_OPTIONAL
2432   // This is the case, if called from omp_init_lock_with_hint:
2433   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2434   if (!codeptr)
2435     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2436   if (ompt_enabled.ompt_callback_lock_destroy) {
2437     kmp_user_lock_p lck;
2438     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2439       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2440     } else {
2441       lck = (kmp_user_lock_p)user_lock;
2442     }
2443     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2444         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2445   }
2446 #endif
2447   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2448 #else
2449   kmp_user_lock_p lck;
2450 
2451   if ((__kmp_user_lock_kind == lk_tas) &&
2452       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2453     lck = (kmp_user_lock_p)user_lock;
2454   }
2455 #if KMP_USE_FUTEX
2456   else if ((__kmp_user_lock_kind == lk_futex) &&
2457            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2458     lck = (kmp_user_lock_p)user_lock;
2459   }
2460 #endif
2461   else {
2462     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2463   }
2464 
2465 #if OMPT_SUPPORT && OMPT_OPTIONAL
2466   // This is the case, if called from omp_init_lock_with_hint:
2467   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2468   if (!codeptr)
2469     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2470   if (ompt_enabled.ompt_callback_lock_destroy) {
2471     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2472         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2473   }
2474 #endif
2475 
2476 #if USE_ITT_BUILD
2477   __kmp_itt_lock_destroyed(lck);
2478 #endif /* USE_ITT_BUILD */
2479   DESTROY_LOCK(lck);
2480 
2481   if ((__kmp_user_lock_kind == lk_tas) &&
2482       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2483     ;
2484   }
2485 #if KMP_USE_FUTEX
2486   else if ((__kmp_user_lock_kind == lk_futex) &&
2487            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2488     ;
2489   }
2490 #endif
2491   else {
2492     __kmp_user_lock_free(user_lock, gtid, lck);
2493   }
2494 #endif // KMP_USE_DYNAMIC_LOCK
2495 } // __kmpc_destroy_lock
2496 
2497 /* destroy the lock */
2498 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2499 #if KMP_USE_DYNAMIC_LOCK
2500 
2501 #if USE_ITT_BUILD
2502   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2503   __kmp_itt_lock_destroyed(ilk->lock);
2504 #endif
2505 #if OMPT_SUPPORT && OMPT_OPTIONAL
2506   // This is the case, if called from omp_init_lock_with_hint:
2507   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2508   if (!codeptr)
2509     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2510   if (ompt_enabled.ompt_callback_lock_destroy) {
2511     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2512         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2513   }
2514 #endif
2515   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2516 
2517 #else // KMP_USE_DYNAMIC_LOCK
2518 
2519   kmp_user_lock_p lck;
2520 
2521   if ((__kmp_user_lock_kind == lk_tas) &&
2522       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2523        OMP_NEST_LOCK_T_SIZE)) {
2524     lck = (kmp_user_lock_p)user_lock;
2525   }
2526 #if KMP_USE_FUTEX
2527   else if ((__kmp_user_lock_kind == lk_futex) &&
2528            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2529             OMP_NEST_LOCK_T_SIZE)) {
2530     lck = (kmp_user_lock_p)user_lock;
2531   }
2532 #endif
2533   else {
2534     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2535   }
2536 
2537 #if OMPT_SUPPORT && OMPT_OPTIONAL
2538   // This is the case, if called from omp_init_lock_with_hint:
2539   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2540   if (!codeptr)
2541     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2542   if (ompt_enabled.ompt_callback_lock_destroy) {
2543     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2544         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2545   }
2546 #endif
2547 
2548 #if USE_ITT_BUILD
2549   __kmp_itt_lock_destroyed(lck);
2550 #endif /* USE_ITT_BUILD */
2551 
2552   DESTROY_NESTED_LOCK(lck);
2553 
2554   if ((__kmp_user_lock_kind == lk_tas) &&
2555       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2556        OMP_NEST_LOCK_T_SIZE)) {
2557     ;
2558   }
2559 #if KMP_USE_FUTEX
2560   else if ((__kmp_user_lock_kind == lk_futex) &&
2561            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2562             OMP_NEST_LOCK_T_SIZE)) {
2563     ;
2564   }
2565 #endif
2566   else {
2567     __kmp_user_lock_free(user_lock, gtid, lck);
2568   }
2569 #endif // KMP_USE_DYNAMIC_LOCK
2570 } // __kmpc_destroy_nest_lock
2571 
2572 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2573   KMP_COUNT_BLOCK(OMP_set_lock);
2574 #if KMP_USE_DYNAMIC_LOCK
2575   int tag = KMP_EXTRACT_D_TAG(user_lock);
2576 #if USE_ITT_BUILD
2577   __kmp_itt_lock_acquiring(
2578       (kmp_user_lock_p)
2579           user_lock); // itt function will get to the right lock object.
2580 #endif
2581 #if OMPT_SUPPORT && OMPT_OPTIONAL
2582   // This is the case, if called from omp_init_lock_with_hint:
2583   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2584   if (!codeptr)
2585     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2586   if (ompt_enabled.ompt_callback_mutex_acquire) {
2587     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2588         ompt_mutex_lock, omp_lock_hint_none,
2589         __ompt_get_mutex_impl_type(user_lock),
2590         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2591   }
2592 #endif
2593 #if KMP_USE_INLINED_TAS
2594   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2595     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2596   } else
2597 #elif KMP_USE_INLINED_FUTEX
2598   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2599     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2600   } else
2601 #endif
2602   {
2603     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2604   }
2605 #if USE_ITT_BUILD
2606   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2607 #endif
2608 #if OMPT_SUPPORT && OMPT_OPTIONAL
2609   if (ompt_enabled.ompt_callback_mutex_acquired) {
2610     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2611         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2612   }
2613 #endif
2614 
2615 #else // KMP_USE_DYNAMIC_LOCK
2616 
2617   kmp_user_lock_p lck;
2618 
2619   if ((__kmp_user_lock_kind == lk_tas) &&
2620       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2621     lck = (kmp_user_lock_p)user_lock;
2622   }
2623 #if KMP_USE_FUTEX
2624   else if ((__kmp_user_lock_kind == lk_futex) &&
2625            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2626     lck = (kmp_user_lock_p)user_lock;
2627   }
2628 #endif
2629   else {
2630     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2631   }
2632 
2633 #if USE_ITT_BUILD
2634   __kmp_itt_lock_acquiring(lck);
2635 #endif /* USE_ITT_BUILD */
2636 #if OMPT_SUPPORT && OMPT_OPTIONAL
2637   // This is the case, if called from omp_init_lock_with_hint:
2638   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2639   if (!codeptr)
2640     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2641   if (ompt_enabled.ompt_callback_mutex_acquire) {
2642     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2643         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2644         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2645   }
2646 #endif
2647 
2648   ACQUIRE_LOCK(lck, gtid);
2649 
2650 #if USE_ITT_BUILD
2651   __kmp_itt_lock_acquired(lck);
2652 #endif /* USE_ITT_BUILD */
2653 
2654 #if OMPT_SUPPORT && OMPT_OPTIONAL
2655   if (ompt_enabled.ompt_callback_mutex_acquired) {
2656     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2657         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2658   }
2659 #endif
2660 
2661 #endif // KMP_USE_DYNAMIC_LOCK
2662 }
2663 
2664 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2665 #if KMP_USE_DYNAMIC_LOCK
2666 
2667 #if USE_ITT_BUILD
2668   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2669 #endif
2670 #if OMPT_SUPPORT && OMPT_OPTIONAL
2671   // This is the case, if called from omp_init_lock_with_hint:
2672   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2673   if (!codeptr)
2674     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2675   if (ompt_enabled.enabled) {
2676     if (ompt_enabled.ompt_callback_mutex_acquire) {
2677       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2678           ompt_mutex_nest_lock, omp_lock_hint_none,
2679           __ompt_get_mutex_impl_type(user_lock),
2680           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2681     }
2682   }
2683 #endif
2684   int acquire_status =
2685       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2686   (void) acquire_status;
2687 #if USE_ITT_BUILD
2688   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2689 #endif
2690 
2691 #if OMPT_SUPPORT && OMPT_OPTIONAL
2692   if (ompt_enabled.enabled) {
2693     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2694       if (ompt_enabled.ompt_callback_mutex_acquired) {
2695         // lock_first
2696         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2697             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2698             codeptr);
2699       }
2700     } else {
2701       if (ompt_enabled.ompt_callback_nest_lock) {
2702         // lock_next
2703         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2704             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2705       }
2706     }
2707   }
2708 #endif
2709 
2710 #else // KMP_USE_DYNAMIC_LOCK
2711   int acquire_status;
2712   kmp_user_lock_p lck;
2713 
2714   if ((__kmp_user_lock_kind == lk_tas) &&
2715       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2716        OMP_NEST_LOCK_T_SIZE)) {
2717     lck = (kmp_user_lock_p)user_lock;
2718   }
2719 #if KMP_USE_FUTEX
2720   else if ((__kmp_user_lock_kind == lk_futex) &&
2721            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2722             OMP_NEST_LOCK_T_SIZE)) {
2723     lck = (kmp_user_lock_p)user_lock;
2724   }
2725 #endif
2726   else {
2727     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2728   }
2729 
2730 #if USE_ITT_BUILD
2731   __kmp_itt_lock_acquiring(lck);
2732 #endif /* USE_ITT_BUILD */
2733 #if OMPT_SUPPORT && OMPT_OPTIONAL
2734   // This is the case, if called from omp_init_lock_with_hint:
2735   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2736   if (!codeptr)
2737     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2738   if (ompt_enabled.enabled) {
2739     if (ompt_enabled.ompt_callback_mutex_acquire) {
2740       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2741           ompt_mutex_nest_lock, omp_lock_hint_none,
2742           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2743           codeptr);
2744     }
2745   }
2746 #endif
2747 
2748   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2749 
2750 #if USE_ITT_BUILD
2751   __kmp_itt_lock_acquired(lck);
2752 #endif /* USE_ITT_BUILD */
2753 
2754 #if OMPT_SUPPORT && OMPT_OPTIONAL
2755   if (ompt_enabled.enabled) {
2756     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2757       if (ompt_enabled.ompt_callback_mutex_acquired) {
2758         // lock_first
2759         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2760             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2761       }
2762     } else {
2763       if (ompt_enabled.ompt_callback_nest_lock) {
2764         // lock_next
2765         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2766             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2767       }
2768     }
2769   }
2770 #endif
2771 
2772 #endif // KMP_USE_DYNAMIC_LOCK
2773 }
2774 
2775 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2776 #if KMP_USE_DYNAMIC_LOCK
2777 
2778   int tag = KMP_EXTRACT_D_TAG(user_lock);
2779 #if USE_ITT_BUILD
2780   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2781 #endif
2782 #if KMP_USE_INLINED_TAS
2783   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2784     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2785   } else
2786 #elif KMP_USE_INLINED_FUTEX
2787   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2788     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2789   } else
2790 #endif
2791   {
2792     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2793   }
2794 
2795 #if OMPT_SUPPORT && OMPT_OPTIONAL
2796   // This is the case, if called from omp_init_lock_with_hint:
2797   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2798   if (!codeptr)
2799     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2800   if (ompt_enabled.ompt_callback_mutex_released) {
2801     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2802         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2803   }
2804 #endif
2805 
2806 #else // KMP_USE_DYNAMIC_LOCK
2807 
2808   kmp_user_lock_p lck;
2809 
2810   /* Can't use serial interval since not block structured */
2811   /* release the lock */
2812 
2813   if ((__kmp_user_lock_kind == lk_tas) &&
2814       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2815 #if KMP_OS_LINUX &&                                                            \
2816     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2817 // "fast" path implemented to fix customer performance issue
2818 #if USE_ITT_BUILD
2819     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2820 #endif /* USE_ITT_BUILD */
2821     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2822     KMP_MB();
2823 
2824 #if OMPT_SUPPORT && OMPT_OPTIONAL
2825     // This is the case, if called from omp_init_lock_with_hint:
2826     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2827     if (!codeptr)
2828       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2829     if (ompt_enabled.ompt_callback_mutex_released) {
2830       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2831           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2832     }
2833 #endif
2834 
2835     return;
2836 #else
2837     lck = (kmp_user_lock_p)user_lock;
2838 #endif
2839   }
2840 #if KMP_USE_FUTEX
2841   else if ((__kmp_user_lock_kind == lk_futex) &&
2842            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2843     lck = (kmp_user_lock_p)user_lock;
2844   }
2845 #endif
2846   else {
2847     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2848   }
2849 
2850 #if USE_ITT_BUILD
2851   __kmp_itt_lock_releasing(lck);
2852 #endif /* USE_ITT_BUILD */
2853 
2854   RELEASE_LOCK(lck, gtid);
2855 
2856 #if OMPT_SUPPORT && OMPT_OPTIONAL
2857   // This is the case, if called from omp_init_lock_with_hint:
2858   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2859   if (!codeptr)
2860     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2861   if (ompt_enabled.ompt_callback_mutex_released) {
2862     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2863         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2864   }
2865 #endif
2866 
2867 #endif // KMP_USE_DYNAMIC_LOCK
2868 }
2869 
2870 /* release the lock */
2871 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2872 #if KMP_USE_DYNAMIC_LOCK
2873 
2874 #if USE_ITT_BUILD
2875   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2876 #endif
2877   int release_status =
2878       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2879   (void) release_status;
2880 
2881 #if OMPT_SUPPORT && OMPT_OPTIONAL
2882   // This is the case, if called from omp_init_lock_with_hint:
2883   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2884   if (!codeptr)
2885     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2886   if (ompt_enabled.enabled) {
2887     if (release_status == KMP_LOCK_RELEASED) {
2888       if (ompt_enabled.ompt_callback_mutex_released) {
2889         // release_lock_last
2890         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2891             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2892             codeptr);
2893       }
2894     } else if (ompt_enabled.ompt_callback_nest_lock) {
2895       // release_lock_prev
2896       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2897           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2898     }
2899   }
2900 #endif
2901 
2902 #else // KMP_USE_DYNAMIC_LOCK
2903 
2904   kmp_user_lock_p lck;
2905 
2906   /* Can't use serial interval since not block structured */
2907 
2908   if ((__kmp_user_lock_kind == lk_tas) &&
2909       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2910        OMP_NEST_LOCK_T_SIZE)) {
2911 #if KMP_OS_LINUX &&                                                            \
2912     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2913     // "fast" path implemented to fix customer performance issue
2914     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2915 #if USE_ITT_BUILD
2916     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2917 #endif /* USE_ITT_BUILD */
2918 
2919 #if OMPT_SUPPORT && OMPT_OPTIONAL
2920     int release_status = KMP_LOCK_STILL_HELD;
2921 #endif
2922 
2923     if (--(tl->lk.depth_locked) == 0) {
2924       TCW_4(tl->lk.poll, 0);
2925 #if OMPT_SUPPORT && OMPT_OPTIONAL
2926       release_status = KMP_LOCK_RELEASED;
2927 #endif
2928     }
2929     KMP_MB();
2930 
2931 #if OMPT_SUPPORT && OMPT_OPTIONAL
2932     // This is the case, if called from omp_init_lock_with_hint:
2933     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2934     if (!codeptr)
2935       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2936     if (ompt_enabled.enabled) {
2937       if (release_status == KMP_LOCK_RELEASED) {
2938         if (ompt_enabled.ompt_callback_mutex_released) {
2939           // release_lock_last
2940           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2941               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2942         }
2943       } else if (ompt_enabled.ompt_callback_nest_lock) {
2944         // release_lock_previous
2945         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2946             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2947       }
2948     }
2949 #endif
2950 
2951     return;
2952 #else
2953     lck = (kmp_user_lock_p)user_lock;
2954 #endif
2955   }
2956 #if KMP_USE_FUTEX
2957   else if ((__kmp_user_lock_kind == lk_futex) &&
2958            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2959             OMP_NEST_LOCK_T_SIZE)) {
2960     lck = (kmp_user_lock_p)user_lock;
2961   }
2962 #endif
2963   else {
2964     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2965   }
2966 
2967 #if USE_ITT_BUILD
2968   __kmp_itt_lock_releasing(lck);
2969 #endif /* USE_ITT_BUILD */
2970 
2971   int release_status;
2972   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2973 #if OMPT_SUPPORT && OMPT_OPTIONAL
2974   // This is the case, if called from omp_init_lock_with_hint:
2975   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2976   if (!codeptr)
2977     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2978   if (ompt_enabled.enabled) {
2979     if (release_status == KMP_LOCK_RELEASED) {
2980       if (ompt_enabled.ompt_callback_mutex_released) {
2981         // release_lock_last
2982         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2983             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2984       }
2985     } else if (ompt_enabled.ompt_callback_nest_lock) {
2986       // release_lock_previous
2987       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2988           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2989     }
2990   }
2991 #endif
2992 
2993 #endif // KMP_USE_DYNAMIC_LOCK
2994 }
2995 
2996 /* try to acquire the lock */
2997 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2998   KMP_COUNT_BLOCK(OMP_test_lock);
2999 
3000 #if KMP_USE_DYNAMIC_LOCK
3001   int rc;
3002   int tag = KMP_EXTRACT_D_TAG(user_lock);
3003 #if USE_ITT_BUILD
3004   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3005 #endif
3006 #if OMPT_SUPPORT && OMPT_OPTIONAL
3007   // This is the case, if called from omp_init_lock_with_hint:
3008   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3009   if (!codeptr)
3010     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3011   if (ompt_enabled.ompt_callback_mutex_acquire) {
3012     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3013         ompt_mutex_lock, omp_lock_hint_none,
3014         __ompt_get_mutex_impl_type(user_lock),
3015         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3016   }
3017 #endif
3018 #if KMP_USE_INLINED_TAS
3019   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3020     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3021   } else
3022 #elif KMP_USE_INLINED_FUTEX
3023   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3024     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3025   } else
3026 #endif
3027   {
3028     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3029   }
3030   if (rc) {
3031 #if USE_ITT_BUILD
3032     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3033 #endif
3034 #if OMPT_SUPPORT && OMPT_OPTIONAL
3035     if (ompt_enabled.ompt_callback_mutex_acquired) {
3036       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3037           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3038     }
3039 #endif
3040     return FTN_TRUE;
3041   } else {
3042 #if USE_ITT_BUILD
3043     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3044 #endif
3045     return FTN_FALSE;
3046   }
3047 
3048 #else // KMP_USE_DYNAMIC_LOCK
3049 
3050   kmp_user_lock_p lck;
3051   int rc;
3052 
3053   if ((__kmp_user_lock_kind == lk_tas) &&
3054       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3055     lck = (kmp_user_lock_p)user_lock;
3056   }
3057 #if KMP_USE_FUTEX
3058   else if ((__kmp_user_lock_kind == lk_futex) &&
3059            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3060     lck = (kmp_user_lock_p)user_lock;
3061   }
3062 #endif
3063   else {
3064     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3065   }
3066 
3067 #if USE_ITT_BUILD
3068   __kmp_itt_lock_acquiring(lck);
3069 #endif /* USE_ITT_BUILD */
3070 #if OMPT_SUPPORT && OMPT_OPTIONAL
3071   // This is the case, if called from omp_init_lock_with_hint:
3072   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3073   if (!codeptr)
3074     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3075   if (ompt_enabled.ompt_callback_mutex_acquire) {
3076     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3077         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3078         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3079   }
3080 #endif
3081 
3082   rc = TEST_LOCK(lck, gtid);
3083 #if USE_ITT_BUILD
3084   if (rc) {
3085     __kmp_itt_lock_acquired(lck);
3086   } else {
3087     __kmp_itt_lock_cancelled(lck);
3088   }
3089 #endif /* USE_ITT_BUILD */
3090 #if OMPT_SUPPORT && OMPT_OPTIONAL
3091   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3092     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3093         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3094   }
3095 #endif
3096 
3097   return (rc ? FTN_TRUE : FTN_FALSE);
3098 
3099 /* Can't use serial interval since not block structured */
3100 
3101 #endif // KMP_USE_DYNAMIC_LOCK
3102 }
3103 
3104 /* try to acquire the lock */
3105 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3106 #if KMP_USE_DYNAMIC_LOCK
3107   int rc;
3108 #if USE_ITT_BUILD
3109   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3110 #endif
3111 #if OMPT_SUPPORT && OMPT_OPTIONAL
3112   // This is the case, if called from omp_init_lock_with_hint:
3113   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3114   if (!codeptr)
3115     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3116   if (ompt_enabled.ompt_callback_mutex_acquire) {
3117     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3118         ompt_mutex_nest_lock, omp_lock_hint_none,
3119         __ompt_get_mutex_impl_type(user_lock),
3120         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3121   }
3122 #endif
3123   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3124 #if USE_ITT_BUILD
3125   if (rc) {
3126     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3127   } else {
3128     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3129   }
3130 #endif
3131 #if OMPT_SUPPORT && OMPT_OPTIONAL
3132   if (ompt_enabled.enabled && rc) {
3133     if (rc == 1) {
3134       if (ompt_enabled.ompt_callback_mutex_acquired) {
3135         // lock_first
3136         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3137             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3138             codeptr);
3139       }
3140     } else {
3141       if (ompt_enabled.ompt_callback_nest_lock) {
3142         // lock_next
3143         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3144             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3145       }
3146     }
3147   }
3148 #endif
3149   return rc;
3150 
3151 #else // KMP_USE_DYNAMIC_LOCK
3152 
3153   kmp_user_lock_p lck;
3154   int rc;
3155 
3156   if ((__kmp_user_lock_kind == lk_tas) &&
3157       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3158        OMP_NEST_LOCK_T_SIZE)) {
3159     lck = (kmp_user_lock_p)user_lock;
3160   }
3161 #if KMP_USE_FUTEX
3162   else if ((__kmp_user_lock_kind == lk_futex) &&
3163            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3164             OMP_NEST_LOCK_T_SIZE)) {
3165     lck = (kmp_user_lock_p)user_lock;
3166   }
3167 #endif
3168   else {
3169     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3170   }
3171 
3172 #if USE_ITT_BUILD
3173   __kmp_itt_lock_acquiring(lck);
3174 #endif /* USE_ITT_BUILD */
3175 
3176 #if OMPT_SUPPORT && OMPT_OPTIONAL
3177   // This is the case, if called from omp_init_lock_with_hint:
3178   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3179   if (!codeptr)
3180     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3181   if (ompt_enabled.enabled) &&
3182         ompt_enabled.ompt_callback_mutex_acquire) {
3183       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3184           ompt_mutex_nest_lock, omp_lock_hint_none,
3185           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3186           codeptr);
3187     }
3188 #endif
3189 
3190   rc = TEST_NESTED_LOCK(lck, gtid);
3191 #if USE_ITT_BUILD
3192   if (rc) {
3193     __kmp_itt_lock_acquired(lck);
3194   } else {
3195     __kmp_itt_lock_cancelled(lck);
3196   }
3197 #endif /* USE_ITT_BUILD */
3198 #if OMPT_SUPPORT && OMPT_OPTIONAL
3199   if (ompt_enabled.enabled && rc) {
3200     if (rc == 1) {
3201       if (ompt_enabled.ompt_callback_mutex_acquired) {
3202         // lock_first
3203         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3204             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3205       }
3206     } else {
3207       if (ompt_enabled.ompt_callback_nest_lock) {
3208         // lock_next
3209         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3210             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3211       }
3212     }
3213   }
3214 #endif
3215   return rc;
3216 
3217 /* Can't use serial interval since not block structured */
3218 
3219 #endif // KMP_USE_DYNAMIC_LOCK
3220 }
3221 
3222 // Interface to fast scalable reduce methods routines
3223 
3224 // keep the selected method in a thread local structure for cross-function
3225 // usage: will be used in __kmpc_end_reduce* functions;
3226 // another solution: to re-determine the method one more time in
3227 // __kmpc_end_reduce* functions (new prototype required then)
3228 // AT: which solution is better?
3229 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3230   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3231 
3232 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3233   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3234 
3235 // description of the packed_reduction_method variable: look at the macros in
3236 // kmp.h
3237 
3238 // used in a critical section reduce block
3239 static __forceinline void
3240 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3241                                           kmp_critical_name *crit) {
3242 
3243   // this lock was visible to a customer and to the threading profile tool as a
3244   // serial overhead span (although it's used for an internal purpose only)
3245   //            why was it visible in previous implementation?
3246   //            should we keep it visible in new reduce block?
3247   kmp_user_lock_p lck;
3248 
3249 #if KMP_USE_DYNAMIC_LOCK
3250 
3251   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3252   // Check if it is initialized.
3253   if (*lk == 0) {
3254     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3255       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3256                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3257     } else {
3258       __kmp_init_indirect_csptr(crit, loc, global_tid,
3259                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3260     }
3261   }
3262   // Branch for accessing the actual lock object and set operation. This
3263   // branching is inevitable since this lock initialization does not follow the
3264   // normal dispatch path (lock table is not used).
3265   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3266     lck = (kmp_user_lock_p)lk;
3267     KMP_DEBUG_ASSERT(lck != NULL);
3268     if (__kmp_env_consistency_check) {
3269       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3270     }
3271     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3272   } else {
3273     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3274     lck = ilk->lock;
3275     KMP_DEBUG_ASSERT(lck != NULL);
3276     if (__kmp_env_consistency_check) {
3277       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3278     }
3279     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3280   }
3281 
3282 #else // KMP_USE_DYNAMIC_LOCK
3283 
3284   // We know that the fast reduction code is only emitted by Intel compilers
3285   // with 32 byte critical sections. If there isn't enough space, then we
3286   // have to use a pointer.
3287   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3288     lck = (kmp_user_lock_p)crit;
3289   } else {
3290     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3291   }
3292   KMP_DEBUG_ASSERT(lck != NULL);
3293 
3294   if (__kmp_env_consistency_check)
3295     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3296 
3297   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3298 
3299 #endif // KMP_USE_DYNAMIC_LOCK
3300 }
3301 
3302 // used in a critical section reduce block
3303 static __forceinline void
3304 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3305                                         kmp_critical_name *crit) {
3306 
3307   kmp_user_lock_p lck;
3308 
3309 #if KMP_USE_DYNAMIC_LOCK
3310 
3311   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3312     lck = (kmp_user_lock_p)crit;
3313     if (__kmp_env_consistency_check)
3314       __kmp_pop_sync(global_tid, ct_critical, loc);
3315     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3316   } else {
3317     kmp_indirect_lock_t *ilk =
3318         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3319     if (__kmp_env_consistency_check)
3320       __kmp_pop_sync(global_tid, ct_critical, loc);
3321     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3322   }
3323 
3324 #else // KMP_USE_DYNAMIC_LOCK
3325 
3326   // We know that the fast reduction code is only emitted by Intel compilers
3327   // with 32 byte critical sections. If there isn't enough space, then we have
3328   // to use a pointer.
3329   if (__kmp_base_user_lock_size > 32) {
3330     lck = *((kmp_user_lock_p *)crit);
3331     KMP_ASSERT(lck != NULL);
3332   } else {
3333     lck = (kmp_user_lock_p)crit;
3334   }
3335 
3336   if (__kmp_env_consistency_check)
3337     __kmp_pop_sync(global_tid, ct_critical, loc);
3338 
3339   __kmp_release_user_lock_with_checks(lck, global_tid);
3340 
3341 #endif // KMP_USE_DYNAMIC_LOCK
3342 } // __kmp_end_critical_section_reduce_block
3343 
3344 #if OMP_40_ENABLED
3345 static __forceinline int
3346 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3347                                      int *task_state) {
3348   kmp_team_t *team;
3349 
3350   // Check if we are inside the teams construct?
3351   if (th->th.th_teams_microtask) {
3352     *team_p = team = th->th.th_team;
3353     if (team->t.t_level == th->th.th_teams_level) {
3354       // This is reduction at teams construct.
3355       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3356       // Let's swap teams temporarily for the reduction.
3357       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3358       th->th.th_team = team->t.t_parent;
3359       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3360       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3361       *task_state = th->th.th_task_state;
3362       th->th.th_task_state = 0;
3363 
3364       return 1;
3365     }
3366   }
3367   return 0;
3368 }
3369 
3370 static __forceinline void
3371 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3372   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3373   th->th.th_info.ds.ds_tid = 0;
3374   th->th.th_team = team;
3375   th->th.th_team_nproc = team->t.t_nproc;
3376   th->th.th_task_team = team->t.t_task_team[task_state];
3377   th->th.th_task_state = task_state;
3378 }
3379 #endif
3380 
3381 /* 2.a.i. Reduce Block without a terminating barrier */
3382 /*!
3383 @ingroup SYNCHRONIZATION
3384 @param loc source location information
3385 @param global_tid global thread number
3386 @param num_vars number of items (variables) to be reduced
3387 @param reduce_size size of data in bytes to be reduced
3388 @param reduce_data pointer to data to be reduced
3389 @param reduce_func callback function providing reduction operation on two
3390 operands and returning result of reduction in lhs_data
3391 @param lck pointer to the unique lock data structure
3392 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3393 threads if atomic reduction needed
3394 
3395 The nowait version is used for a reduce clause with the nowait argument.
3396 */
3397 kmp_int32
3398 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3399                      size_t reduce_size, void *reduce_data,
3400                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3401                      kmp_critical_name *lck) {
3402 
3403   KMP_COUNT_BLOCK(REDUCE_nowait);
3404   int retval = 0;
3405   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3406 #if OMP_40_ENABLED
3407   kmp_info_t *th;
3408   kmp_team_t *team;
3409   int teams_swapped = 0, task_state;
3410 #endif
3411   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3412 
3413   // why do we need this initialization here at all?
3414   // Reduction clause can not be used as a stand-alone directive.
3415 
3416   // do not call __kmp_serial_initialize(), it will be called by
3417   // __kmp_parallel_initialize() if needed
3418   // possible detection of false-positive race by the threadchecker ???
3419   if (!TCR_4(__kmp_init_parallel))
3420     __kmp_parallel_initialize();
3421 
3422 #if OMP_50_ENABLED
3423   __kmp_resume_if_soft_paused();
3424 #endif
3425 
3426 // check correctness of reduce block nesting
3427 #if KMP_USE_DYNAMIC_LOCK
3428   if (__kmp_env_consistency_check)
3429     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3430 #else
3431   if (__kmp_env_consistency_check)
3432     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3433 #endif
3434 
3435 #if OMP_40_ENABLED
3436   th = __kmp_thread_from_gtid(global_tid);
3437   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3438 #endif // OMP_40_ENABLED
3439 
3440   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3441   // the value should be kept in a variable
3442   // the variable should be either a construct-specific or thread-specific
3443   // property, not a team specific property
3444   //     (a thread can reach the next reduce block on the next construct, reduce
3445   //     method may differ on the next construct)
3446   // an ident_t "loc" parameter could be used as a construct-specific property
3447   // (what if loc == 0?)
3448   //     (if both construct-specific and team-specific variables were shared,
3449   //     then unness extra syncs should be needed)
3450   // a thread-specific variable is better regarding two issues above (next
3451   // construct and extra syncs)
3452   // a thread-specific "th_local.reduction_method" variable is used currently
3453   // each thread executes 'determine' and 'set' lines (no need to execute by one
3454   // thread, to avoid unness extra syncs)
3455 
3456   packed_reduction_method = __kmp_determine_reduction_method(
3457       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3458   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3459 
3460   if (packed_reduction_method == critical_reduce_block) {
3461 
3462     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3463     retval = 1;
3464 
3465   } else if (packed_reduction_method == empty_reduce_block) {
3466 
3467     // usage: if team size == 1, no synchronization is required ( Intel
3468     // platforms only )
3469     retval = 1;
3470 
3471   } else if (packed_reduction_method == atomic_reduce_block) {
3472 
3473     retval = 2;
3474 
3475     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3476     // won't be called by the code gen)
3477     //     (it's not quite good, because the checking block has been closed by
3478     //     this 'pop',
3479     //      but atomic operation has not been executed yet, will be executed
3480     //      slightly later, literally on next instruction)
3481     if (__kmp_env_consistency_check)
3482       __kmp_pop_sync(global_tid, ct_reduce, loc);
3483 
3484   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3485                                    tree_reduce_block)) {
3486 
3487 // AT: performance issue: a real barrier here
3488 // AT:     (if master goes slow, other threads are blocked here waiting for the
3489 // master to come and release them)
3490 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3491 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3492 // be confusing to a customer)
3493 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3494 // might go faster and be more in line with sense of NOWAIT
3495 // AT: TO DO: do epcc test and compare times
3496 
3497 // this barrier should be invisible to a customer and to the threading profile
3498 // tool (it's neither a terminating barrier nor customer's code, it's
3499 // used for an internal purpose)
3500 #if OMPT_SUPPORT
3501     // JP: can this barrier potentially leed to task scheduling?
3502     // JP: as long as there is a barrier in the implementation, OMPT should and
3503     // will provide the barrier events
3504     //         so we set-up the necessary frame/return addresses.
3505     ompt_frame_t *ompt_frame;
3506     if (ompt_enabled.enabled) {
3507       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3508       if (ompt_frame->enter_frame.ptr == NULL)
3509         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3510       OMPT_STORE_RETURN_ADDRESS(global_tid);
3511     }
3512 #endif
3513 #if USE_ITT_NOTIFY
3514     __kmp_threads[global_tid]->th.th_ident = loc;
3515 #endif
3516     retval =
3517         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3518                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3519     retval = (retval != 0) ? (0) : (1);
3520 #if OMPT_SUPPORT && OMPT_OPTIONAL
3521     if (ompt_enabled.enabled) {
3522       ompt_frame->enter_frame = ompt_data_none;
3523     }
3524 #endif
3525 
3526     // all other workers except master should do this pop here
3527     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3528     if (__kmp_env_consistency_check) {
3529       if (retval == 0) {
3530         __kmp_pop_sync(global_tid, ct_reduce, loc);
3531       }
3532     }
3533 
3534   } else {
3535 
3536     // should never reach this block
3537     KMP_ASSERT(0); // "unexpected method"
3538   }
3539 #if OMP_40_ENABLED
3540   if (teams_swapped) {
3541     __kmp_restore_swapped_teams(th, team, task_state);
3542   }
3543 #endif
3544   KA_TRACE(
3545       10,
3546       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3547        global_tid, packed_reduction_method, retval));
3548 
3549   return retval;
3550 }
3551 
3552 /*!
3553 @ingroup SYNCHRONIZATION
3554 @param loc source location information
3555 @param global_tid global thread id.
3556 @param lck pointer to the unique lock data structure
3557 
3558 Finish the execution of a reduce nowait.
3559 */
3560 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3561                               kmp_critical_name *lck) {
3562 
3563   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3564 
3565   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3566 
3567   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3568 
3569   if (packed_reduction_method == critical_reduce_block) {
3570 
3571     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3572 
3573   } else if (packed_reduction_method == empty_reduce_block) {
3574 
3575     // usage: if team size == 1, no synchronization is required ( on Intel
3576     // platforms only )
3577 
3578   } else if (packed_reduction_method == atomic_reduce_block) {
3579 
3580     // neither master nor other workers should get here
3581     //     (code gen does not generate this call in case 2: atomic reduce block)
3582     // actually it's better to remove this elseif at all;
3583     // after removal this value will checked by the 'else' and will assert
3584 
3585   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3586                                    tree_reduce_block)) {
3587 
3588     // only master gets here
3589 
3590   } else {
3591 
3592     // should never reach this block
3593     KMP_ASSERT(0); // "unexpected method"
3594   }
3595 
3596   if (__kmp_env_consistency_check)
3597     __kmp_pop_sync(global_tid, ct_reduce, loc);
3598 
3599   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3600                 global_tid, packed_reduction_method));
3601 
3602   return;
3603 }
3604 
3605 /* 2.a.ii. Reduce Block with a terminating barrier */
3606 
3607 /*!
3608 @ingroup SYNCHRONIZATION
3609 @param loc source location information
3610 @param global_tid global thread number
3611 @param num_vars number of items (variables) to be reduced
3612 @param reduce_size size of data in bytes to be reduced
3613 @param reduce_data pointer to data to be reduced
3614 @param reduce_func callback function providing reduction operation on two
3615 operands and returning result of reduction in lhs_data
3616 @param lck pointer to the unique lock data structure
3617 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3618 threads if atomic reduction needed
3619 
3620 A blocking reduce that includes an implicit barrier.
3621 */
3622 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3623                         size_t reduce_size, void *reduce_data,
3624                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3625                         kmp_critical_name *lck) {
3626   KMP_COUNT_BLOCK(REDUCE_wait);
3627   int retval = 0;
3628   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3629 #if OMP_40_ENABLED
3630   kmp_info_t *th;
3631   kmp_team_t *team;
3632   int teams_swapped = 0, task_state;
3633 #endif
3634 
3635   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3636 
3637   // why do we need this initialization here at all?
3638   // Reduction clause can not be a stand-alone directive.
3639 
3640   // do not call __kmp_serial_initialize(), it will be called by
3641   // __kmp_parallel_initialize() if needed
3642   // possible detection of false-positive race by the threadchecker ???
3643   if (!TCR_4(__kmp_init_parallel))
3644     __kmp_parallel_initialize();
3645 
3646 #if OMP_50_ENABLED
3647   __kmp_resume_if_soft_paused();
3648 #endif
3649 
3650 // check correctness of reduce block nesting
3651 #if KMP_USE_DYNAMIC_LOCK
3652   if (__kmp_env_consistency_check)
3653     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3654 #else
3655   if (__kmp_env_consistency_check)
3656     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3657 #endif
3658 
3659 #if OMP_40_ENABLED
3660   th = __kmp_thread_from_gtid(global_tid);
3661   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3662 #endif // OMP_40_ENABLED
3663 
3664   packed_reduction_method = __kmp_determine_reduction_method(
3665       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3666   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3667 
3668   if (packed_reduction_method == critical_reduce_block) {
3669 
3670     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3671     retval = 1;
3672 
3673   } else if (packed_reduction_method == empty_reduce_block) {
3674 
3675     // usage: if team size == 1, no synchronization is required ( Intel
3676     // platforms only )
3677     retval = 1;
3678 
3679   } else if (packed_reduction_method == atomic_reduce_block) {
3680 
3681     retval = 2;
3682 
3683   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3684                                    tree_reduce_block)) {
3685 
3686 // case tree_reduce_block:
3687 // this barrier should be visible to a customer and to the threading profile
3688 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3689 #if OMPT_SUPPORT
3690     ompt_frame_t *ompt_frame;
3691     if (ompt_enabled.enabled) {
3692       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3693       if (ompt_frame->enter_frame.ptr == NULL)
3694         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3695       OMPT_STORE_RETURN_ADDRESS(global_tid);
3696     }
3697 #endif
3698 #if USE_ITT_NOTIFY
3699     __kmp_threads[global_tid]->th.th_ident =
3700         loc; // needed for correct notification of frames
3701 #endif
3702     retval =
3703         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3704                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3705     retval = (retval != 0) ? (0) : (1);
3706 #if OMPT_SUPPORT && OMPT_OPTIONAL
3707     if (ompt_enabled.enabled) {
3708       ompt_frame->enter_frame = ompt_data_none;
3709     }
3710 #endif
3711 
3712     // all other workers except master should do this pop here
3713     // ( none of other workers except master will enter __kmpc_end_reduce() )
3714     if (__kmp_env_consistency_check) {
3715       if (retval == 0) { // 0: all other workers; 1: master
3716         __kmp_pop_sync(global_tid, ct_reduce, loc);
3717       }
3718     }
3719 
3720   } else {
3721 
3722     // should never reach this block
3723     KMP_ASSERT(0); // "unexpected method"
3724   }
3725 #if OMP_40_ENABLED
3726   if (teams_swapped) {
3727     __kmp_restore_swapped_teams(th, team, task_state);
3728   }
3729 #endif
3730 
3731   KA_TRACE(10,
3732            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3733             global_tid, packed_reduction_method, retval));
3734 
3735   return retval;
3736 }
3737 
3738 /*!
3739 @ingroup SYNCHRONIZATION
3740 @param loc source location information
3741 @param global_tid global thread id.
3742 @param lck pointer to the unique lock data structure
3743 
3744 Finish the execution of a blocking reduce.
3745 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3746 start function.
3747 */
3748 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3749                        kmp_critical_name *lck) {
3750 
3751   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3752 #if OMP_40_ENABLED
3753   kmp_info_t *th;
3754   kmp_team_t *team;
3755   int teams_swapped = 0, task_state;
3756 #endif
3757 
3758   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3759 
3760 #if OMP_40_ENABLED
3761   th = __kmp_thread_from_gtid(global_tid);
3762   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3763 #endif // OMP_40_ENABLED
3764 
3765   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3766 
3767   // this barrier should be visible to a customer and to the threading profile
3768   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3769 
3770   if (packed_reduction_method == critical_reduce_block) {
3771 
3772     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3773 
3774 // TODO: implicit barrier: should be exposed
3775 #if OMPT_SUPPORT
3776     ompt_frame_t *ompt_frame;
3777     if (ompt_enabled.enabled) {
3778       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3779       if (ompt_frame->enter_frame.ptr == NULL)
3780         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3781       OMPT_STORE_RETURN_ADDRESS(global_tid);
3782     }
3783 #endif
3784 #if USE_ITT_NOTIFY
3785     __kmp_threads[global_tid]->th.th_ident = loc;
3786 #endif
3787     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3788 #if OMPT_SUPPORT && OMPT_OPTIONAL
3789     if (ompt_enabled.enabled) {
3790       ompt_frame->enter_frame = ompt_data_none;
3791     }
3792 #endif
3793 
3794   } else if (packed_reduction_method == empty_reduce_block) {
3795 
3796 // usage: if team size==1, no synchronization is required (Intel platforms only)
3797 
3798 // TODO: implicit barrier: should be exposed
3799 #if OMPT_SUPPORT
3800     ompt_frame_t *ompt_frame;
3801     if (ompt_enabled.enabled) {
3802       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3803       if (ompt_frame->enter_frame.ptr == NULL)
3804         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3805       OMPT_STORE_RETURN_ADDRESS(global_tid);
3806     }
3807 #endif
3808 #if USE_ITT_NOTIFY
3809     __kmp_threads[global_tid]->th.th_ident = loc;
3810 #endif
3811     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3812 #if OMPT_SUPPORT && OMPT_OPTIONAL
3813     if (ompt_enabled.enabled) {
3814       ompt_frame->enter_frame = ompt_data_none;
3815     }
3816 #endif
3817 
3818   } else if (packed_reduction_method == atomic_reduce_block) {
3819 
3820 #if OMPT_SUPPORT
3821     ompt_frame_t *ompt_frame;
3822     if (ompt_enabled.enabled) {
3823       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3824       if (ompt_frame->enter_frame.ptr == NULL)
3825         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3826       OMPT_STORE_RETURN_ADDRESS(global_tid);
3827     }
3828 #endif
3829 // TODO: implicit barrier: should be exposed
3830 #if USE_ITT_NOTIFY
3831     __kmp_threads[global_tid]->th.th_ident = loc;
3832 #endif
3833     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3834 #if OMPT_SUPPORT && OMPT_OPTIONAL
3835     if (ompt_enabled.enabled) {
3836       ompt_frame->enter_frame = ompt_data_none;
3837     }
3838 #endif
3839 
3840   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3841                                    tree_reduce_block)) {
3842 
3843     // only master executes here (master releases all other workers)
3844     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3845                             global_tid);
3846 
3847   } else {
3848 
3849     // should never reach this block
3850     KMP_ASSERT(0); // "unexpected method"
3851   }
3852 #if OMP_40_ENABLED
3853   if (teams_swapped) {
3854     __kmp_restore_swapped_teams(th, team, task_state);
3855   }
3856 #endif
3857 
3858   if (__kmp_env_consistency_check)
3859     __kmp_pop_sync(global_tid, ct_reduce, loc);
3860 
3861   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3862                 global_tid, packed_reduction_method));
3863 
3864   return;
3865 }
3866 
3867 #undef __KMP_GET_REDUCTION_METHOD
3868 #undef __KMP_SET_REDUCTION_METHOD
3869 
3870 /* end of interface to fast scalable reduce routines */
3871 
3872 kmp_uint64 __kmpc_get_taskid() {
3873 
3874   kmp_int32 gtid;
3875   kmp_info_t *thread;
3876 
3877   gtid = __kmp_get_gtid();
3878   if (gtid < 0) {
3879     return 0;
3880   }
3881   thread = __kmp_thread_from_gtid(gtid);
3882   return thread->th.th_current_task->td_task_id;
3883 
3884 } // __kmpc_get_taskid
3885 
3886 kmp_uint64 __kmpc_get_parent_taskid() {
3887 
3888   kmp_int32 gtid;
3889   kmp_info_t *thread;
3890   kmp_taskdata_t *parent_task;
3891 
3892   gtid = __kmp_get_gtid();
3893   if (gtid < 0) {
3894     return 0;
3895   }
3896   thread = __kmp_thread_from_gtid(gtid);
3897   parent_task = thread->th.th_current_task->td_parent;
3898   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3899 
3900 } // __kmpc_get_parent_taskid
3901 
3902 #if OMP_45_ENABLED
3903 /*!
3904 @ingroup WORK_SHARING
3905 @param loc  source location information.
3906 @param gtid  global thread number.
3907 @param num_dims  number of associated doacross loops.
3908 @param dims  info on loops bounds.
3909 
3910 Initialize doacross loop information.
3911 Expect compiler send us inclusive bounds,
3912 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3913 */
3914 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3915                           const struct kmp_dim *dims) {
3916   int j, idx;
3917   kmp_int64 last, trace_count;
3918   kmp_info_t *th = __kmp_threads[gtid];
3919   kmp_team_t *team = th->th.th_team;
3920   kmp_uint32 *flags;
3921   kmp_disp_t *pr_buf = th->th.th_dispatch;
3922   dispatch_shared_info_t *sh_buf;
3923 
3924   KA_TRACE(
3925       20,
3926       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3927        gtid, num_dims, !team->t.t_serialized));
3928   KMP_DEBUG_ASSERT(dims != NULL);
3929   KMP_DEBUG_ASSERT(num_dims > 0);
3930 
3931   if (team->t.t_serialized) {
3932     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3933     return; // no dependencies if team is serialized
3934   }
3935   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3936   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3937   // the next loop
3938   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3939 
3940   // Save bounds info into allocated private buffer
3941   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3942   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3943       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3944   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3945   pr_buf->th_doacross_info[0] =
3946       (kmp_int64)num_dims; // first element is number of dimensions
3947   // Save also address of num_done in order to access it later without knowing
3948   // the buffer index
3949   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3950   pr_buf->th_doacross_info[2] = dims[0].lo;
3951   pr_buf->th_doacross_info[3] = dims[0].up;
3952   pr_buf->th_doacross_info[4] = dims[0].st;
3953   last = 5;
3954   for (j = 1; j < num_dims; ++j) {
3955     kmp_int64
3956         range_length; // To keep ranges of all dimensions but the first dims[0]
3957     if (dims[j].st == 1) { // most common case
3958       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3959       range_length = dims[j].up - dims[j].lo + 1;
3960     } else {
3961       if (dims[j].st > 0) {
3962         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3963         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3964       } else { // negative increment
3965         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3966         range_length =
3967             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3968       }
3969     }
3970     pr_buf->th_doacross_info[last++] = range_length;
3971     pr_buf->th_doacross_info[last++] = dims[j].lo;
3972     pr_buf->th_doacross_info[last++] = dims[j].up;
3973     pr_buf->th_doacross_info[last++] = dims[j].st;
3974   }
3975 
3976   // Compute total trip count.
3977   // Start with range of dims[0] which we don't need to keep in the buffer.
3978   if (dims[0].st == 1) { // most common case
3979     trace_count = dims[0].up - dims[0].lo + 1;
3980   } else if (dims[0].st > 0) {
3981     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3982     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3983   } else { // negative increment
3984     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3985     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3986   }
3987   for (j = 1; j < num_dims; ++j) {
3988     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3989   }
3990   KMP_DEBUG_ASSERT(trace_count > 0);
3991 
3992   // Check if shared buffer is not occupied by other loop (idx -
3993   // __kmp_dispatch_num_buffers)
3994   if (idx != sh_buf->doacross_buf_idx) {
3995     // Shared buffer is occupied, wait for it to be free
3996     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3997                  __kmp_eq_4, NULL);
3998   }
3999 #if KMP_32_BIT_ARCH
4000   // Check if we are the first thread. After the CAS the first thread gets 0,
4001   // others get 1 if initialization is in progress, allocated pointer otherwise.
4002   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4003   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4004       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4005 #else
4006   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4007       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4008 #endif
4009   if (flags == NULL) {
4010     // we are the first thread, allocate the array of flags
4011     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
4012     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4013     KMP_MB();
4014     sh_buf->doacross_flags = flags;
4015   } else if (flags == (kmp_uint32 *)1) {
4016 #if KMP_32_BIT_ARCH
4017     // initialization is still in progress, need to wait
4018     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4019 #else
4020     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4021 #endif
4022       KMP_YIELD(TRUE);
4023     KMP_MB();
4024   } else {
4025     KMP_MB();
4026   }
4027   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4028   pr_buf->th_doacross_flags =
4029       sh_buf->doacross_flags; // save private copy in order to not
4030   // touch shared buffer on each iteration
4031   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4032 }
4033 
4034 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4035   kmp_int32 shft, num_dims, i;
4036   kmp_uint32 flag;
4037   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4038   kmp_info_t *th = __kmp_threads[gtid];
4039   kmp_team_t *team = th->th.th_team;
4040   kmp_disp_t *pr_buf;
4041   kmp_int64 lo, up, st;
4042 
4043   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4044   if (team->t.t_serialized) {
4045     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4046     return; // no dependencies if team is serialized
4047   }
4048 
4049   // calculate sequential iteration number and check out-of-bounds condition
4050   pr_buf = th->th.th_dispatch;
4051   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4052   num_dims = pr_buf->th_doacross_info[0];
4053   lo = pr_buf->th_doacross_info[2];
4054   up = pr_buf->th_doacross_info[3];
4055   st = pr_buf->th_doacross_info[4];
4056   if (st == 1) { // most common case
4057     if (vec[0] < lo || vec[0] > up) {
4058       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4059                     "bounds [%lld,%lld]\n",
4060                     gtid, vec[0], lo, up));
4061       return;
4062     }
4063     iter_number = vec[0] - lo;
4064   } else if (st > 0) {
4065     if (vec[0] < lo || vec[0] > up) {
4066       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4067                     "bounds [%lld,%lld]\n",
4068                     gtid, vec[0], lo, up));
4069       return;
4070     }
4071     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4072   } else { // negative increment
4073     if (vec[0] > lo || vec[0] < up) {
4074       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4075                     "bounds [%lld,%lld]\n",
4076                     gtid, vec[0], lo, up));
4077       return;
4078     }
4079     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4080   }
4081   for (i = 1; i < num_dims; ++i) {
4082     kmp_int64 iter, ln;
4083     kmp_int32 j = i * 4;
4084     ln = pr_buf->th_doacross_info[j + 1];
4085     lo = pr_buf->th_doacross_info[j + 2];
4086     up = pr_buf->th_doacross_info[j + 3];
4087     st = pr_buf->th_doacross_info[j + 4];
4088     if (st == 1) {
4089       if (vec[i] < lo || vec[i] > up) {
4090         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4091                       "bounds [%lld,%lld]\n",
4092                       gtid, vec[i], lo, up));
4093         return;
4094       }
4095       iter = vec[i] - lo;
4096     } else if (st > 0) {
4097       if (vec[i] < lo || vec[i] > up) {
4098         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4099                       "bounds [%lld,%lld]\n",
4100                       gtid, vec[i], lo, up));
4101         return;
4102       }
4103       iter = (kmp_uint64)(vec[i] - lo) / st;
4104     } else { // st < 0
4105       if (vec[i] > lo || vec[i] < up) {
4106         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4107                       "bounds [%lld,%lld]\n",
4108                       gtid, vec[i], lo, up));
4109         return;
4110       }
4111       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4112     }
4113     iter_number = iter + ln * iter_number;
4114   }
4115   shft = iter_number % 32; // use 32-bit granularity
4116   iter_number >>= 5; // divided by 32
4117   flag = 1 << shft;
4118   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4119     KMP_YIELD(TRUE);
4120   }
4121   KMP_MB();
4122   KA_TRACE(20,
4123            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4124             gtid, (iter_number << 5) + shft));
4125 }
4126 
4127 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4128   kmp_int32 shft, num_dims, i;
4129   kmp_uint32 flag;
4130   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4131   kmp_info_t *th = __kmp_threads[gtid];
4132   kmp_team_t *team = th->th.th_team;
4133   kmp_disp_t *pr_buf;
4134   kmp_int64 lo, st;
4135 
4136   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4137   if (team->t.t_serialized) {
4138     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4139     return; // no dependencies if team is serialized
4140   }
4141 
4142   // calculate sequential iteration number (same as in "wait" but no
4143   // out-of-bounds checks)
4144   pr_buf = th->th.th_dispatch;
4145   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4146   num_dims = pr_buf->th_doacross_info[0];
4147   lo = pr_buf->th_doacross_info[2];
4148   st = pr_buf->th_doacross_info[4];
4149   if (st == 1) { // most common case
4150     iter_number = vec[0] - lo;
4151   } else if (st > 0) {
4152     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4153   } else { // negative increment
4154     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4155   }
4156   for (i = 1; i < num_dims; ++i) {
4157     kmp_int64 iter, ln;
4158     kmp_int32 j = i * 4;
4159     ln = pr_buf->th_doacross_info[j + 1];
4160     lo = pr_buf->th_doacross_info[j + 2];
4161     st = pr_buf->th_doacross_info[j + 4];
4162     if (st == 1) {
4163       iter = vec[i] - lo;
4164     } else if (st > 0) {
4165       iter = (kmp_uint64)(vec[i] - lo) / st;
4166     } else { // st < 0
4167       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4168     }
4169     iter_number = iter + ln * iter_number;
4170   }
4171   shft = iter_number % 32; // use 32-bit granularity
4172   iter_number >>= 5; // divided by 32
4173   flag = 1 << shft;
4174   KMP_MB();
4175   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4176     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4177   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4178                 (iter_number << 5) + shft));
4179 }
4180 
4181 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4182   kmp_int32 num_done;
4183   kmp_info_t *th = __kmp_threads[gtid];
4184   kmp_team_t *team = th->th.th_team;
4185   kmp_disp_t *pr_buf = th->th.th_dispatch;
4186 
4187   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4188   if (team->t.t_serialized) {
4189     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4190     return; // nothing to do
4191   }
4192   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4193   if (num_done == th->th.th_team_nproc) {
4194     // we are the last thread, need to free shared resources
4195     int idx = pr_buf->th_doacross_buf_idx - 1;
4196     dispatch_shared_info_t *sh_buf =
4197         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4198     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4199                      (kmp_int64)&sh_buf->doacross_num_done);
4200     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4201     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4202     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4203     sh_buf->doacross_flags = NULL;
4204     sh_buf->doacross_num_done = 0;
4205     sh_buf->doacross_buf_idx +=
4206         __kmp_dispatch_num_buffers; // free buffer for future re-use
4207   }
4208   // free private resources (need to keep buffer index forever)
4209   pr_buf->th_doacross_flags = NULL;
4210   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4211   pr_buf->th_doacross_info = NULL;
4212   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4213 }
4214 #endif
4215 
4216 #if OMP_50_ENABLED
4217 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4218 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4219   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4220 }
4221 
4222 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4223   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4224 }
4225 
4226 int __kmpc_get_target_offload(void) {
4227   if (!__kmp_init_serial) {
4228     __kmp_serial_initialize();
4229   }
4230   return __kmp_target_offload;
4231 }
4232 
4233 int __kmpc_pause_resource(kmp_pause_status_t level) {
4234   if (!__kmp_init_serial) {
4235     return 1; // Can't pause if runtime is not initialized
4236   }
4237   return __kmp_pause_resource(level);
4238 }
4239 #endif // OMP_50_ENABLED
4240 
4241 // end of file //
4242