1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 }
74 
75 /*!
76 @ingroup THREAD_STATES
77 @param loc Source location information.
78 @return The global thread index of the active thread.
79 
80 This function can be called in any context.
81 
82 If the runtime has ony been entered at the outermost level from a
83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
84 that which would be returned by omp_get_thread_num() in the outermost
85 active parallel construct. (Or zero if there is no active parallel
86 construct, since the master thread is necessarily thread zero).
87 
88 If multiple non-OpenMP threads all enter an OpenMP construct then this
89 will be a unique thread identifier among all the threads created by
90 the OpenMP runtime (but the value cannote be defined in terms of
91 OpenMP thread ids returned by omp_get_thread_num()).
92 */
93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
94   kmp_int32 gtid = __kmp_entry_gtid();
95 
96   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
97 
98   return gtid;
99 }
100 
101 /*!
102 @ingroup THREAD_STATES
103 @param loc Source location information.
104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
105 
106 This function can be called in any context.
107 It returns the total number of threads under the control of the OpenMP runtime.
108 That is not a number that can be determined by any OpenMP standard calls, since
109 the library may be called from more than one non-OpenMP thread, and this
110 reflects the total over all such calls. Similarly the runtime maintains
111 underlying threads even when they are not active (since the cost of creating
112 and destroying OS threads is high), this call counts all such threads even if
113 they are not waiting for work.
114 */
115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
116   KC_TRACE(10,
117            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
118 
119   return TCR_4(__kmp_all_nth);
120 }
121 
122 /*!
123 @ingroup THREAD_STATES
124 @param loc Source location information.
125 @return The thread number of the calling thread in the innermost active parallel
126 construct.
127 */
128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
129   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
130   return __kmp_tid_from_gtid(__kmp_entry_gtid());
131 }
132 
133 /*!
134 @ingroup THREAD_STATES
135 @param loc Source location information.
136 @return The number of threads in the innermost active parallel construct.
137 */
138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
139   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
140 
141   return __kmp_entry_thread()->th.th_team->t.t_nproc;
142 }
143 
144 /*!
145  * @ingroup DEPRECATED
146  * @param loc location description
147  *
148  * This function need not be called. It always returns TRUE.
149  */
150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
151 #ifndef KMP_DEBUG
152 
153   return TRUE;
154 
155 #else
156 
157   const char *semi2;
158   const char *semi3;
159   int line_no;
160 
161   if (__kmp_par_range == 0) {
162     return TRUE;
163   }
164   semi2 = loc->psource;
165   if (semi2 == NULL) {
166     return TRUE;
167   }
168   semi2 = strchr(semi2, ';');
169   if (semi2 == NULL) {
170     return TRUE;
171   }
172   semi2 = strchr(semi2 + 1, ';');
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   if (__kmp_par_range_filename[0]) {
177     const char *name = semi2 - 1;
178     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
179       name--;
180     }
181     if ((*name == '/') || (*name == ';')) {
182       name++;
183     }
184     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
185       return __kmp_par_range < 0;
186     }
187   }
188   semi3 = strchr(semi2 + 1, ';');
189   if (__kmp_par_range_routine[0]) {
190     if ((semi3 != NULL) && (semi3 > semi2) &&
191         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
192       return __kmp_par_range < 0;
193     }
194   }
195   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
196     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
197       return __kmp_par_range > 0;
198     }
199     return __kmp_par_range < 0;
200   }
201   return TRUE;
202 
203 #endif /* KMP_DEBUG */
204 }
205 
206 /*!
207 @ingroup THREAD_STATES
208 @param loc Source location information.
209 @return 1 if this thread is executing inside an active parallel region, zero if
210 not.
211 */
212 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
213   return __kmp_entry_thread()->th.th_root->r.r_active;
214 }
215 
216 /*!
217 @ingroup PARALLEL
218 @param loc source location information
219 @param global_tid global thread number
220 @param num_threads number of threads requested for this parallel construct
221 
222 Set the number of threads to be used by the next fork spawned by this thread.
223 This call is only required if the parallel construct has a `num_threads` clause.
224 */
225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
226                              kmp_int32 num_threads) {
227   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
228                 global_tid, num_threads));
229 
230   __kmp_push_num_threads(loc, global_tid, num_threads);
231 }
232 
233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
234   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
235 
236   /* the num_threads are automatically popped */
237 }
238 
239 #if OMP_40_ENABLED
240 
241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
242                            kmp_int32 proc_bind) {
243   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
244                 proc_bind));
245 
246   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
247 }
248 
249 #endif /* OMP_40_ENABLED */
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     omp_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
300       OMPT_STORE_RETURN_ADDRESS(gtid);
301     }
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
312                     &ap
313 #else
314                     ap
315 #endif
316                     );
317 #if INCLUDE_SSC_MARKS
318     SSC_MARK_JOINING();
319 #endif
320     __kmp_join_call(loc, gtid
321 #if OMPT_SUPPORT
322                     ,
323                     fork_context_intel
324 #endif
325                     );
326 
327     va_end(ap);
328   }
329 
330 #if KMP_STATS_ENABLED
331   if (previous_state == stats_state_e::SERIAL_REGION) {
332     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
333   } else {
334     KMP_POP_PARTITIONED_TIMER();
335   }
336 #endif // KMP_STATS_ENABLED
337 }
338 
339 #if OMP_40_ENABLED
340 /*!
341 @ingroup PARALLEL
342 @param loc source location information
343 @param global_tid global thread number
344 @param num_teams number of teams requested for the teams construct
345 @param num_threads number of threads per team requested for the teams construct
346 
347 Set the number of teams to be used by the teams construct.
348 This call is only required if the teams construct has a `num_teams` clause
349 or a `thread_limit` clause (or both).
350 */
351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
352                            kmp_int32 num_teams, kmp_int32 num_threads) {
353   KA_TRACE(20,
354            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
355             global_tid, num_teams, num_threads));
356 
357   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
358 }
359 
360 /*!
361 @ingroup PARALLEL
362 @param loc  source location information
363 @param argc  total number of arguments in the ellipsis
364 @param microtask  pointer to callback routine consisting of outlined teams
365 construct
366 @param ...  pointers to shared variables that aren't global
367 
368 Do the actual fork and call the microtask in the relevant number of threads.
369 */
370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
371                        ...) {
372   int gtid = __kmp_entry_gtid();
373   kmp_info_t *this_thr = __kmp_threads[gtid];
374   va_list ap;
375   va_start(ap, microtask);
376 
377   KMP_COUNT_BLOCK(OMP_TEAMS);
378 
379   // remember teams entry point and nesting level
380   this_thr->th.th_teams_microtask = microtask;
381   this_thr->th.th_teams_level =
382       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
383 
384 #if OMPT_SUPPORT
385   kmp_team_t *parent_team = this_thr->th.th_team;
386   int tid = __kmp_tid_from_gtid(gtid);
387   if (ompt_enabled.enabled) {
388     parent_team->t.t_implicit_task_taskdata[tid]
389         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
390   }
391   OMPT_STORE_RETURN_ADDRESS(gtid);
392 #endif
393 
394   // check if __kmpc_push_num_teams called, set default number of teams
395   // otherwise
396   if (this_thr->th.th_teams_size.nteams == 0) {
397     __kmp_push_num_teams(loc, gtid, 0, 0);
398   }
399   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
400   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
401   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
402 
403   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
404                   VOLATILE_CAST(microtask_t)
405                       __kmp_teams_master, // "wrapped" task
406                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
407 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
408                   &ap
409 #else
410                   ap
411 #endif
412                   );
413   __kmp_join_call(loc, gtid
414 #if OMPT_SUPPORT
415                   ,
416                   fork_context_intel
417 #endif
418                   );
419 
420   this_thr->th.th_teams_microtask = NULL;
421   this_thr->th.th_teams_level = 0;
422   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
423   va_end(ap);
424 }
425 #endif /* OMP_40_ENABLED */
426 
427 // I don't think this function should ever have been exported.
428 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
429 // openmp code ever called it, but it's been exported from the RTL for so
430 // long that I'm afraid to remove the definition.
431 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
432 
433 /*!
434 @ingroup PARALLEL
435 @param loc  source location information
436 @param global_tid  global thread number
437 
438 Enter a serialized parallel construct. This interface is used to handle a
439 conditional parallel region, like this,
440 @code
441 #pragma omp parallel if (condition)
442 @endcode
443 when the condition is false.
444 */
445 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
446 // The implementation is now in kmp_runtime.cpp so that it can share static
447 // functions with kmp_fork_call since the tasks to be done are similar in
448 // each case.
449 #if OMPT_SUPPORT
450   OMPT_STORE_RETURN_ADDRESS(global_tid);
451 #endif
452   __kmp_serialized_parallel(loc, global_tid);
453 }
454 
455 /*!
456 @ingroup PARALLEL
457 @param loc  source location information
458 @param global_tid  global thread number
459 
460 Leave a serialized parallel construct.
461 */
462 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
463   kmp_internal_control_t *top;
464   kmp_info_t *this_thr;
465   kmp_team_t *serial_team;
466 
467   KC_TRACE(10,
468            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
469 
470   /* skip all this code for autopar serialized loops since it results in
471      unacceptable overhead */
472   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
473     return;
474 
475   // Not autopar code
476   if (!TCR_4(__kmp_init_parallel))
477     __kmp_parallel_initialize();
478 
479   this_thr = __kmp_threads[global_tid];
480   serial_team = this_thr->th.th_serial_team;
481 
482 #if OMP_45_ENABLED
483   kmp_task_team_t *task_team = this_thr->th.th_task_team;
484 
485   // we need to wait for the proxy tasks before finishing the thread
486   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
487     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
488 #endif
489 
490   KMP_MB();
491   KMP_DEBUG_ASSERT(serial_team);
492   KMP_ASSERT(serial_team->t.t_serialized);
493   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
494   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
495   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
496   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
497 
498 #if OMPT_SUPPORT
499   if (ompt_enabled.enabled &&
500       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
501     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
502     if (ompt_enabled.ompt_callback_implicit_task) {
503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
504           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
505           OMPT_CUR_TASK_INFO(this_thr)->thread_num);
506     }
507 
508     // reset clear the task id only after unlinking the task
509     ompt_data_t *parent_task_data;
510     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
511 
512     if (ompt_enabled.ompt_callback_parallel_end) {
513       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
514           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
515           ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
516     }
517     __ompt_lw_taskteam_unlink(this_thr);
518     this_thr->th.ompt_thread_info.state = omp_state_overhead;
519   }
520 #endif
521 
522   /* If necessary, pop the internal control stack values and replace the team
523    * values */
524   top = serial_team->t.t_control_stack_top;
525   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
526     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
527     serial_team->t.t_control_stack_top = top->next;
528     __kmp_free(top);
529   }
530 
531   // if( serial_team -> t.t_serialized > 1 )
532   serial_team->t.t_level--;
533 
534   /* pop dispatch buffers stack */
535   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
536   {
537     dispatch_private_info_t *disp_buffer =
538         serial_team->t.t_dispatch->th_disp_buffer;
539     serial_team->t.t_dispatch->th_disp_buffer =
540         serial_team->t.t_dispatch->th_disp_buffer->next;
541     __kmp_free(disp_buffer);
542   }
543 
544   --serial_team->t.t_serialized;
545   if (serial_team->t.t_serialized == 0) {
546 
547 /* return to the parallel section */
548 
549 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
550     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
551       __kmp_clear_x87_fpu_status_word();
552       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
553       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
554     }
555 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
556 
557     this_thr->th.th_team = serial_team->t.t_parent;
558     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
559 
560     /* restore values cached in the thread */
561     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
562     this_thr->th.th_team_master =
563         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
564     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
565 
566     /* TODO the below shouldn't need to be adjusted for serialized teams */
567     this_thr->th.th_dispatch =
568         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
569 
570     __kmp_pop_current_task_from_thread(this_thr);
571 
572     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
573     this_thr->th.th_current_task->td_flags.executing = 1;
574 
575     if (__kmp_tasking_mode != tskm_immediate_exec) {
576       // Copy the task team from the new child / old parent team to the thread.
577       this_thr->th.th_task_team =
578           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
579       KA_TRACE(20,
580                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
581                 "team %p\n",
582                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
583     }
584   } else {
585     if (__kmp_tasking_mode != tskm_immediate_exec) {
586       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
587                     "depth of serial team %p to %d\n",
588                     global_tid, serial_team, serial_team->t.t_serialized));
589     }
590   }
591 
592   if (__kmp_env_consistency_check)
593     __kmp_pop_parallel(global_tid, NULL);
594 #if OMPT_SUPPORT
595   if (ompt_enabled.enabled)
596     this_thr->th.ompt_thread_info.state =
597         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
598                                            : omp_state_work_parallel);
599 #endif
600 }
601 
602 /*!
603 @ingroup SYNCHRONIZATION
604 @param loc  source location information.
605 
606 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
607 depending on the memory ordering convention obeyed by the compiler
608 even that may not be necessary).
609 */
610 void __kmpc_flush(ident_t *loc) {
611   KC_TRACE(10, ("__kmpc_flush: called\n"));
612 
613   /* need explicit __mf() here since use volatile instead in library */
614   KMP_MB(); /* Flush all pending memory write invalidates.  */
615 
616 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
617 #if KMP_MIC
618 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
619 // We shouldn't need it, though, since the ABI rules require that
620 // * If the compiler generates NGO stores it also generates the fence
621 // * If users hand-code NGO stores they should insert the fence
622 // therefore no incomplete unordered stores should be visible.
623 #else
624   // C74404
625   // This is to address non-temporal store instructions (sfence needed).
626   // The clflush instruction is addressed either (mfence needed).
627   // Probably the non-temporal load monvtdqa instruction should also be
628   // addressed.
629   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
630   if (!__kmp_cpuinfo.initialized) {
631     __kmp_query_cpuid(&__kmp_cpuinfo);
632   }
633   if (!__kmp_cpuinfo.sse2) {
634     // CPU cannot execute SSE2 instructions.
635   } else {
636 #if KMP_COMPILER_ICC
637     _mm_mfence();
638 #elif KMP_COMPILER_MSVC
639     MemoryBarrier();
640 #else
641     __sync_synchronize();
642 #endif // KMP_COMPILER_ICC
643   }
644 #endif // KMP_MIC
645 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
646 // Nothing to see here move along
647 #elif KMP_ARCH_PPC64
648 // Nothing needed here (we have a real MB above).
649 #if KMP_OS_CNK
650   // The flushing thread needs to yield here; this prevents a
651   // busy-waiting thread from saturating the pipeline. flush is
652   // often used in loops like this:
653   // while (!flag) {
654   //   #pragma omp flush(flag)
655   // }
656   // and adding the yield here is good for at least a 10x speedup
657   // when running >2 threads per core (on the NAS LU benchmark).
658   __kmp_yield(TRUE);
659 #endif
660 #else
661 #error Unknown or unsupported architecture
662 #endif
663 
664 #if OMPT_SUPPORT && OMPT_OPTIONAL
665   if (ompt_enabled.ompt_callback_flush) {
666     ompt_callbacks.ompt_callback(ompt_callback_flush)(
667         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
668   }
669 #endif
670 }
671 
672 /* -------------------------------------------------------------------------- */
673 /*!
674 @ingroup SYNCHRONIZATION
675 @param loc source location information
676 @param global_tid thread id.
677 
678 Execute a barrier.
679 */
680 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
681   KMP_COUNT_BLOCK(OMP_BARRIER);
682   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
683 
684   if (!TCR_4(__kmp_init_parallel))
685     __kmp_parallel_initialize();
686 
687   if (__kmp_env_consistency_check) {
688     if (loc == 0) {
689       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
690     }
691 
692     __kmp_check_barrier(global_tid, ct_barrier, loc);
693   }
694 
695 #if OMPT_SUPPORT
696   omp_frame_t *ompt_frame;
697   if (ompt_enabled.enabled) {
698     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
699     if (ompt_frame->enter_frame == NULL)
700       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
701     OMPT_STORE_RETURN_ADDRESS(global_tid);
702   }
703 #endif
704   __kmp_threads[global_tid]->th.th_ident = loc;
705   // TODO: explicit barrier_wait_id:
706   //   this function is called when 'barrier' directive is present or
707   //   implicit barrier at the end of a worksharing construct.
708   // 1) better to add a per-thread barrier counter to a thread data structure
709   // 2) set to 0 when a new team is created
710   // 4) no sync is required
711 
712   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
713 #if OMPT_SUPPORT && OMPT_OPTIONAL
714   if (ompt_enabled.enabled) {
715     ompt_frame->enter_frame = NULL;
716   }
717 #endif
718 }
719 
720 /* The BARRIER for a MASTER section is always explicit   */
721 /*!
722 @ingroup WORK_SHARING
723 @param loc  source location information.
724 @param global_tid  global thread number .
725 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
726 */
727 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
728   int status = 0;
729 
730   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
731 
732   if (!TCR_4(__kmp_init_parallel))
733     __kmp_parallel_initialize();
734 
735   if (KMP_MASTER_GTID(global_tid)) {
736     KMP_COUNT_BLOCK(OMP_MASTER);
737     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
738     status = 1;
739   }
740 
741 #if OMPT_SUPPORT && OMPT_OPTIONAL
742   if (status) {
743     if (ompt_enabled.ompt_callback_master) {
744       kmp_info_t *this_thr = __kmp_threads[global_tid];
745       kmp_team_t *team = this_thr->th.th_team;
746 
747       int tid = __kmp_tid_from_gtid(global_tid);
748       ompt_callbacks.ompt_callback(ompt_callback_master)(
749           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
750           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
751           OMPT_GET_RETURN_ADDRESS(0));
752     }
753   }
754 #endif
755 
756   if (__kmp_env_consistency_check) {
757 #if KMP_USE_DYNAMIC_LOCK
758     if (status)
759       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
760     else
761       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
762 #else
763     if (status)
764       __kmp_push_sync(global_tid, ct_master, loc, NULL);
765     else
766       __kmp_check_sync(global_tid, ct_master, loc, NULL);
767 #endif
768   }
769 
770   return status;
771 }
772 
773 /*!
774 @ingroup WORK_SHARING
775 @param loc  source location information.
776 @param global_tid  global thread number .
777 
778 Mark the end of a <tt>master</tt> region. This should only be called by the
779 thread that executes the <tt>master</tt> region.
780 */
781 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
782   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
783 
784   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
785   KMP_POP_PARTITIONED_TIMER();
786 
787 #if OMPT_SUPPORT && OMPT_OPTIONAL
788   kmp_info_t *this_thr = __kmp_threads[global_tid];
789   kmp_team_t *team = this_thr->th.th_team;
790   if (ompt_enabled.ompt_callback_master) {
791     int tid = __kmp_tid_from_gtid(global_tid);
792     ompt_callbacks.ompt_callback(ompt_callback_master)(
793         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
794         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
795         OMPT_GET_RETURN_ADDRESS(0));
796   }
797 #endif
798 
799   if (__kmp_env_consistency_check) {
800     if (global_tid < 0)
801       KMP_WARNING(ThreadIdentInvalid);
802 
803     if (KMP_MASTER_GTID(global_tid))
804       __kmp_pop_sync(global_tid, ct_master, loc);
805   }
806 }
807 
808 /*!
809 @ingroup WORK_SHARING
810 @param loc  source location information.
811 @param gtid  global thread number.
812 
813 Start execution of an <tt>ordered</tt> construct.
814 */
815 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
816   int cid = 0;
817   kmp_info_t *th;
818   KMP_DEBUG_ASSERT(__kmp_init_serial);
819 
820   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
821 
822   if (!TCR_4(__kmp_init_parallel))
823     __kmp_parallel_initialize();
824 
825 #if USE_ITT_BUILD
826   __kmp_itt_ordered_prep(gtid);
827 // TODO: ordered_wait_id
828 #endif /* USE_ITT_BUILD */
829 
830   th = __kmp_threads[gtid];
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_team_t *team;
834   omp_wait_id_t lck;
835   void *codeptr_ra;
836   if (ompt_enabled.enabled) {
837     OMPT_STORE_RETURN_ADDRESS(gtid);
838     team = __kmp_team_from_gtid(gtid);
839     lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value;
840     /* OMPT state update */
841     th->th.ompt_thread_info.wait_id = lck;
842     th->th.ompt_thread_info.state = omp_state_wait_ordered;
843 
844     /* OMPT event callback */
845     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
846     if (ompt_enabled.ompt_callback_mutex_acquire) {
847       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
848           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
849           (omp_wait_id_t)lck, codeptr_ra);
850     }
851   }
852 #endif
853 
854   if (th->th.th_dispatch->th_deo_fcn != 0)
855     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
856   else
857     __kmp_parallel_deo(&gtid, &cid, loc);
858 
859 #if OMPT_SUPPORT && OMPT_OPTIONAL
860   if (ompt_enabled.enabled) {
861     /* OMPT state update */
862     th->th.ompt_thread_info.state = omp_state_work_parallel;
863     th->th.ompt_thread_info.wait_id = 0;
864 
865     /* OMPT event callback */
866     if (ompt_enabled.ompt_callback_mutex_acquired) {
867       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
868           ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra);
869     }
870   }
871 #endif
872 
873 #if USE_ITT_BUILD
874   __kmp_itt_ordered_start(gtid);
875 #endif /* USE_ITT_BUILD */
876 }
877 
878 /*!
879 @ingroup WORK_SHARING
880 @param loc  source location information.
881 @param gtid  global thread number.
882 
883 End execution of an <tt>ordered</tt> construct.
884 */
885 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
886   int cid = 0;
887   kmp_info_t *th;
888 
889   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
890 
891 #if USE_ITT_BUILD
892   __kmp_itt_ordered_end(gtid);
893 // TODO: ordered_wait_id
894 #endif /* USE_ITT_BUILD */
895 
896   th = __kmp_threads[gtid];
897 
898   if (th->th.th_dispatch->th_dxo_fcn != 0)
899     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
900   else
901     __kmp_parallel_dxo(&gtid, &cid, loc);
902 
903 #if OMPT_SUPPORT && OMPT_OPTIONAL
904   OMPT_STORE_RETURN_ADDRESS(gtid);
905   if (ompt_enabled.ompt_callback_mutex_released) {
906     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
907         ompt_mutex_ordered,
908         (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
909         OMPT_LOAD_RETURN_ADDRESS(gtid));
910   }
911 #endif
912 }
913 
914 #if KMP_USE_DYNAMIC_LOCK
915 
916 static __forceinline void
917 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
918                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
919   // Pointer to the allocated indirect lock is written to crit, while indexing
920   // is ignored.
921   void *idx;
922   kmp_indirect_lock_t **lck;
923   lck = (kmp_indirect_lock_t **)crit;
924   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
925   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
926   KMP_SET_I_LOCK_LOCATION(ilk, loc);
927   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
928   KA_TRACE(20,
929            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
930 #if USE_ITT_BUILD
931   __kmp_itt_critical_creating(ilk->lock, loc);
932 #endif
933   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
934   if (status == 0) {
935 #if USE_ITT_BUILD
936     __kmp_itt_critical_destroyed(ilk->lock);
937 #endif
938     // We don't really need to destroy the unclaimed lock here since it will be
939     // cleaned up at program exit.
940     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
941   }
942   KMP_DEBUG_ASSERT(*lck != NULL);
943 }
944 
945 // Fast-path acquire tas lock
946 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
947   {                                                                            \
948     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
949     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
950     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
951     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
952         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
953       kmp_uint32 spins;                                                        \
954       KMP_FSYNC_PREPARE(l);                                                    \
955       KMP_INIT_YIELD(spins);                                                   \
956       if (TCR_4(__kmp_nth) >                                                   \
957           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
958         KMP_YIELD(TRUE);                                                       \
959       } else {                                                                 \
960         KMP_YIELD_SPIN(spins);                                                 \
961       }                                                                        \
962       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
963       while (                                                                  \
964           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
965           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
966         __kmp_spin_backoff(&backoff);                                          \
967         if (TCR_4(__kmp_nth) >                                                 \
968             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
969           KMP_YIELD(TRUE);                                                     \
970         } else {                                                               \
971           KMP_YIELD_SPIN(spins);                                               \
972         }                                                                      \
973       }                                                                        \
974     }                                                                          \
975     KMP_FSYNC_ACQUIRED(l);                                                     \
976   }
977 
978 // Fast-path test tas lock
979 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
980   {                                                                            \
981     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
982     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
983     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
984     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
985          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
986   }
987 
988 // Fast-path release tas lock
989 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
990   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
991 
992 #if KMP_USE_FUTEX
993 
994 #include <sys/syscall.h>
995 #include <unistd.h>
996 #ifndef FUTEX_WAIT
997 #define FUTEX_WAIT 0
998 #endif
999 #ifndef FUTEX_WAKE
1000 #define FUTEX_WAKE 1
1001 #endif
1002 
1003 // Fast-path acquire futex lock
1004 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1005   {                                                                            \
1006     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1007     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1008     KMP_MB();                                                                  \
1009     KMP_FSYNC_PREPARE(ftx);                                                    \
1010     kmp_int32 poll_val;                                                        \
1011     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1012                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1013                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1014       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1015       if (!cond) {                                                             \
1016         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1017                                          poll_val |                            \
1018                                              KMP_LOCK_BUSY(1, futex))) {       \
1019           continue;                                                            \
1020         }                                                                      \
1021         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1022       }                                                                        \
1023       kmp_int32 rc;                                                            \
1024       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1025                         NULL, NULL, 0)) != 0) {                                \
1026         continue;                                                              \
1027       }                                                                        \
1028       gtid_code |= 1;                                                          \
1029     }                                                                          \
1030     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1031   }
1032 
1033 // Fast-path test futex lock
1034 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1035   {                                                                            \
1036     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1037     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1038                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1039       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1040       rc = TRUE;                                                               \
1041     } else {                                                                   \
1042       rc = FALSE;                                                              \
1043     }                                                                          \
1044   }
1045 
1046 // Fast-path release futex lock
1047 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1048   {                                                                            \
1049     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1050     KMP_MB();                                                                  \
1051     KMP_FSYNC_RELEASING(ftx);                                                  \
1052     kmp_int32 poll_val =                                                       \
1053         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1054     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1055       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1056               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1057     }                                                                          \
1058     KMP_MB();                                                                  \
1059     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1060               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1061   }
1062 
1063 #endif // KMP_USE_FUTEX
1064 
1065 #else // KMP_USE_DYNAMIC_LOCK
1066 
1067 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1068                                                       ident_t const *loc,
1069                                                       kmp_int32 gtid) {
1070   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1071 
1072   // Because of the double-check, the following load doesn't need to be volatile
1073   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1074 
1075   if (lck == NULL) {
1076     void *idx;
1077 
1078     // Allocate & initialize the lock.
1079     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1080     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1081     __kmp_init_user_lock_with_checks(lck);
1082     __kmp_set_user_lock_location(lck, loc);
1083 #if USE_ITT_BUILD
1084     __kmp_itt_critical_creating(lck);
1085 // __kmp_itt_critical_creating() should be called *before* the first usage
1086 // of underlying lock. It is the only place where we can guarantee it. There
1087 // are chances the lock will destroyed with no usage, but it is not a
1088 // problem, because this is not real event seen by user but rather setting
1089 // name for object (lock). See more details in kmp_itt.h.
1090 #endif /* USE_ITT_BUILD */
1091 
1092     // Use a cmpxchg instruction to slam the start of the critical section with
1093     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1094     // and use the lock that the other thread allocated.
1095     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1096 
1097     if (status == 0) {
1098 // Deallocate the lock and reload the value.
1099 #if USE_ITT_BUILD
1100       __kmp_itt_critical_destroyed(lck);
1101 // Let ITT know the lock is destroyed and the same memory location may be reused
1102 // for another purpose.
1103 #endif /* USE_ITT_BUILD */
1104       __kmp_destroy_user_lock_with_checks(lck);
1105       __kmp_user_lock_free(&idx, gtid, lck);
1106       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1107       KMP_DEBUG_ASSERT(lck != NULL);
1108     }
1109   }
1110   return lck;
1111 }
1112 
1113 #endif // KMP_USE_DYNAMIC_LOCK
1114 
1115 /*!
1116 @ingroup WORK_SHARING
1117 @param loc  source location information.
1118 @param global_tid  global thread number .
1119 @param crit identity of the critical section. This could be a pointer to a lock
1120 associated with the critical section, or some other suitably unique value.
1121 
1122 Enter code protected by a `critical` construct.
1123 This function blocks until the executing thread can enter the critical section.
1124 */
1125 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1126                      kmp_critical_name *crit) {
1127 #if KMP_USE_DYNAMIC_LOCK
1128 #if OMPT_SUPPORT && OMPT_OPTIONAL
1129   OMPT_STORE_RETURN_ADDRESS(global_tid);
1130 #endif // OMPT_SUPPORT
1131   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1132 #else
1133   KMP_COUNT_BLOCK(OMP_CRITICAL);
1134 #if OMPT_SUPPORT && OMPT_OPTIONAL
1135   omp_state_t prev_state = omp_state_undefined;
1136   ompt_thread_info_t ti;
1137 #endif
1138   kmp_user_lock_p lck;
1139 
1140   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1141 
1142   // TODO: add THR_OVHD_STATE
1143 
1144   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1145   KMP_CHECK_USER_LOCK_INIT();
1146 
1147   if ((__kmp_user_lock_kind == lk_tas) &&
1148       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1149     lck = (kmp_user_lock_p)crit;
1150   }
1151 #if KMP_USE_FUTEX
1152   else if ((__kmp_user_lock_kind == lk_futex) &&
1153            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1154     lck = (kmp_user_lock_p)crit;
1155   }
1156 #endif
1157   else { // ticket, queuing or drdpa
1158     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1159   }
1160 
1161   if (__kmp_env_consistency_check)
1162     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1163 
1164 // since the critical directive binds to all threads, not just the current
1165 // team we have to check this even if we are in a serialized team.
1166 // also, even if we are the uber thread, we still have to conduct the lock,
1167 // as we have to contend with sibling threads.
1168 
1169 #if USE_ITT_BUILD
1170   __kmp_itt_critical_acquiring(lck);
1171 #endif /* USE_ITT_BUILD */
1172 #if OMPT_SUPPORT && OMPT_OPTIONAL
1173   OMPT_STORE_RETURN_ADDRESS(gtid);
1174   void *codeptr_ra = NULL;
1175   if (ompt_enabled.enabled) {
1176     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1177     /* OMPT state update */
1178     prev_state = ti.state;
1179     ti.wait_id = (omp_wait_id_t)lck;
1180     ti.state = omp_state_wait_critical;
1181 
1182     /* OMPT event callback */
1183     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1184     if (ompt_enabled.ompt_callback_mutex_acquire) {
1185       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1186           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1187           (omp_wait_id_t)crit, codeptr_ra);
1188     }
1189   }
1190 #endif
1191   // Value of 'crit' should be good for using as a critical_id of the critical
1192   // section directive.
1193   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1194 
1195 #if USE_ITT_BUILD
1196   __kmp_itt_critical_acquired(lck);
1197 #endif /* USE_ITT_BUILD */
1198 #if OMPT_SUPPORT && OMPT_OPTIONAL
1199   if (ompt_enabled.enabled) {
1200     /* OMPT state update */
1201     ti.state = prev_state;
1202     ti.wait_id = 0;
1203 
1204     /* OMPT event callback */
1205     if (ompt_enabled.ompt_callback_mutex_acquired) {
1206       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1207           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra);
1208     }
1209   }
1210 #endif
1211   KMP_POP_PARTITIONED_TIMER();
1212 
1213   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1214   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1215 #endif // KMP_USE_DYNAMIC_LOCK
1216 }
1217 
1218 #if KMP_USE_DYNAMIC_LOCK
1219 
1220 // Converts the given hint to an internal lock implementation
1221 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1222 #if KMP_USE_TSX
1223 #define KMP_TSX_LOCK(seq) lockseq_##seq
1224 #else
1225 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1226 #endif
1227 
1228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1229 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1230 #else
1231 #define KMP_CPUINFO_RTM 0
1232 #endif
1233 
1234   // Hints that do not require further logic
1235   if (hint & kmp_lock_hint_hle)
1236     return KMP_TSX_LOCK(hle);
1237   if (hint & kmp_lock_hint_rtm)
1238     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1239   if (hint & kmp_lock_hint_adaptive)
1240     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1241 
1242   // Rule out conflicting hints first by returning the default lock
1243   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1244     return __kmp_user_lock_seq;
1245   if ((hint & omp_lock_hint_speculative) &&
1246       (hint & omp_lock_hint_nonspeculative))
1247     return __kmp_user_lock_seq;
1248 
1249   // Do not even consider speculation when it appears to be contended
1250   if (hint & omp_lock_hint_contended)
1251     return lockseq_queuing;
1252 
1253   // Uncontended lock without speculation
1254   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1255     return lockseq_tas;
1256 
1257   // HLE lock for speculation
1258   if (hint & omp_lock_hint_speculative)
1259     return KMP_TSX_LOCK(hle);
1260 
1261   return __kmp_user_lock_seq;
1262 }
1263 
1264 #if OMPT_SUPPORT && OMPT_OPTIONAL
1265 #if KMP_USE_DYNAMIC_LOCK
1266 static kmp_mutex_impl_t
1267 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1268   if (user_lock) {
1269     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1270     case 0:
1271       break;
1272 #if KMP_USE_FUTEX
1273     case locktag_futex:
1274       return kmp_mutex_impl_queuing;
1275 #endif
1276     case locktag_tas:
1277       return kmp_mutex_impl_spin;
1278 #if KMP_USE_TSX
1279     case locktag_hle:
1280       return kmp_mutex_impl_speculative;
1281 #endif
1282     default:
1283       return ompt_mutex_impl_unknown;
1284     }
1285     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1286   }
1287   KMP_ASSERT(ilock);
1288   switch (ilock->type) {
1289 #if KMP_USE_TSX
1290   case locktag_adaptive:
1291   case locktag_rtm:
1292     return kmp_mutex_impl_speculative;
1293 #endif
1294   case locktag_nested_tas:
1295     return kmp_mutex_impl_spin;
1296 #if KMP_USE_FUTEX
1297   case locktag_nested_futex:
1298 #endif
1299   case locktag_ticket:
1300   case locktag_queuing:
1301   case locktag_drdpa:
1302   case locktag_nested_ticket:
1303   case locktag_nested_queuing:
1304   case locktag_nested_drdpa:
1305     return kmp_mutex_impl_queuing;
1306   default:
1307     return ompt_mutex_impl_unknown;
1308   }
1309 }
1310 #else
1311 // For locks without dynamic binding
1312 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1313   switch (__kmp_user_lock_kind) {
1314   case lk_tas:
1315     return kmp_mutex_impl_spin;
1316 #if KMP_USE_FUTEX
1317   case lk_futex:
1318 #endif
1319   case lk_ticket:
1320   case lk_queuing:
1321   case lk_drdpa:
1322     return kmp_mutex_impl_queuing;
1323 #if KMP_USE_TSX
1324   case lk_hle:
1325   case lk_rtm:
1326   case lk_adaptive:
1327     return kmp_mutex_impl_speculative;
1328 #endif
1329   default:
1330     return ompt_mutex_impl_unknown;
1331   }
1332 }
1333 #endif // KMP_USE_DYNAMIC_LOCK
1334 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1335 
1336 /*!
1337 @ingroup WORK_SHARING
1338 @param loc  source location information.
1339 @param global_tid  global thread number.
1340 @param crit identity of the critical section. This could be a pointer to a lock
1341 associated with the critical section, or some other suitably unique value.
1342 @param hint the lock hint.
1343 
1344 Enter code protected by a `critical` construct with a hint. The hint value is
1345 used to suggest a lock implementation. This function blocks until the executing
1346 thread can enter the critical section unless the hint suggests use of
1347 speculative execution and the hardware supports it.
1348 */
1349 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1350                                kmp_critical_name *crit, uintptr_t hint) {
1351   KMP_COUNT_BLOCK(OMP_CRITICAL);
1352   kmp_user_lock_p lck;
1353 #if OMPT_SUPPORT && OMPT_OPTIONAL
1354   omp_state_t prev_state = omp_state_undefined;
1355   ompt_thread_info_t ti;
1356   // This is the case, if called from __kmpc_critical:
1357   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1358   if (!codeptr)
1359     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1360 #endif
1361 
1362   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1363 
1364   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1365   // Check if it is initialized.
1366   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1367   if (*lk == 0) {
1368     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1369     if (KMP_IS_D_LOCK(lckseq)) {
1370       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1371                                   KMP_GET_D_TAG(lckseq));
1372     } else {
1373       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1374     }
1375   }
1376   // Branch for accessing the actual lock object and set operation. This
1377   // branching is inevitable since this lock initialization does not follow the
1378   // normal dispatch path (lock table is not used).
1379   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1380     lck = (kmp_user_lock_p)lk;
1381     if (__kmp_env_consistency_check) {
1382       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1383                       __kmp_map_hint_to_lock(hint));
1384     }
1385 #if USE_ITT_BUILD
1386     __kmp_itt_critical_acquiring(lck);
1387 #endif
1388 #if OMPT_SUPPORT && OMPT_OPTIONAL
1389     if (ompt_enabled.enabled) {
1390       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1391       /* OMPT state update */
1392       prev_state = ti.state;
1393       ti.wait_id = (omp_wait_id_t)lck;
1394       ti.state = omp_state_wait_critical;
1395 
1396       /* OMPT event callback */
1397       if (ompt_enabled.ompt_callback_mutex_acquire) {
1398         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1399             ompt_mutex_critical, (unsigned int)hint,
1400             __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr);
1401       }
1402     }
1403 #endif
1404 #if KMP_USE_INLINED_TAS
1405     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1406       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1407     } else
1408 #elif KMP_USE_INLINED_FUTEX
1409     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1410       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1411     } else
1412 #endif
1413     {
1414       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1415     }
1416   } else {
1417     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1418     lck = ilk->lock;
1419     if (__kmp_env_consistency_check) {
1420       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1421                       __kmp_map_hint_to_lock(hint));
1422     }
1423 #if USE_ITT_BUILD
1424     __kmp_itt_critical_acquiring(lck);
1425 #endif
1426 #if OMPT_SUPPORT && OMPT_OPTIONAL
1427     if (ompt_enabled.enabled) {
1428       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1429       /* OMPT state update */
1430       prev_state = ti.state;
1431       ti.wait_id = (omp_wait_id_t)lck;
1432       ti.state = omp_state_wait_critical;
1433 
1434       /* OMPT event callback */
1435       if (ompt_enabled.ompt_callback_mutex_acquire) {
1436         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1437             ompt_mutex_critical, (unsigned int)hint,
1438             __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr);
1439       }
1440     }
1441 #endif
1442     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1443   }
1444   KMP_POP_PARTITIONED_TIMER();
1445 
1446 #if USE_ITT_BUILD
1447   __kmp_itt_critical_acquired(lck);
1448 #endif /* USE_ITT_BUILD */
1449 #if OMPT_SUPPORT && OMPT_OPTIONAL
1450   if (ompt_enabled.enabled) {
1451     /* OMPT state update */
1452     ti.state = prev_state;
1453     ti.wait_id = 0;
1454 
1455     /* OMPT event callback */
1456     if (ompt_enabled.ompt_callback_mutex_acquired) {
1457       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1458           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr);
1459     }
1460   }
1461 #endif
1462 
1463   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1464   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1465 } // __kmpc_critical_with_hint
1466 
1467 #endif // KMP_USE_DYNAMIC_LOCK
1468 
1469 /*!
1470 @ingroup WORK_SHARING
1471 @param loc  source location information.
1472 @param global_tid  global thread number .
1473 @param crit identity of the critical section. This could be a pointer to a lock
1474 associated with the critical section, or some other suitably unique value.
1475 
1476 Leave a critical section, releasing any lock that was held during its execution.
1477 */
1478 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1479                          kmp_critical_name *crit) {
1480   kmp_user_lock_p lck;
1481 
1482   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1483 
1484 #if KMP_USE_DYNAMIC_LOCK
1485   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1486     lck = (kmp_user_lock_p)crit;
1487     KMP_ASSERT(lck != NULL);
1488     if (__kmp_env_consistency_check) {
1489       __kmp_pop_sync(global_tid, ct_critical, loc);
1490     }
1491 #if USE_ITT_BUILD
1492     __kmp_itt_critical_releasing(lck);
1493 #endif
1494 #if KMP_USE_INLINED_TAS
1495     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1496       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1497     } else
1498 #elif KMP_USE_INLINED_FUTEX
1499     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1500       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1501     } else
1502 #endif
1503     {
1504       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1505     }
1506   } else {
1507     kmp_indirect_lock_t *ilk =
1508         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1509     KMP_ASSERT(ilk != NULL);
1510     lck = ilk->lock;
1511     if (__kmp_env_consistency_check) {
1512       __kmp_pop_sync(global_tid, ct_critical, loc);
1513     }
1514 #if USE_ITT_BUILD
1515     __kmp_itt_critical_releasing(lck);
1516 #endif
1517     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1518   }
1519 
1520 #else // KMP_USE_DYNAMIC_LOCK
1521 
1522   if ((__kmp_user_lock_kind == lk_tas) &&
1523       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1524     lck = (kmp_user_lock_p)crit;
1525   }
1526 #if KMP_USE_FUTEX
1527   else if ((__kmp_user_lock_kind == lk_futex) &&
1528            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1529     lck = (kmp_user_lock_p)crit;
1530   }
1531 #endif
1532   else { // ticket, queuing or drdpa
1533     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1534   }
1535 
1536   KMP_ASSERT(lck != NULL);
1537 
1538   if (__kmp_env_consistency_check)
1539     __kmp_pop_sync(global_tid, ct_critical, loc);
1540 
1541 #if USE_ITT_BUILD
1542   __kmp_itt_critical_releasing(lck);
1543 #endif /* USE_ITT_BUILD */
1544   // Value of 'crit' should be good for using as a critical_id of the critical
1545   // section directive.
1546   __kmp_release_user_lock_with_checks(lck, global_tid);
1547 
1548 #endif // KMP_USE_DYNAMIC_LOCK
1549 
1550 #if OMPT_SUPPORT && OMPT_OPTIONAL
1551   /* OMPT release event triggers after lock is released; place here to trigger
1552    * for all #if branches */
1553   OMPT_STORE_RETURN_ADDRESS(global_tid);
1554   if (ompt_enabled.ompt_callback_mutex_released) {
1555     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1556         ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1557   }
1558 #endif
1559 
1560   KMP_POP_PARTITIONED_TIMER();
1561   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1562 }
1563 
1564 /*!
1565 @ingroup SYNCHRONIZATION
1566 @param loc source location information
1567 @param global_tid thread id.
1568 @return one if the thread should execute the master block, zero otherwise
1569 
1570 Start execution of a combined barrier and master. The barrier is executed inside
1571 this function.
1572 */
1573 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1574   int status;
1575 
1576   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1577 
1578   if (!TCR_4(__kmp_init_parallel))
1579     __kmp_parallel_initialize();
1580 
1581   if (__kmp_env_consistency_check)
1582     __kmp_check_barrier(global_tid, ct_barrier, loc);
1583 
1584 #if OMPT_SUPPORT
1585   omp_frame_t *ompt_frame;
1586   if (ompt_enabled.enabled) {
1587     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1588     if (ompt_frame->enter_frame == NULL)
1589       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1590     OMPT_STORE_RETURN_ADDRESS(global_tid);
1591   }
1592 #endif
1593 #if USE_ITT_NOTIFY
1594   __kmp_threads[global_tid]->th.th_ident = loc;
1595 #endif
1596   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1597 #if OMPT_SUPPORT && OMPT_OPTIONAL
1598   if (ompt_enabled.enabled) {
1599     ompt_frame->enter_frame = NULL;
1600   }
1601 #endif
1602 
1603   return (status != 0) ? 0 : 1;
1604 }
1605 
1606 /*!
1607 @ingroup SYNCHRONIZATION
1608 @param loc source location information
1609 @param global_tid thread id.
1610 
1611 Complete the execution of a combined barrier and master. This function should
1612 only be called at the completion of the <tt>master</tt> code. Other threads will
1613 still be waiting at the barrier and this call releases them.
1614 */
1615 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1616   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1617 
1618   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1619 }
1620 
1621 /*!
1622 @ingroup SYNCHRONIZATION
1623 @param loc source location information
1624 @param global_tid thread id.
1625 @return one if the thread should execute the master block, zero otherwise
1626 
1627 Start execution of a combined barrier and master(nowait) construct.
1628 The barrier is executed inside this function.
1629 There is no equivalent "end" function, since the
1630 */
1631 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1632   kmp_int32 ret;
1633 
1634   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1635 
1636   if (!TCR_4(__kmp_init_parallel))
1637     __kmp_parallel_initialize();
1638 
1639   if (__kmp_env_consistency_check) {
1640     if (loc == 0) {
1641       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1642     }
1643     __kmp_check_barrier(global_tid, ct_barrier, loc);
1644   }
1645 
1646 #if OMPT_SUPPORT
1647   omp_frame_t *ompt_frame;
1648   if (ompt_enabled.enabled) {
1649     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1650     if (ompt_frame->enter_frame == NULL)
1651       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1652     OMPT_STORE_RETURN_ADDRESS(global_tid);
1653   }
1654 #endif
1655 #if USE_ITT_NOTIFY
1656   __kmp_threads[global_tid]->th.th_ident = loc;
1657 #endif
1658   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1659 #if OMPT_SUPPORT && OMPT_OPTIONAL
1660   if (ompt_enabled.enabled) {
1661     ompt_frame->enter_frame = NULL;
1662   }
1663 #endif
1664 
1665   ret = __kmpc_master(loc, global_tid);
1666 
1667   if (__kmp_env_consistency_check) {
1668     /*  there's no __kmpc_end_master called; so the (stats) */
1669     /*  actions of __kmpc_end_master are done here          */
1670 
1671     if (global_tid < 0) {
1672       KMP_WARNING(ThreadIdentInvalid);
1673     }
1674     if (ret) {
1675       /* only one thread should do the pop since only */
1676       /* one did the push (see __kmpc_master())       */
1677 
1678       __kmp_pop_sync(global_tid, ct_master, loc);
1679     }
1680   }
1681 
1682   return (ret);
1683 }
1684 
1685 /* The BARRIER for a SINGLE process section is always explicit   */
1686 /*!
1687 @ingroup WORK_SHARING
1688 @param loc  source location information
1689 @param global_tid  global thread number
1690 @return One if this thread should execute the single construct, zero otherwise.
1691 
1692 Test whether to execute a <tt>single</tt> construct.
1693 There are no implicit barriers in the two "single" calls, rather the compiler
1694 should introduce an explicit barrier if it is required.
1695 */
1696 
1697 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1698   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1699 
1700   if (rc) {
1701     // We are going to execute the single statement, so we should count it.
1702     KMP_COUNT_BLOCK(OMP_SINGLE);
1703     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1704   }
1705 
1706 #if OMPT_SUPPORT && OMPT_OPTIONAL
1707   kmp_info_t *this_thr = __kmp_threads[global_tid];
1708   kmp_team_t *team = this_thr->th.th_team;
1709   int tid = __kmp_tid_from_gtid(global_tid);
1710 
1711   if (ompt_enabled.enabled) {
1712     if (rc) {
1713       if (ompt_enabled.ompt_callback_work) {
1714         ompt_callbacks.ompt_callback(ompt_callback_work)(
1715             ompt_work_single_executor, ompt_scope_begin,
1716             &(team->t.ompt_team_info.parallel_data),
1717             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1718             1, OMPT_GET_RETURN_ADDRESS(0));
1719       }
1720     } else {
1721       if (ompt_enabled.ompt_callback_work) {
1722         ompt_callbacks.ompt_callback(ompt_callback_work)(
1723             ompt_work_single_other, ompt_scope_begin,
1724             &(team->t.ompt_team_info.parallel_data),
1725             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1726             1, OMPT_GET_RETURN_ADDRESS(0));
1727         ompt_callbacks.ompt_callback(ompt_callback_work)(
1728             ompt_work_single_other, ompt_scope_end,
1729             &(team->t.ompt_team_info.parallel_data),
1730             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1731             1, OMPT_GET_RETURN_ADDRESS(0));
1732       }
1733     }
1734   }
1735 #endif
1736 
1737   return rc;
1738 }
1739 
1740 /*!
1741 @ingroup WORK_SHARING
1742 @param loc  source location information
1743 @param global_tid  global thread number
1744 
1745 Mark the end of a <tt>single</tt> construct.  This function should
1746 only be called by the thread that executed the block of code protected
1747 by the `single` construct.
1748 */
1749 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1750   __kmp_exit_single(global_tid);
1751   KMP_POP_PARTITIONED_TIMER();
1752 
1753 #if OMPT_SUPPORT && OMPT_OPTIONAL
1754   kmp_info_t *this_thr = __kmp_threads[global_tid];
1755   kmp_team_t *team = this_thr->th.th_team;
1756   int tid = __kmp_tid_from_gtid(global_tid);
1757 
1758   if (ompt_enabled.ompt_callback_work) {
1759     ompt_callbacks.ompt_callback(ompt_callback_work)(
1760         ompt_work_single_executor, ompt_scope_end,
1761         &(team->t.ompt_team_info.parallel_data),
1762         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1763         OMPT_GET_RETURN_ADDRESS(0));
1764   }
1765 #endif
1766 }
1767 
1768 /*!
1769 @ingroup WORK_SHARING
1770 @param loc Source location
1771 @param global_tid Global thread id
1772 
1773 Mark the end of a statically scheduled loop.
1774 */
1775 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1776   KMP_POP_PARTITIONED_TIMER();
1777   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1778 
1779 #if OMPT_SUPPORT && OMPT_OPTIONAL
1780   if (ompt_enabled.ompt_callback_work) {
1781     ompt_work_type_t ompt_work_type = ompt_work_loop;
1782     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1783     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1784     // Determine workshare type
1785     if (loc != NULL) {
1786       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1787         ompt_work_type = ompt_work_loop;
1788       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1789         ompt_work_type = ompt_work_sections;
1790       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1791         ompt_work_type = ompt_work_distribute;
1792       } else {
1793         // use default set above.
1794         // a warning about this case is provided in __kmpc_for_static_init
1795       }
1796       KMP_DEBUG_ASSERT(ompt_work_type);
1797     }
1798     ompt_callbacks.ompt_callback(ompt_callback_work)(
1799         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1800         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1801   }
1802 #endif
1803   if (__kmp_env_consistency_check)
1804     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1805 }
1806 
1807 // User routines which take C-style arguments (call by value)
1808 // different from the Fortran equivalent routines
1809 
1810 void ompc_set_num_threads(int arg) {
1811   // !!!!! TODO: check the per-task binding
1812   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1813 }
1814 
1815 void ompc_set_dynamic(int flag) {
1816   kmp_info_t *thread;
1817 
1818   /* For the thread-private implementation of the internal controls */
1819   thread = __kmp_entry_thread();
1820 
1821   __kmp_save_internal_controls(thread);
1822 
1823   set__dynamic(thread, flag ? TRUE : FALSE);
1824 }
1825 
1826 void ompc_set_nested(int flag) {
1827   kmp_info_t *thread;
1828 
1829   /* For the thread-private internal controls implementation */
1830   thread = __kmp_entry_thread();
1831 
1832   __kmp_save_internal_controls(thread);
1833 
1834   set__nested(thread, flag ? TRUE : FALSE);
1835 }
1836 
1837 void ompc_set_max_active_levels(int max_active_levels) {
1838   /* TO DO */
1839   /* we want per-task implementation of this internal control */
1840 
1841   /* For the per-thread internal controls implementation */
1842   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1843 }
1844 
1845 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1846   // !!!!! TODO: check the per-task binding
1847   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1848 }
1849 
1850 int ompc_get_ancestor_thread_num(int level) {
1851   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1852 }
1853 
1854 int ompc_get_team_size(int level) {
1855   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1856 }
1857 
1858 void kmpc_set_stacksize(int arg) {
1859   // __kmp_aux_set_stacksize initializes the library if needed
1860   __kmp_aux_set_stacksize(arg);
1861 }
1862 
1863 void kmpc_set_stacksize_s(size_t arg) {
1864   // __kmp_aux_set_stacksize initializes the library if needed
1865   __kmp_aux_set_stacksize(arg);
1866 }
1867 
1868 void kmpc_set_blocktime(int arg) {
1869   int gtid, tid;
1870   kmp_info_t *thread;
1871 
1872   gtid = __kmp_entry_gtid();
1873   tid = __kmp_tid_from_gtid(gtid);
1874   thread = __kmp_thread_from_gtid(gtid);
1875 
1876   __kmp_aux_set_blocktime(arg, thread, tid);
1877 }
1878 
1879 void kmpc_set_library(int arg) {
1880   // __kmp_user_set_library initializes the library if needed
1881   __kmp_user_set_library((enum library_type)arg);
1882 }
1883 
1884 void kmpc_set_defaults(char const *str) {
1885   // __kmp_aux_set_defaults initializes the library if needed
1886   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1887 }
1888 
1889 void kmpc_set_disp_num_buffers(int arg) {
1890   // ignore after initialization because some teams have already
1891   // allocated dispatch buffers
1892   if (__kmp_init_serial == 0 && arg > 0)
1893     __kmp_dispatch_num_buffers = arg;
1894 }
1895 
1896 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1897 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1898   return -1;
1899 #else
1900   if (!TCR_4(__kmp_init_middle)) {
1901     __kmp_middle_initialize();
1902   }
1903   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1904 #endif
1905 }
1906 
1907 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1908 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1909   return -1;
1910 #else
1911   if (!TCR_4(__kmp_init_middle)) {
1912     __kmp_middle_initialize();
1913   }
1914   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1915 #endif
1916 }
1917 
1918 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1919 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1920   return -1;
1921 #else
1922   if (!TCR_4(__kmp_init_middle)) {
1923     __kmp_middle_initialize();
1924   }
1925   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1926 #endif
1927 }
1928 
1929 /* -------------------------------------------------------------------------- */
1930 /*!
1931 @ingroup THREADPRIVATE
1932 @param loc       source location information
1933 @param gtid      global thread number
1934 @param cpy_size  size of the cpy_data buffer
1935 @param cpy_data  pointer to data to be copied
1936 @param cpy_func  helper function to call for copying data
1937 @param didit     flag variable: 1=single thread; 0=not single thread
1938 
1939 __kmpc_copyprivate implements the interface for the private data broadcast
1940 needed for the copyprivate clause associated with a single region in an
1941 OpenMP<sup>*</sup> program (both C and Fortran).
1942 All threads participating in the parallel region call this routine.
1943 One of the threads (called the single thread) should have the <tt>didit</tt>
1944 variable set to 1 and all other threads should have that variable set to 0.
1945 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1946 
1947 The OpenMP specification forbids the use of nowait on the single region when a
1948 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1949 barrier internally to avoid race conditions, so the code generation for the
1950 single region should avoid generating a barrier after the call to @ref
1951 __kmpc_copyprivate.
1952 
1953 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1954 The <tt>loc</tt> parameter is a pointer to source location information.
1955 
1956 Internal implementation: The single thread will first copy its descriptor
1957 address (cpy_data) to a team-private location, then the other threads will each
1958 call the function pointed to by the parameter cpy_func, which carries out the
1959 copy by copying the data using the cpy_data buffer.
1960 
1961 The cpy_func routine used for the copy and the contents of the data area defined
1962 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1963 to be done. For instance, the cpy_data buffer can hold the actual data to be
1964 copied or it may hold a list of pointers to the data. The cpy_func routine must
1965 interpret the cpy_data buffer appropriately.
1966 
1967 The interface to cpy_func is as follows:
1968 @code
1969 void cpy_func( void *destination, void *source )
1970 @endcode
1971 where void *destination is the cpy_data pointer for the thread being copied to
1972 and void *source is the cpy_data pointer for the thread being copied from.
1973 */
1974 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1975                         void *cpy_data, void (*cpy_func)(void *, void *),
1976                         kmp_int32 didit) {
1977   void **data_ptr;
1978 
1979   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1980 
1981   KMP_MB();
1982 
1983   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1984 
1985   if (__kmp_env_consistency_check) {
1986     if (loc == 0) {
1987       KMP_WARNING(ConstructIdentInvalid);
1988     }
1989   }
1990 
1991   // ToDo: Optimize the following two barriers into some kind of split barrier
1992 
1993   if (didit)
1994     *data_ptr = cpy_data;
1995 
1996 #if OMPT_SUPPORT
1997   omp_frame_t *ompt_frame;
1998   if (ompt_enabled.enabled) {
1999     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2000     if (ompt_frame->enter_frame == NULL)
2001       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
2002     OMPT_STORE_RETURN_ADDRESS(gtid);
2003   }
2004 #endif
2005 /* This barrier is not a barrier region boundary */
2006 #if USE_ITT_NOTIFY
2007   __kmp_threads[gtid]->th.th_ident = loc;
2008 #endif
2009   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2010 
2011   if (!didit)
2012     (*cpy_func)(cpy_data, *data_ptr);
2013 
2014 // Consider next barrier a user-visible barrier for barrier region boundaries
2015 // Nesting checks are already handled by the single construct checks
2016 
2017 #if OMPT_SUPPORT
2018   if (ompt_enabled.enabled) {
2019     OMPT_STORE_RETURN_ADDRESS(gtid);
2020   }
2021 #endif
2022 #if USE_ITT_NOTIFY
2023   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2024 // tasks can overwrite the location)
2025 #endif
2026   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2027 #if OMPT_SUPPORT && OMPT_OPTIONAL
2028   if (ompt_enabled.enabled) {
2029     ompt_frame->enter_frame = NULL;
2030   }
2031 #endif
2032 }
2033 
2034 /* -------------------------------------------------------------------------- */
2035 
2036 #define INIT_LOCK __kmp_init_user_lock_with_checks
2037 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2038 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2039 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2040 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2041 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2042   __kmp_acquire_nested_user_lock_with_checks_timed
2043 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2044 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2045 #define TEST_LOCK __kmp_test_user_lock_with_checks
2046 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2047 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2048 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2049 
2050 // TODO: Make check abort messages use location info & pass it into
2051 // with_checks routines
2052 
2053 #if KMP_USE_DYNAMIC_LOCK
2054 
2055 // internal lock initializer
2056 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2057                                                     kmp_dyna_lockseq_t seq) {
2058   if (KMP_IS_D_LOCK(seq)) {
2059     KMP_INIT_D_LOCK(lock, seq);
2060 #if USE_ITT_BUILD
2061     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2062 #endif
2063   } else {
2064     KMP_INIT_I_LOCK(lock, seq);
2065 #if USE_ITT_BUILD
2066     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2067     __kmp_itt_lock_creating(ilk->lock, loc);
2068 #endif
2069   }
2070 }
2071 
2072 // internal nest lock initializer
2073 static __forceinline void
2074 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2075                                kmp_dyna_lockseq_t seq) {
2076 #if KMP_USE_TSX
2077   // Don't have nested lock implementation for speculative locks
2078   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2079     seq = __kmp_user_lock_seq;
2080 #endif
2081   switch (seq) {
2082   case lockseq_tas:
2083     seq = lockseq_nested_tas;
2084     break;
2085 #if KMP_USE_FUTEX
2086   case lockseq_futex:
2087     seq = lockseq_nested_futex;
2088     break;
2089 #endif
2090   case lockseq_ticket:
2091     seq = lockseq_nested_ticket;
2092     break;
2093   case lockseq_queuing:
2094     seq = lockseq_nested_queuing;
2095     break;
2096   case lockseq_drdpa:
2097     seq = lockseq_nested_drdpa;
2098     break;
2099   default:
2100     seq = lockseq_nested_queuing;
2101   }
2102   KMP_INIT_I_LOCK(lock, seq);
2103 #if USE_ITT_BUILD
2104   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2105   __kmp_itt_lock_creating(ilk->lock, loc);
2106 #endif
2107 }
2108 
2109 /* initialize the lock with a hint */
2110 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2111                                 uintptr_t hint) {
2112   KMP_DEBUG_ASSERT(__kmp_init_serial);
2113   if (__kmp_env_consistency_check && user_lock == NULL) {
2114     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2115   }
2116 
2117   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2118 
2119 #if OMPT_SUPPORT && OMPT_OPTIONAL
2120   // This is the case, if called from omp_init_lock_with_hint:
2121   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2122   if (!codeptr)
2123     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2124   if (ompt_enabled.ompt_callback_lock_init) {
2125     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2126         ompt_mutex_lock, (omp_lock_hint_t)hint,
2127         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2128         codeptr);
2129   }
2130 #endif
2131 }
2132 
2133 /* initialize the lock with a hint */
2134 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2135                                      void **user_lock, uintptr_t hint) {
2136   KMP_DEBUG_ASSERT(__kmp_init_serial);
2137   if (__kmp_env_consistency_check && user_lock == NULL) {
2138     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2139   }
2140 
2141   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2142 
2143 #if OMPT_SUPPORT && OMPT_OPTIONAL
2144   // This is the case, if called from omp_init_lock_with_hint:
2145   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2146   if (!codeptr)
2147     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2148   if (ompt_enabled.ompt_callback_lock_init) {
2149     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2150         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2151         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2152         codeptr);
2153   }
2154 #endif
2155 }
2156 
2157 #endif // KMP_USE_DYNAMIC_LOCK
2158 
2159 /* initialize the lock */
2160 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2161 #if KMP_USE_DYNAMIC_LOCK
2162 
2163   KMP_DEBUG_ASSERT(__kmp_init_serial);
2164   if (__kmp_env_consistency_check && user_lock == NULL) {
2165     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2166   }
2167   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2168 
2169 #if OMPT_SUPPORT && OMPT_OPTIONAL
2170   // This is the case, if called from omp_init_lock_with_hint:
2171   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2172   if (!codeptr)
2173     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2174   if (ompt_enabled.ompt_callback_lock_init) {
2175     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2176         ompt_mutex_lock, omp_lock_hint_none,
2177         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2178         codeptr);
2179   }
2180 #endif
2181 
2182 #else // KMP_USE_DYNAMIC_LOCK
2183 
2184   static char const *const func = "omp_init_lock";
2185   kmp_user_lock_p lck;
2186   KMP_DEBUG_ASSERT(__kmp_init_serial);
2187 
2188   if (__kmp_env_consistency_check) {
2189     if (user_lock == NULL) {
2190       KMP_FATAL(LockIsUninitialized, func);
2191     }
2192   }
2193 
2194   KMP_CHECK_USER_LOCK_INIT();
2195 
2196   if ((__kmp_user_lock_kind == lk_tas) &&
2197       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2198     lck = (kmp_user_lock_p)user_lock;
2199   }
2200 #if KMP_USE_FUTEX
2201   else if ((__kmp_user_lock_kind == lk_futex) &&
2202            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2203     lck = (kmp_user_lock_p)user_lock;
2204   }
2205 #endif
2206   else {
2207     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2208   }
2209   INIT_LOCK(lck);
2210   __kmp_set_user_lock_location(lck, loc);
2211 
2212 #if OMPT_SUPPORT && OMPT_OPTIONAL
2213   // This is the case, if called from omp_init_lock_with_hint:
2214   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2215   if (!codeptr)
2216     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2217   if (ompt_enabled.ompt_callback_lock_init) {
2218     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2219         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2220         (omp_wait_id_t)user_lock, codeptr);
2221   }
2222 #endif
2223 
2224 #if USE_ITT_BUILD
2225   __kmp_itt_lock_creating(lck);
2226 #endif /* USE_ITT_BUILD */
2227 
2228 #endif // KMP_USE_DYNAMIC_LOCK
2229 } // __kmpc_init_lock
2230 
2231 /* initialize the lock */
2232 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2233 #if KMP_USE_DYNAMIC_LOCK
2234 
2235   KMP_DEBUG_ASSERT(__kmp_init_serial);
2236   if (__kmp_env_consistency_check && user_lock == NULL) {
2237     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2238   }
2239   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2240 
2241 #if OMPT_SUPPORT && OMPT_OPTIONAL
2242   // This is the case, if called from omp_init_lock_with_hint:
2243   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2244   if (!codeptr)
2245     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2246   if (ompt_enabled.ompt_callback_lock_init) {
2247     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2248         ompt_mutex_nest_lock, omp_lock_hint_none,
2249         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2250         codeptr);
2251   }
2252 #endif
2253 
2254 #else // KMP_USE_DYNAMIC_LOCK
2255 
2256   static char const *const func = "omp_init_nest_lock";
2257   kmp_user_lock_p lck;
2258   KMP_DEBUG_ASSERT(__kmp_init_serial);
2259 
2260   if (__kmp_env_consistency_check) {
2261     if (user_lock == NULL) {
2262       KMP_FATAL(LockIsUninitialized, func);
2263     }
2264   }
2265 
2266   KMP_CHECK_USER_LOCK_INIT();
2267 
2268   if ((__kmp_user_lock_kind == lk_tas) &&
2269       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2270        OMP_NEST_LOCK_T_SIZE)) {
2271     lck = (kmp_user_lock_p)user_lock;
2272   }
2273 #if KMP_USE_FUTEX
2274   else if ((__kmp_user_lock_kind == lk_futex) &&
2275            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2276             OMP_NEST_LOCK_T_SIZE)) {
2277     lck = (kmp_user_lock_p)user_lock;
2278   }
2279 #endif
2280   else {
2281     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2282   }
2283 
2284   INIT_NESTED_LOCK(lck);
2285   __kmp_set_user_lock_location(lck, loc);
2286 
2287 #if OMPT_SUPPORT && OMPT_OPTIONAL
2288   // This is the case, if called from omp_init_lock_with_hint:
2289   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2290   if (!codeptr)
2291     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2292   if (ompt_enabled.ompt_callback_lock_init) {
2293     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2294         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2295         (omp_wait_id_t)user_lock, codeptr);
2296   }
2297 #endif
2298 
2299 #if USE_ITT_BUILD
2300   __kmp_itt_lock_creating(lck);
2301 #endif /* USE_ITT_BUILD */
2302 
2303 #endif // KMP_USE_DYNAMIC_LOCK
2304 } // __kmpc_init_nest_lock
2305 
2306 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2307 #if KMP_USE_DYNAMIC_LOCK
2308 
2309 #if USE_ITT_BUILD
2310   kmp_user_lock_p lck;
2311   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2312     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2313   } else {
2314     lck = (kmp_user_lock_p)user_lock;
2315   }
2316   __kmp_itt_lock_destroyed(lck);
2317 #endif
2318 #if OMPT_SUPPORT && OMPT_OPTIONAL
2319   // This is the case, if called from omp_init_lock_with_hint:
2320   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2321   if (!codeptr)
2322     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2323   if (ompt_enabled.ompt_callback_lock_destroy) {
2324     kmp_user_lock_p lck;
2325     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2326       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2327     } else {
2328       lck = (kmp_user_lock_p)user_lock;
2329     }
2330     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2331         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2332   }
2333 #endif
2334   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2335 #else
2336   kmp_user_lock_p lck;
2337 
2338   if ((__kmp_user_lock_kind == lk_tas) &&
2339       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2340     lck = (kmp_user_lock_p)user_lock;
2341   }
2342 #if KMP_USE_FUTEX
2343   else if ((__kmp_user_lock_kind == lk_futex) &&
2344            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2345     lck = (kmp_user_lock_p)user_lock;
2346   }
2347 #endif
2348   else {
2349     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2350   }
2351 
2352 #if OMPT_SUPPORT && OMPT_OPTIONAL
2353   // This is the case, if called from omp_init_lock_with_hint:
2354   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2355   if (!codeptr)
2356     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2357   if (ompt_enabled.ompt_callback_lock_destroy) {
2358     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2359         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2360   }
2361 #endif
2362 
2363 #if USE_ITT_BUILD
2364   __kmp_itt_lock_destroyed(lck);
2365 #endif /* USE_ITT_BUILD */
2366   DESTROY_LOCK(lck);
2367 
2368   if ((__kmp_user_lock_kind == lk_tas) &&
2369       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2370     ;
2371   }
2372 #if KMP_USE_FUTEX
2373   else if ((__kmp_user_lock_kind == lk_futex) &&
2374            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2375     ;
2376   }
2377 #endif
2378   else {
2379     __kmp_user_lock_free(user_lock, gtid, lck);
2380   }
2381 #endif // KMP_USE_DYNAMIC_LOCK
2382 } // __kmpc_destroy_lock
2383 
2384 /* destroy the lock */
2385 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2386 #if KMP_USE_DYNAMIC_LOCK
2387 
2388 #if USE_ITT_BUILD
2389   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2390   __kmp_itt_lock_destroyed(ilk->lock);
2391 #endif
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393   // This is the case, if called from omp_init_lock_with_hint:
2394   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2395   if (!codeptr)
2396     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2397   if (ompt_enabled.ompt_callback_lock_destroy) {
2398     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2399         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2400   }
2401 #endif
2402   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2403 
2404 #else // KMP_USE_DYNAMIC_LOCK
2405 
2406   kmp_user_lock_p lck;
2407 
2408   if ((__kmp_user_lock_kind == lk_tas) &&
2409       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2410        OMP_NEST_LOCK_T_SIZE)) {
2411     lck = (kmp_user_lock_p)user_lock;
2412   }
2413 #if KMP_USE_FUTEX
2414   else if ((__kmp_user_lock_kind == lk_futex) &&
2415            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2416             OMP_NEST_LOCK_T_SIZE)) {
2417     lck = (kmp_user_lock_p)user_lock;
2418   }
2419 #endif
2420   else {
2421     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2422   }
2423 
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   // This is the case, if called from omp_init_lock_with_hint:
2426   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2427   if (!codeptr)
2428     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2429   if (ompt_enabled.ompt_callback_lock_destroy) {
2430     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2431         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2432   }
2433 #endif
2434 
2435 #if USE_ITT_BUILD
2436   __kmp_itt_lock_destroyed(lck);
2437 #endif /* USE_ITT_BUILD */
2438 
2439   DESTROY_NESTED_LOCK(lck);
2440 
2441   if ((__kmp_user_lock_kind == lk_tas) &&
2442       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2443        OMP_NEST_LOCK_T_SIZE)) {
2444     ;
2445   }
2446 #if KMP_USE_FUTEX
2447   else if ((__kmp_user_lock_kind == lk_futex) &&
2448            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2449             OMP_NEST_LOCK_T_SIZE)) {
2450     ;
2451   }
2452 #endif
2453   else {
2454     __kmp_user_lock_free(user_lock, gtid, lck);
2455   }
2456 #endif // KMP_USE_DYNAMIC_LOCK
2457 } // __kmpc_destroy_nest_lock
2458 
2459 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2460   KMP_COUNT_BLOCK(OMP_set_lock);
2461 #if KMP_USE_DYNAMIC_LOCK
2462   int tag = KMP_EXTRACT_D_TAG(user_lock);
2463 #if USE_ITT_BUILD
2464   __kmp_itt_lock_acquiring(
2465       (kmp_user_lock_p)
2466           user_lock); // itt function will get to the right lock object.
2467 #endif
2468 #if OMPT_SUPPORT && OMPT_OPTIONAL
2469   // This is the case, if called from omp_init_lock_with_hint:
2470   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2471   if (!codeptr)
2472     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2473   if (ompt_enabled.ompt_callback_mutex_acquire) {
2474     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2475         ompt_mutex_lock, omp_lock_hint_none,
2476         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2477         codeptr);
2478   }
2479 #endif
2480 #if KMP_USE_INLINED_TAS
2481   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2482     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2483   } else
2484 #elif KMP_USE_INLINED_FUTEX
2485   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2486     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2487   } else
2488 #endif
2489   {
2490     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2491   }
2492 #if USE_ITT_BUILD
2493   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2494 #endif
2495 #if OMPT_SUPPORT && OMPT_OPTIONAL
2496   if (ompt_enabled.ompt_callback_mutex_acquired) {
2497     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2498         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2499   }
2500 #endif
2501 
2502 #else // KMP_USE_DYNAMIC_LOCK
2503 
2504   kmp_user_lock_p lck;
2505 
2506   if ((__kmp_user_lock_kind == lk_tas) &&
2507       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2508     lck = (kmp_user_lock_p)user_lock;
2509   }
2510 #if KMP_USE_FUTEX
2511   else if ((__kmp_user_lock_kind == lk_futex) &&
2512            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2513     lck = (kmp_user_lock_p)user_lock;
2514   }
2515 #endif
2516   else {
2517     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2518   }
2519 
2520 #if USE_ITT_BUILD
2521   __kmp_itt_lock_acquiring(lck);
2522 #endif /* USE_ITT_BUILD */
2523 #if OMPT_SUPPORT && OMPT_OPTIONAL
2524   // This is the case, if called from omp_init_lock_with_hint:
2525   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2526   if (!codeptr)
2527     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2528   if (ompt_enabled.ompt_callback_mutex_acquire) {
2529     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2530         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2531         (omp_wait_id_t)lck, codeptr);
2532   }
2533 #endif
2534 
2535   ACQUIRE_LOCK(lck, gtid);
2536 
2537 #if USE_ITT_BUILD
2538   __kmp_itt_lock_acquired(lck);
2539 #endif /* USE_ITT_BUILD */
2540 
2541 #if OMPT_SUPPORT && OMPT_OPTIONAL
2542   if (ompt_enabled.ompt_callback_mutex_acquired) {
2543     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2544         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2545   }
2546 #endif
2547 
2548 #endif // KMP_USE_DYNAMIC_LOCK
2549 }
2550 
2551 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2552 #if KMP_USE_DYNAMIC_LOCK
2553 
2554 #if USE_ITT_BUILD
2555   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2556 #endif
2557 #if OMPT_SUPPORT && OMPT_OPTIONAL
2558   // This is the case, if called from omp_init_lock_with_hint:
2559   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2560   if (!codeptr)
2561     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2562   if (ompt_enabled.enabled) {
2563     if (ompt_enabled.ompt_callback_mutex_acquire) {
2564       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2565           ompt_mutex_nest_lock, omp_lock_hint_none,
2566           __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2567           codeptr);
2568     }
2569   }
2570 #endif
2571   int acquire_status =
2572       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2573 #if USE_ITT_BUILD
2574   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2575 #endif
2576 
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578   if (ompt_enabled.enabled) {
2579     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2580       if (ompt_enabled.ompt_callback_mutex_acquired) {
2581         // lock_first
2582         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2583             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2584       }
2585     } else {
2586       if (ompt_enabled.ompt_callback_nest_lock) {
2587         // lock_next
2588         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2589             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
2590       }
2591     }
2592   }
2593 #endif
2594 
2595 #else // KMP_USE_DYNAMIC_LOCK
2596   int acquire_status;
2597   kmp_user_lock_p lck;
2598 
2599   if ((__kmp_user_lock_kind == lk_tas) &&
2600       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2601        OMP_NEST_LOCK_T_SIZE)) {
2602     lck = (kmp_user_lock_p)user_lock;
2603   }
2604 #if KMP_USE_FUTEX
2605   else if ((__kmp_user_lock_kind == lk_futex) &&
2606            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2607             OMP_NEST_LOCK_T_SIZE)) {
2608     lck = (kmp_user_lock_p)user_lock;
2609   }
2610 #endif
2611   else {
2612     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2613   }
2614 
2615 #if USE_ITT_BUILD
2616   __kmp_itt_lock_acquiring(lck);
2617 #endif /* USE_ITT_BUILD */
2618 #if OMPT_SUPPORT && OMPT_OPTIONAL
2619   // This is the case, if called from omp_init_lock_with_hint:
2620   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2621   if (!codeptr)
2622     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2623   if (ompt_enabled.enabled) {
2624     if (ompt_enabled.ompt_callback_mutex_acquire) {
2625       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2626           ompt_mutex_nest_lock, omp_lock_hint_none,
2627           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
2628     }
2629   }
2630 #endif
2631 
2632   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2633 
2634 #if USE_ITT_BUILD
2635   __kmp_itt_lock_acquired(lck);
2636 #endif /* USE_ITT_BUILD */
2637 
2638 #if OMPT_SUPPORT && OMPT_OPTIONAL
2639   if (ompt_enabled.enabled) {
2640     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2641       if (ompt_enabled.ompt_callback_mutex_acquired) {
2642         // lock_first
2643         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2644             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2645       }
2646     } else {
2647       if (ompt_enabled.ompt_callback_nest_lock) {
2648         // lock_next
2649         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2650             ompt_scope_begin, (omp_wait_id_t)lck, codeptr);
2651       }
2652     }
2653   }
2654 #endif
2655 
2656 #endif // KMP_USE_DYNAMIC_LOCK
2657 }
2658 
2659 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2660 #if KMP_USE_DYNAMIC_LOCK
2661 
2662   int tag = KMP_EXTRACT_D_TAG(user_lock);
2663 #if USE_ITT_BUILD
2664   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2665 #endif
2666 #if KMP_USE_INLINED_TAS
2667   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2668     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2669   } else
2670 #elif KMP_USE_INLINED_FUTEX
2671   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2672     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2673   } else
2674 #endif
2675   {
2676     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2677   }
2678 
2679 #if OMPT_SUPPORT && OMPT_OPTIONAL
2680   // This is the case, if called from omp_init_lock_with_hint:
2681   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2682   if (!codeptr)
2683     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2684   if (ompt_enabled.ompt_callback_mutex_released) {
2685     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2686         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2687   }
2688 #endif
2689 
2690 #else // KMP_USE_DYNAMIC_LOCK
2691 
2692   kmp_user_lock_p lck;
2693 
2694   /* Can't use serial interval since not block structured */
2695   /* release the lock */
2696 
2697   if ((__kmp_user_lock_kind == lk_tas) &&
2698       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2699 #if KMP_OS_LINUX &&                                                            \
2700     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2701 // "fast" path implemented to fix customer performance issue
2702 #if USE_ITT_BUILD
2703     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2704 #endif /* USE_ITT_BUILD */
2705     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2706     KMP_MB();
2707 
2708 #if OMPT_SUPPORT && OMPT_OPTIONAL
2709     // This is the case, if called from omp_init_lock_with_hint:
2710     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2711     if (!codeptr)
2712       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2713     if (ompt_enabled.ompt_callback_mutex_released) {
2714       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2715           ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2716     }
2717 #endif
2718 
2719     return;
2720 #else
2721     lck = (kmp_user_lock_p)user_lock;
2722 #endif
2723   }
2724 #if KMP_USE_FUTEX
2725   else if ((__kmp_user_lock_kind == lk_futex) &&
2726            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2727     lck = (kmp_user_lock_p)user_lock;
2728   }
2729 #endif
2730   else {
2731     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2732   }
2733 
2734 #if USE_ITT_BUILD
2735   __kmp_itt_lock_releasing(lck);
2736 #endif /* USE_ITT_BUILD */
2737 
2738   RELEASE_LOCK(lck, gtid);
2739 
2740 #if OMPT_SUPPORT && OMPT_OPTIONAL
2741   // This is the case, if called from omp_init_lock_with_hint:
2742   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2743   if (!codeptr)
2744     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2745   if (ompt_enabled.ompt_callback_mutex_released) {
2746     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2747         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2748   }
2749 #endif
2750 
2751 #endif // KMP_USE_DYNAMIC_LOCK
2752 }
2753 
2754 /* release the lock */
2755 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2756 #if KMP_USE_DYNAMIC_LOCK
2757 
2758 #if USE_ITT_BUILD
2759   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2760 #endif
2761   int release_status =
2762       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2763 
2764 #if OMPT_SUPPORT && OMPT_OPTIONAL
2765   // This is the case, if called from omp_init_lock_with_hint:
2766   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2767   if (!codeptr)
2768     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2769   if (ompt_enabled.enabled) {
2770     if (release_status == KMP_LOCK_RELEASED) {
2771       if (ompt_enabled.ompt_callback_mutex_released) {
2772         // release_lock_last
2773         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2774             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2775       }
2776     } else if (ompt_enabled.ompt_callback_nest_lock) {
2777       // release_lock_prev
2778       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2779           ompt_scope_end, (omp_wait_id_t)user_lock, codeptr);
2780     }
2781   }
2782 #endif
2783 
2784 #else // KMP_USE_DYNAMIC_LOCK
2785 
2786   kmp_user_lock_p lck;
2787 
2788   /* Can't use serial interval since not block structured */
2789 
2790   if ((__kmp_user_lock_kind == lk_tas) &&
2791       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2792        OMP_NEST_LOCK_T_SIZE)) {
2793 #if KMP_OS_LINUX &&                                                            \
2794     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2795     // "fast" path implemented to fix customer performance issue
2796     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2797 #if USE_ITT_BUILD
2798     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2799 #endif /* USE_ITT_BUILD */
2800 
2801 #if OMPT_SUPPORT && OMPT_OPTIONAL
2802     int release_status = KMP_LOCK_STILL_HELD;
2803 #endif
2804 
2805     if (--(tl->lk.depth_locked) == 0) {
2806       TCW_4(tl->lk.poll, 0);
2807 #if OMPT_SUPPORT && OMPT_OPTIONAL
2808       release_status = KMP_LOCK_RELEASED;
2809 #endif
2810     }
2811     KMP_MB();
2812 
2813 #if OMPT_SUPPORT && OMPT_OPTIONAL
2814     // This is the case, if called from omp_init_lock_with_hint:
2815     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2816     if (!codeptr)
2817       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2818     if (ompt_enabled.enabled) {
2819       if (release_status == KMP_LOCK_RELEASED) {
2820         if (ompt_enabled.ompt_callback_mutex_released) {
2821           // release_lock_last
2822           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2823               ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2824         }
2825       } else if (ompt_enabled.ompt_callback_nest_lock) {
2826         // release_lock_previous
2827         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2828             ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2829       }
2830     }
2831 #endif
2832 
2833     return;
2834 #else
2835     lck = (kmp_user_lock_p)user_lock;
2836 #endif
2837   }
2838 #if KMP_USE_FUTEX
2839   else if ((__kmp_user_lock_kind == lk_futex) &&
2840            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2841             OMP_NEST_LOCK_T_SIZE)) {
2842     lck = (kmp_user_lock_p)user_lock;
2843   }
2844 #endif
2845   else {
2846     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2847   }
2848 
2849 #if USE_ITT_BUILD
2850   __kmp_itt_lock_releasing(lck);
2851 #endif /* USE_ITT_BUILD */
2852 
2853   int release_status;
2854   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2855 #if OMPT_SUPPORT && OMPT_OPTIONAL
2856   // This is the case, if called from omp_init_lock_with_hint:
2857   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2858   if (!codeptr)
2859     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2860   if (ompt_enabled.enabled) {
2861     if (release_status == KMP_LOCK_RELEASED) {
2862       if (ompt_enabled.ompt_callback_mutex_released) {
2863         // release_lock_last
2864         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2865             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2866       }
2867     } else if (ompt_enabled.ompt_callback_nest_lock) {
2868       // release_lock_previous
2869       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2870           ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2871     }
2872   }
2873 #endif
2874 
2875 #endif // KMP_USE_DYNAMIC_LOCK
2876 }
2877 
2878 /* try to acquire the lock */
2879 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2880   KMP_COUNT_BLOCK(OMP_test_lock);
2881 
2882 #if KMP_USE_DYNAMIC_LOCK
2883   int rc;
2884   int tag = KMP_EXTRACT_D_TAG(user_lock);
2885 #if USE_ITT_BUILD
2886   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2887 #endif
2888 #if OMPT_SUPPORT && OMPT_OPTIONAL
2889   // This is the case, if called from omp_init_lock_with_hint:
2890   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2891   if (!codeptr)
2892     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2893   if (ompt_enabled.ompt_callback_mutex_acquire) {
2894     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2895         ompt_mutex_lock, omp_lock_hint_none,
2896         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2897         codeptr);
2898   }
2899 #endif
2900 #if KMP_USE_INLINED_TAS
2901   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2902     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2903   } else
2904 #elif KMP_USE_INLINED_FUTEX
2905   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2906     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2907   } else
2908 #endif
2909   {
2910     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2911   }
2912   if (rc) {
2913 #if USE_ITT_BUILD
2914     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2915 #endif
2916 #if OMPT_SUPPORT && OMPT_OPTIONAL
2917     if (ompt_enabled.ompt_callback_mutex_acquired) {
2918       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2919           ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2920     }
2921 #endif
2922     return FTN_TRUE;
2923   } else {
2924 #if USE_ITT_BUILD
2925     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2926 #endif
2927     return FTN_FALSE;
2928   }
2929 
2930 #else // KMP_USE_DYNAMIC_LOCK
2931 
2932   kmp_user_lock_p lck;
2933   int rc;
2934 
2935   if ((__kmp_user_lock_kind == lk_tas) &&
2936       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2937     lck = (kmp_user_lock_p)user_lock;
2938   }
2939 #if KMP_USE_FUTEX
2940   else if ((__kmp_user_lock_kind == lk_futex) &&
2941            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2942     lck = (kmp_user_lock_p)user_lock;
2943   }
2944 #endif
2945   else {
2946     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2947   }
2948 
2949 #if USE_ITT_BUILD
2950   __kmp_itt_lock_acquiring(lck);
2951 #endif /* USE_ITT_BUILD */
2952 #if OMPT_SUPPORT && OMPT_OPTIONAL
2953   // This is the case, if called from omp_init_lock_with_hint:
2954   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2955   if (!codeptr)
2956     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2957   if (ompt_enabled.ompt_callback_mutex_acquire) {
2958     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2959         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2960         (omp_wait_id_t)lck, codeptr);
2961   }
2962 #endif
2963 
2964   rc = TEST_LOCK(lck, gtid);
2965 #if USE_ITT_BUILD
2966   if (rc) {
2967     __kmp_itt_lock_acquired(lck);
2968   } else {
2969     __kmp_itt_lock_cancelled(lck);
2970   }
2971 #endif /* USE_ITT_BUILD */
2972 #if OMPT_SUPPORT && OMPT_OPTIONAL
2973   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2974     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2975         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2976   }
2977 #endif
2978 
2979   return (rc ? FTN_TRUE : FTN_FALSE);
2980 
2981 /* Can't use serial interval since not block structured */
2982 
2983 #endif // KMP_USE_DYNAMIC_LOCK
2984 }
2985 
2986 /* try to acquire the lock */
2987 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2988 #if KMP_USE_DYNAMIC_LOCK
2989   int rc;
2990 #if USE_ITT_BUILD
2991   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2992 #endif
2993 #if OMPT_SUPPORT && OMPT_OPTIONAL
2994   // This is the case, if called from omp_init_lock_with_hint:
2995   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2996   if (!codeptr)
2997     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2998   if (ompt_enabled.ompt_callback_mutex_acquire) {
2999     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3000         ompt_mutex_nest_lock, omp_lock_hint_none,
3001         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
3002         codeptr);
3003   }
3004 #endif
3005   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3006 #if USE_ITT_BUILD
3007   if (rc) {
3008     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3009   } else {
3010     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3011   }
3012 #endif
3013 #if OMPT_SUPPORT && OMPT_OPTIONAL
3014   if (ompt_enabled.enabled && rc) {
3015     if (rc == 1) {
3016       if (ompt_enabled.ompt_callback_mutex_acquired) {
3017         // lock_first
3018         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3019             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
3020       }
3021     } else {
3022       if (ompt_enabled.ompt_callback_nest_lock) {
3023         // lock_next
3024         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3025             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
3026       }
3027     }
3028   }
3029 #endif
3030   return rc;
3031 
3032 #else // KMP_USE_DYNAMIC_LOCK
3033 
3034   kmp_user_lock_p lck;
3035   int rc;
3036 
3037   if ((__kmp_user_lock_kind == lk_tas) &&
3038       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3039        OMP_NEST_LOCK_T_SIZE)) {
3040     lck = (kmp_user_lock_p)user_lock;
3041   }
3042 #if KMP_USE_FUTEX
3043   else if ((__kmp_user_lock_kind == lk_futex) &&
3044            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3045             OMP_NEST_LOCK_T_SIZE)) {
3046     lck = (kmp_user_lock_p)user_lock;
3047   }
3048 #endif
3049   else {
3050     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3051   }
3052 
3053 #if USE_ITT_BUILD
3054   __kmp_itt_lock_acquiring(lck);
3055 #endif /* USE_ITT_BUILD */
3056 
3057 #if OMPT_SUPPORT && OMPT_OPTIONAL
3058   // This is the case, if called from omp_init_lock_with_hint:
3059   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3060   if (!codeptr)
3061     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3062   if (ompt_enabled.enabled) &&
3063         ompt_enabled.ompt_callback_mutex_acquire) {
3064       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3065           ompt_mutex_nest_lock, omp_lock_hint_none,
3066           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
3067     }
3068 #endif
3069 
3070   rc = TEST_NESTED_LOCK(lck, gtid);
3071 #if USE_ITT_BUILD
3072   if (rc) {
3073     __kmp_itt_lock_acquired(lck);
3074   } else {
3075     __kmp_itt_lock_cancelled(lck);
3076   }
3077 #endif /* USE_ITT_BUILD */
3078 #if OMPT_SUPPORT && OMPT_OPTIONAL
3079   if (ompt_enabled.enabled && rc) {
3080     if (rc == 1) {
3081       if (ompt_enabled.ompt_callback_mutex_acquired) {
3082         // lock_first
3083         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3084             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
3085       }
3086     } else {
3087       if (ompt_enabled.ompt_callback_nest_lock) {
3088         // lock_next
3089         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3090             ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr);
3091       }
3092     }
3093   }
3094 #endif
3095   return rc;
3096 
3097 /* Can't use serial interval since not block structured */
3098 
3099 #endif // KMP_USE_DYNAMIC_LOCK
3100 }
3101 
3102 // Interface to fast scalable reduce methods routines
3103 
3104 // keep the selected method in a thread local structure for cross-function
3105 // usage: will be used in __kmpc_end_reduce* functions;
3106 // another solution: to re-determine the method one more time in
3107 // __kmpc_end_reduce* functions (new prototype required then)
3108 // AT: which solution is better?
3109 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3110   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3111 
3112 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3113   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3114 
3115 // description of the packed_reduction_method variable: look at the macros in
3116 // kmp.h
3117 
3118 // used in a critical section reduce block
3119 static __forceinline void
3120 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3121                                           kmp_critical_name *crit) {
3122 
3123   // this lock was visible to a customer and to the threading profile tool as a
3124   // serial overhead span (although it's used for an internal purpose only)
3125   //            why was it visible in previous implementation?
3126   //            should we keep it visible in new reduce block?
3127   kmp_user_lock_p lck;
3128 
3129 #if KMP_USE_DYNAMIC_LOCK
3130 
3131   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3132   // Check if it is initialized.
3133   if (*lk == 0) {
3134     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3135       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3136                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3137     } else {
3138       __kmp_init_indirect_csptr(crit, loc, global_tid,
3139                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3140     }
3141   }
3142   // Branch for accessing the actual lock object and set operation. This
3143   // branching is inevitable since this lock initialization does not follow the
3144   // normal dispatch path (lock table is not used).
3145   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3146     lck = (kmp_user_lock_p)lk;
3147     KMP_DEBUG_ASSERT(lck != NULL);
3148     if (__kmp_env_consistency_check) {
3149       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3150     }
3151     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3152   } else {
3153     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3154     lck = ilk->lock;
3155     KMP_DEBUG_ASSERT(lck != NULL);
3156     if (__kmp_env_consistency_check) {
3157       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3158     }
3159     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3160   }
3161 
3162 #else // KMP_USE_DYNAMIC_LOCK
3163 
3164   // We know that the fast reduction code is only emitted by Intel compilers
3165   // with 32 byte critical sections. If there isn't enough space, then we
3166   // have to use a pointer.
3167   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3168     lck = (kmp_user_lock_p)crit;
3169   } else {
3170     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3171   }
3172   KMP_DEBUG_ASSERT(lck != NULL);
3173 
3174   if (__kmp_env_consistency_check)
3175     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3176 
3177   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3178 
3179 #endif // KMP_USE_DYNAMIC_LOCK
3180 }
3181 
3182 // used in a critical section reduce block
3183 static __forceinline void
3184 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3185                                         kmp_critical_name *crit) {
3186 
3187   kmp_user_lock_p lck;
3188 
3189 #if KMP_USE_DYNAMIC_LOCK
3190 
3191   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3192     lck = (kmp_user_lock_p)crit;
3193     if (__kmp_env_consistency_check)
3194       __kmp_pop_sync(global_tid, ct_critical, loc);
3195     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3196   } else {
3197     kmp_indirect_lock_t *ilk =
3198         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3199     if (__kmp_env_consistency_check)
3200       __kmp_pop_sync(global_tid, ct_critical, loc);
3201     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3202   }
3203 
3204 #else // KMP_USE_DYNAMIC_LOCK
3205 
3206   // We know that the fast reduction code is only emitted by Intel compilers
3207   // with 32 byte critical sections. If there isn't enough space, then we have
3208   // to use a pointer.
3209   if (__kmp_base_user_lock_size > 32) {
3210     lck = *((kmp_user_lock_p *)crit);
3211     KMP_ASSERT(lck != NULL);
3212   } else {
3213     lck = (kmp_user_lock_p)crit;
3214   }
3215 
3216   if (__kmp_env_consistency_check)
3217     __kmp_pop_sync(global_tid, ct_critical, loc);
3218 
3219   __kmp_release_user_lock_with_checks(lck, global_tid);
3220 
3221 #endif // KMP_USE_DYNAMIC_LOCK
3222 } // __kmp_end_critical_section_reduce_block
3223 
3224 #if OMP_40_ENABLED
3225 static __forceinline int
3226 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3227                                      int *task_state) {
3228   kmp_team_t *team;
3229 
3230   // Check if we are inside the teams construct?
3231   if (th->th.th_teams_microtask) {
3232     *team_p = team = th->th.th_team;
3233     if (team->t.t_level == th->th.th_teams_level) {
3234       // This is reduction at teams construct.
3235       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3236       // Let's swap teams temporarily for the reduction.
3237       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3238       th->th.th_team = team->t.t_parent;
3239       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3240       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3241       *task_state = th->th.th_task_state;
3242       th->th.th_task_state = 0;
3243 
3244       return 1;
3245     }
3246   }
3247   return 0;
3248 }
3249 
3250 static __forceinline void
3251 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3252   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3253   th->th.th_info.ds.ds_tid = 0;
3254   th->th.th_team = team;
3255   th->th.th_team_nproc = team->t.t_nproc;
3256   th->th.th_task_team = team->t.t_task_team[task_state];
3257   th->th.th_task_state = task_state;
3258 }
3259 #endif
3260 
3261 /* 2.a.i. Reduce Block without a terminating barrier */
3262 /*!
3263 @ingroup SYNCHRONIZATION
3264 @param loc source location information
3265 @param global_tid global thread number
3266 @param num_vars number of items (variables) to be reduced
3267 @param reduce_size size of data in bytes to be reduced
3268 @param reduce_data pointer to data to be reduced
3269 @param reduce_func callback function providing reduction operation on two
3270 operands and returning result of reduction in lhs_data
3271 @param lck pointer to the unique lock data structure
3272 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3273 threads if atomic reduction needed
3274 
3275 The nowait version is used for a reduce clause with the nowait argument.
3276 */
3277 kmp_int32
3278 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3279                      size_t reduce_size, void *reduce_data,
3280                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3281                      kmp_critical_name *lck) {
3282 
3283   KMP_COUNT_BLOCK(REDUCE_nowait);
3284   int retval = 0;
3285   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3286 #if OMP_40_ENABLED
3287   kmp_info_t *th;
3288   kmp_team_t *team;
3289   int teams_swapped = 0, task_state;
3290 #endif
3291   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3292 
3293   // why do we need this initialization here at all?
3294   // Reduction clause can not be used as a stand-alone directive.
3295 
3296   // do not call __kmp_serial_initialize(), it will be called by
3297   // __kmp_parallel_initialize() if needed
3298   // possible detection of false-positive race by the threadchecker ???
3299   if (!TCR_4(__kmp_init_parallel))
3300     __kmp_parallel_initialize();
3301 
3302 // check correctness of reduce block nesting
3303 #if KMP_USE_DYNAMIC_LOCK
3304   if (__kmp_env_consistency_check)
3305     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3306 #else
3307   if (__kmp_env_consistency_check)
3308     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3309 #endif
3310 
3311 #if OMP_40_ENABLED
3312   th = __kmp_thread_from_gtid(global_tid);
3313   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3314 #endif // OMP_40_ENABLED
3315 
3316   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3317   // the value should be kept in a variable
3318   // the variable should be either a construct-specific or thread-specific
3319   // property, not a team specific property
3320   //     (a thread can reach the next reduce block on the next construct, reduce
3321   //     method may differ on the next construct)
3322   // an ident_t "loc" parameter could be used as a construct-specific property
3323   // (what if loc == 0?)
3324   //     (if both construct-specific and team-specific variables were shared,
3325   //     then unness extra syncs should be needed)
3326   // a thread-specific variable is better regarding two issues above (next
3327   // construct and extra syncs)
3328   // a thread-specific "th_local.reduction_method" variable is used currently
3329   // each thread executes 'determine' and 'set' lines (no need to execute by one
3330   // thread, to avoid unness extra syncs)
3331 
3332   packed_reduction_method = __kmp_determine_reduction_method(
3333       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3334   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3335 
3336   if (packed_reduction_method == critical_reduce_block) {
3337 
3338     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3339     retval = 1;
3340 
3341   } else if (packed_reduction_method == empty_reduce_block) {
3342 
3343     // usage: if team size == 1, no synchronization is required ( Intel
3344     // platforms only )
3345     retval = 1;
3346 
3347   } else if (packed_reduction_method == atomic_reduce_block) {
3348 
3349     retval = 2;
3350 
3351     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3352     // won't be called by the code gen)
3353     //     (it's not quite good, because the checking block has been closed by
3354     //     this 'pop',
3355     //      but atomic operation has not been executed yet, will be executed
3356     //      slightly later, literally on next instruction)
3357     if (__kmp_env_consistency_check)
3358       __kmp_pop_sync(global_tid, ct_reduce, loc);
3359 
3360   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3361                                    tree_reduce_block)) {
3362 
3363 // AT: performance issue: a real barrier here
3364 // AT:     (if master goes slow, other threads are blocked here waiting for the
3365 // master to come and release them)
3366 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3367 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3368 // be confusing to a customer)
3369 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3370 // might go faster and be more in line with sense of NOWAIT
3371 // AT: TO DO: do epcc test and compare times
3372 
3373 // this barrier should be invisible to a customer and to the threading profile
3374 // tool (it's neither a terminating barrier nor customer's code, it's
3375 // used for an internal purpose)
3376 #if OMPT_SUPPORT
3377     // JP: can this barrier potentially leed to task scheduling?
3378     // JP: as long as there is a barrier in the implementation, OMPT should and
3379     // will provide the barrier events
3380     //         so we set-up the necessary frame/return addresses.
3381     omp_frame_t *ompt_frame;
3382     if (ompt_enabled.enabled) {
3383       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3384       if (ompt_frame->enter_frame == NULL)
3385         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3386       OMPT_STORE_RETURN_ADDRESS(global_tid);
3387     }
3388 #endif
3389 #if USE_ITT_NOTIFY
3390     __kmp_threads[global_tid]->th.th_ident = loc;
3391 #endif
3392     retval =
3393         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3394                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3395     retval = (retval != 0) ? (0) : (1);
3396 #if OMPT_SUPPORT && OMPT_OPTIONAL
3397     if (ompt_enabled.enabled) {
3398       ompt_frame->enter_frame = NULL;
3399     }
3400 #endif
3401 
3402     // all other workers except master should do this pop here
3403     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3404     if (__kmp_env_consistency_check) {
3405       if (retval == 0) {
3406         __kmp_pop_sync(global_tid, ct_reduce, loc);
3407       }
3408     }
3409 
3410   } else {
3411 
3412     // should never reach this block
3413     KMP_ASSERT(0); // "unexpected method"
3414   }
3415 #if OMP_40_ENABLED
3416   if (teams_swapped) {
3417     __kmp_restore_swapped_teams(th, team, task_state);
3418   }
3419 #endif
3420   KA_TRACE(
3421       10,
3422       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3423        global_tid, packed_reduction_method, retval));
3424 
3425   return retval;
3426 }
3427 
3428 /*!
3429 @ingroup SYNCHRONIZATION
3430 @param loc source location information
3431 @param global_tid global thread id.
3432 @param lck pointer to the unique lock data structure
3433 
3434 Finish the execution of a reduce nowait.
3435 */
3436 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3437                               kmp_critical_name *lck) {
3438 
3439   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3440 
3441   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3442 
3443   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3444 
3445   if (packed_reduction_method == critical_reduce_block) {
3446 
3447     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3448 
3449   } else if (packed_reduction_method == empty_reduce_block) {
3450 
3451     // usage: if team size == 1, no synchronization is required ( on Intel
3452     // platforms only )
3453 
3454   } else if (packed_reduction_method == atomic_reduce_block) {
3455 
3456     // neither master nor other workers should get here
3457     //     (code gen does not generate this call in case 2: atomic reduce block)
3458     // actually it's better to remove this elseif at all;
3459     // after removal this value will checked by the 'else' and will assert
3460 
3461   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3462                                    tree_reduce_block)) {
3463 
3464     // only master gets here
3465 
3466   } else {
3467 
3468     // should never reach this block
3469     KMP_ASSERT(0); // "unexpected method"
3470   }
3471 
3472   if (__kmp_env_consistency_check)
3473     __kmp_pop_sync(global_tid, ct_reduce, loc);
3474 
3475   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3476                 global_tid, packed_reduction_method));
3477 
3478   return;
3479 }
3480 
3481 /* 2.a.ii. Reduce Block with a terminating barrier */
3482 
3483 /*!
3484 @ingroup SYNCHRONIZATION
3485 @param loc source location information
3486 @param global_tid global thread number
3487 @param num_vars number of items (variables) to be reduced
3488 @param reduce_size size of data in bytes to be reduced
3489 @param reduce_data pointer to data to be reduced
3490 @param reduce_func callback function providing reduction operation on two
3491 operands and returning result of reduction in lhs_data
3492 @param lck pointer to the unique lock data structure
3493 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3494 threads if atomic reduction needed
3495 
3496 A blocking reduce that includes an implicit barrier.
3497 */
3498 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3499                         size_t reduce_size, void *reduce_data,
3500                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3501                         kmp_critical_name *lck) {
3502   KMP_COUNT_BLOCK(REDUCE_wait);
3503   int retval = 0;
3504   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3505 #if OMP_40_ENABLED
3506   kmp_info_t *th;
3507   kmp_team_t *team;
3508   int teams_swapped = 0, task_state;
3509 #endif
3510 
3511   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3512 
3513   // why do we need this initialization here at all?
3514   // Reduction clause can not be a stand-alone directive.
3515 
3516   // do not call __kmp_serial_initialize(), it will be called by
3517   // __kmp_parallel_initialize() if needed
3518   // possible detection of false-positive race by the threadchecker ???
3519   if (!TCR_4(__kmp_init_parallel))
3520     __kmp_parallel_initialize();
3521 
3522 // check correctness of reduce block nesting
3523 #if KMP_USE_DYNAMIC_LOCK
3524   if (__kmp_env_consistency_check)
3525     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3526 #else
3527   if (__kmp_env_consistency_check)
3528     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3529 #endif
3530 
3531 #if OMP_40_ENABLED
3532   th = __kmp_thread_from_gtid(global_tid);
3533   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3534 #endif // OMP_40_ENABLED
3535 
3536   packed_reduction_method = __kmp_determine_reduction_method(
3537       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3538   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3539 
3540   if (packed_reduction_method == critical_reduce_block) {
3541 
3542     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3543     retval = 1;
3544 
3545   } else if (packed_reduction_method == empty_reduce_block) {
3546 
3547     // usage: if team size == 1, no synchronization is required ( Intel
3548     // platforms only )
3549     retval = 1;
3550 
3551   } else if (packed_reduction_method == atomic_reduce_block) {
3552 
3553     retval = 2;
3554 
3555   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3556                                    tree_reduce_block)) {
3557 
3558 // case tree_reduce_block:
3559 // this barrier should be visible to a customer and to the threading profile
3560 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3561 #if OMPT_SUPPORT
3562     omp_frame_t *ompt_frame;
3563     if (ompt_enabled.enabled) {
3564       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3565       if (ompt_frame->enter_frame == NULL)
3566         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3567       OMPT_STORE_RETURN_ADDRESS(global_tid);
3568     }
3569 #endif
3570 #if USE_ITT_NOTIFY
3571     __kmp_threads[global_tid]->th.th_ident =
3572         loc; // needed for correct notification of frames
3573 #endif
3574     retval =
3575         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3576                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3577     retval = (retval != 0) ? (0) : (1);
3578 #if OMPT_SUPPORT && OMPT_OPTIONAL
3579     if (ompt_enabled.enabled) {
3580       ompt_frame->enter_frame = NULL;
3581     }
3582 #endif
3583 
3584     // all other workers except master should do this pop here
3585     // ( none of other workers except master will enter __kmpc_end_reduce() )
3586     if (__kmp_env_consistency_check) {
3587       if (retval == 0) { // 0: all other workers; 1: master
3588         __kmp_pop_sync(global_tid, ct_reduce, loc);
3589       }
3590     }
3591 
3592   } else {
3593 
3594     // should never reach this block
3595     KMP_ASSERT(0); // "unexpected method"
3596   }
3597 #if OMP_40_ENABLED
3598   if (teams_swapped) {
3599     __kmp_restore_swapped_teams(th, team, task_state);
3600   }
3601 #endif
3602 
3603   KA_TRACE(10,
3604            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3605             global_tid, packed_reduction_method, retval));
3606 
3607   return retval;
3608 }
3609 
3610 /*!
3611 @ingroup SYNCHRONIZATION
3612 @param loc source location information
3613 @param global_tid global thread id.
3614 @param lck pointer to the unique lock data structure
3615 
3616 Finish the execution of a blocking reduce.
3617 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3618 start function.
3619 */
3620 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3621                        kmp_critical_name *lck) {
3622 
3623   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3624 #if OMP_40_ENABLED
3625   kmp_info_t *th;
3626   kmp_team_t *team;
3627   int teams_swapped = 0, task_state;
3628 #endif
3629 
3630   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3631 
3632 #if OMP_40_ENABLED
3633   th = __kmp_thread_from_gtid(global_tid);
3634   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3635 #endif // OMP_40_ENABLED
3636 
3637   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3638 
3639   // this barrier should be visible to a customer and to the threading profile
3640   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3641 
3642   if (packed_reduction_method == critical_reduce_block) {
3643 
3644     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3645 
3646 // TODO: implicit barrier: should be exposed
3647 #if OMPT_SUPPORT
3648     omp_frame_t *ompt_frame;
3649     if (ompt_enabled.enabled) {
3650       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3651       if (ompt_frame->enter_frame == NULL)
3652         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3653       OMPT_STORE_RETURN_ADDRESS(global_tid);
3654     }
3655 #endif
3656 #if USE_ITT_NOTIFY
3657     __kmp_threads[global_tid]->th.th_ident = loc;
3658 #endif
3659     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3660 #if OMPT_SUPPORT && OMPT_OPTIONAL
3661     if (ompt_enabled.enabled) {
3662       ompt_frame->enter_frame = NULL;
3663     }
3664 #endif
3665 
3666   } else if (packed_reduction_method == empty_reduce_block) {
3667 
3668 // usage: if team size==1, no synchronization is required (Intel platforms only)
3669 
3670 // TODO: implicit barrier: should be exposed
3671 #if OMPT_SUPPORT
3672     omp_frame_t *ompt_frame;
3673     if (ompt_enabled.enabled) {
3674       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3675       if (ompt_frame->enter_frame == NULL)
3676         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3677       OMPT_STORE_RETURN_ADDRESS(global_tid);
3678     }
3679 #endif
3680 #if USE_ITT_NOTIFY
3681     __kmp_threads[global_tid]->th.th_ident = loc;
3682 #endif
3683     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3684 #if OMPT_SUPPORT && OMPT_OPTIONAL
3685     if (ompt_enabled.enabled) {
3686       ompt_frame->enter_frame = NULL;
3687     }
3688 #endif
3689 
3690   } else if (packed_reduction_method == atomic_reduce_block) {
3691 
3692 #if OMPT_SUPPORT
3693     omp_frame_t *ompt_frame;
3694     if (ompt_enabled.enabled) {
3695       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3696       if (ompt_frame->enter_frame == NULL)
3697         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3698       OMPT_STORE_RETURN_ADDRESS(global_tid);
3699     }
3700 #endif
3701 // TODO: implicit barrier: should be exposed
3702 #if USE_ITT_NOTIFY
3703     __kmp_threads[global_tid]->th.th_ident = loc;
3704 #endif
3705     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3706 #if OMPT_SUPPORT && OMPT_OPTIONAL
3707     if (ompt_enabled.enabled) {
3708       ompt_frame->enter_frame = NULL;
3709     }
3710 #endif
3711 
3712   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3713                                    tree_reduce_block)) {
3714 
3715     // only master executes here (master releases all other workers)
3716     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3717                             global_tid);
3718 
3719   } else {
3720 
3721     // should never reach this block
3722     KMP_ASSERT(0); // "unexpected method"
3723   }
3724 #if OMP_40_ENABLED
3725   if (teams_swapped) {
3726     __kmp_restore_swapped_teams(th, team, task_state);
3727   }
3728 #endif
3729 
3730   if (__kmp_env_consistency_check)
3731     __kmp_pop_sync(global_tid, ct_reduce, loc);
3732 
3733   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3734                 global_tid, packed_reduction_method));
3735 
3736   return;
3737 }
3738 
3739 #undef __KMP_GET_REDUCTION_METHOD
3740 #undef __KMP_SET_REDUCTION_METHOD
3741 
3742 /* end of interface to fast scalable reduce routines */
3743 
3744 kmp_uint64 __kmpc_get_taskid() {
3745 
3746   kmp_int32 gtid;
3747   kmp_info_t *thread;
3748 
3749   gtid = __kmp_get_gtid();
3750   if (gtid < 0) {
3751     return 0;
3752   }
3753   thread = __kmp_thread_from_gtid(gtid);
3754   return thread->th.th_current_task->td_task_id;
3755 
3756 } // __kmpc_get_taskid
3757 
3758 kmp_uint64 __kmpc_get_parent_taskid() {
3759 
3760   kmp_int32 gtid;
3761   kmp_info_t *thread;
3762   kmp_taskdata_t *parent_task;
3763 
3764   gtid = __kmp_get_gtid();
3765   if (gtid < 0) {
3766     return 0;
3767   }
3768   thread = __kmp_thread_from_gtid(gtid);
3769   parent_task = thread->th.th_current_task->td_parent;
3770   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3771 
3772 } // __kmpc_get_parent_taskid
3773 
3774 #if OMP_45_ENABLED
3775 /*!
3776 @ingroup WORK_SHARING
3777 @param loc  source location information.
3778 @param gtid  global thread number.
3779 @param num_dims  number of associated doacross loops.
3780 @param dims  info on loops bounds.
3781 
3782 Initialize doacross loop information.
3783 Expect compiler send us inclusive bounds,
3784 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3785 */
3786 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3787                           const struct kmp_dim *dims) {
3788   int j, idx;
3789   kmp_int64 last, trace_count;
3790   kmp_info_t *th = __kmp_threads[gtid];
3791   kmp_team_t *team = th->th.th_team;
3792   kmp_uint32 *flags;
3793   kmp_disp_t *pr_buf = th->th.th_dispatch;
3794   dispatch_shared_info_t *sh_buf;
3795 
3796   KA_TRACE(
3797       20,
3798       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3799        gtid, num_dims, !team->t.t_serialized));
3800   KMP_DEBUG_ASSERT(dims != NULL);
3801   KMP_DEBUG_ASSERT(num_dims > 0);
3802 
3803   if (team->t.t_serialized) {
3804     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3805     return; // no dependencies if team is serialized
3806   }
3807   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3808   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3809   // the next loop
3810   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3811 
3812   // Save bounds info into allocated private buffer
3813   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3814   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3815       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3816   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3817   pr_buf->th_doacross_info[0] =
3818       (kmp_int64)num_dims; // first element is number of dimensions
3819   // Save also address of num_done in order to access it later without knowing
3820   // the buffer index
3821   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3822   pr_buf->th_doacross_info[2] = dims[0].lo;
3823   pr_buf->th_doacross_info[3] = dims[0].up;
3824   pr_buf->th_doacross_info[4] = dims[0].st;
3825   last = 5;
3826   for (j = 1; j < num_dims; ++j) {
3827     kmp_int64
3828         range_length; // To keep ranges of all dimensions but the first dims[0]
3829     if (dims[j].st == 1) { // most common case
3830       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3831       range_length = dims[j].up - dims[j].lo + 1;
3832     } else {
3833       if (dims[j].st > 0) {
3834         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3835         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3836       } else { // negative increment
3837         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3838         range_length =
3839             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3840       }
3841     }
3842     pr_buf->th_doacross_info[last++] = range_length;
3843     pr_buf->th_doacross_info[last++] = dims[j].lo;
3844     pr_buf->th_doacross_info[last++] = dims[j].up;
3845     pr_buf->th_doacross_info[last++] = dims[j].st;
3846   }
3847 
3848   // Compute total trip count.
3849   // Start with range of dims[0] which we don't need to keep in the buffer.
3850   if (dims[0].st == 1) { // most common case
3851     trace_count = dims[0].up - dims[0].lo + 1;
3852   } else if (dims[0].st > 0) {
3853     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3854     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3855   } else { // negative increment
3856     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3857     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3858   }
3859   for (j = 1; j < num_dims; ++j) {
3860     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3861   }
3862   KMP_DEBUG_ASSERT(trace_count > 0);
3863 
3864   // Check if shared buffer is not occupied by other loop (idx -
3865   // __kmp_dispatch_num_buffers)
3866   if (idx != sh_buf->doacross_buf_idx) {
3867     // Shared buffer is occupied, wait for it to be free
3868     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3869                        __kmp_eq_4, NULL);
3870   }
3871 #if KMP_32_BIT_ARCH
3872   // Check if we are the first thread. After the CAS the first thread gets 0,
3873   // others get 1 if initialization is in progress, allocated pointer otherwise.
3874   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3875   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3876       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3877 #else
3878   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3879       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3880 #endif
3881   if (flags == NULL) {
3882     // we are the first thread, allocate the array of flags
3883     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3884     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3885     KMP_MB();
3886     sh_buf->doacross_flags = flags;
3887   } else if (flags == (kmp_uint32 *)1) {
3888 #if KMP_32_BIT_ARCH
3889     // initialization is still in progress, need to wait
3890     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3891 #else
3892     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3893 #endif
3894       KMP_YIELD(TRUE);
3895     KMP_MB();
3896   } else {
3897     KMP_MB();
3898   }
3899   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3900   pr_buf->th_doacross_flags =
3901       sh_buf->doacross_flags; // save private copy in order to not
3902   // touch shared buffer on each iteration
3903   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3904 }
3905 
3906 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3907   kmp_int32 shft, num_dims, i;
3908   kmp_uint32 flag;
3909   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3910   kmp_info_t *th = __kmp_threads[gtid];
3911   kmp_team_t *team = th->th.th_team;
3912   kmp_disp_t *pr_buf;
3913   kmp_int64 lo, up, st;
3914 
3915   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3916   if (team->t.t_serialized) {
3917     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3918     return; // no dependencies if team is serialized
3919   }
3920 
3921   // calculate sequential iteration number and check out-of-bounds condition
3922   pr_buf = th->th.th_dispatch;
3923   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3924   num_dims = pr_buf->th_doacross_info[0];
3925   lo = pr_buf->th_doacross_info[2];
3926   up = pr_buf->th_doacross_info[3];
3927   st = pr_buf->th_doacross_info[4];
3928   if (st == 1) { // most common case
3929     if (vec[0] < lo || vec[0] > up) {
3930       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3931                     "bounds [%lld,%lld]\n",
3932                     gtid, vec[0], lo, up));
3933       return;
3934     }
3935     iter_number = vec[0] - lo;
3936   } else if (st > 0) {
3937     if (vec[0] < lo || vec[0] > up) {
3938       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3939                     "bounds [%lld,%lld]\n",
3940                     gtid, vec[0], lo, up));
3941       return;
3942     }
3943     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3944   } else { // negative increment
3945     if (vec[0] > lo || vec[0] < up) {
3946       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3947                     "bounds [%lld,%lld]\n",
3948                     gtid, vec[0], lo, up));
3949       return;
3950     }
3951     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3952   }
3953   for (i = 1; i < num_dims; ++i) {
3954     kmp_int64 iter, ln;
3955     kmp_int32 j = i * 4;
3956     ln = pr_buf->th_doacross_info[j + 1];
3957     lo = pr_buf->th_doacross_info[j + 2];
3958     up = pr_buf->th_doacross_info[j + 3];
3959     st = pr_buf->th_doacross_info[j + 4];
3960     if (st == 1) {
3961       if (vec[i] < lo || vec[i] > up) {
3962         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3963                       "bounds [%lld,%lld]\n",
3964                       gtid, vec[i], lo, up));
3965         return;
3966       }
3967       iter = vec[i] - lo;
3968     } else if (st > 0) {
3969       if (vec[i] < lo || vec[i] > up) {
3970         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3971                       "bounds [%lld,%lld]\n",
3972                       gtid, vec[i], lo, up));
3973         return;
3974       }
3975       iter = (kmp_uint64)(vec[i] - lo) / st;
3976     } else { // st < 0
3977       if (vec[i] > lo || vec[i] < up) {
3978         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3979                       "bounds [%lld,%lld]\n",
3980                       gtid, vec[i], lo, up));
3981         return;
3982       }
3983       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3984     }
3985     iter_number = iter + ln * iter_number;
3986   }
3987   shft = iter_number % 32; // use 32-bit granularity
3988   iter_number >>= 5; // divided by 32
3989   flag = 1 << shft;
3990   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3991     KMP_YIELD(TRUE);
3992   }
3993   KMP_MB();
3994   KA_TRACE(20,
3995            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3996             gtid, (iter_number << 5) + shft));
3997 }
3998 
3999 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4000   kmp_int32 shft, num_dims, i;
4001   kmp_uint32 flag;
4002   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4003   kmp_info_t *th = __kmp_threads[gtid];
4004   kmp_team_t *team = th->th.th_team;
4005   kmp_disp_t *pr_buf;
4006   kmp_int64 lo, st;
4007 
4008   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4009   if (team->t.t_serialized) {
4010     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4011     return; // no dependencies if team is serialized
4012   }
4013 
4014   // calculate sequential iteration number (same as in "wait" but no
4015   // out-of-bounds checks)
4016   pr_buf = th->th.th_dispatch;
4017   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4018   num_dims = pr_buf->th_doacross_info[0];
4019   lo = pr_buf->th_doacross_info[2];
4020   st = pr_buf->th_doacross_info[4];
4021   if (st == 1) { // most common case
4022     iter_number = vec[0] - lo;
4023   } else if (st > 0) {
4024     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4025   } else { // negative increment
4026     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4027   }
4028   for (i = 1; i < num_dims; ++i) {
4029     kmp_int64 iter, ln;
4030     kmp_int32 j = i * 4;
4031     ln = pr_buf->th_doacross_info[j + 1];
4032     lo = pr_buf->th_doacross_info[j + 2];
4033     st = pr_buf->th_doacross_info[j + 4];
4034     if (st == 1) {
4035       iter = vec[i] - lo;
4036     } else if (st > 0) {
4037       iter = (kmp_uint64)(vec[i] - lo) / st;
4038     } else { // st < 0
4039       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4040     }
4041     iter_number = iter + ln * iter_number;
4042   }
4043   shft = iter_number % 32; // use 32-bit granularity
4044   iter_number >>= 5; // divided by 32
4045   flag = 1 << shft;
4046   KMP_MB();
4047   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4048     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4049   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4050                 (iter_number << 5) + shft));
4051 }
4052 
4053 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4054   kmp_int32 num_done;
4055   kmp_info_t *th = __kmp_threads[gtid];
4056   kmp_team_t *team = th->th.th_team;
4057   kmp_disp_t *pr_buf = th->th.th_dispatch;
4058 
4059   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4060   if (team->t.t_serialized) {
4061     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4062     return; // nothing to do
4063   }
4064   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4065   if (num_done == th->th.th_team_nproc) {
4066     // we are the last thread, need to free shared resources
4067     int idx = pr_buf->th_doacross_buf_idx - 1;
4068     dispatch_shared_info_t *sh_buf =
4069         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4070     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4071                      (kmp_int64)&sh_buf->doacross_num_done);
4072     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4073     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4074     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4075     sh_buf->doacross_flags = NULL;
4076     sh_buf->doacross_num_done = 0;
4077     sh_buf->doacross_buf_idx +=
4078         __kmp_dispatch_num_buffers; // free buffer for future re-use
4079   }
4080   // free private resources (need to keep buffer index forever)
4081   pr_buf->th_doacross_flags = NULL;
4082   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4083   pr_buf->th_doacross_info = NULL;
4084   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4085 }
4086 #endif
4087 
4088 #if OMP_50_ENABLED
4089 int __kmpc_get_target_offload(void) {
4090   if (!__kmp_init_serial) {
4091     __kmp_serial_initialize();
4092   }
4093   return __kmp_target_offload;
4094 }
4095 #endif // OMP_50_ENABLED
4096 
4097 // end of file //
4098