1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 }
74 
75 /*!
76 @ingroup THREAD_STATES
77 @param loc Source location information.
78 @return The global thread index of the active thread.
79 
80 This function can be called in any context.
81 
82 If the runtime has ony been entered at the outermost level from a
83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
84 that which would be returned by omp_get_thread_num() in the outermost
85 active parallel construct. (Or zero if there is no active parallel
86 construct, since the master thread is necessarily thread zero).
87 
88 If multiple non-OpenMP threads all enter an OpenMP construct then this
89 will be a unique thread identifier among all the threads created by
90 the OpenMP runtime (but the value cannote be defined in terms of
91 OpenMP thread ids returned by omp_get_thread_num()).
92 */
93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
94   kmp_int32 gtid = __kmp_entry_gtid();
95 
96   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
97 
98   return gtid;
99 }
100 
101 /*!
102 @ingroup THREAD_STATES
103 @param loc Source location information.
104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
105 
106 This function can be called in any context.
107 It returns the total number of threads under the control of the OpenMP runtime.
108 That is not a number that can be determined by any OpenMP standard calls, since
109 the library may be called from more than one non-OpenMP thread, and this
110 reflects the total over all such calls. Similarly the runtime maintains
111 underlying threads even when they are not active (since the cost of creating
112 and destroying OS threads is high), this call counts all such threads even if
113 they are not waiting for work.
114 */
115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
116   KC_TRACE(10,
117            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
118 
119   return TCR_4(__kmp_all_nth);
120 }
121 
122 /*!
123 @ingroup THREAD_STATES
124 @param loc Source location information.
125 @return The thread number of the calling thread in the innermost active parallel
126 construct.
127 */
128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
129   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
130   return __kmp_tid_from_gtid(__kmp_entry_gtid());
131 }
132 
133 /*!
134 @ingroup THREAD_STATES
135 @param loc Source location information.
136 @return The number of threads in the innermost active parallel construct.
137 */
138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
139   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
140 
141   return __kmp_entry_thread()->th.th_team->t.t_nproc;
142 }
143 
144 /*!
145  * @ingroup DEPRECATED
146  * @param loc location description
147  *
148  * This function need not be called. It always returns TRUE.
149  */
150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
151 #ifndef KMP_DEBUG
152 
153   return TRUE;
154 
155 #else
156 
157   const char *semi2;
158   const char *semi3;
159   int line_no;
160 
161   if (__kmp_par_range == 0) {
162     return TRUE;
163   }
164   semi2 = loc->psource;
165   if (semi2 == NULL) {
166     return TRUE;
167   }
168   semi2 = strchr(semi2, ';');
169   if (semi2 == NULL) {
170     return TRUE;
171   }
172   semi2 = strchr(semi2 + 1, ';');
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   if (__kmp_par_range_filename[0]) {
177     const char *name = semi2 - 1;
178     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
179       name--;
180     }
181     if ((*name == '/') || (*name == ';')) {
182       name++;
183     }
184     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
185       return __kmp_par_range < 0;
186     }
187   }
188   semi3 = strchr(semi2 + 1, ';');
189   if (__kmp_par_range_routine[0]) {
190     if ((semi3 != NULL) && (semi3 > semi2) &&
191         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
192       return __kmp_par_range < 0;
193     }
194   }
195   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
196     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
197       return __kmp_par_range > 0;
198     }
199     return __kmp_par_range < 0;
200   }
201   return TRUE;
202 
203 #endif /* KMP_DEBUG */
204 }
205 
206 /*!
207 @ingroup THREAD_STATES
208 @param loc Source location information.
209 @return 1 if this thread is executing inside an active parallel region, zero if
210 not.
211 */
212 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
213   return __kmp_entry_thread()->th.th_root->r.r_active;
214 }
215 
216 /*!
217 @ingroup PARALLEL
218 @param loc source location information
219 @param global_tid global thread number
220 @param num_threads number of threads requested for this parallel construct
221 
222 Set the number of threads to be used by the next fork spawned by this thread.
223 This call is only required if the parallel construct has a `num_threads` clause.
224 */
225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
226                              kmp_int32 num_threads) {
227   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
228                 global_tid, num_threads));
229 
230   __kmp_push_num_threads(loc, global_tid, num_threads);
231 }
232 
233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
234   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
235 
236   /* the num_threads are automatically popped */
237 }
238 
239 #if OMP_40_ENABLED
240 
241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
242                            kmp_int32 proc_bind) {
243   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
244                 proc_bind));
245 
246   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
247 }
248 
249 #endif /* OMP_40_ENABLED */
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   int inParallel = __kmpc_in_parallel(loc);
266   if (inParallel) {
267     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
268   } else {
269     KMP_COUNT_BLOCK(OMP_PARALLEL);
270   }
271 #endif
272 
273   // maybe to save thr_state is enough here
274   {
275     va_list ap;
276     va_start(ap, microtask);
277 
278 #if OMPT_SUPPORT
279     ompt_frame_t *ompt_frame;
280     if (ompt_enabled.enabled) {
281       kmp_info_t *master_th = __kmp_threads[gtid];
282       kmp_team_t *parent_team = master_th->th.th_team;
283       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
284       if (lwt)
285         ompt_frame = &(lwt->ompt_task_info.frame);
286       else {
287         int tid = __kmp_tid_from_gtid(gtid);
288         ompt_frame = &(
289             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
290       }
291       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
292       OMPT_STORE_RETURN_ADDRESS(gtid);
293     }
294 #endif
295 
296 #if INCLUDE_SSC_MARKS
297     SSC_MARK_FORKING();
298 #endif
299     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
300                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
301                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
302 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
303 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
304                     &ap
305 #else
306                     ap
307 #endif
308                     );
309 #if INCLUDE_SSC_MARKS
310     SSC_MARK_JOINING();
311 #endif
312     __kmp_join_call(loc, gtid
313 #if OMPT_SUPPORT
314                     ,
315                     fork_context_intel
316 #endif
317                     );
318 
319     va_end(ap);
320   }
321 }
322 
323 #if OMP_40_ENABLED
324 /*!
325 @ingroup PARALLEL
326 @param loc source location information
327 @param global_tid global thread number
328 @param num_teams number of teams requested for the teams construct
329 @param num_threads number of threads per team requested for the teams construct
330 
331 Set the number of teams to be used by the teams construct.
332 This call is only required if the teams construct has a `num_teams` clause
333 or a `thread_limit` clause (or both).
334 */
335 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
336                            kmp_int32 num_teams, kmp_int32 num_threads) {
337   KA_TRACE(20,
338            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
339             global_tid, num_teams, num_threads));
340 
341   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
342 }
343 
344 /*!
345 @ingroup PARALLEL
346 @param loc  source location information
347 @param argc  total number of arguments in the ellipsis
348 @param microtask  pointer to callback routine consisting of outlined teams
349 construct
350 @param ...  pointers to shared variables that aren't global
351 
352 Do the actual fork and call the microtask in the relevant number of threads.
353 */
354 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
355                        ...) {
356   int gtid = __kmp_entry_gtid();
357   kmp_info_t *this_thr = __kmp_threads[gtid];
358   va_list ap;
359   va_start(ap, microtask);
360 
361   KMP_COUNT_BLOCK(OMP_TEAMS);
362 
363   // remember teams entry point and nesting level
364   this_thr->th.th_teams_microtask = microtask;
365   this_thr->th.th_teams_level =
366       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
367 
368 #if OMPT_SUPPORT
369   kmp_team_t *parent_team = this_thr->th.th_team;
370   int tid = __kmp_tid_from_gtid(gtid);
371   if (ompt_enabled.enabled) {
372     parent_team->t.t_implicit_task_taskdata[tid]
373         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
374   }
375   OMPT_STORE_RETURN_ADDRESS(gtid);
376 #endif
377 
378   // check if __kmpc_push_num_teams called, set default number of teams
379   // otherwise
380   if (this_thr->th.th_teams_size.nteams == 0) {
381     __kmp_push_num_teams(loc, gtid, 0, 0);
382   }
383   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
384   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
385   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
386 
387   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
388                   VOLATILE_CAST(microtask_t)
389                       __kmp_teams_master, // "wrapped" task
390                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
391 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
392                   &ap
393 #else
394                   ap
395 #endif
396                   );
397   __kmp_join_call(loc, gtid
398 #if OMPT_SUPPORT
399                   ,
400                   fork_context_intel
401 #endif
402                   );
403 
404   this_thr->th.th_teams_microtask = NULL;
405   this_thr->th.th_teams_level = 0;
406   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
407   va_end(ap);
408 }
409 #endif /* OMP_40_ENABLED */
410 
411 // I don't think this function should ever have been exported.
412 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
413 // openmp code ever called it, but it's been exported from the RTL for so
414 // long that I'm afraid to remove the definition.
415 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
416 
417 /*!
418 @ingroup PARALLEL
419 @param loc  source location information
420 @param global_tid  global thread number
421 
422 Enter a serialized parallel construct. This interface is used to handle a
423 conditional parallel region, like this,
424 @code
425 #pragma omp parallel if (condition)
426 @endcode
427 when the condition is false.
428 */
429 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
430 // The implementation is now in kmp_runtime.cpp so that it can share static
431 // functions with kmp_fork_call since the tasks to be done are similar in
432 // each case.
433 #if OMPT_SUPPORT
434   OMPT_STORE_RETURN_ADDRESS(global_tid);
435 #endif
436   __kmp_serialized_parallel(loc, global_tid);
437 }
438 
439 /*!
440 @ingroup PARALLEL
441 @param loc  source location information
442 @param global_tid  global thread number
443 
444 Leave a serialized parallel construct.
445 */
446 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
447   kmp_internal_control_t *top;
448   kmp_info_t *this_thr;
449   kmp_team_t *serial_team;
450 
451   KC_TRACE(10,
452            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
453 
454   /* skip all this code for autopar serialized loops since it results in
455      unacceptable overhead */
456   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
457     return;
458 
459   // Not autopar code
460   if (!TCR_4(__kmp_init_parallel))
461     __kmp_parallel_initialize();
462 
463   this_thr = __kmp_threads[global_tid];
464   serial_team = this_thr->th.th_serial_team;
465 
466 #if OMP_45_ENABLED
467   kmp_task_team_t *task_team = this_thr->th.th_task_team;
468 
469   // we need to wait for the proxy tasks before finishing the thread
470   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
471     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
472 #endif
473 
474   KMP_MB();
475   KMP_DEBUG_ASSERT(serial_team);
476   KMP_ASSERT(serial_team->t.t_serialized);
477   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
478   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
479   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
480   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
481 
482 #if OMPT_SUPPORT
483   if (ompt_enabled.enabled &&
484       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
485     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
486     if (ompt_enabled.ompt_callback_implicit_task) {
487       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
488           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
489           __kmp_tid_from_gtid(global_tid));
490     }
491 
492     // reset clear the task id only after unlinking the task
493     ompt_data_t *parent_task_data;
494     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
495 
496     if (ompt_enabled.ompt_callback_parallel_end) {
497       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
498           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
499           ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
500     }
501     __ompt_lw_taskteam_unlink(this_thr);
502     this_thr->th.ompt_thread_info.state = omp_state_overhead;
503   }
504 #endif
505 
506   /* If necessary, pop the internal control stack values and replace the team
507    * values */
508   top = serial_team->t.t_control_stack_top;
509   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
510     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
511     serial_team->t.t_control_stack_top = top->next;
512     __kmp_free(top);
513   }
514 
515   // if( serial_team -> t.t_serialized > 1 )
516   serial_team->t.t_level--;
517 
518   /* pop dispatch buffers stack */
519   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
520   {
521     dispatch_private_info_t *disp_buffer =
522         serial_team->t.t_dispatch->th_disp_buffer;
523     serial_team->t.t_dispatch->th_disp_buffer =
524         serial_team->t.t_dispatch->th_disp_buffer->next;
525     __kmp_free(disp_buffer);
526   }
527 
528   --serial_team->t.t_serialized;
529   if (serial_team->t.t_serialized == 0) {
530 
531 /* return to the parallel section */
532 
533 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
534     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
535       __kmp_clear_x87_fpu_status_word();
536       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
537       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
538     }
539 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
540 
541     this_thr->th.th_team = serial_team->t.t_parent;
542     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
543 
544     /* restore values cached in the thread */
545     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
546     this_thr->th.th_team_master =
547         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
548     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
549 
550     /* TODO the below shouldn't need to be adjusted for serialized teams */
551     this_thr->th.th_dispatch =
552         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
553 
554     __kmp_pop_current_task_from_thread(this_thr);
555 
556     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
557     this_thr->th.th_current_task->td_flags.executing = 1;
558 
559     if (__kmp_tasking_mode != tskm_immediate_exec) {
560       // Copy the task team from the new child / old parent team to the thread.
561       this_thr->th.th_task_team =
562           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
563       KA_TRACE(20,
564                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
565                 "team %p\n",
566                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
567     }
568   } else {
569     if (__kmp_tasking_mode != tskm_immediate_exec) {
570       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
571                     "depth of serial team %p to %d\n",
572                     global_tid, serial_team, serial_team->t.t_serialized));
573     }
574   }
575 
576   if (__kmp_env_consistency_check)
577     __kmp_pop_parallel(global_tid, NULL);
578 #if OMPT_SUPPORT
579   if (ompt_enabled.enabled)
580     this_thr->th.ompt_thread_info.state =
581         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
582                                            : omp_state_work_parallel);
583 #endif
584 }
585 
586 /*!
587 @ingroup SYNCHRONIZATION
588 @param loc  source location information.
589 
590 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
591 depending on the memory ordering convention obeyed by the compiler
592 even that may not be necessary).
593 */
594 void __kmpc_flush(ident_t *loc) {
595   KC_TRACE(10, ("__kmpc_flush: called\n"));
596 
597   /* need explicit __mf() here since use volatile instead in library */
598   KMP_MB(); /* Flush all pending memory write invalidates.  */
599 
600 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
601 #if KMP_MIC
602 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
603 // We shouldn't need it, though, since the ABI rules require that
604 // * If the compiler generates NGO stores it also generates the fence
605 // * If users hand-code NGO stores they should insert the fence
606 // therefore no incomplete unordered stores should be visible.
607 #else
608   // C74404
609   // This is to address non-temporal store instructions (sfence needed).
610   // The clflush instruction is addressed either (mfence needed).
611   // Probably the non-temporal load monvtdqa instruction should also be
612   // addressed.
613   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
614   if (!__kmp_cpuinfo.initialized) {
615     __kmp_query_cpuid(&__kmp_cpuinfo);
616   }
617   if (!__kmp_cpuinfo.sse2) {
618     // CPU cannot execute SSE2 instructions.
619   } else {
620 #if KMP_COMPILER_ICC
621     _mm_mfence();
622 #elif KMP_COMPILER_MSVC
623     MemoryBarrier();
624 #else
625     __sync_synchronize();
626 #endif // KMP_COMPILER_ICC
627   }
628 #endif // KMP_MIC
629 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
630 // Nothing to see here move along
631 #elif KMP_ARCH_PPC64
632 // Nothing needed here (we have a real MB above).
633 #if KMP_OS_CNK
634   // The flushing thread needs to yield here; this prevents a
635   // busy-waiting thread from saturating the pipeline. flush is
636   // often used in loops like this:
637   // while (!flag) {
638   //   #pragma omp flush(flag)
639   // }
640   // and adding the yield here is good for at least a 10x speedup
641   // when running >2 threads per core (on the NAS LU benchmark).
642   __kmp_yield(TRUE);
643 #endif
644 #else
645 #error Unknown or unsupported architecture
646 #endif
647 
648 #if OMPT_SUPPORT && OMPT_OPTIONAL
649   if (ompt_enabled.ompt_callback_flush) {
650     ompt_callbacks.ompt_callback(ompt_callback_flush)(
651         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
652   }
653 #endif
654 }
655 
656 /* -------------------------------------------------------------------------- */
657 /*!
658 @ingroup SYNCHRONIZATION
659 @param loc source location information
660 @param global_tid thread id.
661 
662 Execute a barrier.
663 */
664 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
665   KMP_COUNT_BLOCK(OMP_BARRIER);
666   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
667 
668   if (!TCR_4(__kmp_init_parallel))
669     __kmp_parallel_initialize();
670 
671   if (__kmp_env_consistency_check) {
672     if (loc == 0) {
673       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
674     }
675 
676     __kmp_check_barrier(global_tid, ct_barrier, loc);
677   }
678 
679 #if OMPT_SUPPORT
680   ompt_frame_t *ompt_frame;
681   if (ompt_enabled.enabled) {
682     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
683     if (ompt_frame->enter_frame == NULL)
684       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
685     OMPT_STORE_RETURN_ADDRESS(global_tid);
686   }
687 #endif
688   __kmp_threads[global_tid]->th.th_ident = loc;
689   // TODO: explicit barrier_wait_id:
690   //   this function is called when 'barrier' directive is present or
691   //   implicit barrier at the end of a worksharing construct.
692   // 1) better to add a per-thread barrier counter to a thread data structure
693   // 2) set to 0 when a new team is created
694   // 4) no sync is required
695 
696   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
697 #if OMPT_SUPPORT && OMPT_OPTIONAL
698   if (ompt_enabled.enabled) {
699     ompt_frame->enter_frame = NULL;
700   }
701 #endif
702 }
703 
704 /* The BARRIER for a MASTER section is always explicit   */
705 /*!
706 @ingroup WORK_SHARING
707 @param loc  source location information.
708 @param global_tid  global thread number .
709 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
710 */
711 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
712   int status = 0;
713 
714   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
715 
716   if (!TCR_4(__kmp_init_parallel))
717     __kmp_parallel_initialize();
718 
719   if (KMP_MASTER_GTID(global_tid)) {
720     KMP_COUNT_BLOCK(OMP_MASTER);
721     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
722     status = 1;
723   }
724 
725 #if OMPT_SUPPORT && OMPT_OPTIONAL
726   if (status) {
727     if (ompt_enabled.ompt_callback_master) {
728       kmp_info_t *this_thr = __kmp_threads[global_tid];
729       kmp_team_t *team = this_thr->th.th_team;
730 
731       int tid = __kmp_tid_from_gtid(global_tid);
732       ompt_callbacks.ompt_callback(ompt_callback_master)(
733           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
734           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
735           OMPT_GET_RETURN_ADDRESS(0));
736     }
737   }
738 #endif
739 
740   if (__kmp_env_consistency_check) {
741 #if KMP_USE_DYNAMIC_LOCK
742     if (status)
743       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
744     else
745       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
746 #else
747     if (status)
748       __kmp_push_sync(global_tid, ct_master, loc, NULL);
749     else
750       __kmp_check_sync(global_tid, ct_master, loc, NULL);
751 #endif
752   }
753 
754   return status;
755 }
756 
757 /*!
758 @ingroup WORK_SHARING
759 @param loc  source location information.
760 @param global_tid  global thread number .
761 
762 Mark the end of a <tt>master</tt> region. This should only be called by the
763 thread that executes the <tt>master</tt> region.
764 */
765 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
766   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
767 
768   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
769   KMP_POP_PARTITIONED_TIMER();
770 
771 #if OMPT_SUPPORT && OMPT_OPTIONAL
772   kmp_info_t *this_thr = __kmp_threads[global_tid];
773   kmp_team_t *team = this_thr->th.th_team;
774   if (ompt_enabled.ompt_callback_master) {
775     int tid = __kmp_tid_from_gtid(global_tid);
776     ompt_callbacks.ompt_callback(ompt_callback_master)(
777         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
778         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
779         OMPT_GET_RETURN_ADDRESS(0));
780   }
781 #endif
782 
783   if (__kmp_env_consistency_check) {
784     if (global_tid < 0)
785       KMP_WARNING(ThreadIdentInvalid);
786 
787     if (KMP_MASTER_GTID(global_tid))
788       __kmp_pop_sync(global_tid, ct_master, loc);
789   }
790 }
791 
792 /*!
793 @ingroup WORK_SHARING
794 @param loc  source location information.
795 @param gtid  global thread number.
796 
797 Start execution of an <tt>ordered</tt> construct.
798 */
799 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
800   int cid = 0;
801   kmp_info_t *th;
802   KMP_DEBUG_ASSERT(__kmp_init_serial);
803 
804   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
805 
806   if (!TCR_4(__kmp_init_parallel))
807     __kmp_parallel_initialize();
808 
809 #if USE_ITT_BUILD
810   __kmp_itt_ordered_prep(gtid);
811 // TODO: ordered_wait_id
812 #endif /* USE_ITT_BUILD */
813 
814   th = __kmp_threads[gtid];
815 
816 #if OMPT_SUPPORT && OMPT_OPTIONAL
817   kmp_team_t *team;
818   ompt_wait_id_t lck;
819   void *codeptr_ra;
820   if (ompt_enabled.enabled) {
821     OMPT_STORE_RETURN_ADDRESS(gtid);
822     team = __kmp_team_from_gtid(gtid);
823     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
824     /* OMPT state update */
825     th->th.ompt_thread_info.wait_id = lck;
826     th->th.ompt_thread_info.state = omp_state_wait_ordered;
827 
828     /* OMPT event callback */
829     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
830     if (ompt_enabled.ompt_callback_mutex_acquire) {
831       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
832           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
833           (ompt_wait_id_t)lck, codeptr_ra);
834     }
835   }
836 #endif
837 
838   if (th->th.th_dispatch->th_deo_fcn != 0)
839     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
840   else
841     __kmp_parallel_deo(&gtid, &cid, loc);
842 
843 #if OMPT_SUPPORT && OMPT_OPTIONAL
844   if (ompt_enabled.enabled) {
845     /* OMPT state update */
846     th->th.ompt_thread_info.state = omp_state_work_parallel;
847     th->th.ompt_thread_info.wait_id = 0;
848 
849     /* OMPT event callback */
850     if (ompt_enabled.ompt_callback_mutex_acquired) {
851       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
852           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
853     }
854   }
855 #endif
856 
857 #if USE_ITT_BUILD
858   __kmp_itt_ordered_start(gtid);
859 #endif /* USE_ITT_BUILD */
860 }
861 
862 /*!
863 @ingroup WORK_SHARING
864 @param loc  source location information.
865 @param gtid  global thread number.
866 
867 End execution of an <tt>ordered</tt> construct.
868 */
869 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
870   int cid = 0;
871   kmp_info_t *th;
872 
873   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
874 
875 #if USE_ITT_BUILD
876   __kmp_itt_ordered_end(gtid);
877 // TODO: ordered_wait_id
878 #endif /* USE_ITT_BUILD */
879 
880   th = __kmp_threads[gtid];
881 
882   if (th->th.th_dispatch->th_dxo_fcn != 0)
883     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
884   else
885     __kmp_parallel_dxo(&gtid, &cid, loc);
886 
887 #if OMPT_SUPPORT && OMPT_OPTIONAL
888   OMPT_STORE_RETURN_ADDRESS(gtid);
889   if (ompt_enabled.ompt_callback_mutex_released) {
890     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
891         ompt_mutex_ordered,
892         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
893         OMPT_LOAD_RETURN_ADDRESS(gtid));
894   }
895 #endif
896 }
897 
898 #if KMP_USE_DYNAMIC_LOCK
899 
900 static __forceinline void
901 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
902                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
903   // Pointer to the allocated indirect lock is written to crit, while indexing
904   // is ignored.
905   void *idx;
906   kmp_indirect_lock_t **lck;
907   lck = (kmp_indirect_lock_t **)crit;
908   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
909   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
910   KMP_SET_I_LOCK_LOCATION(ilk, loc);
911   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
912   KA_TRACE(20,
913            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
914 #if USE_ITT_BUILD
915   __kmp_itt_critical_creating(ilk->lock, loc);
916 #endif
917   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
918   if (status == 0) {
919 #if USE_ITT_BUILD
920     __kmp_itt_critical_destroyed(ilk->lock);
921 #endif
922     // We don't really need to destroy the unclaimed lock here since it will be
923     // cleaned up at program exit.
924     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
925   }
926   KMP_DEBUG_ASSERT(*lck != NULL);
927 }
928 
929 // Fast-path acquire tas lock
930 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
931   {                                                                            \
932     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
933     if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                    \
934         !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
935                                      KMP_LOCK_BUSY(gtid + 1, tas))) {          \
936       kmp_uint32 spins;                                                        \
937       KMP_FSYNC_PREPARE(l);                                                    \
938       KMP_INIT_YIELD(spins);                                                   \
939       if (TCR_4(__kmp_nth) >                                                   \
940           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
941         KMP_YIELD(TRUE);                                                       \
942       } else {                                                                 \
943         KMP_YIELD_SPIN(spins);                                                 \
944       }                                                                        \
945       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
946       while (l->lk.poll != KMP_LOCK_FREE(tas) ||                               \
947              !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),   \
948                                           KMP_LOCK_BUSY(gtid + 1, tas))) {     \
949         __kmp_spin_backoff(&backoff);                                          \
950         if (TCR_4(__kmp_nth) >                                                 \
951             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
952           KMP_YIELD(TRUE);                                                     \
953         } else {                                                               \
954           KMP_YIELD_SPIN(spins);                                               \
955         }                                                                      \
956       }                                                                        \
957     }                                                                          \
958     KMP_FSYNC_ACQUIRED(l);                                                     \
959   }
960 
961 // Fast-path test tas lock
962 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
963   {                                                                            \
964     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
965     rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                   \
966          KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
967                                      KMP_LOCK_BUSY(gtid + 1, tas));            \
968   }
969 
970 // Fast-path release tas lock
971 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
972   {                                                                            \
973     TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));              \
974     KMP_MB();                                                                  \
975   }
976 
977 #if KMP_USE_FUTEX
978 
979 #include <sys/syscall.h>
980 #include <unistd.h>
981 #ifndef FUTEX_WAIT
982 #define FUTEX_WAIT 0
983 #endif
984 #ifndef FUTEX_WAKE
985 #define FUTEX_WAKE 1
986 #endif
987 
988 // Fast-path acquire futex lock
989 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
990   {                                                                            \
991     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
992     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
993     KMP_MB();                                                                  \
994     KMP_FSYNC_PREPARE(ftx);                                                    \
995     kmp_int32 poll_val;                                                        \
996     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
997                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
998                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
999       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1000       if (!cond) {                                                             \
1001         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1002                                          poll_val |                            \
1003                                              KMP_LOCK_BUSY(1, futex))) {       \
1004           continue;                                                            \
1005         }                                                                      \
1006         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1007       }                                                                        \
1008       kmp_int32 rc;                                                            \
1009       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1010                         NULL, NULL, 0)) != 0) {                                \
1011         continue;                                                              \
1012       }                                                                        \
1013       gtid_code |= 1;                                                          \
1014     }                                                                          \
1015     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1016   }
1017 
1018 // Fast-path test futex lock
1019 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1020   {                                                                            \
1021     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1022     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1023                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1024       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1025       rc = TRUE;                                                               \
1026     } else {                                                                   \
1027       rc = FALSE;                                                              \
1028     }                                                                          \
1029   }
1030 
1031 // Fast-path release futex lock
1032 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1033   {                                                                            \
1034     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1035     KMP_MB();                                                                  \
1036     KMP_FSYNC_RELEASING(ftx);                                                  \
1037     kmp_int32 poll_val =                                                       \
1038         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1039     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1040       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1041               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1042     }                                                                          \
1043     KMP_MB();                                                                  \
1044     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1045               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1046   }
1047 
1048 #endif // KMP_USE_FUTEX
1049 
1050 #else // KMP_USE_DYNAMIC_LOCK
1051 
1052 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1053                                                       ident_t const *loc,
1054                                                       kmp_int32 gtid) {
1055   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1056 
1057   // Because of the double-check, the following load doesn't need to be volatile
1058   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1059 
1060   if (lck == NULL) {
1061     void *idx;
1062 
1063     // Allocate & initialize the lock.
1064     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1065     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1066     __kmp_init_user_lock_with_checks(lck);
1067     __kmp_set_user_lock_location(lck, loc);
1068 #if USE_ITT_BUILD
1069     __kmp_itt_critical_creating(lck);
1070 // __kmp_itt_critical_creating() should be called *before* the first usage
1071 // of underlying lock. It is the only place where we can guarantee it. There
1072 // are chances the lock will destroyed with no usage, but it is not a
1073 // problem, because this is not real event seen by user but rather setting
1074 // name for object (lock). See more details in kmp_itt.h.
1075 #endif /* USE_ITT_BUILD */
1076 
1077     // Use a cmpxchg instruction to slam the start of the critical section with
1078     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1079     // and use the lock that the other thread allocated.
1080     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1081 
1082     if (status == 0) {
1083 // Deallocate the lock and reload the value.
1084 #if USE_ITT_BUILD
1085       __kmp_itt_critical_destroyed(lck);
1086 // Let ITT know the lock is destroyed and the same memory location may be reused
1087 // for another purpose.
1088 #endif /* USE_ITT_BUILD */
1089       __kmp_destroy_user_lock_with_checks(lck);
1090       __kmp_user_lock_free(&idx, gtid, lck);
1091       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1092       KMP_DEBUG_ASSERT(lck != NULL);
1093     }
1094   }
1095   return lck;
1096 }
1097 
1098 #endif // KMP_USE_DYNAMIC_LOCK
1099 
1100 /*!
1101 @ingroup WORK_SHARING
1102 @param loc  source location information.
1103 @param global_tid  global thread number .
1104 @param crit identity of the critical section. This could be a pointer to a lock
1105 associated with the critical section, or some other suitably unique value.
1106 
1107 Enter code protected by a `critical` construct.
1108 This function blocks until the executing thread can enter the critical section.
1109 */
1110 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1111                      kmp_critical_name *crit) {
1112 #if KMP_USE_DYNAMIC_LOCK
1113 #if OMPT_SUPPORT && OMPT_OPTIONAL
1114   OMPT_STORE_RETURN_ADDRESS(global_tid);
1115 #endif // OMPT_SUPPORT
1116   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1117 #else
1118   KMP_COUNT_BLOCK(OMP_CRITICAL);
1119   KMP_TIME_PARTITIONED_BLOCK(
1120       OMP_critical_wait); /* Time spent waiting to enter the critical section */
1121 #if OMPT_SUPPORT && OMPT_OPTIONAL
1122   omp_state_t prev_state = omp_state_undefined;
1123   ompt_thread_info_t ti;
1124 #endif
1125   kmp_user_lock_p lck;
1126 
1127   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1128 
1129   // TODO: add THR_OVHD_STATE
1130 
1131   KMP_CHECK_USER_LOCK_INIT();
1132 
1133   if ((__kmp_user_lock_kind == lk_tas) &&
1134       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1135     lck = (kmp_user_lock_p)crit;
1136   }
1137 #if KMP_USE_FUTEX
1138   else if ((__kmp_user_lock_kind == lk_futex) &&
1139            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1140     lck = (kmp_user_lock_p)crit;
1141   }
1142 #endif
1143   else { // ticket, queuing or drdpa
1144     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1145   }
1146 
1147   if (__kmp_env_consistency_check)
1148     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1149 
1150 // since the critical directive binds to all threads, not just the current
1151 // team we have to check this even if we are in a serialized team.
1152 // also, even if we are the uber thread, we still have to conduct the lock,
1153 // as we have to contend with sibling threads.
1154 
1155 #if USE_ITT_BUILD
1156   __kmp_itt_critical_acquiring(lck);
1157 #endif /* USE_ITT_BUILD */
1158 #if OMPT_SUPPORT && OMPT_OPTIONAL
1159   OMPT_STORE_RETURN_ADDRESS(gtid);
1160   void *codeptr_ra = NULL;
1161   if (ompt_enabled.enabled) {
1162     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1163     /* OMPT state update */
1164     prev_state = ti.state;
1165     ti.wait_id = (ompt_wait_id_t)lck;
1166     ti.state = omp_state_wait_critical;
1167 
1168     /* OMPT event callback */
1169     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1170     if (ompt_enabled.ompt_callback_mutex_acquire) {
1171       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1172           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1173           (ompt_wait_id_t)crit, codeptr_ra);
1174     }
1175   }
1176 #endif
1177   // Value of 'crit' should be good for using as a critical_id of the critical
1178   // section directive.
1179   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1180 
1181 #if USE_ITT_BUILD
1182   __kmp_itt_critical_acquired(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185   if (ompt_enabled.enabled) {
1186     /* OMPT state update */
1187     ti.state = prev_state;
1188     ti.wait_id = 0;
1189 
1190     /* OMPT event callback */
1191     if (ompt_enabled.ompt_callback_mutex_acquired) {
1192       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1193           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1194     }
1195   }
1196 #endif
1197 
1198   KMP_START_EXPLICIT_TIMER(OMP_critical);
1199   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1200 #endif // KMP_USE_DYNAMIC_LOCK
1201 }
1202 
1203 #if KMP_USE_DYNAMIC_LOCK
1204 
1205 // Converts the given hint to an internal lock implementation
1206 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1207 #if KMP_USE_TSX
1208 #define KMP_TSX_LOCK(seq) lockseq_##seq
1209 #else
1210 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1211 #endif
1212 
1213 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1214 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1215 #else
1216 #define KMP_CPUINFO_RTM 0
1217 #endif
1218 
1219   // Hints that do not require further logic
1220   if (hint & kmp_lock_hint_hle)
1221     return KMP_TSX_LOCK(hle);
1222   if (hint & kmp_lock_hint_rtm)
1223     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1224   if (hint & kmp_lock_hint_adaptive)
1225     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1226 
1227   // Rule out conflicting hints first by returning the default lock
1228   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1229     return __kmp_user_lock_seq;
1230   if ((hint & omp_lock_hint_speculative) &&
1231       (hint & omp_lock_hint_nonspeculative))
1232     return __kmp_user_lock_seq;
1233 
1234   // Do not even consider speculation when it appears to be contended
1235   if (hint & omp_lock_hint_contended)
1236     return lockseq_queuing;
1237 
1238   // Uncontended lock without speculation
1239   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1240     return lockseq_tas;
1241 
1242   // HLE lock for speculation
1243   if (hint & omp_lock_hint_speculative)
1244     return KMP_TSX_LOCK(hle);
1245 
1246   return __kmp_user_lock_seq;
1247 }
1248 
1249 #if OMPT_SUPPORT && OMPT_OPTIONAL
1250 static kmp_mutex_impl_t
1251 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1252   if (user_lock) {
1253     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1254     case 0:
1255       break;
1256 #if KMP_USE_FUTEX
1257     case locktag_futex:
1258       return kmp_mutex_impl_queuing;
1259 #endif
1260     case locktag_tas:
1261       return kmp_mutex_impl_spin;
1262 #if KMP_USE_TSX
1263     case locktag_hle:
1264       return kmp_mutex_impl_speculative;
1265 #endif
1266     default:
1267       return ompt_mutex_impl_unknown;
1268     }
1269     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1270   }
1271   KMP_ASSERT(ilock);
1272   switch (ilock->type) {
1273 #if KMP_USE_TSX
1274   case locktag_adaptive:
1275   case locktag_rtm:
1276     return kmp_mutex_impl_speculative;
1277 #endif
1278   case locktag_nested_tas:
1279     return kmp_mutex_impl_spin;
1280 #if KMP_USE_FUTEX
1281   case locktag_nested_futex:
1282 #endif
1283   case locktag_ticket:
1284   case locktag_queuing:
1285   case locktag_drdpa:
1286   case locktag_nested_ticket:
1287   case locktag_nested_queuing:
1288   case locktag_nested_drdpa:
1289     return kmp_mutex_impl_queuing;
1290   default:
1291     return ompt_mutex_impl_unknown;
1292   }
1293 }
1294 
1295 // For locks without dynamic binding
1296 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1297   switch (__kmp_user_lock_kind) {
1298   case lk_tas:
1299     return kmp_mutex_impl_spin;
1300 #if KMP_USE_FUTEX
1301   case lk_futex:
1302 #endif
1303   case lk_ticket:
1304   case lk_queuing:
1305   case lk_drdpa:
1306     return kmp_mutex_impl_queuing;
1307 #if KMP_USE_TSX
1308   case lk_hle:
1309   case lk_rtm:
1310   case lk_adaptive:
1311     return kmp_mutex_impl_speculative;
1312 #endif
1313   default:
1314     return ompt_mutex_impl_unknown;
1315   }
1316 }
1317 #endif
1318 
1319 /*!
1320 @ingroup WORK_SHARING
1321 @param loc  source location information.
1322 @param global_tid  global thread number.
1323 @param crit identity of the critical section. This could be a pointer to a lock
1324 associated with the critical section, or some other suitably unique value.
1325 @param hint the lock hint.
1326 
1327 Enter code protected by a `critical` construct with a hint. The hint value is
1328 used to suggest a lock implementation. This function blocks until the executing
1329 thread can enter the critical section unless the hint suggests use of
1330 speculative execution and the hardware supports it.
1331 */
1332 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1333                                kmp_critical_name *crit, uintptr_t hint) {
1334   KMP_COUNT_BLOCK(OMP_CRITICAL);
1335   kmp_user_lock_p lck;
1336 #if OMPT_SUPPORT && OMPT_OPTIONAL
1337   omp_state_t prev_state = omp_state_undefined;
1338   ompt_thread_info_t ti;
1339   // This is the case, if called from __kmpc_critical:
1340   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1341   if (!codeptr)
1342     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1343 #endif
1344 
1345   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1346 
1347   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1348   // Check if it is initialized.
1349   if (*lk == 0) {
1350     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1351     if (KMP_IS_D_LOCK(lckseq)) {
1352       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1353                                   KMP_GET_D_TAG(lckseq));
1354     } else {
1355       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1356     }
1357   }
1358   // Branch for accessing the actual lock object and set operation. This
1359   // branching is inevitable since this lock initialization does not follow the
1360   // normal dispatch path (lock table is not used).
1361   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1362     lck = (kmp_user_lock_p)lk;
1363     if (__kmp_env_consistency_check) {
1364       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1365                       __kmp_map_hint_to_lock(hint));
1366     }
1367 #if USE_ITT_BUILD
1368     __kmp_itt_critical_acquiring(lck);
1369 #endif
1370 #if OMPT_SUPPORT && OMPT_OPTIONAL
1371     if (ompt_enabled.enabled) {
1372       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1373       /* OMPT state update */
1374       prev_state = ti.state;
1375       ti.wait_id = (ompt_wait_id_t)lck;
1376       ti.state = omp_state_wait_critical;
1377 
1378       /* OMPT event callback */
1379       if (ompt_enabled.ompt_callback_mutex_acquire) {
1380         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1381             ompt_mutex_critical, (unsigned int)hint,
1382             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1383       }
1384     }
1385 #endif
1386 #if KMP_USE_INLINED_TAS
1387     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1388       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1389     } else
1390 #elif KMP_USE_INLINED_FUTEX
1391     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1392       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1393     } else
1394 #endif
1395     {
1396       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1397     }
1398   } else {
1399     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1400     lck = ilk->lock;
1401     if (__kmp_env_consistency_check) {
1402       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1403                       __kmp_map_hint_to_lock(hint));
1404     }
1405 #if USE_ITT_BUILD
1406     __kmp_itt_critical_acquiring(lck);
1407 #endif
1408 #if OMPT_SUPPORT && OMPT_OPTIONAL
1409     if (ompt_enabled.enabled) {
1410       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1411       /* OMPT state update */
1412       prev_state = ti.state;
1413       ti.wait_id = (ompt_wait_id_t)lck;
1414       ti.state = omp_state_wait_critical;
1415 
1416       /* OMPT event callback */
1417       if (ompt_enabled.ompt_callback_mutex_acquire) {
1418         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1419             ompt_mutex_critical, (unsigned int)hint,
1420             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1421       }
1422     }
1423 #endif
1424     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1425   }
1426 
1427 #if USE_ITT_BUILD
1428   __kmp_itt_critical_acquired(lck);
1429 #endif /* USE_ITT_BUILD */
1430 #if OMPT_SUPPORT && OMPT_OPTIONAL
1431   if (ompt_enabled.enabled) {
1432     /* OMPT state update */
1433     ti.state = prev_state;
1434     ti.wait_id = 0;
1435 
1436     /* OMPT event callback */
1437     if (ompt_enabled.ompt_callback_mutex_acquired) {
1438       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1439           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1440     }
1441   }
1442 #endif
1443 
1444   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1445   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1446 } // __kmpc_critical_with_hint
1447 
1448 #endif // KMP_USE_DYNAMIC_LOCK
1449 
1450 /*!
1451 @ingroup WORK_SHARING
1452 @param loc  source location information.
1453 @param global_tid  global thread number .
1454 @param crit identity of the critical section. This could be a pointer to a lock
1455 associated with the critical section, or some other suitably unique value.
1456 
1457 Leave a critical section, releasing any lock that was held during its execution.
1458 */
1459 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1460                          kmp_critical_name *crit) {
1461   kmp_user_lock_p lck;
1462 
1463   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1464 
1465 #if KMP_USE_DYNAMIC_LOCK
1466   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1467     lck = (kmp_user_lock_p)crit;
1468     KMP_ASSERT(lck != NULL);
1469     if (__kmp_env_consistency_check) {
1470       __kmp_pop_sync(global_tid, ct_critical, loc);
1471     }
1472 #if USE_ITT_BUILD
1473     __kmp_itt_critical_releasing(lck);
1474 #endif
1475 #if KMP_USE_INLINED_TAS
1476     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1477       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1478     } else
1479 #elif KMP_USE_INLINED_FUTEX
1480     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1481       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1482     } else
1483 #endif
1484     {
1485       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1486     }
1487   } else {
1488     kmp_indirect_lock_t *ilk =
1489         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1490     KMP_ASSERT(ilk != NULL);
1491     lck = ilk->lock;
1492     if (__kmp_env_consistency_check) {
1493       __kmp_pop_sync(global_tid, ct_critical, loc);
1494     }
1495 #if USE_ITT_BUILD
1496     __kmp_itt_critical_releasing(lck);
1497 #endif
1498     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1499   }
1500 
1501 #else // KMP_USE_DYNAMIC_LOCK
1502 
1503   if ((__kmp_user_lock_kind == lk_tas) &&
1504       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1505     lck = (kmp_user_lock_p)crit;
1506   }
1507 #if KMP_USE_FUTEX
1508   else if ((__kmp_user_lock_kind == lk_futex) &&
1509            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1510     lck = (kmp_user_lock_p)crit;
1511   }
1512 #endif
1513   else { // ticket, queuing or drdpa
1514     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1515   }
1516 
1517   KMP_ASSERT(lck != NULL);
1518 
1519   if (__kmp_env_consistency_check)
1520     __kmp_pop_sync(global_tid, ct_critical, loc);
1521 
1522 #if USE_ITT_BUILD
1523   __kmp_itt_critical_releasing(lck);
1524 #endif /* USE_ITT_BUILD */
1525   // Value of 'crit' should be good for using as a critical_id of the critical
1526   // section directive.
1527   __kmp_release_user_lock_with_checks(lck, global_tid);
1528 
1529 #endif // KMP_USE_DYNAMIC_LOCK
1530 
1531 #if OMPT_SUPPORT && OMPT_OPTIONAL
1532   /* OMPT release event triggers after lock is released; place here to trigger
1533    * for all #if branches */
1534   OMPT_STORE_RETURN_ADDRESS(global_tid);
1535   if (ompt_enabled.ompt_callback_mutex_released) {
1536     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1537         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1538   }
1539 #endif
1540 
1541   KMP_POP_PARTITIONED_TIMER();
1542   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1543 }
1544 
1545 /*!
1546 @ingroup SYNCHRONIZATION
1547 @param loc source location information
1548 @param global_tid thread id.
1549 @return one if the thread should execute the master block, zero otherwise
1550 
1551 Start execution of a combined barrier and master. The barrier is executed inside
1552 this function.
1553 */
1554 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1555   int status;
1556 
1557   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1558 
1559   if (!TCR_4(__kmp_init_parallel))
1560     __kmp_parallel_initialize();
1561 
1562   if (__kmp_env_consistency_check)
1563     __kmp_check_barrier(global_tid, ct_barrier, loc);
1564 
1565 #if OMPT_SUPPORT
1566   ompt_frame_t *ompt_frame;
1567   if (ompt_enabled.enabled) {
1568     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1569     if (ompt_frame->enter_frame == NULL)
1570       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1571     OMPT_STORE_RETURN_ADDRESS(global_tid);
1572   }
1573 #endif
1574 #if USE_ITT_NOTIFY
1575   __kmp_threads[global_tid]->th.th_ident = loc;
1576 #endif
1577   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1578 #if OMPT_SUPPORT && OMPT_OPTIONAL
1579   if (ompt_enabled.enabled) {
1580     ompt_frame->enter_frame = NULL;
1581   }
1582 #endif
1583 
1584   return (status != 0) ? 0 : 1;
1585 }
1586 
1587 /*!
1588 @ingroup SYNCHRONIZATION
1589 @param loc source location information
1590 @param global_tid thread id.
1591 
1592 Complete the execution of a combined barrier and master. This function should
1593 only be called at the completion of the <tt>master</tt> code. Other threads will
1594 still be waiting at the barrier and this call releases them.
1595 */
1596 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1597   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1598 
1599   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1600 }
1601 
1602 /*!
1603 @ingroup SYNCHRONIZATION
1604 @param loc source location information
1605 @param global_tid thread id.
1606 @return one if the thread should execute the master block, zero otherwise
1607 
1608 Start execution of a combined barrier and master(nowait) construct.
1609 The barrier is executed inside this function.
1610 There is no equivalent "end" function, since the
1611 */
1612 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1613   kmp_int32 ret;
1614 
1615   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1616 
1617   if (!TCR_4(__kmp_init_parallel))
1618     __kmp_parallel_initialize();
1619 
1620   if (__kmp_env_consistency_check) {
1621     if (loc == 0) {
1622       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1623     }
1624     __kmp_check_barrier(global_tid, ct_barrier, loc);
1625   }
1626 
1627 #if OMPT_SUPPORT
1628   ompt_frame_t *ompt_frame;
1629   if (ompt_enabled.enabled) {
1630     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1631     if (ompt_frame->enter_frame == NULL)
1632       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1633     OMPT_STORE_RETURN_ADDRESS(global_tid);
1634   }
1635 #endif
1636 #if USE_ITT_NOTIFY
1637   __kmp_threads[global_tid]->th.th_ident = loc;
1638 #endif
1639   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1640 #if OMPT_SUPPORT && OMPT_OPTIONAL
1641   if (ompt_enabled.enabled) {
1642     ompt_frame->enter_frame = NULL;
1643   }
1644 #endif
1645 
1646   ret = __kmpc_master(loc, global_tid);
1647 
1648   if (__kmp_env_consistency_check) {
1649     /*  there's no __kmpc_end_master called; so the (stats) */
1650     /*  actions of __kmpc_end_master are done here          */
1651 
1652     if (global_tid < 0) {
1653       KMP_WARNING(ThreadIdentInvalid);
1654     }
1655     if (ret) {
1656       /* only one thread should do the pop since only */
1657       /* one did the push (see __kmpc_master())       */
1658 
1659       __kmp_pop_sync(global_tid, ct_master, loc);
1660     }
1661   }
1662 
1663   return (ret);
1664 }
1665 
1666 /* The BARRIER for a SINGLE process section is always explicit   */
1667 /*!
1668 @ingroup WORK_SHARING
1669 @param loc  source location information
1670 @param global_tid  global thread number
1671 @return One if this thread should execute the single construct, zero otherwise.
1672 
1673 Test whether to execute a <tt>single</tt> construct.
1674 There are no implicit barriers in the two "single" calls, rather the compiler
1675 should introduce an explicit barrier if it is required.
1676 */
1677 
1678 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1679   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1680 
1681   if (rc) {
1682     // We are going to execute the single statement, so we should count it.
1683     KMP_COUNT_BLOCK(OMP_SINGLE);
1684     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1685   }
1686 
1687 #if OMPT_SUPPORT && OMPT_OPTIONAL
1688   kmp_info_t *this_thr = __kmp_threads[global_tid];
1689   kmp_team_t *team = this_thr->th.th_team;
1690   int tid = __kmp_tid_from_gtid(global_tid);
1691 
1692   if (ompt_enabled.enabled) {
1693     if (rc) {
1694       if (ompt_enabled.ompt_callback_work) {
1695         ompt_callbacks.ompt_callback(ompt_callback_work)(
1696             ompt_work_single_executor, ompt_scope_begin,
1697             &(team->t.ompt_team_info.parallel_data),
1698             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1699             1, OMPT_GET_RETURN_ADDRESS(0));
1700       }
1701     } else {
1702       if (ompt_enabled.ompt_callback_work) {
1703         ompt_callbacks.ompt_callback(ompt_callback_work)(
1704             ompt_work_single_other, ompt_scope_begin,
1705             &(team->t.ompt_team_info.parallel_data),
1706             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1707             1, OMPT_GET_RETURN_ADDRESS(0));
1708         ompt_callbacks.ompt_callback(ompt_callback_work)(
1709             ompt_work_single_other, ompt_scope_end,
1710             &(team->t.ompt_team_info.parallel_data),
1711             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1712             1, OMPT_GET_RETURN_ADDRESS(0));
1713       }
1714     }
1715   }
1716 #endif
1717 
1718   return rc;
1719 }
1720 
1721 /*!
1722 @ingroup WORK_SHARING
1723 @param loc  source location information
1724 @param global_tid  global thread number
1725 
1726 Mark the end of a <tt>single</tt> construct.  This function should
1727 only be called by the thread that executed the block of code protected
1728 by the `single` construct.
1729 */
1730 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1731   __kmp_exit_single(global_tid);
1732   KMP_POP_PARTITIONED_TIMER();
1733 
1734 #if OMPT_SUPPORT && OMPT_OPTIONAL
1735   kmp_info_t *this_thr = __kmp_threads[global_tid];
1736   kmp_team_t *team = this_thr->th.th_team;
1737   int tid = __kmp_tid_from_gtid(global_tid);
1738 
1739   if (ompt_enabled.ompt_callback_work) {
1740     ompt_callbacks.ompt_callback(ompt_callback_work)(
1741         ompt_work_single_executor, ompt_scope_end,
1742         &(team->t.ompt_team_info.parallel_data),
1743         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1744         OMPT_GET_RETURN_ADDRESS(0));
1745   }
1746 #endif
1747 }
1748 
1749 /*!
1750 @ingroup WORK_SHARING
1751 @param loc Source location
1752 @param global_tid Global thread id
1753 
1754 Mark the end of a statically scheduled loop.
1755 */
1756 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1757   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1758 
1759 #if OMPT_SUPPORT && OMPT_OPTIONAL
1760   if (ompt_enabled.ompt_callback_work) {
1761     ompt_work_type_t ompt_work_type = ompt_work_loop;
1762     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1763     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1764     // Determine workshare type
1765     if (loc != NULL) {
1766       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1767         ompt_work_type = ompt_work_loop;
1768       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1769         ompt_work_type = ompt_work_sections;
1770       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1771         ompt_work_type = ompt_work_distribute;
1772       } else {
1773         // use default set above.
1774         // a warning about this case is provided in __kmpc_for_static_init
1775       }
1776       KMP_DEBUG_ASSERT(ompt_work_type);
1777     }
1778     ompt_callbacks.ompt_callback(ompt_callback_work)(
1779         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1780         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1781   }
1782 #endif
1783 
1784   if (__kmp_env_consistency_check)
1785     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1786 }
1787 
1788 // User routines which take C-style arguments (call by value)
1789 // different from the Fortran equivalent routines
1790 
1791 void ompc_set_num_threads(int arg) {
1792   // !!!!! TODO: check the per-task binding
1793   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1794 }
1795 
1796 void ompc_set_dynamic(int flag) {
1797   kmp_info_t *thread;
1798 
1799   /* For the thread-private implementation of the internal controls */
1800   thread = __kmp_entry_thread();
1801 
1802   __kmp_save_internal_controls(thread);
1803 
1804   set__dynamic(thread, flag ? TRUE : FALSE);
1805 }
1806 
1807 void ompc_set_nested(int flag) {
1808   kmp_info_t *thread;
1809 
1810   /* For the thread-private internal controls implementation */
1811   thread = __kmp_entry_thread();
1812 
1813   __kmp_save_internal_controls(thread);
1814 
1815   set__nested(thread, flag ? TRUE : FALSE);
1816 }
1817 
1818 void ompc_set_max_active_levels(int max_active_levels) {
1819   /* TO DO */
1820   /* we want per-task implementation of this internal control */
1821 
1822   /* For the per-thread internal controls implementation */
1823   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1824 }
1825 
1826 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1827   // !!!!! TODO: check the per-task binding
1828   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1829 }
1830 
1831 int ompc_get_ancestor_thread_num(int level) {
1832   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1833 }
1834 
1835 int ompc_get_team_size(int level) {
1836   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1837 }
1838 
1839 void kmpc_set_stacksize(int arg) {
1840   // __kmp_aux_set_stacksize initializes the library if needed
1841   __kmp_aux_set_stacksize(arg);
1842 }
1843 
1844 void kmpc_set_stacksize_s(size_t arg) {
1845   // __kmp_aux_set_stacksize initializes the library if needed
1846   __kmp_aux_set_stacksize(arg);
1847 }
1848 
1849 void kmpc_set_blocktime(int arg) {
1850   int gtid, tid;
1851   kmp_info_t *thread;
1852 
1853   gtid = __kmp_entry_gtid();
1854   tid = __kmp_tid_from_gtid(gtid);
1855   thread = __kmp_thread_from_gtid(gtid);
1856 
1857   __kmp_aux_set_blocktime(arg, thread, tid);
1858 }
1859 
1860 void kmpc_set_library(int arg) {
1861   // __kmp_user_set_library initializes the library if needed
1862   __kmp_user_set_library((enum library_type)arg);
1863 }
1864 
1865 void kmpc_set_defaults(char const *str) {
1866   // __kmp_aux_set_defaults initializes the library if needed
1867   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1868 }
1869 
1870 void kmpc_set_disp_num_buffers(int arg) {
1871   // ignore after initialization because some teams have already
1872   // allocated dispatch buffers
1873   if (__kmp_init_serial == 0 && arg > 0)
1874     __kmp_dispatch_num_buffers = arg;
1875 }
1876 
1877 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1878 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1879   return -1;
1880 #else
1881   if (!TCR_4(__kmp_init_middle)) {
1882     __kmp_middle_initialize();
1883   }
1884   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1885 #endif
1886 }
1887 
1888 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1889 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1890   return -1;
1891 #else
1892   if (!TCR_4(__kmp_init_middle)) {
1893     __kmp_middle_initialize();
1894   }
1895   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1896 #endif
1897 }
1898 
1899 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1900 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1901   return -1;
1902 #else
1903   if (!TCR_4(__kmp_init_middle)) {
1904     __kmp_middle_initialize();
1905   }
1906   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1907 #endif
1908 }
1909 
1910 /* -------------------------------------------------------------------------- */
1911 /*!
1912 @ingroup THREADPRIVATE
1913 @param loc       source location information
1914 @param gtid      global thread number
1915 @param cpy_size  size of the cpy_data buffer
1916 @param cpy_data  pointer to data to be copied
1917 @param cpy_func  helper function to call for copying data
1918 @param didit     flag variable: 1=single thread; 0=not single thread
1919 
1920 __kmpc_copyprivate implements the interface for the private data broadcast
1921 needed for the copyprivate clause associated with a single region in an
1922 OpenMP<sup>*</sup> program (both C and Fortran).
1923 All threads participating in the parallel region call this routine.
1924 One of the threads (called the single thread) should have the <tt>didit</tt>
1925 variable set to 1 and all other threads should have that variable set to 0.
1926 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1927 
1928 The OpenMP specification forbids the use of nowait on the single region when a
1929 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1930 barrier internally to avoid race conditions, so the code generation for the
1931 single region should avoid generating a barrier after the call to @ref
1932 __kmpc_copyprivate.
1933 
1934 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1935 The <tt>loc</tt> parameter is a pointer to source location information.
1936 
1937 Internal implementation: The single thread will first copy its descriptor
1938 address (cpy_data) to a team-private location, then the other threads will each
1939 call the function pointed to by the parameter cpy_func, which carries out the
1940 copy by copying the data using the cpy_data buffer.
1941 
1942 The cpy_func routine used for the copy and the contents of the data area defined
1943 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1944 to be done. For instance, the cpy_data buffer can hold the actual data to be
1945 copied or it may hold a list of pointers to the data. The cpy_func routine must
1946 interpret the cpy_data buffer appropriately.
1947 
1948 The interface to cpy_func is as follows:
1949 @code
1950 void cpy_func( void *destination, void *source )
1951 @endcode
1952 where void *destination is the cpy_data pointer for the thread being copied to
1953 and void *source is the cpy_data pointer for the thread being copied from.
1954 */
1955 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1956                         void *cpy_data, void (*cpy_func)(void *, void *),
1957                         kmp_int32 didit) {
1958   void **data_ptr;
1959 
1960   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1961 
1962   KMP_MB();
1963 
1964   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1965 
1966   if (__kmp_env_consistency_check) {
1967     if (loc == 0) {
1968       KMP_WARNING(ConstructIdentInvalid);
1969     }
1970   }
1971 
1972   // ToDo: Optimize the following two barriers into some kind of split barrier
1973 
1974   if (didit)
1975     *data_ptr = cpy_data;
1976 
1977 #if OMPT_SUPPORT
1978   ompt_frame_t *ompt_frame;
1979   if (ompt_enabled.enabled) {
1980     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1981     if (ompt_frame->enter_frame == NULL)
1982       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1983     OMPT_STORE_RETURN_ADDRESS(gtid);
1984   }
1985 #endif
1986 /* This barrier is not a barrier region boundary */
1987 #if USE_ITT_NOTIFY
1988   __kmp_threads[gtid]->th.th_ident = loc;
1989 #endif
1990   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1991 
1992   if (!didit)
1993     (*cpy_func)(cpy_data, *data_ptr);
1994 
1995 // Consider next barrier a user-visible barrier for barrier region boundaries
1996 // Nesting checks are already handled by the single construct checks
1997 
1998 #if OMPT_SUPPORT
1999   if (ompt_enabled.enabled) {
2000     OMPT_STORE_RETURN_ADDRESS(gtid);
2001   }
2002 #endif
2003 #if USE_ITT_NOTIFY
2004   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2005 // tasks can overwrite the location)
2006 #endif
2007   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2008 #if OMPT_SUPPORT && OMPT_OPTIONAL
2009   if (ompt_enabled.enabled) {
2010     ompt_frame->enter_frame = NULL;
2011   }
2012 #endif
2013 }
2014 
2015 /* -------------------------------------------------------------------------- */
2016 
2017 #define INIT_LOCK __kmp_init_user_lock_with_checks
2018 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2019 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2020 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2021 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2022 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2023   __kmp_acquire_nested_user_lock_with_checks_timed
2024 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2025 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2026 #define TEST_LOCK __kmp_test_user_lock_with_checks
2027 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2028 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2029 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2030 
2031 // TODO: Make check abort messages use location info & pass it into
2032 // with_checks routines
2033 
2034 #if KMP_USE_DYNAMIC_LOCK
2035 
2036 // internal lock initializer
2037 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2038                                                     kmp_dyna_lockseq_t seq) {
2039   if (KMP_IS_D_LOCK(seq)) {
2040     KMP_INIT_D_LOCK(lock, seq);
2041 #if USE_ITT_BUILD
2042     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2043 #endif
2044   } else {
2045     KMP_INIT_I_LOCK(lock, seq);
2046 #if USE_ITT_BUILD
2047     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2048     __kmp_itt_lock_creating(ilk->lock, loc);
2049 #endif
2050   }
2051 }
2052 
2053 // internal nest lock initializer
2054 static __forceinline void
2055 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2056                                kmp_dyna_lockseq_t seq) {
2057 #if KMP_USE_TSX
2058   // Don't have nested lock implementation for speculative locks
2059   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2060     seq = __kmp_user_lock_seq;
2061 #endif
2062   switch (seq) {
2063   case lockseq_tas:
2064     seq = lockseq_nested_tas;
2065     break;
2066 #if KMP_USE_FUTEX
2067   case lockseq_futex:
2068     seq = lockseq_nested_futex;
2069     break;
2070 #endif
2071   case lockseq_ticket:
2072     seq = lockseq_nested_ticket;
2073     break;
2074   case lockseq_queuing:
2075     seq = lockseq_nested_queuing;
2076     break;
2077   case lockseq_drdpa:
2078     seq = lockseq_nested_drdpa;
2079     break;
2080   default:
2081     seq = lockseq_nested_queuing;
2082   }
2083   KMP_INIT_I_LOCK(lock, seq);
2084 #if USE_ITT_BUILD
2085   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2086   __kmp_itt_lock_creating(ilk->lock, loc);
2087 #endif
2088 }
2089 
2090 /* initialize the lock with a hint */
2091 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2092                                 uintptr_t hint) {
2093   KMP_DEBUG_ASSERT(__kmp_init_serial);
2094   if (__kmp_env_consistency_check && user_lock == NULL) {
2095     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2096   }
2097 
2098   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2099 
2100 #if OMPT_SUPPORT && OMPT_OPTIONAL
2101   // This is the case, if called from omp_init_lock_with_hint:
2102   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2103   if (!codeptr)
2104     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2105   if (ompt_enabled.ompt_callback_lock_init) {
2106     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2107         ompt_mutex_lock, (omp_lock_hint_t)hint,
2108         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2109         codeptr);
2110   }
2111 #endif
2112 }
2113 
2114 /* initialize the lock with a hint */
2115 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2116                                      void **user_lock, uintptr_t hint) {
2117   KMP_DEBUG_ASSERT(__kmp_init_serial);
2118   if (__kmp_env_consistency_check && user_lock == NULL) {
2119     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2120   }
2121 
2122   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2123 
2124 #if OMPT_SUPPORT && OMPT_OPTIONAL
2125   // This is the case, if called from omp_init_lock_with_hint:
2126   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2127   if (!codeptr)
2128     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2129   if (ompt_enabled.ompt_callback_lock_init) {
2130     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2131         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2132         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2133         codeptr);
2134   }
2135 #endif
2136 }
2137 
2138 #endif // KMP_USE_DYNAMIC_LOCK
2139 
2140 /* initialize the lock */
2141 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2142 #if KMP_USE_DYNAMIC_LOCK
2143 
2144   KMP_DEBUG_ASSERT(__kmp_init_serial);
2145   if (__kmp_env_consistency_check && user_lock == NULL) {
2146     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2147   }
2148   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2149 
2150 #if OMPT_SUPPORT && OMPT_OPTIONAL
2151   // This is the case, if called from omp_init_lock_with_hint:
2152   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2153   if (!codeptr)
2154     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2155   if (ompt_enabled.ompt_callback_lock_init) {
2156     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2157         ompt_mutex_lock, omp_lock_hint_none,
2158         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2159         codeptr);
2160   }
2161 #endif
2162 
2163 #else // KMP_USE_DYNAMIC_LOCK
2164 
2165   static char const *const func = "omp_init_lock";
2166   kmp_user_lock_p lck;
2167   KMP_DEBUG_ASSERT(__kmp_init_serial);
2168 
2169   if (__kmp_env_consistency_check) {
2170     if (user_lock == NULL) {
2171       KMP_FATAL(LockIsUninitialized, func);
2172     }
2173   }
2174 
2175   KMP_CHECK_USER_LOCK_INIT();
2176 
2177   if ((__kmp_user_lock_kind == lk_tas) &&
2178       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2179     lck = (kmp_user_lock_p)user_lock;
2180   }
2181 #if KMP_USE_FUTEX
2182   else if ((__kmp_user_lock_kind == lk_futex) &&
2183            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2184     lck = (kmp_user_lock_p)user_lock;
2185   }
2186 #endif
2187   else {
2188     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2189   }
2190   INIT_LOCK(lck);
2191   __kmp_set_user_lock_location(lck, loc);
2192 
2193 #if OMPT_SUPPORT && OMPT_OPTIONAL
2194   // This is the case, if called from omp_init_lock_with_hint:
2195   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2196   if (!codeptr)
2197     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2198   if (ompt_enabled.ompt_callback_lock_init) {
2199     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2200         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2201         (ompt_wait_id_t)user_lock, codeptr);
2202   }
2203 #endif
2204 
2205 #if USE_ITT_BUILD
2206   __kmp_itt_lock_creating(lck);
2207 #endif /* USE_ITT_BUILD */
2208 
2209 #endif // KMP_USE_DYNAMIC_LOCK
2210 } // __kmpc_init_lock
2211 
2212 /* initialize the lock */
2213 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2214 #if KMP_USE_DYNAMIC_LOCK
2215 
2216   KMP_DEBUG_ASSERT(__kmp_init_serial);
2217   if (__kmp_env_consistency_check && user_lock == NULL) {
2218     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2219   }
2220   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2221 
2222 #if OMPT_SUPPORT && OMPT_OPTIONAL
2223   // This is the case, if called from omp_init_lock_with_hint:
2224   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2225   if (!codeptr)
2226     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2227   if (ompt_enabled.ompt_callback_lock_init) {
2228     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2229         ompt_mutex_nest_lock, omp_lock_hint_none,
2230         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2231         codeptr);
2232   }
2233 #endif
2234 
2235 #else // KMP_USE_DYNAMIC_LOCK
2236 
2237   static char const *const func = "omp_init_nest_lock";
2238   kmp_user_lock_p lck;
2239   KMP_DEBUG_ASSERT(__kmp_init_serial);
2240 
2241   if (__kmp_env_consistency_check) {
2242     if (user_lock == NULL) {
2243       KMP_FATAL(LockIsUninitialized, func);
2244     }
2245   }
2246 
2247   KMP_CHECK_USER_LOCK_INIT();
2248 
2249   if ((__kmp_user_lock_kind == lk_tas) &&
2250       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2251        OMP_NEST_LOCK_T_SIZE)) {
2252     lck = (kmp_user_lock_p)user_lock;
2253   }
2254 #if KMP_USE_FUTEX
2255   else if ((__kmp_user_lock_kind == lk_futex) &&
2256            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2257             OMP_NEST_LOCK_T_SIZE)) {
2258     lck = (kmp_user_lock_p)user_lock;
2259   }
2260 #endif
2261   else {
2262     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2263   }
2264 
2265   INIT_NESTED_LOCK(lck);
2266   __kmp_set_user_lock_location(lck, loc);
2267 
2268 #if OMPT_SUPPORT && OMPT_OPTIONAL
2269   // This is the case, if called from omp_init_lock_with_hint:
2270   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2271   if (!codeptr)
2272     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2273   if (ompt_enabled.ompt_callback_lock_init) {
2274     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2275         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2276         (ompt_wait_id_t)user_lock, codeptr);
2277   }
2278 #endif
2279 
2280 #if USE_ITT_BUILD
2281   __kmp_itt_lock_creating(lck);
2282 #endif /* USE_ITT_BUILD */
2283 
2284 #endif // KMP_USE_DYNAMIC_LOCK
2285 } // __kmpc_init_nest_lock
2286 
2287 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2288 #if KMP_USE_DYNAMIC_LOCK
2289 
2290 #if USE_ITT_BUILD
2291   kmp_user_lock_p lck;
2292   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2293     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2294   } else {
2295     lck = (kmp_user_lock_p)user_lock;
2296   }
2297   __kmp_itt_lock_destroyed(lck);
2298 #endif
2299 #if OMPT_SUPPORT && OMPT_OPTIONAL
2300   // This is the case, if called from omp_init_lock_with_hint:
2301   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2302   if (!codeptr)
2303     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2304   if (ompt_enabled.ompt_callback_lock_destroy) {
2305     kmp_user_lock_p lck;
2306     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2307       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2308     } else {
2309       lck = (kmp_user_lock_p)user_lock;
2310     }
2311     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2312         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2313   }
2314 #endif
2315   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2316 #else
2317   kmp_user_lock_p lck;
2318 
2319   if ((__kmp_user_lock_kind == lk_tas) &&
2320       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2321     lck = (kmp_user_lock_p)user_lock;
2322   }
2323 #if KMP_USE_FUTEX
2324   else if ((__kmp_user_lock_kind == lk_futex) &&
2325            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2326     lck = (kmp_user_lock_p)user_lock;
2327   }
2328 #endif
2329   else {
2330     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2331   }
2332 
2333 #if OMPT_SUPPORT && OMPT_OPTIONAL
2334   // This is the case, if called from omp_init_lock_with_hint:
2335   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2336   if (!codeptr)
2337     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2338   if (ompt_enabled.ompt_callback_lock_destroy) {
2339     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2340         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2341   }
2342 #endif
2343 
2344 #if USE_ITT_BUILD
2345   __kmp_itt_lock_destroyed(lck);
2346 #endif /* USE_ITT_BUILD */
2347   DESTROY_LOCK(lck);
2348 
2349   if ((__kmp_user_lock_kind == lk_tas) &&
2350       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2351     ;
2352   }
2353 #if KMP_USE_FUTEX
2354   else if ((__kmp_user_lock_kind == lk_futex) &&
2355            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2356     ;
2357   }
2358 #endif
2359   else {
2360     __kmp_user_lock_free(user_lock, gtid, lck);
2361   }
2362 #endif // KMP_USE_DYNAMIC_LOCK
2363 } // __kmpc_destroy_lock
2364 
2365 /* destroy the lock */
2366 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2367 #if KMP_USE_DYNAMIC_LOCK
2368 
2369 #if USE_ITT_BUILD
2370   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2371   __kmp_itt_lock_destroyed(ilk->lock);
2372 #endif
2373 #if OMPT_SUPPORT && OMPT_OPTIONAL
2374   // This is the case, if called from omp_init_lock_with_hint:
2375   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2376   if (!codeptr)
2377     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2378   if (ompt_enabled.ompt_callback_lock_destroy) {
2379     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2380         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2381   }
2382 #endif
2383   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2384 
2385 #else // KMP_USE_DYNAMIC_LOCK
2386 
2387   kmp_user_lock_p lck;
2388 
2389   if ((__kmp_user_lock_kind == lk_tas) &&
2390       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2391        OMP_NEST_LOCK_T_SIZE)) {
2392     lck = (kmp_user_lock_p)user_lock;
2393   }
2394 #if KMP_USE_FUTEX
2395   else if ((__kmp_user_lock_kind == lk_futex) &&
2396            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2397             OMP_NEST_LOCK_T_SIZE)) {
2398     lck = (kmp_user_lock_p)user_lock;
2399   }
2400 #endif
2401   else {
2402     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2403   }
2404 
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2406   // This is the case, if called from omp_init_lock_with_hint:
2407   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2408   if (!codeptr)
2409     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2410   if (ompt_enabled.ompt_callback_lock_destroy) {
2411     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2412         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2413   }
2414 #endif
2415 
2416 #if USE_ITT_BUILD
2417   __kmp_itt_lock_destroyed(lck);
2418 #endif /* USE_ITT_BUILD */
2419 
2420   DESTROY_NESTED_LOCK(lck);
2421 
2422   if ((__kmp_user_lock_kind == lk_tas) &&
2423       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2424        OMP_NEST_LOCK_T_SIZE)) {
2425     ;
2426   }
2427 #if KMP_USE_FUTEX
2428   else if ((__kmp_user_lock_kind == lk_futex) &&
2429            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2430             OMP_NEST_LOCK_T_SIZE)) {
2431     ;
2432   }
2433 #endif
2434   else {
2435     __kmp_user_lock_free(user_lock, gtid, lck);
2436   }
2437 #endif // KMP_USE_DYNAMIC_LOCK
2438 } // __kmpc_destroy_nest_lock
2439 
2440 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2441   KMP_COUNT_BLOCK(OMP_set_lock);
2442 #if KMP_USE_DYNAMIC_LOCK
2443   int tag = KMP_EXTRACT_D_TAG(user_lock);
2444 #if USE_ITT_BUILD
2445   __kmp_itt_lock_acquiring(
2446       (kmp_user_lock_p)
2447           user_lock); // itt function will get to the right lock object.
2448 #endif
2449 #if OMPT_SUPPORT && OMPT_OPTIONAL
2450   // This is the case, if called from omp_init_lock_with_hint:
2451   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2452   if (!codeptr)
2453     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2454   if (ompt_enabled.ompt_callback_mutex_acquire) {
2455     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2456         ompt_mutex_lock, omp_lock_hint_none,
2457         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2458         codeptr);
2459   }
2460 #endif
2461 #if KMP_USE_INLINED_TAS
2462   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2463     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2464   } else
2465 #elif KMP_USE_INLINED_FUTEX
2466   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2467     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2468   } else
2469 #endif
2470   {
2471     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2472   }
2473 #if USE_ITT_BUILD
2474   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2475 #endif
2476 #if OMPT_SUPPORT && OMPT_OPTIONAL
2477   if (ompt_enabled.ompt_callback_mutex_acquired) {
2478     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2479         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2480   }
2481 #endif
2482 
2483 #else // KMP_USE_DYNAMIC_LOCK
2484 
2485   kmp_user_lock_p lck;
2486 
2487   if ((__kmp_user_lock_kind == lk_tas) &&
2488       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2489     lck = (kmp_user_lock_p)user_lock;
2490   }
2491 #if KMP_USE_FUTEX
2492   else if ((__kmp_user_lock_kind == lk_futex) &&
2493            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2494     lck = (kmp_user_lock_p)user_lock;
2495   }
2496 #endif
2497   else {
2498     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2499   }
2500 
2501 #if USE_ITT_BUILD
2502   __kmp_itt_lock_acquiring(lck);
2503 #endif /* USE_ITT_BUILD */
2504 #if OMPT_SUPPORT && OMPT_OPTIONAL
2505   // This is the case, if called from omp_init_lock_with_hint:
2506   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2507   if (!codeptr)
2508     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2509   if (ompt_enabled.ompt_callback_mutex_acquire) {
2510     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2511         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2512         (ompt_wait_id_t)lck, codeptr);
2513   }
2514 #endif
2515 
2516   ACQUIRE_LOCK(lck, gtid);
2517 
2518 #if USE_ITT_BUILD
2519   __kmp_itt_lock_acquired(lck);
2520 #endif /* USE_ITT_BUILD */
2521 
2522 #if OMPT_SUPPORT && OMPT_OPTIONAL
2523   if (ompt_enabled.ompt_callback_mutex_acquired) {
2524     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2525         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2526   }
2527 #endif
2528 
2529 #endif // KMP_USE_DYNAMIC_LOCK
2530 }
2531 
2532 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2533 #if KMP_USE_DYNAMIC_LOCK
2534 
2535 #if USE_ITT_BUILD
2536   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2537 #endif
2538 #if OMPT_SUPPORT && OMPT_OPTIONAL
2539   // This is the case, if called from omp_init_lock_with_hint:
2540   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2541   if (!codeptr)
2542     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2543   if (ompt_enabled.enabled) {
2544     if (ompt_enabled.ompt_callback_mutex_acquire) {
2545       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2546           ompt_mutex_nest_lock, omp_lock_hint_none,
2547           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2548           codeptr);
2549     }
2550   }
2551 #endif
2552   int acquire_status =
2553       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2554 #if USE_ITT_BUILD
2555   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2556 #endif
2557 
2558 #if OMPT_SUPPORT && OMPT_OPTIONAL
2559   if (ompt_enabled.enabled) {
2560     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2561       if (ompt_enabled.ompt_callback_mutex_acquired) {
2562         // lock_first
2563         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2564             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2565       }
2566     } else {
2567       if (ompt_enabled.ompt_callback_nest_lock) {
2568         // lock_next
2569         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2570             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2571       }
2572     }
2573   }
2574 #endif
2575 
2576 #else // KMP_USE_DYNAMIC_LOCK
2577   int acquire_status;
2578   kmp_user_lock_p lck;
2579 
2580   if ((__kmp_user_lock_kind == lk_tas) &&
2581       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2582        OMP_NEST_LOCK_T_SIZE)) {
2583     lck = (kmp_user_lock_p)user_lock;
2584   }
2585 #if KMP_USE_FUTEX
2586   else if ((__kmp_user_lock_kind == lk_futex) &&
2587            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2588             OMP_NEST_LOCK_T_SIZE)) {
2589     lck = (kmp_user_lock_p)user_lock;
2590   }
2591 #endif
2592   else {
2593     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2594   }
2595 
2596 #if USE_ITT_BUILD
2597   __kmp_itt_lock_acquiring(lck);
2598 #endif /* USE_ITT_BUILD */
2599 #if OMPT_SUPPORT && OMPT_OPTIONAL
2600   // This is the case, if called from omp_init_lock_with_hint:
2601   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2602   if (!codeptr)
2603     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2604   if (ompt_enabled.enabled) {
2605     if (ompt_enabled.ompt_callback_mutex_acquire) {
2606       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2607           ompt_mutex_nest_lock, omp_lock_hint_none,
2608           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2609     }
2610   }
2611 #endif
2612 
2613   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2614 
2615 #if USE_ITT_BUILD
2616   __kmp_itt_lock_acquired(lck);
2617 #endif /* USE_ITT_BUILD */
2618 
2619 #if OMPT_SUPPORT && OMPT_OPTIONAL
2620   if (ompt_enabled.enabled) {
2621     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2622       if (ompt_enabled.ompt_callback_mutex_acquired) {
2623         // lock_first
2624         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2625             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2626       }
2627     } else {
2628       if (ompt_enabled.ompt_callback_nest_lock) {
2629         // lock_next
2630         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2631             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2632       }
2633     }
2634   }
2635 #endif
2636 
2637 #endif // KMP_USE_DYNAMIC_LOCK
2638 }
2639 
2640 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2641 #if KMP_USE_DYNAMIC_LOCK
2642 
2643   int tag = KMP_EXTRACT_D_TAG(user_lock);
2644 #if USE_ITT_BUILD
2645   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2646 #endif
2647 #if KMP_USE_INLINED_TAS
2648   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2649     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2650   } else
2651 #elif KMP_USE_INLINED_FUTEX
2652   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2653     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2654   } else
2655 #endif
2656   {
2657     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2658   }
2659 
2660 #if OMPT_SUPPORT && OMPT_OPTIONAL
2661   // This is the case, if called from omp_init_lock_with_hint:
2662   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2663   if (!codeptr)
2664     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2665   if (ompt_enabled.ompt_callback_mutex_released) {
2666     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2667         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2668   }
2669 #endif
2670 
2671 #else // KMP_USE_DYNAMIC_LOCK
2672 
2673   kmp_user_lock_p lck;
2674 
2675   /* Can't use serial interval since not block structured */
2676   /* release the lock */
2677 
2678   if ((__kmp_user_lock_kind == lk_tas) &&
2679       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2680 #if KMP_OS_LINUX &&                                                            \
2681     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2682 // "fast" path implemented to fix customer performance issue
2683 #if USE_ITT_BUILD
2684     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2685 #endif /* USE_ITT_BUILD */
2686     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2687     KMP_MB();
2688 
2689 #if OMPT_SUPPORT && OMPT_OPTIONAL
2690     // This is the case, if called from omp_init_lock_with_hint:
2691     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2692     if (!codeptr)
2693       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2694     if (ompt_enabled.ompt_callback_mutex_released) {
2695       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2696           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2697     }
2698 #endif
2699 
2700     return;
2701 #else
2702     lck = (kmp_user_lock_p)user_lock;
2703 #endif
2704   }
2705 #if KMP_USE_FUTEX
2706   else if ((__kmp_user_lock_kind == lk_futex) &&
2707            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2708     lck = (kmp_user_lock_p)user_lock;
2709   }
2710 #endif
2711   else {
2712     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2713   }
2714 
2715 #if USE_ITT_BUILD
2716   __kmp_itt_lock_releasing(lck);
2717 #endif /* USE_ITT_BUILD */
2718 
2719   RELEASE_LOCK(lck, gtid);
2720 
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722   // This is the case, if called from omp_init_lock_with_hint:
2723   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2724   if (!codeptr)
2725     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2726   if (ompt_enabled.ompt_callback_mutex_released) {
2727     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2728         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2729   }
2730 #endif
2731 
2732 #endif // KMP_USE_DYNAMIC_LOCK
2733 }
2734 
2735 /* release the lock */
2736 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2737 #if KMP_USE_DYNAMIC_LOCK
2738 
2739 #if USE_ITT_BUILD
2740   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2741 #endif
2742   int release_status =
2743       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2744 
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746   // This is the case, if called from omp_init_lock_with_hint:
2747   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2748   if (!codeptr)
2749     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750   if (ompt_enabled.enabled) {
2751     if (release_status == KMP_LOCK_RELEASED) {
2752       if (ompt_enabled.ompt_callback_mutex_released) {
2753         // release_lock_last
2754         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2755             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2756       }
2757     } else if (ompt_enabled.ompt_callback_nest_lock) {
2758       // release_lock_prev
2759       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2760           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2761     }
2762   }
2763 #endif
2764 
2765 #else // KMP_USE_DYNAMIC_LOCK
2766 
2767   kmp_user_lock_p lck;
2768 
2769   /* Can't use serial interval since not block structured */
2770 
2771   if ((__kmp_user_lock_kind == lk_tas) &&
2772       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2773        OMP_NEST_LOCK_T_SIZE)) {
2774 #if KMP_OS_LINUX &&                                                            \
2775     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2776     // "fast" path implemented to fix customer performance issue
2777     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2778 #if USE_ITT_BUILD
2779     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2780 #endif /* USE_ITT_BUILD */
2781 
2782 #if OMPT_SUPPORT && OMPT_OPTIONAL
2783     int release_status = KMP_LOCK_STILL_HELD;
2784 #endif
2785 
2786     if (--(tl->lk.depth_locked) == 0) {
2787       TCW_4(tl->lk.poll, 0);
2788 #if OMPT_SUPPORT && OMPT_OPTIONAL
2789       release_status = KMP_LOCK_RELEASED;
2790 #endif
2791     }
2792     KMP_MB();
2793 
2794 #if OMPT_SUPPORT && OMPT_OPTIONAL
2795     // This is the case, if called from omp_init_lock_with_hint:
2796     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2797     if (!codeptr)
2798       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2799     if (ompt_enabled.enabled) {
2800       if (release_status == KMP_LOCK_RELEASED) {
2801         if (ompt_enabled.ompt_callback_mutex_released) {
2802           // release_lock_last
2803           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2804               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2805         }
2806       } else if (ompt_enabled.ompt_callback_nest_lock) {
2807         // release_lock_previous
2808         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2809             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2810       }
2811     }
2812 #endif
2813 
2814     return;
2815 #else
2816     lck = (kmp_user_lock_p)user_lock;
2817 #endif
2818   }
2819 #if KMP_USE_FUTEX
2820   else if ((__kmp_user_lock_kind == lk_futex) &&
2821            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2822             OMP_NEST_LOCK_T_SIZE)) {
2823     lck = (kmp_user_lock_p)user_lock;
2824   }
2825 #endif
2826   else {
2827     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2828   }
2829 
2830 #if USE_ITT_BUILD
2831   __kmp_itt_lock_releasing(lck);
2832 #endif /* USE_ITT_BUILD */
2833 
2834   int release_status;
2835   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837   // This is the case, if called from omp_init_lock_with_hint:
2838   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2839   if (!codeptr)
2840     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2841   if (ompt_enabled.enabled) {
2842     if (release_status == KMP_LOCK_RELEASED) {
2843       if (ompt_enabled.ompt_callback_mutex_released) {
2844         // release_lock_last
2845         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2846             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2847       }
2848     } else if (ompt_enabled.ompt_callback_nest_lock) {
2849       // release_lock_previous
2850       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2851           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2852     }
2853   }
2854 #endif
2855 
2856 #endif // KMP_USE_DYNAMIC_LOCK
2857 }
2858 
2859 /* try to acquire the lock */
2860 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2861   KMP_COUNT_BLOCK(OMP_test_lock);
2862 
2863 #if KMP_USE_DYNAMIC_LOCK
2864   int rc;
2865   int tag = KMP_EXTRACT_D_TAG(user_lock);
2866 #if USE_ITT_BUILD
2867   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2868 #endif
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   // This is the case, if called from omp_init_lock_with_hint:
2871   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2872   if (!codeptr)
2873     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2874   if (ompt_enabled.ompt_callback_mutex_acquire) {
2875     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2876         ompt_mutex_lock, omp_lock_hint_none,
2877         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2878         codeptr);
2879   }
2880 #endif
2881 #if KMP_USE_INLINED_TAS
2882   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2883     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2884   } else
2885 #elif KMP_USE_INLINED_FUTEX
2886   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2887     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2888   } else
2889 #endif
2890   {
2891     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2892   }
2893   if (rc) {
2894 #if USE_ITT_BUILD
2895     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2896 #endif
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898     if (ompt_enabled.ompt_callback_mutex_acquired) {
2899       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2900           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2901     }
2902 #endif
2903     return FTN_TRUE;
2904   } else {
2905 #if USE_ITT_BUILD
2906     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2907 #endif
2908     return FTN_FALSE;
2909   }
2910 
2911 #else // KMP_USE_DYNAMIC_LOCK
2912 
2913   kmp_user_lock_p lck;
2914   int rc;
2915 
2916   if ((__kmp_user_lock_kind == lk_tas) &&
2917       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2918     lck = (kmp_user_lock_p)user_lock;
2919   }
2920 #if KMP_USE_FUTEX
2921   else if ((__kmp_user_lock_kind == lk_futex) &&
2922            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2923     lck = (kmp_user_lock_p)user_lock;
2924   }
2925 #endif
2926   else {
2927     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2928   }
2929 
2930 #if USE_ITT_BUILD
2931   __kmp_itt_lock_acquiring(lck);
2932 #endif /* USE_ITT_BUILD */
2933 #if OMPT_SUPPORT && OMPT_OPTIONAL
2934   // This is the case, if called from omp_init_lock_with_hint:
2935   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2936   if (!codeptr)
2937     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2938   if (ompt_enabled.ompt_callback_mutex_acquire) {
2939     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2940         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2941         (ompt_wait_id_t)lck, codeptr);
2942   }
2943 #endif
2944 
2945   rc = TEST_LOCK(lck, gtid);
2946 #if USE_ITT_BUILD
2947   if (rc) {
2948     __kmp_itt_lock_acquired(lck);
2949   } else {
2950     __kmp_itt_lock_cancelled(lck);
2951   }
2952 #endif /* USE_ITT_BUILD */
2953 #if OMPT_SUPPORT && OMPT_OPTIONAL
2954   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2955     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2956         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2957   }
2958 #endif
2959 
2960   return (rc ? FTN_TRUE : FTN_FALSE);
2961 
2962 /* Can't use serial interval since not block structured */
2963 
2964 #endif // KMP_USE_DYNAMIC_LOCK
2965 }
2966 
2967 /* try to acquire the lock */
2968 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2969 #if KMP_USE_DYNAMIC_LOCK
2970   int rc;
2971 #if USE_ITT_BUILD
2972   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2973 #endif
2974 #if OMPT_SUPPORT && OMPT_OPTIONAL
2975   // This is the case, if called from omp_init_lock_with_hint:
2976   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2977   if (!codeptr)
2978     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2979   if (ompt_enabled.ompt_callback_mutex_acquire) {
2980     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2981         ompt_mutex_nest_lock, omp_lock_hint_none,
2982         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2983         codeptr);
2984   }
2985 #endif
2986   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2987 #if USE_ITT_BUILD
2988   if (rc) {
2989     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2990   } else {
2991     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2992   }
2993 #endif
2994 #if OMPT_SUPPORT && OMPT_OPTIONAL
2995   if (ompt_enabled.enabled && rc) {
2996     if (rc == 1) {
2997       if (ompt_enabled.ompt_callback_mutex_acquired) {
2998         // lock_first
2999         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3000             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3001       }
3002     } else {
3003       if (ompt_enabled.ompt_callback_nest_lock) {
3004         // lock_next
3005         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3006             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3007       }
3008     }
3009   }
3010 #endif
3011   return rc;
3012 
3013 #else // KMP_USE_DYNAMIC_LOCK
3014 
3015   kmp_user_lock_p lck;
3016   int rc;
3017 
3018   if ((__kmp_user_lock_kind == lk_tas) &&
3019       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3020        OMP_NEST_LOCK_T_SIZE)) {
3021     lck = (kmp_user_lock_p)user_lock;
3022   }
3023 #if KMP_USE_FUTEX
3024   else if ((__kmp_user_lock_kind == lk_futex) &&
3025            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3026             OMP_NEST_LOCK_T_SIZE)) {
3027     lck = (kmp_user_lock_p)user_lock;
3028   }
3029 #endif
3030   else {
3031     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3032   }
3033 
3034 #if USE_ITT_BUILD
3035   __kmp_itt_lock_acquiring(lck);
3036 #endif /* USE_ITT_BUILD */
3037 
3038 #if OMPT_SUPPORT && OMPT_OPTIONAL
3039   // This is the case, if called from omp_init_lock_with_hint:
3040   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3041   if (!codeptr)
3042     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3043   if (ompt_enabled.enabled) &&
3044         ompt_enabled.ompt_callback_mutex_acquire) {
3045       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3046           ompt_mutex_nest_lock, omp_lock_hint_none,
3047           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3048     }
3049 #endif
3050 
3051   rc = TEST_NESTED_LOCK(lck, gtid);
3052 #if USE_ITT_BUILD
3053   if (rc) {
3054     __kmp_itt_lock_acquired(lck);
3055   } else {
3056     __kmp_itt_lock_cancelled(lck);
3057   }
3058 #endif /* USE_ITT_BUILD */
3059 #if OMPT_SUPPORT && OMPT_OPTIONAL
3060   if (ompt_enabled.enabled && rc) {
3061     if (rc == 1) {
3062       if (ompt_enabled.ompt_callback_mutex_acquired) {
3063         // lock_first
3064         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3065             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3066       }
3067     } else {
3068       if (ompt_enabled.ompt_callback_nest_lock) {
3069         // lock_next
3070         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3071             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3072       }
3073     }
3074   }
3075 #endif
3076   return rc;
3077 
3078 /* Can't use serial interval since not block structured */
3079 
3080 #endif // KMP_USE_DYNAMIC_LOCK
3081 }
3082 
3083 // Interface to fast scalable reduce methods routines
3084 
3085 // keep the selected method in a thread local structure for cross-function
3086 // usage: will be used in __kmpc_end_reduce* functions;
3087 // another solution: to re-determine the method one more time in
3088 // __kmpc_end_reduce* functions (new prototype required then)
3089 // AT: which solution is better?
3090 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3091   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3092 
3093 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3094   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3095 
3096 // description of the packed_reduction_method variable: look at the macros in
3097 // kmp.h
3098 
3099 // used in a critical section reduce block
3100 static __forceinline void
3101 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3102                                           kmp_critical_name *crit) {
3103 
3104   // this lock was visible to a customer and to the threading profile tool as a
3105   // serial overhead span (although it's used for an internal purpose only)
3106   //            why was it visible in previous implementation?
3107   //            should we keep it visible in new reduce block?
3108   kmp_user_lock_p lck;
3109 
3110 #if KMP_USE_DYNAMIC_LOCK
3111 
3112   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3113   // Check if it is initialized.
3114   if (*lk == 0) {
3115     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3116       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3117                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3118     } else {
3119       __kmp_init_indirect_csptr(crit, loc, global_tid,
3120                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3121     }
3122   }
3123   // Branch for accessing the actual lock object and set operation. This
3124   // branching is inevitable since this lock initialization does not follow the
3125   // normal dispatch path (lock table is not used).
3126   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3127     lck = (kmp_user_lock_p)lk;
3128     KMP_DEBUG_ASSERT(lck != NULL);
3129     if (__kmp_env_consistency_check) {
3130       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3131     }
3132     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3133   } else {
3134     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3135     lck = ilk->lock;
3136     KMP_DEBUG_ASSERT(lck != NULL);
3137     if (__kmp_env_consistency_check) {
3138       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3139     }
3140     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3141   }
3142 
3143 #else // KMP_USE_DYNAMIC_LOCK
3144 
3145   // We know that the fast reduction code is only emitted by Intel compilers
3146   // with 32 byte critical sections. If there isn't enough space, then we
3147   // have to use a pointer.
3148   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3149     lck = (kmp_user_lock_p)crit;
3150   } else {
3151     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3152   }
3153   KMP_DEBUG_ASSERT(lck != NULL);
3154 
3155   if (__kmp_env_consistency_check)
3156     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3157 
3158   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3159 
3160 #endif // KMP_USE_DYNAMIC_LOCK
3161 }
3162 
3163 // used in a critical section reduce block
3164 static __forceinline void
3165 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3166                                         kmp_critical_name *crit) {
3167 
3168   kmp_user_lock_p lck;
3169 
3170 #if KMP_USE_DYNAMIC_LOCK
3171 
3172   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3173     lck = (kmp_user_lock_p)crit;
3174     if (__kmp_env_consistency_check)
3175       __kmp_pop_sync(global_tid, ct_critical, loc);
3176     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3177   } else {
3178     kmp_indirect_lock_t *ilk =
3179         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3180     if (__kmp_env_consistency_check)
3181       __kmp_pop_sync(global_tid, ct_critical, loc);
3182     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3183   }
3184 
3185 #else // KMP_USE_DYNAMIC_LOCK
3186 
3187   // We know that the fast reduction code is only emitted by Intel compilers
3188   // with 32 byte critical sections. If there isn't enough space, then we have
3189   // to use a pointer.
3190   if (__kmp_base_user_lock_size > 32) {
3191     lck = *((kmp_user_lock_p *)crit);
3192     KMP_ASSERT(lck != NULL);
3193   } else {
3194     lck = (kmp_user_lock_p)crit;
3195   }
3196 
3197   if (__kmp_env_consistency_check)
3198     __kmp_pop_sync(global_tid, ct_critical, loc);
3199 
3200   __kmp_release_user_lock_with_checks(lck, global_tid);
3201 
3202 #endif // KMP_USE_DYNAMIC_LOCK
3203 } // __kmp_end_critical_section_reduce_block
3204 
3205 #if OMP_40_ENABLED
3206 static __forceinline int
3207 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3208                                      int *task_state) {
3209   kmp_team_t *team;
3210 
3211   // Check if we are inside the teams construct?
3212   if (th->th.th_teams_microtask) {
3213     *team_p = team = th->th.th_team;
3214     if (team->t.t_level == th->th.th_teams_level) {
3215       // This is reduction at teams construct.
3216       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3217       // Let's swap teams temporarily for the reduction.
3218       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3219       th->th.th_team = team->t.t_parent;
3220       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3221       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3222       *task_state = th->th.th_task_state;
3223       th->th.th_task_state = 0;
3224 
3225       return 1;
3226     }
3227   }
3228   return 0;
3229 }
3230 
3231 static __forceinline void
3232 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3233   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3234   th->th.th_info.ds.ds_tid = 0;
3235   th->th.th_team = team;
3236   th->th.th_team_nproc = team->t.t_nproc;
3237   th->th.th_task_team = team->t.t_task_team[task_state];
3238   th->th.th_task_state = task_state;
3239 }
3240 #endif
3241 
3242 /* 2.a.i. Reduce Block without a terminating barrier */
3243 /*!
3244 @ingroup SYNCHRONIZATION
3245 @param loc source location information
3246 @param global_tid global thread number
3247 @param num_vars number of items (variables) to be reduced
3248 @param reduce_size size of data in bytes to be reduced
3249 @param reduce_data pointer to data to be reduced
3250 @param reduce_func callback function providing reduction operation on two
3251 operands and returning result of reduction in lhs_data
3252 @param lck pointer to the unique lock data structure
3253 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3254 threads if atomic reduction needed
3255 
3256 The nowait version is used for a reduce clause with the nowait argument.
3257 */
3258 kmp_int32
3259 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3260                      size_t reduce_size, void *reduce_data,
3261                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3262                      kmp_critical_name *lck) {
3263 
3264   KMP_COUNT_BLOCK(REDUCE_nowait);
3265   int retval = 0;
3266   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3267 #if OMP_40_ENABLED
3268   kmp_info_t *th;
3269   kmp_team_t *team;
3270   int teams_swapped = 0, task_state;
3271 #endif
3272   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3273 
3274   // why do we need this initialization here at all?
3275   // Reduction clause can not be used as a stand-alone directive.
3276 
3277   // do not call __kmp_serial_initialize(), it will be called by
3278   // __kmp_parallel_initialize() if needed
3279   // possible detection of false-positive race by the threadchecker ???
3280   if (!TCR_4(__kmp_init_parallel))
3281     __kmp_parallel_initialize();
3282 
3283 // check correctness of reduce block nesting
3284 #if KMP_USE_DYNAMIC_LOCK
3285   if (__kmp_env_consistency_check)
3286     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3287 #else
3288   if (__kmp_env_consistency_check)
3289     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3290 #endif
3291 
3292 #if OMP_40_ENABLED
3293   th = __kmp_thread_from_gtid(global_tid);
3294   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3295 #endif // OMP_40_ENABLED
3296 
3297   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3298   // the value should be kept in a variable
3299   // the variable should be either a construct-specific or thread-specific
3300   // property, not a team specific property
3301   //     (a thread can reach the next reduce block on the next construct, reduce
3302   //     method may differ on the next construct)
3303   // an ident_t "loc" parameter could be used as a construct-specific property
3304   // (what if loc == 0?)
3305   //     (if both construct-specific and team-specific variables were shared,
3306   //     then unness extra syncs should be needed)
3307   // a thread-specific variable is better regarding two issues above (next
3308   // construct and extra syncs)
3309   // a thread-specific "th_local.reduction_method" variable is used currently
3310   // each thread executes 'determine' and 'set' lines (no need to execute by one
3311   // thread, to avoid unness extra syncs)
3312 
3313   packed_reduction_method = __kmp_determine_reduction_method(
3314       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3315   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3316 
3317   if (packed_reduction_method == critical_reduce_block) {
3318 
3319     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3320     retval = 1;
3321 
3322   } else if (packed_reduction_method == empty_reduce_block) {
3323 
3324     // usage: if team size == 1, no synchronization is required ( Intel
3325     // platforms only )
3326     retval = 1;
3327 
3328   } else if (packed_reduction_method == atomic_reduce_block) {
3329 
3330     retval = 2;
3331 
3332     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3333     // won't be called by the code gen)
3334     //     (it's not quite good, because the checking block has been closed by
3335     //     this 'pop',
3336     //      but atomic operation has not been executed yet, will be executed
3337     //      slightly later, literally on next instruction)
3338     if (__kmp_env_consistency_check)
3339       __kmp_pop_sync(global_tid, ct_reduce, loc);
3340 
3341   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3342                                    tree_reduce_block)) {
3343 
3344 // AT: performance issue: a real barrier here
3345 // AT:     (if master goes slow, other threads are blocked here waiting for the
3346 // master to come and release them)
3347 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3348 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3349 // be confusing to a customer)
3350 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3351 // might go faster and be more in line with sense of NOWAIT
3352 // AT: TO DO: do epcc test and compare times
3353 
3354 // this barrier should be invisible to a customer and to the threading profile
3355 // tool (it's neither a terminating barrier nor customer's code, it's
3356 // used for an internal purpose)
3357 #if OMPT_SUPPORT
3358     // JP: can this barrier potentially leed to task scheduling?
3359     // JP: as long as there is a barrier in the implementation, OMPT should and
3360     // will provide the barrier events
3361     //         so we set-up the necessary frame/return addresses.
3362     ompt_frame_t *ompt_frame;
3363     if (ompt_enabled.enabled) {
3364       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3365       if (ompt_frame->enter_frame == NULL)
3366         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3367       OMPT_STORE_RETURN_ADDRESS(global_tid);
3368     }
3369 #endif
3370 #if USE_ITT_NOTIFY
3371     __kmp_threads[global_tid]->th.th_ident = loc;
3372 #endif
3373     retval =
3374         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3375                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3376     retval = (retval != 0) ? (0) : (1);
3377 #if OMPT_SUPPORT && OMPT_OPTIONAL
3378     if (ompt_enabled.enabled) {
3379       ompt_frame->enter_frame = NULL;
3380     }
3381 #endif
3382 
3383     // all other workers except master should do this pop here
3384     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3385     if (__kmp_env_consistency_check) {
3386       if (retval == 0) {
3387         __kmp_pop_sync(global_tid, ct_reduce, loc);
3388       }
3389     }
3390 
3391   } else {
3392 
3393     // should never reach this block
3394     KMP_ASSERT(0); // "unexpected method"
3395   }
3396 #if OMP_40_ENABLED
3397   if (teams_swapped) {
3398     __kmp_restore_swapped_teams(th, team, task_state);
3399   }
3400 #endif
3401   KA_TRACE(
3402       10,
3403       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3404        global_tid, packed_reduction_method, retval));
3405 
3406   return retval;
3407 }
3408 
3409 /*!
3410 @ingroup SYNCHRONIZATION
3411 @param loc source location information
3412 @param global_tid global thread id.
3413 @param lck pointer to the unique lock data structure
3414 
3415 Finish the execution of a reduce nowait.
3416 */
3417 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3418                               kmp_critical_name *lck) {
3419 
3420   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3421 
3422   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3423 
3424   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3425 
3426   if (packed_reduction_method == critical_reduce_block) {
3427 
3428     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3429 
3430   } else if (packed_reduction_method == empty_reduce_block) {
3431 
3432     // usage: if team size == 1, no synchronization is required ( on Intel
3433     // platforms only )
3434 
3435   } else if (packed_reduction_method == atomic_reduce_block) {
3436 
3437     // neither master nor other workers should get here
3438     //     (code gen does not generate this call in case 2: atomic reduce block)
3439     // actually it's better to remove this elseif at all;
3440     // after removal this value will checked by the 'else' and will assert
3441 
3442   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3443                                    tree_reduce_block)) {
3444 
3445     // only master gets here
3446 
3447   } else {
3448 
3449     // should never reach this block
3450     KMP_ASSERT(0); // "unexpected method"
3451   }
3452 
3453   if (__kmp_env_consistency_check)
3454     __kmp_pop_sync(global_tid, ct_reduce, loc);
3455 
3456   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3457                 global_tid, packed_reduction_method));
3458 
3459   return;
3460 }
3461 
3462 /* 2.a.ii. Reduce Block with a terminating barrier */
3463 
3464 /*!
3465 @ingroup SYNCHRONIZATION
3466 @param loc source location information
3467 @param global_tid global thread number
3468 @param num_vars number of items (variables) to be reduced
3469 @param reduce_size size of data in bytes to be reduced
3470 @param reduce_data pointer to data to be reduced
3471 @param reduce_func callback function providing reduction operation on two
3472 operands and returning result of reduction in lhs_data
3473 @param lck pointer to the unique lock data structure
3474 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3475 threads if atomic reduction needed
3476 
3477 A blocking reduce that includes an implicit barrier.
3478 */
3479 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3480                         size_t reduce_size, void *reduce_data,
3481                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3482                         kmp_critical_name *lck) {
3483   KMP_COUNT_BLOCK(REDUCE_wait);
3484   int retval = 0;
3485   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3486 #if OMP_40_ENABLED
3487   kmp_info_t *th;
3488   kmp_team_t *team;
3489   int teams_swapped = 0, task_state;
3490 #endif
3491 
3492   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3493 
3494   // why do we need this initialization here at all?
3495   // Reduction clause can not be a stand-alone directive.
3496 
3497   // do not call __kmp_serial_initialize(), it will be called by
3498   // __kmp_parallel_initialize() if needed
3499   // possible detection of false-positive race by the threadchecker ???
3500   if (!TCR_4(__kmp_init_parallel))
3501     __kmp_parallel_initialize();
3502 
3503 // check correctness of reduce block nesting
3504 #if KMP_USE_DYNAMIC_LOCK
3505   if (__kmp_env_consistency_check)
3506     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3507 #else
3508   if (__kmp_env_consistency_check)
3509     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3510 #endif
3511 
3512 #if OMP_40_ENABLED
3513   th = __kmp_thread_from_gtid(global_tid);
3514   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3515 #endif // OMP_40_ENABLED
3516 
3517   packed_reduction_method = __kmp_determine_reduction_method(
3518       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3519   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3520 
3521   if (packed_reduction_method == critical_reduce_block) {
3522 
3523     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3524     retval = 1;
3525 
3526   } else if (packed_reduction_method == empty_reduce_block) {
3527 
3528     // usage: if team size == 1, no synchronization is required ( Intel
3529     // platforms only )
3530     retval = 1;
3531 
3532   } else if (packed_reduction_method == atomic_reduce_block) {
3533 
3534     retval = 2;
3535 
3536   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3537                                    tree_reduce_block)) {
3538 
3539 // case tree_reduce_block:
3540 // this barrier should be visible to a customer and to the threading profile
3541 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3542 #if OMPT_SUPPORT
3543     ompt_frame_t *ompt_frame;
3544     if (ompt_enabled.enabled) {
3545       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3546       if (ompt_frame->enter_frame == NULL)
3547         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3548       OMPT_STORE_RETURN_ADDRESS(global_tid);
3549     }
3550 #endif
3551 #if USE_ITT_NOTIFY
3552     __kmp_threads[global_tid]->th.th_ident =
3553         loc; // needed for correct notification of frames
3554 #endif
3555     retval =
3556         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3557                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3558     retval = (retval != 0) ? (0) : (1);
3559 #if OMPT_SUPPORT && OMPT_OPTIONAL
3560     if (ompt_enabled.enabled) {
3561       ompt_frame->enter_frame = NULL;
3562     }
3563 #endif
3564 
3565     // all other workers except master should do this pop here
3566     // ( none of other workers except master will enter __kmpc_end_reduce() )
3567     if (__kmp_env_consistency_check) {
3568       if (retval == 0) { // 0: all other workers; 1: master
3569         __kmp_pop_sync(global_tid, ct_reduce, loc);
3570       }
3571     }
3572 
3573   } else {
3574 
3575     // should never reach this block
3576     KMP_ASSERT(0); // "unexpected method"
3577   }
3578 #if OMP_40_ENABLED
3579   if (teams_swapped) {
3580     __kmp_restore_swapped_teams(th, team, task_state);
3581   }
3582 #endif
3583 
3584   KA_TRACE(10,
3585            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3586             global_tid, packed_reduction_method, retval));
3587 
3588   return retval;
3589 }
3590 
3591 /*!
3592 @ingroup SYNCHRONIZATION
3593 @param loc source location information
3594 @param global_tid global thread id.
3595 @param lck pointer to the unique lock data structure
3596 
3597 Finish the execution of a blocking reduce.
3598 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3599 start function.
3600 */
3601 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3602                        kmp_critical_name *lck) {
3603 
3604   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3605 #if OMP_40_ENABLED
3606   kmp_info_t *th;
3607   kmp_team_t *team;
3608   int teams_swapped = 0, task_state;
3609 #endif
3610 
3611   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3612 
3613 #if OMP_40_ENABLED
3614   th = __kmp_thread_from_gtid(global_tid);
3615   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3616 #endif // OMP_40_ENABLED
3617 
3618   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3619 
3620   // this barrier should be visible to a customer and to the threading profile
3621   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3622 
3623   if (packed_reduction_method == critical_reduce_block) {
3624 
3625     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3626 
3627 // TODO: implicit barrier: should be exposed
3628 #if OMPT_SUPPORT
3629     ompt_frame_t *ompt_frame;
3630     if (ompt_enabled.enabled) {
3631       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3632       if (ompt_frame->enter_frame == NULL)
3633         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3634       OMPT_STORE_RETURN_ADDRESS(global_tid);
3635     }
3636 #endif
3637 #if USE_ITT_NOTIFY
3638     __kmp_threads[global_tid]->th.th_ident = loc;
3639 #endif
3640     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3641 #if OMPT_SUPPORT && OMPT_OPTIONAL
3642     if (ompt_enabled.enabled) {
3643       ompt_frame->enter_frame = NULL;
3644     }
3645 #endif
3646 
3647   } else if (packed_reduction_method == empty_reduce_block) {
3648 
3649 // usage: if team size==1, no synchronization is required (Intel platforms only)
3650 
3651 // TODO: implicit barrier: should be exposed
3652 #if OMPT_SUPPORT
3653     ompt_frame_t *ompt_frame;
3654     if (ompt_enabled.enabled) {
3655       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3656       if (ompt_frame->enter_frame == NULL)
3657         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3658       OMPT_STORE_RETURN_ADDRESS(global_tid);
3659     }
3660 #endif
3661 #if USE_ITT_NOTIFY
3662     __kmp_threads[global_tid]->th.th_ident = loc;
3663 #endif
3664     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3665 #if OMPT_SUPPORT && OMPT_OPTIONAL
3666     if (ompt_enabled.enabled) {
3667       ompt_frame->enter_frame = NULL;
3668     }
3669 #endif
3670 
3671   } else if (packed_reduction_method == atomic_reduce_block) {
3672 
3673 #if OMPT_SUPPORT
3674     ompt_frame_t *ompt_frame;
3675     if (ompt_enabled.enabled) {
3676       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3677       if (ompt_frame->enter_frame == NULL)
3678         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3679       OMPT_STORE_RETURN_ADDRESS(global_tid);
3680     }
3681 #endif
3682 // TODO: implicit barrier: should be exposed
3683 #if USE_ITT_NOTIFY
3684     __kmp_threads[global_tid]->th.th_ident = loc;
3685 #endif
3686     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3687 #if OMPT_SUPPORT && OMPT_OPTIONAL
3688     if (ompt_enabled.enabled) {
3689       ompt_frame->enter_frame = NULL;
3690     }
3691 #endif
3692 
3693   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3694                                    tree_reduce_block)) {
3695 
3696     // only master executes here (master releases all other workers)
3697     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3698                             global_tid);
3699 
3700   } else {
3701 
3702     // should never reach this block
3703     KMP_ASSERT(0); // "unexpected method"
3704   }
3705 #if OMP_40_ENABLED
3706   if (teams_swapped) {
3707     __kmp_restore_swapped_teams(th, team, task_state);
3708   }
3709 #endif
3710 
3711   if (__kmp_env_consistency_check)
3712     __kmp_pop_sync(global_tid, ct_reduce, loc);
3713 
3714   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3715                 global_tid, packed_reduction_method));
3716 
3717   return;
3718 }
3719 
3720 #undef __KMP_GET_REDUCTION_METHOD
3721 #undef __KMP_SET_REDUCTION_METHOD
3722 
3723 /* end of interface to fast scalable reduce routines */
3724 
3725 kmp_uint64 __kmpc_get_taskid() {
3726 
3727   kmp_int32 gtid;
3728   kmp_info_t *thread;
3729 
3730   gtid = __kmp_get_gtid();
3731   if (gtid < 0) {
3732     return 0;
3733   }
3734   thread = __kmp_thread_from_gtid(gtid);
3735   return thread->th.th_current_task->td_task_id;
3736 
3737 } // __kmpc_get_taskid
3738 
3739 kmp_uint64 __kmpc_get_parent_taskid() {
3740 
3741   kmp_int32 gtid;
3742   kmp_info_t *thread;
3743   kmp_taskdata_t *parent_task;
3744 
3745   gtid = __kmp_get_gtid();
3746   if (gtid < 0) {
3747     return 0;
3748   }
3749   thread = __kmp_thread_from_gtid(gtid);
3750   parent_task = thread->th.th_current_task->td_parent;
3751   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3752 
3753 } // __kmpc_get_parent_taskid
3754 
3755 #if OMP_45_ENABLED
3756 /*!
3757 @ingroup WORK_SHARING
3758 @param loc  source location information.
3759 @param gtid  global thread number.
3760 @param num_dims  number of associated doacross loops.
3761 @param dims  info on loops bounds.
3762 
3763 Initialize doacross loop information.
3764 Expect compiler send us inclusive bounds,
3765 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3766 */
3767 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3768                           struct kmp_dim *dims) {
3769   int j, idx;
3770   kmp_int64 last, trace_count;
3771   kmp_info_t *th = __kmp_threads[gtid];
3772   kmp_team_t *team = th->th.th_team;
3773   kmp_uint32 *flags;
3774   kmp_disp_t *pr_buf = th->th.th_dispatch;
3775   dispatch_shared_info_t *sh_buf;
3776 
3777   KA_TRACE(
3778       20,
3779       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3780        gtid, num_dims, !team->t.t_serialized));
3781   KMP_DEBUG_ASSERT(dims != NULL);
3782   KMP_DEBUG_ASSERT(num_dims > 0);
3783 
3784   if (team->t.t_serialized) {
3785     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3786     return; // no dependencies if team is serialized
3787   }
3788   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3789   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3790   // the next loop
3791   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3792 
3793   // Save bounds info into allocated private buffer
3794   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3795   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3796       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3797   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3798   pr_buf->th_doacross_info[0] =
3799       (kmp_int64)num_dims; // first element is number of dimensions
3800   // Save also address of num_done in order to access it later without knowing
3801   // the buffer index
3802   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3803   pr_buf->th_doacross_info[2] = dims[0].lo;
3804   pr_buf->th_doacross_info[3] = dims[0].up;
3805   pr_buf->th_doacross_info[4] = dims[0].st;
3806   last = 5;
3807   for (j = 1; j < num_dims; ++j) {
3808     kmp_int64
3809         range_length; // To keep ranges of all dimensions but the first dims[0]
3810     if (dims[j].st == 1) { // most common case
3811       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3812       range_length = dims[j].up - dims[j].lo + 1;
3813     } else {
3814       if (dims[j].st > 0) {
3815         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3816         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3817       } else { // negative increment
3818         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3819         range_length =
3820             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3821       }
3822     }
3823     pr_buf->th_doacross_info[last++] = range_length;
3824     pr_buf->th_doacross_info[last++] = dims[j].lo;
3825     pr_buf->th_doacross_info[last++] = dims[j].up;
3826     pr_buf->th_doacross_info[last++] = dims[j].st;
3827   }
3828 
3829   // Compute total trip count.
3830   // Start with range of dims[0] which we don't need to keep in the buffer.
3831   if (dims[0].st == 1) { // most common case
3832     trace_count = dims[0].up - dims[0].lo + 1;
3833   } else if (dims[0].st > 0) {
3834     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3835     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3836   } else { // negative increment
3837     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3838     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3839   }
3840   for (j = 1; j < num_dims; ++j) {
3841     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3842   }
3843   KMP_DEBUG_ASSERT(trace_count > 0);
3844 
3845   // Check if shared buffer is not occupied by other loop (idx -
3846   // __kmp_dispatch_num_buffers)
3847   if (idx != sh_buf->doacross_buf_idx) {
3848     // Shared buffer is occupied, wait for it to be free
3849     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3850                        __kmp_eq_4, NULL);
3851   }
3852 #if KMP_32_BIT_ARCH
3853   // Check if we are the first thread. After the CAS the first thread gets 0,
3854   // others get 1 if initialization is in progress, allocated pointer otherwise.
3855   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3856   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3857       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3858 #else
3859   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3860       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3861 #endif
3862   if (flags == NULL) {
3863     // we are the first thread, allocate the array of flags
3864     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3865     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3866     KMP_MB();
3867     sh_buf->doacross_flags = flags;
3868   } else if (flags == (kmp_uint32 *)1) {
3869 #if KMP_32_BIT_ARCH
3870     // initialization is still in progress, need to wait
3871     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3872 #else
3873     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3874 #endif
3875       KMP_YIELD(TRUE);
3876     KMP_MB();
3877   } else {
3878     KMP_MB();
3879   }
3880   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3881   pr_buf->th_doacross_flags =
3882       sh_buf->doacross_flags; // save private copy in order to not
3883   // touch shared buffer on each iteration
3884   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3885 }
3886 
3887 void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) {
3888   kmp_int32 shft, num_dims, i;
3889   kmp_uint32 flag;
3890   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3891   kmp_info_t *th = __kmp_threads[gtid];
3892   kmp_team_t *team = th->th.th_team;
3893   kmp_disp_t *pr_buf;
3894   kmp_int64 lo, up, st;
3895 
3896   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3897   if (team->t.t_serialized) {
3898     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3899     return; // no dependencies if team is serialized
3900   }
3901 
3902   // calculate sequential iteration number and check out-of-bounds condition
3903   pr_buf = th->th.th_dispatch;
3904   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3905   num_dims = pr_buf->th_doacross_info[0];
3906   lo = pr_buf->th_doacross_info[2];
3907   up = pr_buf->th_doacross_info[3];
3908   st = pr_buf->th_doacross_info[4];
3909   if (st == 1) { // most common case
3910     if (vec[0] < lo || vec[0] > up) {
3911       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3912                     "bounds [%lld,%lld]\n",
3913                     gtid, vec[0], lo, up));
3914       return;
3915     }
3916     iter_number = vec[0] - lo;
3917   } else if (st > 0) {
3918     if (vec[0] < lo || vec[0] > up) {
3919       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3920                     "bounds [%lld,%lld]\n",
3921                     gtid, vec[0], lo, up));
3922       return;
3923     }
3924     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3925   } else { // negative increment
3926     if (vec[0] > lo || vec[0] < up) {
3927       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3928                     "bounds [%lld,%lld]\n",
3929                     gtid, vec[0], lo, up));
3930       return;
3931     }
3932     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3933   }
3934   for (i = 1; i < num_dims; ++i) {
3935     kmp_int64 iter, ln;
3936     kmp_int32 j = i * 4;
3937     ln = pr_buf->th_doacross_info[j + 1];
3938     lo = pr_buf->th_doacross_info[j + 2];
3939     up = pr_buf->th_doacross_info[j + 3];
3940     st = pr_buf->th_doacross_info[j + 4];
3941     if (st == 1) {
3942       if (vec[i] < lo || vec[i] > up) {
3943         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3944                       "bounds [%lld,%lld]\n",
3945                       gtid, vec[i], lo, up));
3946         return;
3947       }
3948       iter = vec[i] - lo;
3949     } else if (st > 0) {
3950       if (vec[i] < lo || vec[i] > up) {
3951         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3952                       "bounds [%lld,%lld]\n",
3953                       gtid, vec[i], lo, up));
3954         return;
3955       }
3956       iter = (kmp_uint64)(vec[i] - lo) / st;
3957     } else { // st < 0
3958       if (vec[i] > lo || vec[i] < up) {
3959         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3960                       "bounds [%lld,%lld]\n",
3961                       gtid, vec[i], lo, up));
3962         return;
3963       }
3964       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3965     }
3966     iter_number = iter + ln * iter_number;
3967   }
3968   shft = iter_number % 32; // use 32-bit granularity
3969   iter_number >>= 5; // divided by 32
3970   flag = 1 << shft;
3971   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3972     KMP_YIELD(TRUE);
3973   }
3974   KMP_MB();
3975   KA_TRACE(20,
3976            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3977             gtid, (iter_number << 5) + shft));
3978 }
3979 
3980 void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) {
3981   kmp_int32 shft, num_dims, i;
3982   kmp_uint32 flag;
3983   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3984   kmp_info_t *th = __kmp_threads[gtid];
3985   kmp_team_t *team = th->th.th_team;
3986   kmp_disp_t *pr_buf;
3987   kmp_int64 lo, st;
3988 
3989   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3990   if (team->t.t_serialized) {
3991     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
3992     return; // no dependencies if team is serialized
3993   }
3994 
3995   // calculate sequential iteration number (same as in "wait" but no
3996   // out-of-bounds checks)
3997   pr_buf = th->th.th_dispatch;
3998   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3999   num_dims = pr_buf->th_doacross_info[0];
4000   lo = pr_buf->th_doacross_info[2];
4001   st = pr_buf->th_doacross_info[4];
4002   if (st == 1) { // most common case
4003     iter_number = vec[0] - lo;
4004   } else if (st > 0) {
4005     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4006   } else { // negative increment
4007     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4008   }
4009   for (i = 1; i < num_dims; ++i) {
4010     kmp_int64 iter, ln;
4011     kmp_int32 j = i * 4;
4012     ln = pr_buf->th_doacross_info[j + 1];
4013     lo = pr_buf->th_doacross_info[j + 2];
4014     st = pr_buf->th_doacross_info[j + 4];
4015     if (st == 1) {
4016       iter = vec[i] - lo;
4017     } else if (st > 0) {
4018       iter = (kmp_uint64)(vec[i] - lo) / st;
4019     } else { // st < 0
4020       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4021     }
4022     iter_number = iter + ln * iter_number;
4023   }
4024   shft = iter_number % 32; // use 32-bit granularity
4025   iter_number >>= 5; // divided by 32
4026   flag = 1 << shft;
4027   KMP_MB();
4028   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4029     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4030   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4031                 (iter_number << 5) + shft));
4032 }
4033 
4034 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4035   kmp_int32 num_done;
4036   kmp_info_t *th = __kmp_threads[gtid];
4037   kmp_team_t *team = th->th.th_team;
4038   kmp_disp_t *pr_buf = th->th.th_dispatch;
4039 
4040   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4041   if (team->t.t_serialized) {
4042     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4043     return; // nothing to do
4044   }
4045   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4046   if (num_done == th->th.th_team_nproc) {
4047     // we are the last thread, need to free shared resources
4048     int idx = pr_buf->th_doacross_buf_idx - 1;
4049     dispatch_shared_info_t *sh_buf =
4050         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4051     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4052                      (kmp_int64)&sh_buf->doacross_num_done);
4053     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4054     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4055     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4056     sh_buf->doacross_flags = NULL;
4057     sh_buf->doacross_num_done = 0;
4058     sh_buf->doacross_buf_idx +=
4059         __kmp_dispatch_num_buffers; // free buffer for future re-use
4060   }
4061   // free private resources (need to keep buffer index forever)
4062   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4063   pr_buf->th_doacross_info = NULL;
4064   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4065 }
4066 #endif
4067 
4068 // end of file //
4069