1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 }
74 
75 /*!
76 @ingroup THREAD_STATES
77 @param loc Source location information.
78 @return The global thread index of the active thread.
79 
80 This function can be called in any context.
81 
82 If the runtime has ony been entered at the outermost level from a
83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
84 that which would be returned by omp_get_thread_num() in the outermost
85 active parallel construct. (Or zero if there is no active parallel
86 construct, since the master thread is necessarily thread zero).
87 
88 If multiple non-OpenMP threads all enter an OpenMP construct then this
89 will be a unique thread identifier among all the threads created by
90 the OpenMP runtime (but the value cannote be defined in terms of
91 OpenMP thread ids returned by omp_get_thread_num()).
92 */
93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
94   kmp_int32 gtid = __kmp_entry_gtid();
95 
96   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
97 
98   return gtid;
99 }
100 
101 /*!
102 @ingroup THREAD_STATES
103 @param loc Source location information.
104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
105 
106 This function can be called in any context.
107 It returns the total number of threads under the control of the OpenMP runtime.
108 That is not a number that can be determined by any OpenMP standard calls, since
109 the library may be called from more than one non-OpenMP thread, and this
110 reflects the total over all such calls. Similarly the runtime maintains
111 underlying threads even when they are not active (since the cost of creating
112 and destroying OS threads is high), this call counts all such threads even if
113 they are not waiting for work.
114 */
115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
116   KC_TRACE(10,
117            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
118 
119   return TCR_4(__kmp_all_nth);
120 }
121 
122 /*!
123 @ingroup THREAD_STATES
124 @param loc Source location information.
125 @return The thread number of the calling thread in the innermost active parallel
126 construct.
127 */
128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
129   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
130   return __kmp_tid_from_gtid(__kmp_entry_gtid());
131 }
132 
133 /*!
134 @ingroup THREAD_STATES
135 @param loc Source location information.
136 @return The number of threads in the innermost active parallel construct.
137 */
138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
139   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
140 
141   return __kmp_entry_thread()->th.th_team->t.t_nproc;
142 }
143 
144 /*!
145  * @ingroup DEPRECATED
146  * @param loc location description
147  *
148  * This function need not be called. It always returns TRUE.
149  */
150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
151 #ifndef KMP_DEBUG
152 
153   return TRUE;
154 
155 #else
156 
157   const char *semi2;
158   const char *semi3;
159   int line_no;
160 
161   if (__kmp_par_range == 0) {
162     return TRUE;
163   }
164   semi2 = loc->psource;
165   if (semi2 == NULL) {
166     return TRUE;
167   }
168   semi2 = strchr(semi2, ';');
169   if (semi2 == NULL) {
170     return TRUE;
171   }
172   semi2 = strchr(semi2 + 1, ';');
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   if (__kmp_par_range_filename[0]) {
177     const char *name = semi2 - 1;
178     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
179       name--;
180     }
181     if ((*name == '/') || (*name == ';')) {
182       name++;
183     }
184     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
185       return __kmp_par_range < 0;
186     }
187   }
188   semi3 = strchr(semi2 + 1, ';');
189   if (__kmp_par_range_routine[0]) {
190     if ((semi3 != NULL) && (semi3 > semi2) &&
191         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
192       return __kmp_par_range < 0;
193     }
194   }
195   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
196     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
197       return __kmp_par_range > 0;
198     }
199     return __kmp_par_range < 0;
200   }
201   return TRUE;
202 
203 #endif /* KMP_DEBUG */
204 }
205 
206 /*!
207 @ingroup THREAD_STATES
208 @param loc Source location information.
209 @return 1 if this thread is executing inside an active parallel region, zero if
210 not.
211 */
212 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
213   return __kmp_entry_thread()->th.th_root->r.r_active;
214 }
215 
216 /*!
217 @ingroup PARALLEL
218 @param loc source location information
219 @param global_tid global thread number
220 @param num_threads number of threads requested for this parallel construct
221 
222 Set the number of threads to be used by the next fork spawned by this thread.
223 This call is only required if the parallel construct has a `num_threads` clause.
224 */
225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
226                              kmp_int32 num_threads) {
227   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
228                 global_tid, num_threads));
229 
230   __kmp_push_num_threads(loc, global_tid, num_threads);
231 }
232 
233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
234   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
235 
236   /* the num_threads are automatically popped */
237 }
238 
239 #if OMP_40_ENABLED
240 
241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
242                            kmp_int32 proc_bind) {
243   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
244                 proc_bind));
245 
246   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
247 }
248 
249 #endif /* OMP_40_ENABLED */
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   int inParallel = __kmpc_in_parallel(loc);
266   if (inParallel) {
267     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
268   } else {
269     KMP_COUNT_BLOCK(OMP_PARALLEL);
270   }
271 #endif
272 
273   // maybe to save thr_state is enough here
274   {
275     va_list ap;
276     va_start(ap, microtask);
277 
278 #if OMPT_SUPPORT
279     ompt_frame_t *ompt_frame;
280     if (ompt_enabled.enabled) {
281       kmp_info_t *master_th = __kmp_threads[gtid];
282       kmp_team_t *parent_team = master_th->th.th_team;
283       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
284       if (lwt)
285         ompt_frame = &(lwt->ompt_task_info.frame);
286       else {
287         int tid = __kmp_tid_from_gtid(gtid);
288         ompt_frame = &(
289             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
290       }
291       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
292       OMPT_STORE_RETURN_ADDRESS(gtid);
293     }
294 #endif
295 
296 #if INCLUDE_SSC_MARKS
297     SSC_MARK_FORKING();
298 #endif
299     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
300                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
301                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
302 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
303 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
304                     &ap
305 #else
306                     ap
307 #endif
308                     );
309 #if INCLUDE_SSC_MARKS
310     SSC_MARK_JOINING();
311 #endif
312     __kmp_join_call(loc, gtid
313 #if OMPT_SUPPORT
314                     ,
315                     fork_context_intel
316 #endif
317                     );
318 
319     va_end(ap);
320   }
321 }
322 
323 #if OMP_40_ENABLED
324 /*!
325 @ingroup PARALLEL
326 @param loc source location information
327 @param global_tid global thread number
328 @param num_teams number of teams requested for the teams construct
329 @param num_threads number of threads per team requested for the teams construct
330 
331 Set the number of teams to be used by the teams construct.
332 This call is only required if the teams construct has a `num_teams` clause
333 or a `thread_limit` clause (or both).
334 */
335 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
336                            kmp_int32 num_teams, kmp_int32 num_threads) {
337   KA_TRACE(20,
338            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
339             global_tid, num_teams, num_threads));
340 
341   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
342 }
343 
344 /*!
345 @ingroup PARALLEL
346 @param loc  source location information
347 @param argc  total number of arguments in the ellipsis
348 @param microtask  pointer to callback routine consisting of outlined teams
349 construct
350 @param ...  pointers to shared variables that aren't global
351 
352 Do the actual fork and call the microtask in the relevant number of threads.
353 */
354 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
355                        ...) {
356   int gtid = __kmp_entry_gtid();
357   kmp_info_t *this_thr = __kmp_threads[gtid];
358   va_list ap;
359   va_start(ap, microtask);
360 
361   KMP_COUNT_BLOCK(OMP_TEAMS);
362 
363   // remember teams entry point and nesting level
364   this_thr->th.th_teams_microtask = microtask;
365   this_thr->th.th_teams_level =
366       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
367 
368 #if OMPT_SUPPORT
369   kmp_team_t *parent_team = this_thr->th.th_team;
370   int tid = __kmp_tid_from_gtid(gtid);
371   if (ompt_enabled.enabled) {
372     parent_team->t.t_implicit_task_taskdata[tid]
373         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
374   }
375   OMPT_STORE_RETURN_ADDRESS(gtid);
376 #endif
377 
378   // check if __kmpc_push_num_teams called, set default number of teams
379   // otherwise
380   if (this_thr->th.th_teams_size.nteams == 0) {
381     __kmp_push_num_teams(loc, gtid, 0, 0);
382   }
383   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
384   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
385   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
386 
387   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
388                   VOLATILE_CAST(microtask_t)
389                       __kmp_teams_master, // "wrapped" task
390                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
391 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
392                   &ap
393 #else
394                   ap
395 #endif
396                   );
397   __kmp_join_call(loc, gtid
398 #if OMPT_SUPPORT
399                   ,
400                   fork_context_intel
401 #endif
402                   );
403 
404   this_thr->th.th_teams_microtask = NULL;
405   this_thr->th.th_teams_level = 0;
406   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
407   va_end(ap);
408 }
409 #endif /* OMP_40_ENABLED */
410 
411 // I don't think this function should ever have been exported.
412 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
413 // openmp code ever called it, but it's been exported from the RTL for so
414 // long that I'm afraid to remove the definition.
415 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
416 
417 /*!
418 @ingroup PARALLEL
419 @param loc  source location information
420 @param global_tid  global thread number
421 
422 Enter a serialized parallel construct. This interface is used to handle a
423 conditional parallel region, like this,
424 @code
425 #pragma omp parallel if (condition)
426 @endcode
427 when the condition is false.
428 */
429 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
430 // The implementation is now in kmp_runtime.cpp so that it can share static
431 // functions with kmp_fork_call since the tasks to be done are similar in
432 // each case.
433 #if OMPT_SUPPORT
434   OMPT_STORE_RETURN_ADDRESS(global_tid);
435 #endif
436   __kmp_serialized_parallel(loc, global_tid);
437 }
438 
439 /*!
440 @ingroup PARALLEL
441 @param loc  source location information
442 @param global_tid  global thread number
443 
444 Leave a serialized parallel construct.
445 */
446 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
447   kmp_internal_control_t *top;
448   kmp_info_t *this_thr;
449   kmp_team_t *serial_team;
450 
451   KC_TRACE(10,
452            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
453 
454   /* skip all this code for autopar serialized loops since it results in
455      unacceptable overhead */
456   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
457     return;
458 
459   // Not autopar code
460   if (!TCR_4(__kmp_init_parallel))
461     __kmp_parallel_initialize();
462 
463   this_thr = __kmp_threads[global_tid];
464   serial_team = this_thr->th.th_serial_team;
465 
466 #if OMP_45_ENABLED
467   kmp_task_team_t *task_team = this_thr->th.th_task_team;
468 
469   // we need to wait for the proxy tasks before finishing the thread
470   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
471     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
472 #endif
473 
474   KMP_MB();
475   KMP_DEBUG_ASSERT(serial_team);
476   KMP_ASSERT(serial_team->t.t_serialized);
477   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
478   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
479   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
480   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
481 
482 #if OMPT_SUPPORT
483   if (ompt_enabled.enabled &&
484       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
485     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
486     if (ompt_enabled.ompt_callback_implicit_task) {
487       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
488           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
489           __kmp_tid_from_gtid(global_tid));
490     }
491 
492     // reset clear the task id only after unlinking the task
493     ompt_data_t *parent_task_data;
494     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
495 
496     if (ompt_enabled.ompt_callback_parallel_end) {
497       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
498           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
499           ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
500     }
501     __ompt_lw_taskteam_unlink(this_thr);
502     this_thr->th.ompt_thread_info.state = omp_state_overhead;
503   }
504 #endif
505 
506   /* If necessary, pop the internal control stack values and replace the team
507    * values */
508   top = serial_team->t.t_control_stack_top;
509   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
510     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
511     serial_team->t.t_control_stack_top = top->next;
512     __kmp_free(top);
513   }
514 
515   // if( serial_team -> t.t_serialized > 1 )
516   serial_team->t.t_level--;
517 
518   /* pop dispatch buffers stack */
519   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
520   {
521     dispatch_private_info_t *disp_buffer =
522         serial_team->t.t_dispatch->th_disp_buffer;
523     serial_team->t.t_dispatch->th_disp_buffer =
524         serial_team->t.t_dispatch->th_disp_buffer->next;
525     __kmp_free(disp_buffer);
526   }
527 
528   --serial_team->t.t_serialized;
529   if (serial_team->t.t_serialized == 0) {
530 
531 /* return to the parallel section */
532 
533 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
534     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
535       __kmp_clear_x87_fpu_status_word();
536       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
537       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
538     }
539 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
540 
541     this_thr->th.th_team = serial_team->t.t_parent;
542     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
543 
544     /* restore values cached in the thread */
545     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
546     this_thr->th.th_team_master =
547         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
548     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
549 
550     /* TODO the below shouldn't need to be adjusted for serialized teams */
551     this_thr->th.th_dispatch =
552         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
553 
554     __kmp_pop_current_task_from_thread(this_thr);
555 
556     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
557     this_thr->th.th_current_task->td_flags.executing = 1;
558 
559     if (__kmp_tasking_mode != tskm_immediate_exec) {
560       // Copy the task team from the new child / old parent team to the thread.
561       this_thr->th.th_task_team =
562           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
563       KA_TRACE(20,
564                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
565                 "team %p\n",
566                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
567     }
568   } else {
569     if (__kmp_tasking_mode != tskm_immediate_exec) {
570       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
571                     "depth of serial team %p to %d\n",
572                     global_tid, serial_team, serial_team->t.t_serialized));
573     }
574   }
575 
576   if (__kmp_env_consistency_check)
577     __kmp_pop_parallel(global_tid, NULL);
578 #if OMPT_SUPPORT
579   if (ompt_enabled.enabled)
580     this_thr->th.ompt_thread_info.state =
581         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
582                                            : omp_state_work_parallel);
583 #endif
584 }
585 
586 /*!
587 @ingroup SYNCHRONIZATION
588 @param loc  source location information.
589 
590 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
591 depending on the memory ordering convention obeyed by the compiler
592 even that may not be necessary).
593 */
594 void __kmpc_flush(ident_t *loc) {
595   KC_TRACE(10, ("__kmpc_flush: called\n"));
596 
597   /* need explicit __mf() here since use volatile instead in library */
598   KMP_MB(); /* Flush all pending memory write invalidates.  */
599 
600 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
601 #if KMP_MIC
602 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
603 // We shouldn't need it, though, since the ABI rules require that
604 // * If the compiler generates NGO stores it also generates the fence
605 // * If users hand-code NGO stores they should insert the fence
606 // therefore no incomplete unordered stores should be visible.
607 #else
608   // C74404
609   // This is to address non-temporal store instructions (sfence needed).
610   // The clflush instruction is addressed either (mfence needed).
611   // Probably the non-temporal load monvtdqa instruction should also be
612   // addressed.
613   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
614   if (!__kmp_cpuinfo.initialized) {
615     __kmp_query_cpuid(&__kmp_cpuinfo);
616   }
617   if (!__kmp_cpuinfo.sse2) {
618     // CPU cannot execute SSE2 instructions.
619   } else {
620 #if KMP_COMPILER_ICC
621     _mm_mfence();
622 #elif KMP_COMPILER_MSVC
623     MemoryBarrier();
624 #else
625     __sync_synchronize();
626 #endif // KMP_COMPILER_ICC
627   }
628 #endif // KMP_MIC
629 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
630 // Nothing to see here move along
631 #elif KMP_ARCH_PPC64
632 // Nothing needed here (we have a real MB above).
633 #if KMP_OS_CNK
634   // The flushing thread needs to yield here; this prevents a
635   // busy-waiting thread from saturating the pipeline. flush is
636   // often used in loops like this:
637   // while (!flag) {
638   //   #pragma omp flush(flag)
639   // }
640   // and adding the yield here is good for at least a 10x speedup
641   // when running >2 threads per core (on the NAS LU benchmark).
642   __kmp_yield(TRUE);
643 #endif
644 #else
645 #error Unknown or unsupported architecture
646 #endif
647 
648 #if OMPT_SUPPORT && OMPT_OPTIONAL
649   if (ompt_enabled.ompt_callback_flush) {
650     ompt_callbacks.ompt_callback(ompt_callback_flush)(
651         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
652   }
653 #endif
654 }
655 
656 /* -------------------------------------------------------------------------- */
657 /*!
658 @ingroup SYNCHRONIZATION
659 @param loc source location information
660 @param global_tid thread id.
661 
662 Execute a barrier.
663 */
664 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
665   KMP_COUNT_BLOCK(OMP_BARRIER);
666   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
667 
668   if (!TCR_4(__kmp_init_parallel))
669     __kmp_parallel_initialize();
670 
671   if (__kmp_env_consistency_check) {
672     if (loc == 0) {
673       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
674     }
675 
676     __kmp_check_barrier(global_tid, ct_barrier, loc);
677   }
678 
679 #if OMPT_SUPPORT
680   ompt_frame_t *ompt_frame;
681   if (ompt_enabled.enabled) {
682     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
683     if (ompt_frame->enter_frame == NULL)
684       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
685     OMPT_STORE_RETURN_ADDRESS(global_tid);
686   }
687 #endif
688   __kmp_threads[global_tid]->th.th_ident = loc;
689   // TODO: explicit barrier_wait_id:
690   //   this function is called when 'barrier' directive is present or
691   //   implicit barrier at the end of a worksharing construct.
692   // 1) better to add a per-thread barrier counter to a thread data structure
693   // 2) set to 0 when a new team is created
694   // 4) no sync is required
695 
696   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
697 #if OMPT_SUPPORT && OMPT_OPTIONAL
698   if (ompt_enabled.enabled) {
699     ompt_frame->enter_frame = NULL;
700   }
701 #endif
702 }
703 
704 /* The BARRIER for a MASTER section is always explicit   */
705 /*!
706 @ingroup WORK_SHARING
707 @param loc  source location information.
708 @param global_tid  global thread number .
709 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
710 */
711 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
712   int status = 0;
713 
714   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
715 
716   if (!TCR_4(__kmp_init_parallel))
717     __kmp_parallel_initialize();
718 
719   if (KMP_MASTER_GTID(global_tid)) {
720     KMP_COUNT_BLOCK(OMP_MASTER);
721     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
722     status = 1;
723   }
724 
725 #if OMPT_SUPPORT && OMPT_OPTIONAL
726   if (status) {
727     if (ompt_enabled.ompt_callback_master) {
728       kmp_info_t *this_thr = __kmp_threads[global_tid];
729       kmp_team_t *team = this_thr->th.th_team;
730 
731       int tid = __kmp_tid_from_gtid(global_tid);
732       ompt_callbacks.ompt_callback(ompt_callback_master)(
733           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
734           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
735           OMPT_GET_RETURN_ADDRESS(0));
736     }
737   }
738 #endif
739 
740   if (__kmp_env_consistency_check) {
741 #if KMP_USE_DYNAMIC_LOCK
742     if (status)
743       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
744     else
745       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
746 #else
747     if (status)
748       __kmp_push_sync(global_tid, ct_master, loc, NULL);
749     else
750       __kmp_check_sync(global_tid, ct_master, loc, NULL);
751 #endif
752   }
753 
754   return status;
755 }
756 
757 /*!
758 @ingroup WORK_SHARING
759 @param loc  source location information.
760 @param global_tid  global thread number .
761 
762 Mark the end of a <tt>master</tt> region. This should only be called by the
763 thread that executes the <tt>master</tt> region.
764 */
765 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
766   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
767 
768   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
769   KMP_POP_PARTITIONED_TIMER();
770 
771 #if OMPT_SUPPORT && OMPT_OPTIONAL
772   kmp_info_t *this_thr = __kmp_threads[global_tid];
773   kmp_team_t *team = this_thr->th.th_team;
774   if (ompt_enabled.ompt_callback_master) {
775     int tid = __kmp_tid_from_gtid(global_tid);
776     ompt_callbacks.ompt_callback(ompt_callback_master)(
777         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
778         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
779         OMPT_GET_RETURN_ADDRESS(0));
780   }
781 #endif
782 
783   if (__kmp_env_consistency_check) {
784     if (global_tid < 0)
785       KMP_WARNING(ThreadIdentInvalid);
786 
787     if (KMP_MASTER_GTID(global_tid))
788       __kmp_pop_sync(global_tid, ct_master, loc);
789   }
790 }
791 
792 /*!
793 @ingroup WORK_SHARING
794 @param loc  source location information.
795 @param gtid  global thread number.
796 
797 Start execution of an <tt>ordered</tt> construct.
798 */
799 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
800   int cid = 0;
801   kmp_info_t *th;
802   KMP_DEBUG_ASSERT(__kmp_init_serial);
803 
804   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
805 
806   if (!TCR_4(__kmp_init_parallel))
807     __kmp_parallel_initialize();
808 
809 #if USE_ITT_BUILD
810   __kmp_itt_ordered_prep(gtid);
811 // TODO: ordered_wait_id
812 #endif /* USE_ITT_BUILD */
813 
814   th = __kmp_threads[gtid];
815 
816 #if OMPT_SUPPORT && OMPT_OPTIONAL
817   kmp_team_t *team;
818   ompt_wait_id_t lck;
819   void *codeptr_ra;
820   if (ompt_enabled.enabled) {
821     OMPT_STORE_RETURN_ADDRESS(gtid);
822     team = __kmp_team_from_gtid(gtid);
823     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
824     /* OMPT state update */
825     th->th.ompt_thread_info.wait_id = lck;
826     th->th.ompt_thread_info.state = omp_state_wait_ordered;
827 
828     /* OMPT event callback */
829     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
830     if (ompt_enabled.ompt_callback_mutex_acquire) {
831       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
832           ompt_mutex_ordered, omp_lock_hint_none, ompt_mutex_impl_spin,
833           (ompt_wait_id_t)lck, codeptr_ra);
834     }
835   }
836 #endif
837 
838   if (th->th.th_dispatch->th_deo_fcn != 0)
839     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
840   else
841     __kmp_parallel_deo(&gtid, &cid, loc);
842 
843 #if OMPT_SUPPORT && OMPT_OPTIONAL
844   if (ompt_enabled.enabled) {
845     /* OMPT state update */
846     th->th.ompt_thread_info.state = omp_state_work_parallel;
847     th->th.ompt_thread_info.wait_id = 0;
848 
849     /* OMPT event callback */
850     if (ompt_enabled.ompt_callback_mutex_acquired) {
851       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
852           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
853     }
854   }
855 #endif
856 
857 #if USE_ITT_BUILD
858   __kmp_itt_ordered_start(gtid);
859 #endif /* USE_ITT_BUILD */
860 }
861 
862 /*!
863 @ingroup WORK_SHARING
864 @param loc  source location information.
865 @param gtid  global thread number.
866 
867 End execution of an <tt>ordered</tt> construct.
868 */
869 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
870   int cid = 0;
871   kmp_info_t *th;
872 
873   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
874 
875 #if USE_ITT_BUILD
876   __kmp_itt_ordered_end(gtid);
877 // TODO: ordered_wait_id
878 #endif /* USE_ITT_BUILD */
879 
880   th = __kmp_threads[gtid];
881 
882   if (th->th.th_dispatch->th_dxo_fcn != 0)
883     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
884   else
885     __kmp_parallel_dxo(&gtid, &cid, loc);
886 
887 #if OMPT_SUPPORT && OMPT_OPTIONAL
888   OMPT_STORE_RETURN_ADDRESS(gtid);
889   if (ompt_enabled.ompt_callback_mutex_released) {
890     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
891         ompt_mutex_ordered,
892         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
893         OMPT_LOAD_RETURN_ADDRESS(gtid));
894   }
895 #endif
896 }
897 
898 #if KMP_USE_DYNAMIC_LOCK
899 
900 static __forceinline void
901 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
902                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
903   // Pointer to the allocated indirect lock is written to crit, while indexing
904   // is ignored.
905   void *idx;
906   kmp_indirect_lock_t **lck;
907   lck = (kmp_indirect_lock_t **)crit;
908   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
909   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
910   KMP_SET_I_LOCK_LOCATION(ilk, loc);
911   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
912   KA_TRACE(20,
913            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
914 #if USE_ITT_BUILD
915   __kmp_itt_critical_creating(ilk->lock, loc);
916 #endif
917   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
918   if (status == 0) {
919 #if USE_ITT_BUILD
920     __kmp_itt_critical_destroyed(ilk->lock);
921 #endif
922     // We don't really need to destroy the unclaimed lock here since it will be
923     // cleaned up at program exit.
924     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
925   }
926   KMP_DEBUG_ASSERT(*lck != NULL);
927 }
928 
929 // Fast-path acquire tas lock
930 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
931   {                                                                            \
932     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
933     if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                    \
934         !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
935                                      KMP_LOCK_BUSY(gtid + 1, tas))) {          \
936       kmp_uint32 spins;                                                        \
937       KMP_FSYNC_PREPARE(l);                                                    \
938       KMP_INIT_YIELD(spins);                                                   \
939       if (TCR_4(__kmp_nth) >                                                   \
940           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
941         KMP_YIELD(TRUE);                                                       \
942       } else {                                                                 \
943         KMP_YIELD_SPIN(spins);                                                 \
944       }                                                                        \
945       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
946       while (l->lk.poll != KMP_LOCK_FREE(tas) ||                               \
947              !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),   \
948                                           KMP_LOCK_BUSY(gtid + 1, tas))) {     \
949         __kmp_spin_backoff(&backoff);                                          \
950         if (TCR_4(__kmp_nth) >                                                 \
951             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
952           KMP_YIELD(TRUE);                                                     \
953         } else {                                                               \
954           KMP_YIELD_SPIN(spins);                                               \
955         }                                                                      \
956       }                                                                        \
957     }                                                                          \
958     KMP_FSYNC_ACQUIRED(l);                                                     \
959   }
960 
961 // Fast-path test tas lock
962 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
963   {                                                                            \
964     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
965     rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                   \
966          KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
967                                      KMP_LOCK_BUSY(gtid + 1, tas));            \
968   }
969 
970 // Fast-path release tas lock
971 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
972   {                                                                            \
973     TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));              \
974     KMP_MB();                                                                  \
975   }
976 
977 #if KMP_USE_FUTEX
978 
979 #include <sys/syscall.h>
980 #include <unistd.h>
981 #ifndef FUTEX_WAIT
982 #define FUTEX_WAIT 0
983 #endif
984 #ifndef FUTEX_WAKE
985 #define FUTEX_WAKE 1
986 #endif
987 
988 // Fast-path acquire futex lock
989 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
990   {                                                                            \
991     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
992     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
993     KMP_MB();                                                                  \
994     KMP_FSYNC_PREPARE(ftx);                                                    \
995     kmp_int32 poll_val;                                                        \
996     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
997                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
998                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
999       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1000       if (!cond) {                                                             \
1001         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1002                                          poll_val |                            \
1003                                              KMP_LOCK_BUSY(1, futex))) {       \
1004           continue;                                                            \
1005         }                                                                      \
1006         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1007       }                                                                        \
1008       kmp_int32 rc;                                                            \
1009       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1010                         NULL, NULL, 0)) != 0) {                                \
1011         continue;                                                              \
1012       }                                                                        \
1013       gtid_code |= 1;                                                          \
1014     }                                                                          \
1015     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1016   }
1017 
1018 // Fast-path test futex lock
1019 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1020   {                                                                            \
1021     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1022     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1023                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1024       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1025       rc = TRUE;                                                               \
1026     } else {                                                                   \
1027       rc = FALSE;                                                              \
1028     }                                                                          \
1029   }
1030 
1031 // Fast-path release futex lock
1032 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1033   {                                                                            \
1034     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1035     KMP_MB();                                                                  \
1036     KMP_FSYNC_RELEASING(ftx);                                                  \
1037     kmp_int32 poll_val =                                                       \
1038         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1039     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1040       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1041               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1042     }                                                                          \
1043     KMP_MB();                                                                  \
1044     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1045               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1046   }
1047 
1048 #endif // KMP_USE_FUTEX
1049 
1050 #else // KMP_USE_DYNAMIC_LOCK
1051 
1052 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1053                                                       ident_t const *loc,
1054                                                       kmp_int32 gtid) {
1055   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1056 
1057   // Because of the double-check, the following load doesn't need to be volatile
1058   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1059 
1060   if (lck == NULL) {
1061     void *idx;
1062 
1063     // Allocate & initialize the lock.
1064     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1065     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1066     __kmp_init_user_lock_with_checks(lck);
1067     __kmp_set_user_lock_location(lck, loc);
1068 #if USE_ITT_BUILD
1069     __kmp_itt_critical_creating(lck);
1070 // __kmp_itt_critical_creating() should be called *before* the first usage
1071 // of underlying lock. It is the only place where we can guarantee it. There
1072 // are chances the lock will destroyed with no usage, but it is not a
1073 // problem, because this is not real event seen by user but rather setting
1074 // name for object (lock). See more details in kmp_itt.h.
1075 #endif /* USE_ITT_BUILD */
1076 
1077     // Use a cmpxchg instruction to slam the start of the critical section with
1078     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1079     // and use the lock that the other thread allocated.
1080     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1081 
1082     if (status == 0) {
1083 // Deallocate the lock and reload the value.
1084 #if USE_ITT_BUILD
1085       __kmp_itt_critical_destroyed(lck);
1086 // Let ITT know the lock is destroyed and the same memory location may be reused
1087 // for another purpose.
1088 #endif /* USE_ITT_BUILD */
1089       __kmp_destroy_user_lock_with_checks(lck);
1090       __kmp_user_lock_free(&idx, gtid, lck);
1091       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1092       KMP_DEBUG_ASSERT(lck != NULL);
1093     }
1094   }
1095   return lck;
1096 }
1097 
1098 #endif // KMP_USE_DYNAMIC_LOCK
1099 
1100 /*!
1101 @ingroup WORK_SHARING
1102 @param loc  source location information.
1103 @param global_tid  global thread number .
1104 @param crit identity of the critical section. This could be a pointer to a lock
1105 associated with the critical section, or some other suitably unique value.
1106 
1107 Enter code protected by a `critical` construct.
1108 This function blocks until the executing thread can enter the critical section.
1109 */
1110 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1111                      kmp_critical_name *crit) {
1112 #if KMP_USE_DYNAMIC_LOCK
1113 #if OMPT_SUPPORT && OMPT_OPTIONAL
1114   OMPT_STORE_RETURN_ADDRESS(global_tid);
1115 #endif // OMPT_SUPPORT
1116   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1117 #else
1118   KMP_COUNT_BLOCK(OMP_CRITICAL);
1119   KMP_TIME_PARTITIONED_BLOCK(
1120       OMP_critical_wait); /* Time spent waiting to enter the critical section */
1121 #if OMPT_SUPPORT && OMPT_OPTIONAL
1122   omp_state_t prev_state = omp_state_undefined;
1123   ompt_thread_info_t ti;
1124 #endif
1125   kmp_user_lock_p lck;
1126 
1127   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1128 
1129   // TODO: add THR_OVHD_STATE
1130 
1131   KMP_CHECK_USER_LOCK_INIT();
1132 
1133   if ((__kmp_user_lock_kind == lk_tas) &&
1134       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1135     lck = (kmp_user_lock_p)crit;
1136   }
1137 #if KMP_USE_FUTEX
1138   else if ((__kmp_user_lock_kind == lk_futex) &&
1139            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1140     lck = (kmp_user_lock_p)crit;
1141   }
1142 #endif
1143   else { // ticket, queuing or drdpa
1144     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1145   }
1146 
1147   if (__kmp_env_consistency_check)
1148     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1149 
1150 // since the critical directive binds to all threads, not just the current
1151 // team we have to check this even if we are in a serialized team.
1152 // also, even if we are the uber thread, we still have to conduct the lock,
1153 // as we have to contend with sibling threads.
1154 
1155 #if USE_ITT_BUILD
1156   __kmp_itt_critical_acquiring(lck);
1157 #endif /* USE_ITT_BUILD */
1158 #if OMPT_SUPPORT && OMPT_OPTIONAL
1159   OMPT_STORE_RETURN_ADDRESS(gtid);
1160   void *codeptr_ra = NULL;
1161   if (ompt_enabled.enabled) {
1162     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1163     /* OMPT state update */
1164     prev_state = ti.state;
1165     ti.wait_id = (ompt_wait_id_t)lck;
1166     ti.state = omp_state_wait_critical;
1167 
1168     /* OMPT event callback */
1169     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1170     if (ompt_enabled.ompt_callback_mutex_acquire) {
1171       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1172           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1173           (ompt_wait_id_t)crit, codeptr_ra);
1174     }
1175   }
1176 #endif
1177   // Value of 'crit' should be good for using as a critical_id of the critical
1178   // section directive.
1179   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1180 
1181 #if USE_ITT_BUILD
1182   __kmp_itt_critical_acquired(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185   if (ompt_enabled.enabled) {
1186     /* OMPT state update */
1187     ti.state = prev_state;
1188     ti.wait_id = 0;
1189 
1190     /* OMPT event callback */
1191     if (ompt_enabled.ompt_callback_mutex_acquired) {
1192       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1193           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1194     }
1195   }
1196 #endif
1197 
1198   KMP_START_EXPLICIT_TIMER(OMP_critical);
1199   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1200 #endif // KMP_USE_DYNAMIC_LOCK
1201 }
1202 
1203 #if KMP_USE_DYNAMIC_LOCK
1204 
1205 // Converts the given hint to an internal lock implementation
1206 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1207 #if KMP_USE_TSX
1208 #define KMP_TSX_LOCK(seq) lockseq_##seq
1209 #else
1210 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1211 #endif
1212 
1213 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1214 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1215 #else
1216 #define KMP_CPUINFO_RTM 0
1217 #endif
1218 
1219   // Hints that do not require further logic
1220   if (hint & kmp_lock_hint_hle)
1221     return KMP_TSX_LOCK(hle);
1222   if (hint & kmp_lock_hint_rtm)
1223     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1224   if (hint & kmp_lock_hint_adaptive)
1225     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1226 
1227   // Rule out conflicting hints first by returning the default lock
1228   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1229     return __kmp_user_lock_seq;
1230   if ((hint & omp_lock_hint_speculative) &&
1231       (hint & omp_lock_hint_nonspeculative))
1232     return __kmp_user_lock_seq;
1233 
1234   // Do not even consider speculation when it appears to be contended
1235   if (hint & omp_lock_hint_contended)
1236     return lockseq_queuing;
1237 
1238   // Uncontended lock without speculation
1239   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1240     return lockseq_tas;
1241 
1242   // HLE lock for speculation
1243   if (hint & omp_lock_hint_speculative)
1244     return KMP_TSX_LOCK(hle);
1245 
1246   return __kmp_user_lock_seq;
1247 }
1248 
1249 #if OMPT_SUPPORT && OMPT_OPTIONAL
1250 static ompt_mutex_impl_t
1251 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1252   if (user_lock) {
1253     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1254     case 0:
1255       break;
1256 #if KMP_USE_FUTEX
1257     case locktag_futex:
1258       return ompt_mutex_impl_queuing;
1259 #endif
1260     case locktag_tas:
1261       return ompt_mutex_impl_spin;
1262 #if KMP_USE_TSX
1263     case locktag_hle:
1264       return ompt_mutex_impl_speculative;
1265 #endif
1266     default:
1267       return ompt_mutex_impl_unknown;
1268     }
1269     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1270   }
1271   KMP_ASSERT(ilock);
1272   switch (ilock->type) {
1273 #if KMP_USE_TSX
1274   case locktag_adaptive:
1275   case locktag_rtm:
1276     return ompt_mutex_impl_speculative;
1277 #endif
1278   case locktag_nested_tas:
1279     return ompt_mutex_impl_spin;
1280 #if KMP_USE_FUTEX
1281   case locktag_nested_futex:
1282 #endif
1283   case locktag_ticket:
1284   case locktag_queuing:
1285   case locktag_drdpa:
1286   case locktag_nested_ticket:
1287   case locktag_nested_queuing:
1288   case locktag_nested_drdpa:
1289     return ompt_mutex_impl_queuing;
1290   default:
1291     return ompt_mutex_impl_unknown;
1292   }
1293 }
1294 
1295 // For locks without dynamic binding
1296 static ompt_mutex_impl_t __ompt_get_mutex_impl_type() {
1297   switch (__kmp_user_lock_kind) {
1298   case lk_tas:
1299     return ompt_mutex_impl_spin;
1300 #if KMP_USE_FUTEX
1301   case lk_futex:
1302 #endif
1303   case lk_ticket:
1304   case lk_queuing:
1305   case lk_drdpa:
1306     return ompt_mutex_impl_queuing;
1307 #if KMP_USE_TSX
1308   case lk_hle:
1309   case lk_rtm:
1310   case lk_adaptive:
1311     return ompt_mutex_impl_speculative;
1312 #endif
1313   default:
1314     return ompt_mutex_impl_unknown;
1315   }
1316 }
1317 #endif
1318 
1319 /*!
1320 @ingroup WORK_SHARING
1321 @param loc  source location information.
1322 @param global_tid  global thread number.
1323 @param crit identity of the critical section. This could be a pointer to a lock
1324 associated with the critical section, or some other suitably unique value.
1325 @param hint the lock hint.
1326 
1327 Enter code protected by a `critical` construct with a hint. The hint value is
1328 used to suggest a lock implementation. This function blocks until the executing
1329 thread can enter the critical section unless the hint suggests use of
1330 speculative execution and the hardware supports it.
1331 */
1332 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1333                                kmp_critical_name *crit, uintptr_t hint) {
1334   KMP_COUNT_BLOCK(OMP_CRITICAL);
1335   kmp_user_lock_p lck;
1336 #if OMPT_SUPPORT && OMPT_OPTIONAL
1337   omp_state_t prev_state = omp_state_undefined;
1338   ompt_thread_info_t ti;
1339   // This is the case, if called from __kmpc_critical:
1340   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1341   if (!codeptr)
1342     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1343 #endif
1344 
1345   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1346 
1347   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1348   // Check if it is initialized.
1349   if (*lk == 0) {
1350     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1351     if (KMP_IS_D_LOCK(lckseq)) {
1352       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1353                                   KMP_GET_D_TAG(lckseq));
1354     } else {
1355       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1356     }
1357   }
1358   // Branch for accessing the actual lock object and set operation. This
1359   // branching is inevitable since this lock initialization does not follow the
1360   // normal dispatch path (lock table is not used).
1361   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1362     lck = (kmp_user_lock_p)lk;
1363     if (__kmp_env_consistency_check) {
1364       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1365                       __kmp_map_hint_to_lock(hint));
1366     }
1367 #if USE_ITT_BUILD
1368     __kmp_itt_critical_acquiring(lck);
1369 #endif
1370 #if OMPT_SUPPORT && OMPT_OPTIONAL
1371     if (ompt_enabled.enabled) {
1372       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1373       /* OMPT state update */
1374       prev_state = ti.state;
1375       ti.wait_id = (ompt_wait_id_t)lck;
1376       ti.state = omp_state_wait_critical;
1377 
1378       /* OMPT event callback */
1379       if (ompt_enabled.ompt_callback_mutex_acquire) {
1380         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1381             ompt_mutex_critical, (unsigned int)hint,
1382             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1383       }
1384     }
1385 #endif
1386 #if KMP_USE_INLINED_TAS
1387     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1388       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1389     } else
1390 #elif KMP_USE_INLINED_FUTEX
1391     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1392       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1393     } else
1394 #endif
1395     {
1396       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1397     }
1398   } else {
1399     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1400     lck = ilk->lock;
1401     if (__kmp_env_consistency_check) {
1402       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1403                       __kmp_map_hint_to_lock(hint));
1404     }
1405 #if USE_ITT_BUILD
1406     __kmp_itt_critical_acquiring(lck);
1407 #endif
1408 #if OMPT_SUPPORT && OMPT_OPTIONAL
1409     if (ompt_enabled.enabled) {
1410       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1411       /* OMPT state update */
1412       prev_state = ti.state;
1413       ti.wait_id = (ompt_wait_id_t)lck;
1414       ti.state = omp_state_wait_critical;
1415 
1416       /* OMPT event callback */
1417       if (ompt_enabled.ompt_callback_mutex_acquire) {
1418         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1419             ompt_mutex_critical, (unsigned int)hint,
1420             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1421       }
1422     }
1423 #endif
1424     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1425   }
1426 
1427 #if USE_ITT_BUILD
1428   __kmp_itt_critical_acquired(lck);
1429 #endif /* USE_ITT_BUILD */
1430 #if OMPT_SUPPORT && OMPT_OPTIONAL
1431   if (ompt_enabled.enabled) {
1432     /* OMPT state update */
1433     ti.state = prev_state;
1434     ti.wait_id = 0;
1435 
1436     /* OMPT event callback */
1437     if (ompt_enabled.ompt_callback_mutex_acquired) {
1438       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1439           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1440     }
1441   }
1442 #endif
1443 
1444   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1445   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1446 } // __kmpc_critical_with_hint
1447 
1448 #endif // KMP_USE_DYNAMIC_LOCK
1449 
1450 /*!
1451 @ingroup WORK_SHARING
1452 @param loc  source location information.
1453 @param global_tid  global thread number .
1454 @param crit identity of the critical section. This could be a pointer to a lock
1455 associated with the critical section, or some other suitably unique value.
1456 
1457 Leave a critical section, releasing any lock that was held during its execution.
1458 */
1459 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1460                          kmp_critical_name *crit) {
1461   kmp_user_lock_p lck;
1462 
1463   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1464 
1465 #if KMP_USE_DYNAMIC_LOCK
1466   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1467     lck = (kmp_user_lock_p)crit;
1468     KMP_ASSERT(lck != NULL);
1469     if (__kmp_env_consistency_check) {
1470       __kmp_pop_sync(global_tid, ct_critical, loc);
1471     }
1472 #if USE_ITT_BUILD
1473     __kmp_itt_critical_releasing(lck);
1474 #endif
1475 #if KMP_USE_INLINED_TAS
1476     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1477       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1478     } else
1479 #elif KMP_USE_INLINED_FUTEX
1480     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1481       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1482     } else
1483 #endif
1484     {
1485       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1486     }
1487   } else {
1488     kmp_indirect_lock_t *ilk =
1489         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1490     KMP_ASSERT(ilk != NULL);
1491     lck = ilk->lock;
1492     if (__kmp_env_consistency_check) {
1493       __kmp_pop_sync(global_tid, ct_critical, loc);
1494     }
1495 #if USE_ITT_BUILD
1496     __kmp_itt_critical_releasing(lck);
1497 #endif
1498     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1499   }
1500 
1501 #else // KMP_USE_DYNAMIC_LOCK
1502 
1503   if ((__kmp_user_lock_kind == lk_tas) &&
1504       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1505     lck = (kmp_user_lock_p)crit;
1506   }
1507 #if KMP_USE_FUTEX
1508   else if ((__kmp_user_lock_kind == lk_futex) &&
1509            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1510     lck = (kmp_user_lock_p)crit;
1511   }
1512 #endif
1513   else { // ticket, queuing or drdpa
1514     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1515   }
1516 
1517   KMP_ASSERT(lck != NULL);
1518 
1519   if (__kmp_env_consistency_check)
1520     __kmp_pop_sync(global_tid, ct_critical, loc);
1521 
1522 #if USE_ITT_BUILD
1523   __kmp_itt_critical_releasing(lck);
1524 #endif /* USE_ITT_BUILD */
1525   // Value of 'crit' should be good for using as a critical_id of the critical
1526   // section directive.
1527   __kmp_release_user_lock_with_checks(lck, global_tid);
1528 
1529 #endif // KMP_USE_DYNAMIC_LOCK
1530 
1531 #if OMPT_SUPPORT && OMPT_OPTIONAL
1532   /* OMPT release event triggers after lock is released; place here to trigger
1533    * for all #if branches */
1534   OMPT_STORE_RETURN_ADDRESS(global_tid);
1535   if (ompt_enabled.ompt_callback_mutex_released) {
1536     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1537         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1538   }
1539 #endif
1540 
1541   KMP_POP_PARTITIONED_TIMER();
1542   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1543 }
1544 
1545 /*!
1546 @ingroup SYNCHRONIZATION
1547 @param loc source location information
1548 @param global_tid thread id.
1549 @return one if the thread should execute the master block, zero otherwise
1550 
1551 Start execution of a combined barrier and master. The barrier is executed inside
1552 this function.
1553 */
1554 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1555   int status;
1556 
1557   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1558 
1559   if (!TCR_4(__kmp_init_parallel))
1560     __kmp_parallel_initialize();
1561 
1562   if (__kmp_env_consistency_check)
1563     __kmp_check_barrier(global_tid, ct_barrier, loc);
1564 
1565 #if OMPT_SUPPORT
1566   ompt_frame_t *ompt_frame;
1567   if (ompt_enabled.enabled) {
1568     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1569     if (ompt_frame->enter_frame == NULL)
1570       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1571     OMPT_STORE_RETURN_ADDRESS(global_tid);
1572   }
1573 #endif
1574 #if USE_ITT_NOTIFY
1575   __kmp_threads[global_tid]->th.th_ident = loc;
1576 #endif
1577   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1578 #if OMPT_SUPPORT && OMPT_OPTIONAL
1579   if (ompt_enabled.enabled) {
1580     ompt_frame->enter_frame = NULL;
1581   }
1582 #endif
1583 
1584   return (status != 0) ? 0 : 1;
1585 }
1586 
1587 /*!
1588 @ingroup SYNCHRONIZATION
1589 @param loc source location information
1590 @param global_tid thread id.
1591 
1592 Complete the execution of a combined barrier and master. This function should
1593 only be called at the completion of the <tt>master</tt> code. Other threads will
1594 still be waiting at the barrier and this call releases them.
1595 */
1596 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1597   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1598 
1599   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1600 }
1601 
1602 /*!
1603 @ingroup SYNCHRONIZATION
1604 @param loc source location information
1605 @param global_tid thread id.
1606 @return one if the thread should execute the master block, zero otherwise
1607 
1608 Start execution of a combined barrier and master(nowait) construct.
1609 The barrier is executed inside this function.
1610 There is no equivalent "end" function, since the
1611 */
1612 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1613   kmp_int32 ret;
1614 
1615   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1616 
1617   if (!TCR_4(__kmp_init_parallel))
1618     __kmp_parallel_initialize();
1619 
1620   if (__kmp_env_consistency_check) {
1621     if (loc == 0) {
1622       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1623     }
1624     __kmp_check_barrier(global_tid, ct_barrier, loc);
1625   }
1626 
1627 #if OMPT_SUPPORT
1628   ompt_frame_t *ompt_frame;
1629   if (ompt_enabled.enabled) {
1630     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1631     if (ompt_frame->enter_frame == NULL)
1632       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1633     OMPT_STORE_RETURN_ADDRESS(global_tid);
1634   }
1635 #endif
1636 #if USE_ITT_NOTIFY
1637   __kmp_threads[global_tid]->th.th_ident = loc;
1638 #endif
1639   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1640 #if OMPT_SUPPORT && OMPT_OPTIONAL
1641   if (ompt_enabled.enabled) {
1642     ompt_frame->enter_frame = NULL;
1643   }
1644 #endif
1645 
1646   ret = __kmpc_master(loc, global_tid);
1647 
1648   if (__kmp_env_consistency_check) {
1649     /*  there's no __kmpc_end_master called; so the (stats) */
1650     /*  actions of __kmpc_end_master are done here          */
1651 
1652     if (global_tid < 0) {
1653       KMP_WARNING(ThreadIdentInvalid);
1654     }
1655     if (ret) {
1656       /* only one thread should do the pop since only */
1657       /* one did the push (see __kmpc_master())       */
1658 
1659       __kmp_pop_sync(global_tid, ct_master, loc);
1660     }
1661   }
1662 
1663   return (ret);
1664 }
1665 
1666 /* The BARRIER for a SINGLE process section is always explicit   */
1667 /*!
1668 @ingroup WORK_SHARING
1669 @param loc  source location information
1670 @param global_tid  global thread number
1671 @return One if this thread should execute the single construct, zero otherwise.
1672 
1673 Test whether to execute a <tt>single</tt> construct.
1674 There are no implicit barriers in the two "single" calls, rather the compiler
1675 should introduce an explicit barrier if it is required.
1676 */
1677 
1678 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1679   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1680 
1681   if (rc) {
1682     // We are going to execute the single statement, so we should count it.
1683     KMP_COUNT_BLOCK(OMP_SINGLE);
1684     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1685   }
1686 
1687 #if OMPT_SUPPORT && OMPT_OPTIONAL
1688   kmp_info_t *this_thr = __kmp_threads[global_tid];
1689   kmp_team_t *team = this_thr->th.th_team;
1690   int tid = __kmp_tid_from_gtid(global_tid);
1691 
1692   if (ompt_enabled.enabled) {
1693     if (rc) {
1694       if (ompt_enabled.ompt_callback_work) {
1695         ompt_callbacks.ompt_callback(ompt_callback_work)(
1696             ompt_work_single_executor, ompt_scope_begin,
1697             &(team->t.ompt_team_info.parallel_data),
1698             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1699             1, OMPT_GET_RETURN_ADDRESS(0));
1700       }
1701     } else {
1702       if (ompt_enabled.ompt_callback_work) {
1703         ompt_callbacks.ompt_callback(ompt_callback_work)(
1704             ompt_work_single_other, ompt_scope_begin,
1705             &(team->t.ompt_team_info.parallel_data),
1706             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1707             1, OMPT_GET_RETURN_ADDRESS(0));
1708         ompt_callbacks.ompt_callback(ompt_callback_work)(
1709             ompt_work_single_other, ompt_scope_end,
1710             &(team->t.ompt_team_info.parallel_data),
1711             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1712             1, OMPT_GET_RETURN_ADDRESS(0));
1713       }
1714     }
1715   }
1716 #endif
1717 
1718   return rc;
1719 }
1720 
1721 /*!
1722 @ingroup WORK_SHARING
1723 @param loc  source location information
1724 @param global_tid  global thread number
1725 
1726 Mark the end of a <tt>single</tt> construct.  This function should
1727 only be called by the thread that executed the block of code protected
1728 by the `single` construct.
1729 */
1730 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1731   __kmp_exit_single(global_tid);
1732   KMP_POP_PARTITIONED_TIMER();
1733 
1734 #if OMPT_SUPPORT && OMPT_OPTIONAL
1735   kmp_info_t *this_thr = __kmp_threads[global_tid];
1736   kmp_team_t *team = this_thr->th.th_team;
1737   int tid = __kmp_tid_from_gtid(global_tid);
1738 
1739   if (ompt_enabled.ompt_callback_work) {
1740     ompt_callbacks.ompt_callback(ompt_callback_work)(
1741         ompt_work_single_executor, ompt_scope_end,
1742         &(team->t.ompt_team_info.parallel_data),
1743         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1744         OMPT_GET_RETURN_ADDRESS(0));
1745   }
1746 #endif
1747 }
1748 
1749 /*!
1750 @ingroup WORK_SHARING
1751 @param loc Source location
1752 @param global_tid Global thread id
1753 
1754 Mark the end of a statically scheduled loop.
1755 */
1756 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1757   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1758 
1759 #if OMPT_SUPPORT && OMPT_OPTIONAL
1760   if (ompt_enabled.ompt_callback_work) {
1761     ompt_work_type_t ompt_work_type = ompt_work_loop;
1762     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1763     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1764     // Determine workshare type
1765     if (loc != NULL) {
1766       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1767         ompt_work_type = ompt_work_loop;
1768       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1769         ompt_work_type = ompt_work_sections;
1770       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1771         ompt_work_type = ompt_work_distribute;
1772       } else {
1773         // use default set above.
1774         // a warning about this case is provided in __kmpc_for_static_init
1775       }
1776       KMP_DEBUG_ASSERT(ompt_work_type);
1777     }
1778     ompt_callbacks.ompt_callback(ompt_callback_work)(
1779         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1780         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1781   }
1782 #endif
1783 
1784   if (__kmp_env_consistency_check)
1785     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1786 }
1787 
1788 // User routines which take C-style arguments (call by value)
1789 // different from the Fortran equivalent routines
1790 
1791 void ompc_set_num_threads(int arg) {
1792   // !!!!! TODO: check the per-task binding
1793   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1794 }
1795 
1796 void ompc_set_dynamic(int flag) {
1797   kmp_info_t *thread;
1798 
1799   /* For the thread-private implementation of the internal controls */
1800   thread = __kmp_entry_thread();
1801 
1802   __kmp_save_internal_controls(thread);
1803 
1804   set__dynamic(thread, flag ? TRUE : FALSE);
1805 }
1806 
1807 void ompc_set_nested(int flag) {
1808   kmp_info_t *thread;
1809 
1810   /* For the thread-private internal controls implementation */
1811   thread = __kmp_entry_thread();
1812 
1813   __kmp_save_internal_controls(thread);
1814 
1815   set__nested(thread, flag ? TRUE : FALSE);
1816 }
1817 
1818 void ompc_set_max_active_levels(int max_active_levels) {
1819   /* TO DO */
1820   /* we want per-task implementation of this internal control */
1821 
1822   /* For the per-thread internal controls implementation */
1823   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1824 }
1825 
1826 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1827   // !!!!! TODO: check the per-task binding
1828   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1829 }
1830 
1831 int ompc_get_ancestor_thread_num(int level) {
1832   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1833 }
1834 
1835 int ompc_get_team_size(int level) {
1836   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1837 }
1838 
1839 void kmpc_set_stacksize(int arg) {
1840   // __kmp_aux_set_stacksize initializes the library if needed
1841   __kmp_aux_set_stacksize(arg);
1842 }
1843 
1844 void kmpc_set_stacksize_s(size_t arg) {
1845   // __kmp_aux_set_stacksize initializes the library if needed
1846   __kmp_aux_set_stacksize(arg);
1847 }
1848 
1849 void kmpc_set_blocktime(int arg) {
1850   int gtid, tid;
1851   kmp_info_t *thread;
1852 
1853   gtid = __kmp_entry_gtid();
1854   tid = __kmp_tid_from_gtid(gtid);
1855   thread = __kmp_thread_from_gtid(gtid);
1856 
1857   __kmp_aux_set_blocktime(arg, thread, tid);
1858 }
1859 
1860 void kmpc_set_library(int arg) {
1861   // __kmp_user_set_library initializes the library if needed
1862   __kmp_user_set_library((enum library_type)arg);
1863 }
1864 
1865 void kmpc_set_defaults(char const *str) {
1866   // __kmp_aux_set_defaults initializes the library if needed
1867   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1868 }
1869 
1870 void kmpc_set_disp_num_buffers(int arg) {
1871   // ignore after initialization because some teams have already
1872   // allocated dispatch buffers
1873   if (__kmp_init_serial == 0 && arg > 0)
1874     __kmp_dispatch_num_buffers = arg;
1875 }
1876 
1877 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1878 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1879   return -1;
1880 #else
1881   if (!TCR_4(__kmp_init_middle)) {
1882     __kmp_middle_initialize();
1883   }
1884   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1885 #endif
1886 }
1887 
1888 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1889 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1890   return -1;
1891 #else
1892   if (!TCR_4(__kmp_init_middle)) {
1893     __kmp_middle_initialize();
1894   }
1895   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1896 #endif
1897 }
1898 
1899 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1900 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1901   return -1;
1902 #else
1903   if (!TCR_4(__kmp_init_middle)) {
1904     __kmp_middle_initialize();
1905   }
1906   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1907 #endif
1908 }
1909 
1910 /* -------------------------------------------------------------------------- */
1911 /*!
1912 @ingroup THREADPRIVATE
1913 @param loc       source location information
1914 @param gtid      global thread number
1915 @param cpy_size  size of the cpy_data buffer
1916 @param cpy_data  pointer to data to be copied
1917 @param cpy_func  helper function to call for copying data
1918 @param didit     flag variable: 1=single thread; 0=not single thread
1919 
1920 __kmpc_copyprivate implements the interface for the private data broadcast
1921 needed for the copyprivate clause associated with a single region in an
1922 OpenMP<sup>*</sup> program (both C and Fortran).
1923 All threads participating in the parallel region call this routine.
1924 One of the threads (called the single thread) should have the <tt>didit</tt>
1925 variable set to 1 and all other threads should have that variable set to 0.
1926 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1927 
1928 The OpenMP specification forbids the use of nowait on the single region when a
1929 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1930 barrier internally to avoid race conditions, so the code generation for the
1931 single region should avoid generating a barrier after the call to @ref
1932 __kmpc_copyprivate.
1933 
1934 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1935 The <tt>loc</tt> parameter is a pointer to source location information.
1936 
1937 Internal implementation: The single thread will first copy its descriptor
1938 address (cpy_data) to a team-private location, then the other threads will each
1939 call the function pointed to by the parameter cpy_func, which carries out the
1940 copy by copying the data using the cpy_data buffer.
1941 
1942 The cpy_func routine used for the copy and the contents of the data area defined
1943 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1944 to be done. For instance, the cpy_data buffer can hold the actual data to be
1945 copied or it may hold a list of pointers to the data. The cpy_func routine must
1946 interpret the cpy_data buffer appropriately.
1947 
1948 The interface to cpy_func is as follows:
1949 @code
1950 void cpy_func( void *destination, void *source )
1951 @endcode
1952 where void *destination is the cpy_data pointer for the thread being copied to
1953 and void *source is the cpy_data pointer for the thread being copied from.
1954 */
1955 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1956                         void *cpy_data, void (*cpy_func)(void *, void *),
1957                         kmp_int32 didit) {
1958   void **data_ptr;
1959 
1960   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1961 
1962   KMP_MB();
1963 
1964   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1965 
1966   if (__kmp_env_consistency_check) {
1967     if (loc == 0) {
1968       KMP_WARNING(ConstructIdentInvalid);
1969     }
1970   }
1971 
1972   // ToDo: Optimize the following two barriers into some kind of split barrier
1973 
1974   if (didit)
1975     *data_ptr = cpy_data;
1976 
1977 #if OMPT_SUPPORT
1978   ompt_frame_t *ompt_frame;
1979   if (ompt_enabled.enabled) {
1980     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1981     if (ompt_frame->enter_frame == NULL)
1982       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1983     OMPT_STORE_RETURN_ADDRESS(gtid);
1984   }
1985 #endif
1986 /* This barrier is not a barrier region boundary */
1987 #if USE_ITT_NOTIFY
1988   __kmp_threads[gtid]->th.th_ident = loc;
1989 #endif
1990   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1991 
1992   if (!didit)
1993     (*cpy_func)(cpy_data, *data_ptr);
1994 
1995 // Consider next barrier a user-visible barrier for barrier region boundaries
1996 // Nesting checks are already handled by the single construct checks
1997 
1998 #if OMPT_SUPPORT
1999   if (ompt_enabled.enabled) {
2000     OMPT_STORE_RETURN_ADDRESS(gtid);
2001   }
2002 #endif
2003 #if USE_ITT_NOTIFY
2004   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2005 // tasks can overwrite the location)
2006 #endif
2007   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2008 #if OMPT_SUPPORT && OMPT_OPTIONAL
2009   if (ompt_enabled.enabled) {
2010     ompt_frame->enter_frame = NULL;
2011   }
2012 #endif
2013 }
2014 
2015 /* -------------------------------------------------------------------------- */
2016 
2017 #define INIT_LOCK __kmp_init_user_lock_with_checks
2018 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2019 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2020 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2021 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2022 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2023   __kmp_acquire_nested_user_lock_with_checks_timed
2024 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2025 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2026 #define TEST_LOCK __kmp_test_user_lock_with_checks
2027 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2028 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2029 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2030 
2031 // TODO: Make check abort messages use location info & pass it into
2032 // with_checks routines
2033 
2034 #if KMP_USE_DYNAMIC_LOCK
2035 
2036 // internal lock initializer
2037 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2038                                                     kmp_dyna_lockseq_t seq) {
2039   if (KMP_IS_D_LOCK(seq)) {
2040     KMP_INIT_D_LOCK(lock, seq);
2041 #if USE_ITT_BUILD
2042     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2043 #endif
2044   } else {
2045     KMP_INIT_I_LOCK(lock, seq);
2046 #if USE_ITT_BUILD
2047     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2048     __kmp_itt_lock_creating(ilk->lock, loc);
2049 #endif
2050   }
2051 }
2052 
2053 // internal nest lock initializer
2054 static __forceinline void
2055 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2056                                kmp_dyna_lockseq_t seq) {
2057 #if KMP_USE_TSX
2058   // Don't have nested lock implementation for speculative locks
2059   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2060     seq = __kmp_user_lock_seq;
2061 #endif
2062   switch (seq) {
2063   case lockseq_tas:
2064     seq = lockseq_nested_tas;
2065     break;
2066 #if KMP_USE_FUTEX
2067   case lockseq_futex:
2068     seq = lockseq_nested_futex;
2069     break;
2070 #endif
2071   case lockseq_ticket:
2072     seq = lockseq_nested_ticket;
2073     break;
2074   case lockseq_queuing:
2075     seq = lockseq_nested_queuing;
2076     break;
2077   case lockseq_drdpa:
2078     seq = lockseq_nested_drdpa;
2079     break;
2080   default:
2081     seq = lockseq_nested_queuing;
2082   }
2083   KMP_INIT_I_LOCK(lock, seq);
2084 #if USE_ITT_BUILD
2085   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2086   __kmp_itt_lock_creating(ilk->lock, loc);
2087 #endif
2088 }
2089 
2090 /* initialize the lock with a hint */
2091 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2092                                 uintptr_t hint) {
2093   KMP_DEBUG_ASSERT(__kmp_init_serial);
2094   if (__kmp_env_consistency_check && user_lock == NULL) {
2095     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2096   }
2097 
2098   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2099 
2100 #if OMPT_SUPPORT && OMPT_OPTIONAL
2101   // This is the case, if called from omp_init_lock_with_hint:
2102   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2103   if (!codeptr)
2104     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2105   if (ompt_enabled.ompt_callback_lock_init) {
2106     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2107         ompt_mutex_lock, (omp_lock_hint_t)hint,
2108         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2109         codeptr);
2110   }
2111 #endif
2112 }
2113 
2114 /* initialize the lock with a hint */
2115 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2116                                      void **user_lock, uintptr_t hint) {
2117   KMP_DEBUG_ASSERT(__kmp_init_serial);
2118   if (__kmp_env_consistency_check && user_lock == NULL) {
2119     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2120   }
2121 
2122   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2123 
2124 #if OMPT_SUPPORT && OMPT_OPTIONAL
2125   // This is the case, if called from omp_init_lock_with_hint:
2126   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2127   if (!codeptr)
2128     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2129   if (ompt_enabled.ompt_callback_lock_init) {
2130     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2131         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2132         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2133         codeptr);
2134   }
2135 #endif
2136 }
2137 
2138 #endif // KMP_USE_DYNAMIC_LOCK
2139 
2140 /* initialize the lock */
2141 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2142 #if KMP_USE_DYNAMIC_LOCK
2143 
2144   KMP_DEBUG_ASSERT(__kmp_init_serial);
2145   if (__kmp_env_consistency_check && user_lock == NULL) {
2146     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2147   }
2148   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2149 
2150 #if OMPT_SUPPORT && OMPT_OPTIONAL
2151   // This is the case, if called from omp_init_lock_with_hint:
2152   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2153   if (!codeptr)
2154     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2155   if (ompt_enabled.ompt_callback_lock_init) {
2156     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2157         ompt_mutex_lock, omp_lock_hint_none,
2158         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2159         codeptr);
2160   }
2161 #endif
2162 
2163 #else // KMP_USE_DYNAMIC_LOCK
2164 
2165   static char const *const func = "omp_init_lock";
2166   kmp_user_lock_p lck;
2167   KMP_DEBUG_ASSERT(__kmp_init_serial);
2168 
2169   if (__kmp_env_consistency_check) {
2170     if (user_lock == NULL) {
2171       KMP_FATAL(LockIsUninitialized, func);
2172     }
2173   }
2174 
2175   KMP_CHECK_USER_LOCK_INIT();
2176 
2177   if ((__kmp_user_lock_kind == lk_tas) &&
2178       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2179     lck = (kmp_user_lock_p)user_lock;
2180   }
2181 #if KMP_USE_FUTEX
2182   else if ((__kmp_user_lock_kind == lk_futex) &&
2183            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2184     lck = (kmp_user_lock_p)user_lock;
2185   }
2186 #endif
2187   else {
2188     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2189   }
2190   INIT_LOCK(lck);
2191   __kmp_set_user_lock_location(lck, loc);
2192 
2193 #if OMPT_SUPPORT && OMPT_OPTIONAL
2194   // This is the case, if called from omp_init_lock_with_hint:
2195   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2196   if (!codeptr)
2197     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2198   if (ompt_enabled.ompt_callback_lock_init) {
2199     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2200         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2201         (ompt_wait_id_t)user_lock, codeptr);
2202   }
2203 #endif
2204 
2205 #if USE_ITT_BUILD
2206   __kmp_itt_lock_creating(lck);
2207 #endif /* USE_ITT_BUILD */
2208 
2209 #endif // KMP_USE_DYNAMIC_LOCK
2210 } // __kmpc_init_lock
2211 
2212 /* initialize the lock */
2213 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2214 #if KMP_USE_DYNAMIC_LOCK
2215 
2216   KMP_DEBUG_ASSERT(__kmp_init_serial);
2217   if (__kmp_env_consistency_check && user_lock == NULL) {
2218     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2219   }
2220   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2221 
2222 #if OMPT_SUPPORT && OMPT_OPTIONAL
2223   // This is the case, if called from omp_init_lock_with_hint:
2224   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2225   if (!codeptr)
2226     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2227   if (ompt_enabled.ompt_callback_lock_init) {
2228     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2229         ompt_mutex_nest_lock, omp_lock_hint_none,
2230         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2231         codeptr);
2232   }
2233 #endif
2234 
2235 #else // KMP_USE_DYNAMIC_LOCK
2236 
2237   static char const *const func = "omp_init_nest_lock";
2238   kmp_user_lock_p lck;
2239   KMP_DEBUG_ASSERT(__kmp_init_serial);
2240 
2241   if (__kmp_env_consistency_check) {
2242     if (user_lock == NULL) {
2243       KMP_FATAL(LockIsUninitialized, func);
2244     }
2245   }
2246 
2247   KMP_CHECK_USER_LOCK_INIT();
2248 
2249   if ((__kmp_user_lock_kind == lk_tas) &&
2250       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2251        OMP_NEST_LOCK_T_SIZE)) {
2252     lck = (kmp_user_lock_p)user_lock;
2253   }
2254 #if KMP_USE_FUTEX
2255   else if ((__kmp_user_lock_kind == lk_futex) &&
2256            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2257             OMP_NEST_LOCK_T_SIZE)) {
2258     lck = (kmp_user_lock_p)user_lock;
2259   }
2260 #endif
2261   else {
2262     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2263   }
2264 
2265   INIT_NESTED_LOCK(lck);
2266   __kmp_set_user_lock_location(lck, loc);
2267 
2268 #if OMPT_SUPPORT && OMPT_OPTIONAL
2269   // This is the case, if called from omp_init_lock_with_hint:
2270   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2271   if (!codeptr)
2272     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2273   if (ompt_enabled.ompt_callback_lock_init) {
2274     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2275         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2276         (ompt_wait_id_t)user_lock, codeptr);
2277   }
2278 #endif
2279 
2280 #if USE_ITT_BUILD
2281   __kmp_itt_lock_creating(lck);
2282 #endif /* USE_ITT_BUILD */
2283 
2284 #endif // KMP_USE_DYNAMIC_LOCK
2285 } // __kmpc_init_nest_lock
2286 
2287 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2288 #if KMP_USE_DYNAMIC_LOCK
2289 
2290 #if USE_ITT_BUILD
2291   kmp_user_lock_p lck;
2292   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2293     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2294   } else {
2295     lck = (kmp_user_lock_p)user_lock;
2296   }
2297   __kmp_itt_lock_destroyed(lck);
2298 #endif
2299 #if OMPT_SUPPORT && OMPT_OPTIONAL
2300   // This is the case, if called from omp_init_lock_with_hint:
2301   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2302   if (!codeptr)
2303     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2304   if (ompt_enabled.ompt_callback_lock_destroy) {
2305     kmp_user_lock_p lck;
2306     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2307       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2308     } else {
2309       lck = (kmp_user_lock_p)user_lock;
2310     }
2311     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2312         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2313   }
2314 #endif
2315   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2316 #else
2317   kmp_user_lock_p lck;
2318 
2319   if ((__kmp_user_lock_kind == lk_tas) &&
2320       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2321     lck = (kmp_user_lock_p)user_lock;
2322   }
2323 #if KMP_USE_FUTEX
2324   else if ((__kmp_user_lock_kind == lk_futex) &&
2325            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2326     lck = (kmp_user_lock_p)user_lock;
2327   }
2328 #endif
2329   else {
2330     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2331   }
2332 
2333 #if OMPT_SUPPORT && OMPT_OPTIONAL
2334   // This is the case, if called from omp_init_lock_with_hint:
2335   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2336   if (!codeptr)
2337     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2338   if (ompt_enabled.ompt_callback_lock_destroy) {
2339     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2340         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2341   }
2342 #endif
2343 
2344 #if USE_ITT_BUILD
2345   __kmp_itt_lock_destroyed(lck);
2346 #endif /* USE_ITT_BUILD */
2347   DESTROY_LOCK(lck);
2348 
2349   if ((__kmp_user_lock_kind == lk_tas) &&
2350       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2351     ;
2352   }
2353 #if KMP_USE_FUTEX
2354   else if ((__kmp_user_lock_kind == lk_futex) &&
2355            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2356     ;
2357   }
2358 #endif
2359   else {
2360     __kmp_user_lock_free(user_lock, gtid, lck);
2361   }
2362 #endif // KMP_USE_DYNAMIC_LOCK
2363 } // __kmpc_destroy_lock
2364 
2365 /* destroy the lock */
2366 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2367 #if KMP_USE_DYNAMIC_LOCK
2368 
2369 #if USE_ITT_BUILD
2370   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2371   __kmp_itt_lock_destroyed(ilk->lock);
2372 #endif
2373 #if OMPT_SUPPORT && OMPT_OPTIONAL
2374   // This is the case, if called from omp_init_lock_with_hint:
2375   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2376   if (!codeptr)
2377     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2378   if (ompt_enabled.ompt_callback_lock_destroy) {
2379     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2380         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2381   }
2382 #endif
2383   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2384 
2385 #else // KMP_USE_DYNAMIC_LOCK
2386 
2387   kmp_user_lock_p lck;
2388 
2389   if ((__kmp_user_lock_kind == lk_tas) &&
2390       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2391        OMP_NEST_LOCK_T_SIZE)) {
2392     lck = (kmp_user_lock_p)user_lock;
2393   }
2394 #if KMP_USE_FUTEX
2395   else if ((__kmp_user_lock_kind == lk_futex) &&
2396            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2397             OMP_NEST_LOCK_T_SIZE)) {
2398     lck = (kmp_user_lock_p)user_lock;
2399   }
2400 #endif
2401   else {
2402     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2403   }
2404 
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2406   // This is the case, if called from omp_init_lock_with_hint:
2407   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2408   if (!codeptr)
2409     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2410   if (ompt_enabled.ompt_callback_lock_destroy) {
2411     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2412         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2413   }
2414 #endif
2415 
2416 #if USE_ITT_BUILD
2417   __kmp_itt_lock_destroyed(lck);
2418 #endif /* USE_ITT_BUILD */
2419 
2420   DESTROY_NESTED_LOCK(lck);
2421 
2422   if ((__kmp_user_lock_kind == lk_tas) &&
2423       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2424        OMP_NEST_LOCK_T_SIZE)) {
2425     ;
2426   }
2427 #if KMP_USE_FUTEX
2428   else if ((__kmp_user_lock_kind == lk_futex) &&
2429            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2430             OMP_NEST_LOCK_T_SIZE)) {
2431     ;
2432   }
2433 #endif
2434   else {
2435     __kmp_user_lock_free(user_lock, gtid, lck);
2436   }
2437 #endif // KMP_USE_DYNAMIC_LOCK
2438 } // __kmpc_destroy_nest_lock
2439 
2440 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2441   KMP_COUNT_BLOCK(OMP_set_lock);
2442 #if KMP_USE_DYNAMIC_LOCK
2443   int tag = KMP_EXTRACT_D_TAG(user_lock);
2444 #if USE_ITT_BUILD
2445   __kmp_itt_lock_acquiring(
2446       (kmp_user_lock_p)
2447           user_lock); // itt function will get to the right lock object.
2448 #endif
2449 #if OMPT_SUPPORT && OMPT_OPTIONAL
2450   // This is the case, if called from omp_init_lock_with_hint:
2451   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2452   if (!codeptr)
2453     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2454   if (ompt_enabled.ompt_callback_mutex_acquire) {
2455     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2456         ompt_mutex_lock, omp_lock_hint_none,
2457         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2458         codeptr);
2459   }
2460 #endif
2461 #if KMP_USE_INLINED_TAS
2462   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2463     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2464   } else
2465 #elif KMP_USE_INLINED_FUTEX
2466   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2467     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2468   } else
2469 #endif
2470   {
2471     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2472   }
2473 #if USE_ITT_BUILD
2474   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2475 #endif
2476 #if OMPT_SUPPORT && OMPT_OPTIONAL
2477   if (ompt_enabled.ompt_callback_mutex_acquired) {
2478     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2479         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2480   }
2481 #endif
2482 
2483 #else // KMP_USE_DYNAMIC_LOCK
2484 
2485   kmp_user_lock_p lck;
2486 
2487   if ((__kmp_user_lock_kind == lk_tas) &&
2488       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2489     lck = (kmp_user_lock_p)user_lock;
2490   }
2491 #if KMP_USE_FUTEX
2492   else if ((__kmp_user_lock_kind == lk_futex) &&
2493            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2494     lck = (kmp_user_lock_p)user_lock;
2495   }
2496 #endif
2497   else {
2498     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2499   }
2500 
2501 #if USE_ITT_BUILD
2502   __kmp_itt_lock_acquiring(lck);
2503 #endif /* USE_ITT_BUILD */
2504 #if OMPT_SUPPORT && OMPT_OPTIONAL
2505   // This is the case, if called from omp_init_lock_with_hint:
2506   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2507   if (!codeptr)
2508     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2509   if (ompt_enabled.ompt_callback_mutex_acquire) {
2510     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2511         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2512         (ompt_wait_id_t)lck, codeptr);
2513   }
2514 #endif
2515 
2516   ACQUIRE_LOCK(lck, gtid);
2517 
2518 #if USE_ITT_BUILD
2519   __kmp_itt_lock_acquired(lck);
2520 #endif /* USE_ITT_BUILD */
2521 
2522 #if OMPT_SUPPORT && OMPT_OPTIONAL
2523   if (ompt_enabled.ompt_callback_mutex_acquired) {
2524     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2525         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2526   }
2527 #endif
2528 
2529 #endif // KMP_USE_DYNAMIC_LOCK
2530 }
2531 
2532 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2533 #if KMP_USE_DYNAMIC_LOCK
2534 
2535 #if USE_ITT_BUILD
2536   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2537 #endif
2538 #if OMPT_SUPPORT && OMPT_OPTIONAL
2539   // This is the case, if called from omp_init_lock_with_hint:
2540   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2541   if (!codeptr)
2542     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2543   if (ompt_enabled.enabled) {
2544     if (ompt_enabled.ompt_callback_mutex_acquire) {
2545       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2546           ompt_mutex_nest_lock, omp_lock_hint_none,
2547           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2548           codeptr);
2549     }
2550   }
2551 #endif
2552   int acquire_status =
2553       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2554 #if USE_ITT_BUILD
2555   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2556 #endif
2557 
2558 #if OMPT_SUPPORT && OMPT_OPTIONAL
2559   if (ompt_enabled.enabled) {
2560     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2561       if (ompt_enabled.ompt_callback_mutex_acquired) {
2562         // lock_first
2563         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2564             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2565       }
2566     } else {
2567       if (ompt_enabled.ompt_callback_nest_lock) {
2568         // lock_next
2569         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2570             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2571       }
2572     }
2573   }
2574 #endif
2575 
2576 #else // KMP_USE_DYNAMIC_LOCK
2577   int acquire_status;
2578   kmp_user_lock_p lck;
2579 
2580   if ((__kmp_user_lock_kind == lk_tas) &&
2581       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2582        OMP_NEST_LOCK_T_SIZE)) {
2583     lck = (kmp_user_lock_p)user_lock;
2584   }
2585 #if KMP_USE_FUTEX
2586   else if ((__kmp_user_lock_kind == lk_futex) &&
2587            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2588             OMP_NEST_LOCK_T_SIZE)) {
2589     lck = (kmp_user_lock_p)user_lock;
2590   }
2591 #endif
2592   else {
2593     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2594   }
2595 
2596 #if USE_ITT_BUILD
2597   __kmp_itt_lock_acquiring(lck);
2598 #endif /* USE_ITT_BUILD */
2599 #if OMPT_SUPPORT && OMPT_OPTIONAL
2600   // This is the case, if called from omp_init_lock_with_hint:
2601   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2602   if (!codeptr)
2603     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2604   if (ompt_enabled.enabled) {
2605     if (ompt_enabled.ompt_callback_mutex_acquire) {
2606       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2607           ompt_mutex_nest_lock, omp_lock_hint_none,
2608           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2609     }
2610   }
2611 #endif
2612 
2613   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2614 
2615 #if USE_ITT_BUILD
2616   __kmp_itt_lock_acquired(lck);
2617 #endif /* USE_ITT_BUILD */
2618 
2619 #if OMPT_SUPPORT && OMPT_OPTIONAL
2620   if (ompt_enabled.enabled) {
2621     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2622       if (ompt_enabled.ompt_callback_mutex_acquired) {
2623         // lock_first
2624         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2625             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2626       }
2627     } else {
2628       if (ompt_enabled.ompt_callback_nest_lock) {
2629         // lock_next
2630         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2631             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2632       }
2633     }
2634   }
2635 #endif
2636 
2637 #endif // KMP_USE_DYNAMIC_LOCK
2638 }
2639 
2640 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2641 #if KMP_USE_DYNAMIC_LOCK
2642 
2643   int tag = KMP_EXTRACT_D_TAG(user_lock);
2644 #if USE_ITT_BUILD
2645   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2646 #endif
2647 #if KMP_USE_INLINED_TAS
2648   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2649     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2650   } else
2651 #elif KMP_USE_INLINED_FUTEX
2652   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2653     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2654   } else
2655 #endif
2656   {
2657     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2658   }
2659 
2660 #if OMPT_SUPPORT && OMPT_OPTIONAL
2661   // This is the case, if called from omp_init_lock_with_hint:
2662   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2663   if (!codeptr)
2664     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2665   if (ompt_enabled.ompt_callback_mutex_released) {
2666     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2667         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2668   }
2669 #endif
2670 
2671 #else // KMP_USE_DYNAMIC_LOCK
2672 
2673   kmp_user_lock_p lck;
2674 
2675   /* Can't use serial interval since not block structured */
2676   /* release the lock */
2677 
2678   if ((__kmp_user_lock_kind == lk_tas) &&
2679       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2680 #if KMP_OS_LINUX &&                                                            \
2681     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2682 // "fast" path implemented to fix customer performance issue
2683 #if USE_ITT_BUILD
2684     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2685 #endif /* USE_ITT_BUILD */
2686     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2687     KMP_MB();
2688 
2689 #if OMPT_SUPPORT && OMPT_OPTIONAL
2690     // This is the case, if called from omp_init_lock_with_hint:
2691     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2692     if (!codeptr)
2693       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2694     if (ompt_enabled.ompt_callback_mutex_released) {
2695       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2696           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2697     }
2698 #endif
2699 
2700     return;
2701 #else
2702     lck = (kmp_user_lock_p)user_lock;
2703 #endif
2704   }
2705 #if KMP_USE_FUTEX
2706   else if ((__kmp_user_lock_kind == lk_futex) &&
2707            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2708     lck = (kmp_user_lock_p)user_lock;
2709   }
2710 #endif
2711   else {
2712     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2713   }
2714 
2715 #if USE_ITT_BUILD
2716   __kmp_itt_lock_releasing(lck);
2717 #endif /* USE_ITT_BUILD */
2718 
2719   RELEASE_LOCK(lck, gtid);
2720 
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722   // This is the case, if called from omp_init_lock_with_hint:
2723   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2724   if (!codeptr)
2725     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2726   if (ompt_enabled.ompt_callback_mutex_released) {
2727     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2728         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2729   }
2730 #endif
2731 
2732 #endif // KMP_USE_DYNAMIC_LOCK
2733 }
2734 
2735 /* release the lock */
2736 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2737 #if KMP_USE_DYNAMIC_LOCK
2738 
2739 #if USE_ITT_BUILD
2740   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2741 #endif
2742   int release_status =
2743       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2744 
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746   // This is the case, if called from omp_init_lock_with_hint:
2747   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2748   if (!codeptr)
2749     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750   if (ompt_enabled.enabled) {
2751     if (release_status == KMP_LOCK_RELEASED) {
2752       if (ompt_enabled.ompt_callback_mutex_released) {
2753         // release_lock_last
2754         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2755             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2756       }
2757     } else if (ompt_enabled.ompt_callback_nest_lock) {
2758       // release_lock_prev
2759       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2760           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2761     }
2762   }
2763 #endif
2764 
2765 #else // KMP_USE_DYNAMIC_LOCK
2766 
2767   kmp_user_lock_p lck;
2768 
2769   /* Can't use serial interval since not block structured */
2770 
2771   if ((__kmp_user_lock_kind == lk_tas) &&
2772       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2773        OMP_NEST_LOCK_T_SIZE)) {
2774 #if KMP_OS_LINUX &&                                                            \
2775     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2776     // "fast" path implemented to fix customer performance issue
2777     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2778 #if USE_ITT_BUILD
2779     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2780 #endif /* USE_ITT_BUILD */
2781 
2782 #if OMPT_SUPPORT && OMPT_OPTIONAL
2783     int release_status = KMP_LOCK_STILL_HELD;
2784 #endif
2785 
2786     if (--(tl->lk.depth_locked) == 0) {
2787       TCW_4(tl->lk.poll, 0);
2788 #if OMPT_SUPPORT && OMPT_OPTIONAL
2789       release_status = KMP_LOCK_RELEASED;
2790 #endif
2791     }
2792     KMP_MB();
2793 
2794 #if OMPT_SUPPORT && OMPT_OPTIONAL
2795     // This is the case, if called from omp_init_lock_with_hint:
2796     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2797     if (!codeptr)
2798       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2799     if (ompt_enabled.enabled) {
2800       if (release_status == KMP_LOCK_RELEASED) {
2801         if (ompt_enabled.ompt_callback_mutex_released) {
2802           // release_lock_last
2803           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2804               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2805         }
2806       } else if (ompt_enabled.ompt_callback_nest_lock) {
2807         // release_lock_previous
2808         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2809             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2810       }
2811     }
2812 #endif
2813 
2814     return;
2815 #else
2816     lck = (kmp_user_lock_p)user_lock;
2817 #endif
2818   }
2819 #if KMP_USE_FUTEX
2820   else if ((__kmp_user_lock_kind == lk_futex) &&
2821            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2822             OMP_NEST_LOCK_T_SIZE)) {
2823     lck = (kmp_user_lock_p)user_lock;
2824   }
2825 #endif
2826   else {
2827     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2828   }
2829 
2830 #if USE_ITT_BUILD
2831   __kmp_itt_lock_releasing(lck);
2832 #endif /* USE_ITT_BUILD */
2833 
2834   int release_status;
2835   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2836 #if OMPT_SUPPORT && OMPT_OPTIONAL
2837   // This is the case, if called from omp_init_lock_with_hint:
2838   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2839   if (!codeptr)
2840     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2841   if (ompt_enabled.enabled) {
2842     if (release_status == KMP_LOCK_RELEASED) {
2843       if (ompt_enabled.ompt_callback_mutex_released) {
2844         // release_lock_last
2845         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2846             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2847       }
2848     } else if (ompt_enabled.ompt_callback_nest_lock) {
2849       // release_lock_previous
2850       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2851           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2852     }
2853   }
2854 #endif
2855 
2856 #endif // KMP_USE_DYNAMIC_LOCK
2857 }
2858 
2859 /* try to acquire the lock */
2860 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2861   KMP_COUNT_BLOCK(OMP_test_lock);
2862 
2863 #if KMP_USE_DYNAMIC_LOCK
2864   int rc;
2865   int tag = KMP_EXTRACT_D_TAG(user_lock);
2866 #if USE_ITT_BUILD
2867   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2868 #endif
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   // This is the case, if called from omp_init_lock_with_hint:
2871   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2872   if (!codeptr)
2873     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2874   if (ompt_enabled.ompt_callback_mutex_acquire) {
2875     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2876         ompt_mutex_lock, omp_lock_hint_none,
2877         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2878         codeptr);
2879   }
2880 #endif
2881 #if KMP_USE_INLINED_TAS
2882   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2883     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2884   } else
2885 #elif KMP_USE_INLINED_FUTEX
2886   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2887     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2888   } else
2889 #endif
2890   {
2891     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2892   }
2893   if (rc) {
2894 #if USE_ITT_BUILD
2895     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2896 #endif
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898     if (ompt_enabled.ompt_callback_mutex_acquired) {
2899       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2900           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2901     }
2902 #endif
2903     return FTN_TRUE;
2904   } else {
2905 #if USE_ITT_BUILD
2906     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2907 #endif
2908     return FTN_FALSE;
2909   }
2910 
2911 #else // KMP_USE_DYNAMIC_LOCK
2912 
2913   kmp_user_lock_p lck;
2914   int rc;
2915 
2916   if ((__kmp_user_lock_kind == lk_tas) &&
2917       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2918     lck = (kmp_user_lock_p)user_lock;
2919   }
2920 #if KMP_USE_FUTEX
2921   else if ((__kmp_user_lock_kind == lk_futex) &&
2922            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2923     lck = (kmp_user_lock_p)user_lock;
2924   }
2925 #endif
2926   else {
2927     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2928   }
2929 
2930 #if USE_ITT_BUILD
2931   __kmp_itt_lock_acquiring(lck);
2932 #endif /* USE_ITT_BUILD */
2933 #if OMPT_SUPPORT && OMPT_OPTIONAL
2934   // This is the case, if called from omp_init_lock_with_hint:
2935   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2936   if (!codeptr)
2937     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2938   if (ompt_enabled.ompt_callback_mutex_acquire) {
2939     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2940         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2941         (ompt_wait_id_t)lck, codeptr);
2942   }
2943 #endif
2944 
2945   rc = TEST_LOCK(lck, gtid);
2946 #if USE_ITT_BUILD
2947   if (rc) {
2948     __kmp_itt_lock_acquired(lck);
2949   } else {
2950     __kmp_itt_lock_cancelled(lck);
2951   }
2952 #endif /* USE_ITT_BUILD */
2953 #if OMPT_SUPPORT && OMPT_OPTIONAL
2954   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2955     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2956         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2957   }
2958 #endif
2959 
2960   return (rc ? FTN_TRUE : FTN_FALSE);
2961 
2962 /* Can't use serial interval since not block structured */
2963 
2964 #endif // KMP_USE_DYNAMIC_LOCK
2965 }
2966 
2967 /* try to acquire the lock */
2968 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2969 #if KMP_USE_DYNAMIC_LOCK
2970   int rc;
2971 #if USE_ITT_BUILD
2972   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2973 #endif
2974 #if OMPT_SUPPORT && OMPT_OPTIONAL
2975   // This is the case, if called from omp_init_lock_with_hint:
2976   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2977   if (!codeptr)
2978     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2979   if (ompt_enabled.ompt_callback_mutex_acquire) {
2980     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2981         ompt_mutex_nest_lock, omp_lock_hint_none,
2982         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2983         codeptr);
2984   }
2985 #endif
2986   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2987 #if USE_ITT_BUILD
2988   if (rc) {
2989     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2990   } else {
2991     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2992   }
2993 #endif
2994 #if OMPT_SUPPORT && OMPT_OPTIONAL
2995   if (ompt_enabled.enabled && rc) {
2996     if (rc == 1) {
2997       if (ompt_enabled.ompt_callback_mutex_acquired) {
2998         // lock_first
2999         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3000             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3001       }
3002     } else {
3003       if (ompt_enabled.ompt_callback_nest_lock) {
3004         // lock_next
3005         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3006             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3007       }
3008     }
3009   }
3010 #endif
3011   return rc;
3012 
3013 #else // KMP_USE_DYNAMIC_LOCK
3014 
3015   kmp_user_lock_p lck;
3016   int rc;
3017 
3018   if ((__kmp_user_lock_kind == lk_tas) &&
3019       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3020        OMP_NEST_LOCK_T_SIZE)) {
3021     lck = (kmp_user_lock_p)user_lock;
3022   }
3023 #if KMP_USE_FUTEX
3024   else if ((__kmp_user_lock_kind == lk_futex) &&
3025            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3026             OMP_NEST_LOCK_T_SIZE)) {
3027     lck = (kmp_user_lock_p)user_lock;
3028   }
3029 #endif
3030   else {
3031     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3032   }
3033 
3034 #if USE_ITT_BUILD
3035   __kmp_itt_lock_acquiring(lck);
3036 #endif /* USE_ITT_BUILD */
3037 
3038 #if OMPT_SUPPORT && OMPT_OPTIONAL
3039   // This is the case, if called from omp_init_lock_with_hint:
3040   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3041   if (!codeptr)
3042     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3043   if (ompt_enabled.enabled) &&
3044         ompt_enabled.ompt_callback_mutex_acquire) {
3045       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3046           ompt_mutex_nest_lock, omp_lock_hint_none,
3047           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3048     }
3049 #endif
3050 
3051   rc = TEST_NESTED_LOCK(lck, gtid);
3052 #if USE_ITT_BUILD
3053   if (rc) {
3054     __kmp_itt_lock_acquired(lck);
3055   } else {
3056     __kmp_itt_lock_cancelled(lck);
3057   }
3058 #endif /* USE_ITT_BUILD */
3059 #if OMPT_SUPPORT && OMPT_OPTIONAL
3060   if (ompt_enabled.enabled && rc) {
3061     if (rc == 1) {
3062       if (ompt_enabled.ompt_callback_mutex_acquired) {
3063         // lock_first
3064         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3065             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3066       }
3067     } else {
3068       if (ompt_enabled.ompt_callback_nest_lock) {
3069         // lock_next
3070         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3071             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3072       }
3073     }
3074   }
3075 #endif
3076   return rc;
3077 
3078 /* Can't use serial interval since not block structured */
3079 
3080 #endif // KMP_USE_DYNAMIC_LOCK
3081 }
3082 
3083 // Interface to fast scalable reduce methods routines
3084 
3085 // keep the selected method in a thread local structure for cross-function
3086 // usage: will be used in __kmpc_end_reduce* functions;
3087 // another solution: to re-determine the method one more time in
3088 // __kmpc_end_reduce* functions (new prototype required then)
3089 // AT: which solution is better?
3090 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3091   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3092 
3093 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3094   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3095 
3096 // description of the packed_reduction_method variable: look at the macros in
3097 // kmp.h
3098 
3099 // used in a critical section reduce block
3100 static __forceinline void
3101 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3102                                           kmp_critical_name *crit) {
3103 
3104   // this lock was visible to a customer and to the threading profile tool as a
3105   // serial overhead span (although it's used for an internal purpose only)
3106   //            why was it visible in previous implementation?
3107   //            should we keep it visible in new reduce block?
3108   kmp_user_lock_p lck;
3109 
3110 #if KMP_USE_DYNAMIC_LOCK
3111 
3112   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3113   // Check if it is initialized.
3114   if (*lk == 0) {
3115     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3116       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3117                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3118     } else {
3119       __kmp_init_indirect_csptr(crit, loc, global_tid,
3120                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3121     }
3122   }
3123   // Branch for accessing the actual lock object and set operation. This
3124   // branching is inevitable since this lock initialization does not follow the
3125   // normal dispatch path (lock table is not used).
3126   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3127     lck = (kmp_user_lock_p)lk;
3128     KMP_DEBUG_ASSERT(lck != NULL);
3129     if (__kmp_env_consistency_check) {
3130       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3131     }
3132     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3133   } else {
3134     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3135     lck = ilk->lock;
3136     KMP_DEBUG_ASSERT(lck != NULL);
3137     if (__kmp_env_consistency_check) {
3138       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3139     }
3140     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3141   }
3142 
3143 #else // KMP_USE_DYNAMIC_LOCK
3144 
3145   // We know that the fast reduction code is only emitted by Intel compilers
3146   // with 32 byte critical sections. If there isn't enough space, then we
3147   // have to use a pointer.
3148   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3149     lck = (kmp_user_lock_p)crit;
3150   } else {
3151     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3152   }
3153   KMP_DEBUG_ASSERT(lck != NULL);
3154 
3155   if (__kmp_env_consistency_check)
3156     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3157 
3158   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3159 
3160 #endif // KMP_USE_DYNAMIC_LOCK
3161 }
3162 
3163 // used in a critical section reduce block
3164 static __forceinline void
3165 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3166                                         kmp_critical_name *crit) {
3167 
3168   kmp_user_lock_p lck;
3169 
3170 #if KMP_USE_DYNAMIC_LOCK
3171 
3172   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3173     lck = (kmp_user_lock_p)crit;
3174     if (__kmp_env_consistency_check)
3175       __kmp_pop_sync(global_tid, ct_critical, loc);
3176     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3177   } else {
3178     kmp_indirect_lock_t *ilk =
3179         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3180     if (__kmp_env_consistency_check)
3181       __kmp_pop_sync(global_tid, ct_critical, loc);
3182     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3183   }
3184 
3185 #else // KMP_USE_DYNAMIC_LOCK
3186 
3187   // We know that the fast reduction code is only emitted by Intel compilers
3188   // with 32 byte critical sections. If there isn't enough space, then we have
3189   // to use a pointer.
3190   if (__kmp_base_user_lock_size > 32) {
3191     lck = *((kmp_user_lock_p *)crit);
3192     KMP_ASSERT(lck != NULL);
3193   } else {
3194     lck = (kmp_user_lock_p)crit;
3195   }
3196 
3197   if (__kmp_env_consistency_check)
3198     __kmp_pop_sync(global_tid, ct_critical, loc);
3199 
3200   __kmp_release_user_lock_with_checks(lck, global_tid);
3201 
3202 #endif // KMP_USE_DYNAMIC_LOCK
3203 } // __kmp_end_critical_section_reduce_block
3204 
3205 /* 2.a.i. Reduce Block without a terminating barrier */
3206 /*!
3207 @ingroup SYNCHRONIZATION
3208 @param loc source location information
3209 @param global_tid global thread number
3210 @param num_vars number of items (variables) to be reduced
3211 @param reduce_size size of data in bytes to be reduced
3212 @param reduce_data pointer to data to be reduced
3213 @param reduce_func callback function providing reduction operation on two
3214 operands and returning result of reduction in lhs_data
3215 @param lck pointer to the unique lock data structure
3216 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3217 threads if atomic reduction needed
3218 
3219 The nowait version is used for a reduce clause with the nowait argument.
3220 */
3221 kmp_int32
3222 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3223                      size_t reduce_size, void *reduce_data,
3224                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3225                      kmp_critical_name *lck) {
3226 
3227   KMP_COUNT_BLOCK(REDUCE_nowait);
3228   int retval = 0;
3229   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3230 #if OMP_40_ENABLED
3231   kmp_team_t *team;
3232   kmp_info_t *th;
3233   int teams_swapped = 0, task_state;
3234 #endif
3235   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3236 
3237   // why do we need this initialization here at all?
3238   // Reduction clause can not be used as a stand-alone directive.
3239 
3240   // do not call __kmp_serial_initialize(), it will be called by
3241   // __kmp_parallel_initialize() if needed
3242   // possible detection of false-positive race by the threadchecker ???
3243   if (!TCR_4(__kmp_init_parallel))
3244     __kmp_parallel_initialize();
3245 
3246 // check correctness of reduce block nesting
3247 #if KMP_USE_DYNAMIC_LOCK
3248   if (__kmp_env_consistency_check)
3249     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3250 #else
3251   if (__kmp_env_consistency_check)
3252     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3253 #endif
3254 
3255 #if OMP_40_ENABLED
3256   th = __kmp_thread_from_gtid(global_tid);
3257   if (th->th.th_teams_microtask) { // AC: check if we are inside the teams
3258     // construct?
3259     team = th->th.th_team;
3260     if (team->t.t_level == th->th.th_teams_level) {
3261       // this is reduction at teams construct
3262       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3263       // Let's swap teams temporarily for the reduction barrier
3264       teams_swapped = 1;
3265       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3266       th->th.th_team = team->t.t_parent;
3267       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3268       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3269       task_state = th->th.th_task_state;
3270       th->th.th_task_state = 0;
3271     }
3272   }
3273 #endif // OMP_40_ENABLED
3274 
3275   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3276   // the value should be kept in a variable
3277   // the variable should be either a construct-specific or thread-specific
3278   // property, not a team specific property
3279   //     (a thread can reach the next reduce block on the next construct, reduce
3280   //     method may differ on the next construct)
3281   // an ident_t "loc" parameter could be used as a construct-specific property
3282   // (what if loc == 0?)
3283   //     (if both construct-specific and team-specific variables were shared,
3284   //     then unness extra syncs should be needed)
3285   // a thread-specific variable is better regarding two issues above (next
3286   // construct and extra syncs)
3287   // a thread-specific "th_local.reduction_method" variable is used currently
3288   // each thread executes 'determine' and 'set' lines (no need to execute by one
3289   // thread, to avoid unness extra syncs)
3290 
3291   packed_reduction_method = __kmp_determine_reduction_method(
3292       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3293   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3294 
3295   if (packed_reduction_method == critical_reduce_block) {
3296 
3297     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3298     retval = 1;
3299 
3300   } else if (packed_reduction_method == empty_reduce_block) {
3301 
3302     // usage: if team size == 1, no synchronization is required ( Intel
3303     // platforms only )
3304     retval = 1;
3305 
3306   } else if (packed_reduction_method == atomic_reduce_block) {
3307 
3308     retval = 2;
3309 
3310     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3311     // won't be called by the code gen)
3312     //     (it's not quite good, because the checking block has been closed by
3313     //     this 'pop',
3314     //      but atomic operation has not been executed yet, will be executed
3315     //      slightly later, literally on next instruction)
3316     if (__kmp_env_consistency_check)
3317       __kmp_pop_sync(global_tid, ct_reduce, loc);
3318 
3319   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3320                                    tree_reduce_block)) {
3321 
3322 // AT: performance issue: a real barrier here
3323 // AT:     (if master goes slow, other threads are blocked here waiting for the
3324 // master to come and release them)
3325 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3326 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3327 // be confusing to a customer)
3328 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3329 // might go faster and be more in line with sense of NOWAIT
3330 // AT: TO DO: do epcc test and compare times
3331 
3332 // this barrier should be invisible to a customer and to the threading profile
3333 // tool (it's neither a terminating barrier nor customer's code, it's
3334 // used for an internal purpose)
3335 #if OMPT_SUPPORT
3336     // JP: can this barrier potentially leed to task scheduling?
3337     // JP: as long as there is a barrier in the implementation, OMPT should and
3338     // will provide the barrier events
3339     //         so we set-up the necessary frame/return addresses.
3340     ompt_frame_t *ompt_frame;
3341     if (ompt_enabled.enabled) {
3342       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3343       if (ompt_frame->enter_frame == NULL)
3344         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3345       OMPT_STORE_RETURN_ADDRESS(global_tid);
3346     }
3347 #endif
3348 #if USE_ITT_NOTIFY
3349     __kmp_threads[global_tid]->th.th_ident = loc;
3350 #endif
3351     retval =
3352         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3353                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3354     retval = (retval != 0) ? (0) : (1);
3355 #if OMPT_SUPPORT && OMPT_OPTIONAL
3356     if (ompt_enabled.enabled) {
3357       ompt_frame->enter_frame = NULL;
3358     }
3359 #endif
3360 
3361     // all other workers except master should do this pop here
3362     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3363     if (__kmp_env_consistency_check) {
3364       if (retval == 0) {
3365         __kmp_pop_sync(global_tid, ct_reduce, loc);
3366       }
3367     }
3368 
3369   } else {
3370 
3371     // should never reach this block
3372     KMP_ASSERT(0); // "unexpected method"
3373   }
3374 #if OMP_40_ENABLED
3375   if (teams_swapped) {
3376     // Restore thread structure
3377     th->th.th_info.ds.ds_tid = 0;
3378     th->th.th_team = team;
3379     th->th.th_team_nproc = team->t.t_nproc;
3380     th->th.th_task_team = team->t.t_task_team[task_state];
3381     th->th.th_task_state = task_state;
3382   }
3383 #endif
3384   KA_TRACE(
3385       10,
3386       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3387        global_tid, packed_reduction_method, retval));
3388 
3389   return retval;
3390 }
3391 
3392 /*!
3393 @ingroup SYNCHRONIZATION
3394 @param loc source location information
3395 @param global_tid global thread id.
3396 @param lck pointer to the unique lock data structure
3397 
3398 Finish the execution of a reduce nowait.
3399 */
3400 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3401                               kmp_critical_name *lck) {
3402 
3403   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3404 
3405   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3406 
3407   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3408 
3409   if (packed_reduction_method == critical_reduce_block) {
3410 
3411     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3412 
3413   } else if (packed_reduction_method == empty_reduce_block) {
3414 
3415     // usage: if team size == 1, no synchronization is required ( on Intel
3416     // platforms only )
3417 
3418   } else if (packed_reduction_method == atomic_reduce_block) {
3419 
3420     // neither master nor other workers should get here
3421     //     (code gen does not generate this call in case 2: atomic reduce block)
3422     // actually it's better to remove this elseif at all;
3423     // after removal this value will checked by the 'else' and will assert
3424 
3425   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3426                                    tree_reduce_block)) {
3427 
3428     // only master gets here
3429 
3430   } else {
3431 
3432     // should never reach this block
3433     KMP_ASSERT(0); // "unexpected method"
3434   }
3435 
3436   if (__kmp_env_consistency_check)
3437     __kmp_pop_sync(global_tid, ct_reduce, loc);
3438 
3439   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3440                 global_tid, packed_reduction_method));
3441 
3442   return;
3443 }
3444 
3445 /* 2.a.ii. Reduce Block with a terminating barrier */
3446 
3447 /*!
3448 @ingroup SYNCHRONIZATION
3449 @param loc source location information
3450 @param global_tid global thread number
3451 @param num_vars number of items (variables) to be reduced
3452 @param reduce_size size of data in bytes to be reduced
3453 @param reduce_data pointer to data to be reduced
3454 @param reduce_func callback function providing reduction operation on two
3455 operands and returning result of reduction in lhs_data
3456 @param lck pointer to the unique lock data structure
3457 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3458 threads if atomic reduction needed
3459 
3460 A blocking reduce that includes an implicit barrier.
3461 */
3462 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3463                         size_t reduce_size, void *reduce_data,
3464                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3465                         kmp_critical_name *lck) {
3466   KMP_COUNT_BLOCK(REDUCE_wait);
3467   int retval = 0;
3468   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3469 
3470   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3471 
3472   // why do we need this initialization here at all?
3473   // Reduction clause can not be a stand-alone directive.
3474 
3475   // do not call __kmp_serial_initialize(), it will be called by
3476   // __kmp_parallel_initialize() if needed
3477   // possible detection of false-positive race by the threadchecker ???
3478   if (!TCR_4(__kmp_init_parallel))
3479     __kmp_parallel_initialize();
3480 
3481 // check correctness of reduce block nesting
3482 #if KMP_USE_DYNAMIC_LOCK
3483   if (__kmp_env_consistency_check)
3484     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3485 #else
3486   if (__kmp_env_consistency_check)
3487     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3488 #endif
3489 
3490   packed_reduction_method = __kmp_determine_reduction_method(
3491       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3492   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3493 
3494   if (packed_reduction_method == critical_reduce_block) {
3495 
3496     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3497     retval = 1;
3498 
3499   } else if (packed_reduction_method == empty_reduce_block) {
3500 
3501     // usage: if team size == 1, no synchronization is required ( Intel
3502     // platforms only )
3503     retval = 1;
3504 
3505   } else if (packed_reduction_method == atomic_reduce_block) {
3506 
3507     retval = 2;
3508 
3509   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3510                                    tree_reduce_block)) {
3511 
3512 // case tree_reduce_block:
3513 // this barrier should be visible to a customer and to the threading profile
3514 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3515 #if OMPT_SUPPORT
3516     ompt_frame_t *ompt_frame;
3517     if (ompt_enabled.enabled) {
3518       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3519       if (ompt_frame->enter_frame == NULL)
3520         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3521       OMPT_STORE_RETURN_ADDRESS(global_tid);
3522     }
3523 #endif
3524 #if USE_ITT_NOTIFY
3525     __kmp_threads[global_tid]->th.th_ident =
3526         loc; // needed for correct notification of frames
3527 #endif
3528     retval =
3529         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3530                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3531     retval = (retval != 0) ? (0) : (1);
3532 #if OMPT_SUPPORT && OMPT_OPTIONAL
3533     if (ompt_enabled.enabled) {
3534       ompt_frame->enter_frame = NULL;
3535     }
3536 #endif
3537 
3538     // all other workers except master should do this pop here
3539     // ( none of other workers except master will enter __kmpc_end_reduce() )
3540     if (__kmp_env_consistency_check) {
3541       if (retval == 0) { // 0: all other workers; 1: master
3542         __kmp_pop_sync(global_tid, ct_reduce, loc);
3543       }
3544     }
3545 
3546   } else {
3547 
3548     // should never reach this block
3549     KMP_ASSERT(0); // "unexpected method"
3550   }
3551 
3552   KA_TRACE(10,
3553            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3554             global_tid, packed_reduction_method, retval));
3555 
3556   return retval;
3557 }
3558 
3559 /*!
3560 @ingroup SYNCHRONIZATION
3561 @param loc source location information
3562 @param global_tid global thread id.
3563 @param lck pointer to the unique lock data structure
3564 
3565 Finish the execution of a blocking reduce.
3566 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3567 start function.
3568 */
3569 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3570                        kmp_critical_name *lck) {
3571 
3572   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3573 
3574   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3575 
3576   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3577 
3578   // this barrier should be visible to a customer and to the threading profile
3579   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3580 
3581   if (packed_reduction_method == critical_reduce_block) {
3582 
3583     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3584 
3585 // TODO: implicit barrier: should be exposed
3586 #if OMPT_SUPPORT
3587     ompt_frame_t *ompt_frame;
3588     if (ompt_enabled.enabled) {
3589       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3590       if (ompt_frame->enter_frame == NULL)
3591         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3592       OMPT_STORE_RETURN_ADDRESS(global_tid);
3593     }
3594 #endif
3595 #if USE_ITT_NOTIFY
3596     __kmp_threads[global_tid]->th.th_ident = loc;
3597 #endif
3598     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3599 #if OMPT_SUPPORT && OMPT_OPTIONAL
3600     if (ompt_enabled.enabled) {
3601       ompt_frame->enter_frame = NULL;
3602     }
3603 #endif
3604 
3605   } else if (packed_reduction_method == empty_reduce_block) {
3606 
3607 // usage: if team size==1, no synchronization is required (Intel platforms only)
3608 
3609 // TODO: implicit barrier: should be exposed
3610 #if OMPT_SUPPORT
3611     ompt_frame_t *ompt_frame;
3612     if (ompt_enabled.enabled) {
3613       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3614       if (ompt_frame->enter_frame == NULL)
3615         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3616       OMPT_STORE_RETURN_ADDRESS(global_tid);
3617     }
3618 #endif
3619 #if USE_ITT_NOTIFY
3620     __kmp_threads[global_tid]->th.th_ident = loc;
3621 #endif
3622     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3623 #if OMPT_SUPPORT && OMPT_OPTIONAL
3624     if (ompt_enabled.enabled) {
3625       ompt_frame->enter_frame = NULL;
3626     }
3627 #endif
3628 
3629   } else if (packed_reduction_method == atomic_reduce_block) {
3630 
3631 #if OMPT_SUPPORT
3632     ompt_frame_t *ompt_frame;
3633     if (ompt_enabled.enabled) {
3634       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3635       if (ompt_frame->enter_frame == NULL)
3636         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3637       OMPT_STORE_RETURN_ADDRESS(global_tid);
3638     }
3639 #endif
3640 // TODO: implicit barrier: should be exposed
3641 #if USE_ITT_NOTIFY
3642     __kmp_threads[global_tid]->th.th_ident = loc;
3643 #endif
3644     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3645 #if OMPT_SUPPORT && OMPT_OPTIONAL
3646     if (ompt_enabled.enabled) {
3647       ompt_frame->enter_frame = NULL;
3648     }
3649 #endif
3650 
3651   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3652                                    tree_reduce_block)) {
3653 
3654     // only master executes here (master releases all other workers)
3655     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3656                             global_tid);
3657 
3658   } else {
3659 
3660     // should never reach this block
3661     KMP_ASSERT(0); // "unexpected method"
3662   }
3663 
3664   if (__kmp_env_consistency_check)
3665     __kmp_pop_sync(global_tid, ct_reduce, loc);
3666 
3667   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3668                 global_tid, packed_reduction_method));
3669 
3670   return;
3671 }
3672 
3673 #undef __KMP_GET_REDUCTION_METHOD
3674 #undef __KMP_SET_REDUCTION_METHOD
3675 
3676 /* end of interface to fast scalable reduce routines */
3677 
3678 kmp_uint64 __kmpc_get_taskid() {
3679 
3680   kmp_int32 gtid;
3681   kmp_info_t *thread;
3682 
3683   gtid = __kmp_get_gtid();
3684   if (gtid < 0) {
3685     return 0;
3686   }
3687   thread = __kmp_thread_from_gtid(gtid);
3688   return thread->th.th_current_task->td_task_id;
3689 
3690 } // __kmpc_get_taskid
3691 
3692 kmp_uint64 __kmpc_get_parent_taskid() {
3693 
3694   kmp_int32 gtid;
3695   kmp_info_t *thread;
3696   kmp_taskdata_t *parent_task;
3697 
3698   gtid = __kmp_get_gtid();
3699   if (gtid < 0) {
3700     return 0;
3701   }
3702   thread = __kmp_thread_from_gtid(gtid);
3703   parent_task = thread->th.th_current_task->td_parent;
3704   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3705 
3706 } // __kmpc_get_parent_taskid
3707 
3708 #if OMP_45_ENABLED
3709 /*!
3710 @ingroup WORK_SHARING
3711 @param loc  source location information.
3712 @param gtid  global thread number.
3713 @param num_dims  number of associated doacross loops.
3714 @param dims  info on loops bounds.
3715 
3716 Initialize doacross loop information.
3717 Expect compiler send us inclusive bounds,
3718 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3719 */
3720 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3721                           struct kmp_dim *dims) {
3722   int j, idx;
3723   kmp_int64 last, trace_count;
3724   kmp_info_t *th = __kmp_threads[gtid];
3725   kmp_team_t *team = th->th.th_team;
3726   kmp_uint32 *flags;
3727   kmp_disp_t *pr_buf = th->th.th_dispatch;
3728   dispatch_shared_info_t *sh_buf;
3729 
3730   KA_TRACE(
3731       20,
3732       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3733        gtid, num_dims, !team->t.t_serialized));
3734   KMP_DEBUG_ASSERT(dims != NULL);
3735   KMP_DEBUG_ASSERT(num_dims > 0);
3736 
3737   if (team->t.t_serialized) {
3738     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3739     return; // no dependencies if team is serialized
3740   }
3741   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3742   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3743   // the next loop
3744   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3745 
3746   // Save bounds info into allocated private buffer
3747   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3748   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3749       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3750   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3751   pr_buf->th_doacross_info[0] =
3752       (kmp_int64)num_dims; // first element is number of dimensions
3753   // Save also address of num_done in order to access it later without knowing
3754   // the buffer index
3755   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3756   pr_buf->th_doacross_info[2] = dims[0].lo;
3757   pr_buf->th_doacross_info[3] = dims[0].up;
3758   pr_buf->th_doacross_info[4] = dims[0].st;
3759   last = 5;
3760   for (j = 1; j < num_dims; ++j) {
3761     kmp_int64
3762         range_length; // To keep ranges of all dimensions but the first dims[0]
3763     if (dims[j].st == 1) { // most common case
3764       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3765       range_length = dims[j].up - dims[j].lo + 1;
3766     } else {
3767       if (dims[j].st > 0) {
3768         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3769         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3770       } else { // negative increment
3771         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3772         range_length =
3773             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3774       }
3775     }
3776     pr_buf->th_doacross_info[last++] = range_length;
3777     pr_buf->th_doacross_info[last++] = dims[j].lo;
3778     pr_buf->th_doacross_info[last++] = dims[j].up;
3779     pr_buf->th_doacross_info[last++] = dims[j].st;
3780   }
3781 
3782   // Compute total trip count.
3783   // Start with range of dims[0] which we don't need to keep in the buffer.
3784   if (dims[0].st == 1) { // most common case
3785     trace_count = dims[0].up - dims[0].lo + 1;
3786   } else if (dims[0].st > 0) {
3787     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3788     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3789   } else { // negative increment
3790     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3791     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3792   }
3793   for (j = 1; j < num_dims; ++j) {
3794     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3795   }
3796   KMP_DEBUG_ASSERT(trace_count > 0);
3797 
3798   // Check if shared buffer is not occupied by other loop (idx -
3799   // __kmp_dispatch_num_buffers)
3800   if (idx != sh_buf->doacross_buf_idx) {
3801     // Shared buffer is occupied, wait for it to be free
3802     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3803                        __kmp_eq_4, NULL);
3804   }
3805   // Check if we are the first thread. After the CAS the first thread gets 0,
3806   // others get 1 if initialization is in progress, allocated pointer otherwise.
3807   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3808       (kmp_int64 *)&sh_buf->doacross_flags, NULL, (kmp_int64)1);
3809   if (flags == NULL) {
3810     // we are the first thread, allocate the array of flags
3811     kmp_int64 size =
3812         trace_count / 8 + 8; // in bytes, use single bit per iteration
3813     sh_buf->doacross_flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3814   } else if ((kmp_int64)flags == 1) {
3815     // initialization is still in progress, need to wait
3816     while ((volatile kmp_int64)sh_buf->doacross_flags == 1) {
3817       KMP_YIELD(TRUE);
3818     }
3819   }
3820   KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags >
3821                    1); // check value of pointer
3822   pr_buf->th_doacross_flags =
3823       sh_buf->doacross_flags; // save private copy in order to not
3824   // touch shared buffer on each iteration
3825   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3826 }
3827 
3828 void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) {
3829   kmp_int32 shft, num_dims, i;
3830   kmp_uint32 flag;
3831   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3832   kmp_info_t *th = __kmp_threads[gtid];
3833   kmp_team_t *team = th->th.th_team;
3834   kmp_disp_t *pr_buf;
3835   kmp_int64 lo, up, st;
3836 
3837   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3838   if (team->t.t_serialized) {
3839     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3840     return; // no dependencies if team is serialized
3841   }
3842 
3843   // calculate sequential iteration number and check out-of-bounds condition
3844   pr_buf = th->th.th_dispatch;
3845   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3846   num_dims = pr_buf->th_doacross_info[0];
3847   lo = pr_buf->th_doacross_info[2];
3848   up = pr_buf->th_doacross_info[3];
3849   st = pr_buf->th_doacross_info[4];
3850   if (st == 1) { // most common case
3851     if (vec[0] < lo || vec[0] > up) {
3852       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3853                     "bounds [%lld,%lld]\n",
3854                     gtid, vec[0], lo, up));
3855       return;
3856     }
3857     iter_number = vec[0] - lo;
3858   } else if (st > 0) {
3859     if (vec[0] < lo || vec[0] > up) {
3860       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3861                     "bounds [%lld,%lld]\n",
3862                     gtid, vec[0], lo, up));
3863       return;
3864     }
3865     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3866   } else { // negative increment
3867     if (vec[0] > lo || vec[0] < up) {
3868       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3869                     "bounds [%lld,%lld]\n",
3870                     gtid, vec[0], lo, up));
3871       return;
3872     }
3873     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3874   }
3875   for (i = 1; i < num_dims; ++i) {
3876     kmp_int64 iter, ln;
3877     kmp_int32 j = i * 4;
3878     ln = pr_buf->th_doacross_info[j + 1];
3879     lo = pr_buf->th_doacross_info[j + 2];
3880     up = pr_buf->th_doacross_info[j + 3];
3881     st = pr_buf->th_doacross_info[j + 4];
3882     if (st == 1) {
3883       if (vec[i] < lo || vec[i] > up) {
3884         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3885                       "bounds [%lld,%lld]\n",
3886                       gtid, vec[i], lo, up));
3887         return;
3888       }
3889       iter = vec[i] - lo;
3890     } else if (st > 0) {
3891       if (vec[i] < lo || vec[i] > up) {
3892         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3893                       "bounds [%lld,%lld]\n",
3894                       gtid, vec[i], lo, up));
3895         return;
3896       }
3897       iter = (kmp_uint64)(vec[i] - lo) / st;
3898     } else { // st < 0
3899       if (vec[i] > lo || vec[i] < up) {
3900         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3901                       "bounds [%lld,%lld]\n",
3902                       gtid, vec[i], lo, up));
3903         return;
3904       }
3905       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3906     }
3907     iter_number = iter + ln * iter_number;
3908   }
3909   shft = iter_number % 32; // use 32-bit granularity
3910   iter_number >>= 5; // divided by 32
3911   flag = 1 << shft;
3912   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3913     KMP_YIELD(TRUE);
3914   }
3915   KA_TRACE(20,
3916            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3917             gtid, (iter_number << 5) + shft));
3918 }
3919 
3920 void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) {
3921   kmp_int32 shft, num_dims, i;
3922   kmp_uint32 flag;
3923   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3924   kmp_info_t *th = __kmp_threads[gtid];
3925   kmp_team_t *team = th->th.th_team;
3926   kmp_disp_t *pr_buf;
3927   kmp_int64 lo, st;
3928 
3929   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3930   if (team->t.t_serialized) {
3931     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
3932     return; // no dependencies if team is serialized
3933   }
3934 
3935   // calculate sequential iteration number (same as in "wait" but no
3936   // out-of-bounds checks)
3937   pr_buf = th->th.th_dispatch;
3938   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3939   num_dims = pr_buf->th_doacross_info[0];
3940   lo = pr_buf->th_doacross_info[2];
3941   st = pr_buf->th_doacross_info[4];
3942   if (st == 1) { // most common case
3943     iter_number = vec[0] - lo;
3944   } else if (st > 0) {
3945     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3946   } else { // negative increment
3947     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3948   }
3949   for (i = 1; i < num_dims; ++i) {
3950     kmp_int64 iter, ln;
3951     kmp_int32 j = i * 4;
3952     ln = pr_buf->th_doacross_info[j + 1];
3953     lo = pr_buf->th_doacross_info[j + 2];
3954     st = pr_buf->th_doacross_info[j + 4];
3955     if (st == 1) {
3956       iter = vec[i] - lo;
3957     } else if (st > 0) {
3958       iter = (kmp_uint64)(vec[i] - lo) / st;
3959     } else { // st < 0
3960       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3961     }
3962     iter_number = iter + ln * iter_number;
3963   }
3964   shft = iter_number % 32; // use 32-bit granularity
3965   iter_number >>= 5; // divided by 32
3966   flag = 1 << shft;
3967   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
3968     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
3969   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
3970                 (iter_number << 5) + shft));
3971 }
3972 
3973 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
3974   kmp_int64 num_done;
3975   kmp_info_t *th = __kmp_threads[gtid];
3976   kmp_team_t *team = th->th.th_team;
3977   kmp_disp_t *pr_buf = th->th.th_dispatch;
3978 
3979   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3980   if (team->t.t_serialized) {
3981     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3982     return; // nothing to do
3983   }
3984   num_done = KMP_TEST_THEN_INC64((kmp_int64 *)pr_buf->th_doacross_info[1]) + 1;
3985   if (num_done == th->th.th_team_nproc) {
3986     // we are the last thread, need to free shared resources
3987     int idx = pr_buf->th_doacross_buf_idx - 1;
3988     dispatch_shared_info_t *sh_buf =
3989         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3990     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
3991                      (kmp_int64)&sh_buf->doacross_num_done);
3992     KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3993     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3994     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
3995     sh_buf->doacross_flags = NULL;
3996     sh_buf->doacross_num_done = 0;
3997     sh_buf->doacross_buf_idx +=
3998         __kmp_dispatch_num_buffers; // free buffer for future re-use
3999   }
4000   // free private resources (need to keep buffer index forever)
4001   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4002   pr_buf->th_doacross_info = NULL;
4003   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4004 }
4005 #endif
4006 
4007 // end of file //
4008