1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 }
74 
75 /*!
76 @ingroup THREAD_STATES
77 @param loc Source location information.
78 @return The global thread index of the active thread.
79 
80 This function can be called in any context.
81 
82 If the runtime has ony been entered at the outermost level from a
83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
84 that which would be returned by omp_get_thread_num() in the outermost
85 active parallel construct. (Or zero if there is no active parallel
86 construct, since the master thread is necessarily thread zero).
87 
88 If multiple non-OpenMP threads all enter an OpenMP construct then this
89 will be a unique thread identifier among all the threads created by
90 the OpenMP runtime (but the value cannote be defined in terms of
91 OpenMP thread ids returned by omp_get_thread_num()).
92 */
93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
94   kmp_int32 gtid = __kmp_entry_gtid();
95 
96   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
97 
98   return gtid;
99 }
100 
101 /*!
102 @ingroup THREAD_STATES
103 @param loc Source location information.
104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
105 
106 This function can be called in any context.
107 It returns the total number of threads under the control of the OpenMP runtime.
108 That is not a number that can be determined by any OpenMP standard calls, since
109 the library may be called from more than one non-OpenMP thread, and this
110 reflects the total over all such calls. Similarly the runtime maintains
111 underlying threads even when they are not active (since the cost of creating
112 and destroying OS threads is high), this call counts all such threads even if
113 they are not waiting for work.
114 */
115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
116   KC_TRACE(10,
117            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
118 
119   return TCR_4(__kmp_all_nth);
120 }
121 
122 /*!
123 @ingroup THREAD_STATES
124 @param loc Source location information.
125 @return The thread number of the calling thread in the innermost active parallel
126 construct.
127 */
128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
129   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
130   return __kmp_tid_from_gtid(__kmp_entry_gtid());
131 }
132 
133 /*!
134 @ingroup THREAD_STATES
135 @param loc Source location information.
136 @return The number of threads in the innermost active parallel construct.
137 */
138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
139   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
140 
141   return __kmp_entry_thread()->th.th_team->t.t_nproc;
142 }
143 
144 /*!
145  * @ingroup DEPRECATED
146  * @param loc location description
147  *
148  * This function need not be called. It always returns TRUE.
149  */
150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
151 #ifndef KMP_DEBUG
152 
153   return TRUE;
154 
155 #else
156 
157   const char *semi2;
158   const char *semi3;
159   int line_no;
160 
161   if (__kmp_par_range == 0) {
162     return TRUE;
163   }
164   semi2 = loc->psource;
165   if (semi2 == NULL) {
166     return TRUE;
167   }
168   semi2 = strchr(semi2, ';');
169   if (semi2 == NULL) {
170     return TRUE;
171   }
172   semi2 = strchr(semi2 + 1, ';');
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   if (__kmp_par_range_filename[0]) {
177     const char *name = semi2 - 1;
178     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
179       name--;
180     }
181     if ((*name == '/') || (*name == ';')) {
182       name++;
183     }
184     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
185       return __kmp_par_range < 0;
186     }
187   }
188   semi3 = strchr(semi2 + 1, ';');
189   if (__kmp_par_range_routine[0]) {
190     if ((semi3 != NULL) && (semi3 > semi2) &&
191         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
192       return __kmp_par_range < 0;
193     }
194   }
195   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
196     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
197       return __kmp_par_range > 0;
198     }
199     return __kmp_par_range < 0;
200   }
201   return TRUE;
202 
203 #endif /* KMP_DEBUG */
204 }
205 
206 /*!
207 @ingroup THREAD_STATES
208 @param loc Source location information.
209 @return 1 if this thread is executing inside an active parallel region, zero if
210 not.
211 */
212 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
213   return __kmp_entry_thread()->th.th_root->r.r_active;
214 }
215 
216 /*!
217 @ingroup PARALLEL
218 @param loc source location information
219 @param global_tid global thread number
220 @param num_threads number of threads requested for this parallel construct
221 
222 Set the number of threads to be used by the next fork spawned by this thread.
223 This call is only required if the parallel construct has a `num_threads` clause.
224 */
225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
226                              kmp_int32 num_threads) {
227   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
228                 global_tid, num_threads));
229 
230   __kmp_push_num_threads(loc, global_tid, num_threads);
231 }
232 
233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
234   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
235 
236   /* the num_threads are automatically popped */
237 }
238 
239 #if OMP_40_ENABLED
240 
241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
242                            kmp_int32 proc_bind) {
243   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
244                 proc_bind));
245 
246   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
247 }
248 
249 #endif /* OMP_40_ENABLED */
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     omp_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
300       OMPT_STORE_RETURN_ADDRESS(gtid);
301     }
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
312                     &ap
313 #else
314                     ap
315 #endif
316                     );
317 #if INCLUDE_SSC_MARKS
318     SSC_MARK_JOINING();
319 #endif
320     __kmp_join_call(loc, gtid
321 #if OMPT_SUPPORT
322                     ,
323                     fork_context_intel
324 #endif
325                     );
326 
327     va_end(ap);
328   }
329 
330 #if KMP_STATS_ENABLED
331   if (previous_state == stats_state_e::SERIAL_REGION) {
332     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
333   } else {
334     KMP_POP_PARTITIONED_TIMER();
335   }
336 #endif // KMP_STATS_ENABLED
337 }
338 
339 #if OMP_40_ENABLED
340 /*!
341 @ingroup PARALLEL
342 @param loc source location information
343 @param global_tid global thread number
344 @param num_teams number of teams requested for the teams construct
345 @param num_threads number of threads per team requested for the teams construct
346 
347 Set the number of teams to be used by the teams construct.
348 This call is only required if the teams construct has a `num_teams` clause
349 or a `thread_limit` clause (or both).
350 */
351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
352                            kmp_int32 num_teams, kmp_int32 num_threads) {
353   KA_TRACE(20,
354            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
355             global_tid, num_teams, num_threads));
356 
357   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
358 }
359 
360 /*!
361 @ingroup PARALLEL
362 @param loc  source location information
363 @param argc  total number of arguments in the ellipsis
364 @param microtask  pointer to callback routine consisting of outlined teams
365 construct
366 @param ...  pointers to shared variables that aren't global
367 
368 Do the actual fork and call the microtask in the relevant number of threads.
369 */
370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
371                        ...) {
372   int gtid = __kmp_entry_gtid();
373   kmp_info_t *this_thr = __kmp_threads[gtid];
374   va_list ap;
375   va_start(ap, microtask);
376 
377   KMP_COUNT_BLOCK(OMP_TEAMS);
378 
379   // remember teams entry point and nesting level
380   this_thr->th.th_teams_microtask = microtask;
381   this_thr->th.th_teams_level =
382       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
383 
384 #if OMPT_SUPPORT
385   kmp_team_t *parent_team = this_thr->th.th_team;
386   int tid = __kmp_tid_from_gtid(gtid);
387   if (ompt_enabled.enabled) {
388     parent_team->t.t_implicit_task_taskdata[tid]
389         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
390   }
391   OMPT_STORE_RETURN_ADDRESS(gtid);
392 #endif
393 
394   // check if __kmpc_push_num_teams called, set default number of teams
395   // otherwise
396   if (this_thr->th.th_teams_size.nteams == 0) {
397     __kmp_push_num_teams(loc, gtid, 0, 0);
398   }
399   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
400   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
401   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
402 
403   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
404                   VOLATILE_CAST(microtask_t)
405                       __kmp_teams_master, // "wrapped" task
406                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
407 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
408                   &ap
409 #else
410                   ap
411 #endif
412                   );
413   __kmp_join_call(loc, gtid
414 #if OMPT_SUPPORT
415                   ,
416                   fork_context_intel
417 #endif
418                   );
419 
420   this_thr->th.th_teams_microtask = NULL;
421   this_thr->th.th_teams_level = 0;
422   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
423   va_end(ap);
424 }
425 #endif /* OMP_40_ENABLED */
426 
427 // I don't think this function should ever have been exported.
428 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
429 // openmp code ever called it, but it's been exported from the RTL for so
430 // long that I'm afraid to remove the definition.
431 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
432 
433 /*!
434 @ingroup PARALLEL
435 @param loc  source location information
436 @param global_tid  global thread number
437 
438 Enter a serialized parallel construct. This interface is used to handle a
439 conditional parallel region, like this,
440 @code
441 #pragma omp parallel if (condition)
442 @endcode
443 when the condition is false.
444 */
445 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
446 // The implementation is now in kmp_runtime.cpp so that it can share static
447 // functions with kmp_fork_call since the tasks to be done are similar in
448 // each case.
449 #if OMPT_SUPPORT
450   OMPT_STORE_RETURN_ADDRESS(global_tid);
451 #endif
452   __kmp_serialized_parallel(loc, global_tid);
453 }
454 
455 /*!
456 @ingroup PARALLEL
457 @param loc  source location information
458 @param global_tid  global thread number
459 
460 Leave a serialized parallel construct.
461 */
462 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
463   kmp_internal_control_t *top;
464   kmp_info_t *this_thr;
465   kmp_team_t *serial_team;
466 
467   KC_TRACE(10,
468            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
469 
470   /* skip all this code for autopar serialized loops since it results in
471      unacceptable overhead */
472   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
473     return;
474 
475   // Not autopar code
476   if (!TCR_4(__kmp_init_parallel))
477     __kmp_parallel_initialize();
478 
479   this_thr = __kmp_threads[global_tid];
480   serial_team = this_thr->th.th_serial_team;
481 
482 #if OMP_45_ENABLED
483   kmp_task_team_t *task_team = this_thr->th.th_task_team;
484 
485   // we need to wait for the proxy tasks before finishing the thread
486   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
487     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
488 #endif
489 
490   KMP_MB();
491   KMP_DEBUG_ASSERT(serial_team);
492   KMP_ASSERT(serial_team->t.t_serialized);
493   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
494   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
495   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
496   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
497 
498 #if OMPT_SUPPORT
499   if (ompt_enabled.enabled &&
500       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
501     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
502     if (ompt_enabled.ompt_callback_implicit_task) {
503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
504           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
505           OMPT_CUR_TASK_INFO(this_thr)->thread_num);
506     }
507 
508     // reset clear the task id only after unlinking the task
509     ompt_data_t *parent_task_data;
510     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
511 
512     if (ompt_enabled.ompt_callback_parallel_end) {
513       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
514           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
515           ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
516     }
517     __ompt_lw_taskteam_unlink(this_thr);
518     this_thr->th.ompt_thread_info.state = omp_state_overhead;
519   }
520 #endif
521 
522   /* If necessary, pop the internal control stack values and replace the team
523    * values */
524   top = serial_team->t.t_control_stack_top;
525   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
526     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
527     serial_team->t.t_control_stack_top = top->next;
528     __kmp_free(top);
529   }
530 
531   // if( serial_team -> t.t_serialized > 1 )
532   serial_team->t.t_level--;
533 
534   /* pop dispatch buffers stack */
535   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
536   {
537     dispatch_private_info_t *disp_buffer =
538         serial_team->t.t_dispatch->th_disp_buffer;
539     serial_team->t.t_dispatch->th_disp_buffer =
540         serial_team->t.t_dispatch->th_disp_buffer->next;
541     __kmp_free(disp_buffer);
542   }
543 
544   --serial_team->t.t_serialized;
545   if (serial_team->t.t_serialized == 0) {
546 
547 /* return to the parallel section */
548 
549 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
550     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
551       __kmp_clear_x87_fpu_status_word();
552       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
553       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
554     }
555 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
556 
557     this_thr->th.th_team = serial_team->t.t_parent;
558     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
559 
560     /* restore values cached in the thread */
561     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
562     this_thr->th.th_team_master =
563         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
564     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
565 
566     /* TODO the below shouldn't need to be adjusted for serialized teams */
567     this_thr->th.th_dispatch =
568         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
569 
570     __kmp_pop_current_task_from_thread(this_thr);
571 
572     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
573     this_thr->th.th_current_task->td_flags.executing = 1;
574 
575     if (__kmp_tasking_mode != tskm_immediate_exec) {
576       // Copy the task team from the new child / old parent team to the thread.
577       this_thr->th.th_task_team =
578           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
579       KA_TRACE(20,
580                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
581                 "team %p\n",
582                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
583     }
584   } else {
585     if (__kmp_tasking_mode != tskm_immediate_exec) {
586       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
587                     "depth of serial team %p to %d\n",
588                     global_tid, serial_team, serial_team->t.t_serialized));
589     }
590   }
591 
592   if (__kmp_env_consistency_check)
593     __kmp_pop_parallel(global_tid, NULL);
594 #if OMPT_SUPPORT
595   if (ompt_enabled.enabled)
596     this_thr->th.ompt_thread_info.state =
597         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
598                                            : omp_state_work_parallel);
599 #endif
600 }
601 
602 /*!
603 @ingroup SYNCHRONIZATION
604 @param loc  source location information.
605 
606 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
607 depending on the memory ordering convention obeyed by the compiler
608 even that may not be necessary).
609 */
610 void __kmpc_flush(ident_t *loc) {
611   KC_TRACE(10, ("__kmpc_flush: called\n"));
612 
613   /* need explicit __mf() here since use volatile instead in library */
614   KMP_MB(); /* Flush all pending memory write invalidates.  */
615 
616 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
617 #if KMP_MIC
618 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
619 // We shouldn't need it, though, since the ABI rules require that
620 // * If the compiler generates NGO stores it also generates the fence
621 // * If users hand-code NGO stores they should insert the fence
622 // therefore no incomplete unordered stores should be visible.
623 #else
624   // C74404
625   // This is to address non-temporal store instructions (sfence needed).
626   // The clflush instruction is addressed either (mfence needed).
627   // Probably the non-temporal load monvtdqa instruction should also be
628   // addressed.
629   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
630   if (!__kmp_cpuinfo.initialized) {
631     __kmp_query_cpuid(&__kmp_cpuinfo);
632   }
633   if (!__kmp_cpuinfo.sse2) {
634     // CPU cannot execute SSE2 instructions.
635   } else {
636 #if KMP_COMPILER_ICC
637     _mm_mfence();
638 #elif KMP_COMPILER_MSVC
639     MemoryBarrier();
640 #else
641     __sync_synchronize();
642 #endif // KMP_COMPILER_ICC
643   }
644 #endif // KMP_MIC
645 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
646 // Nothing to see here move along
647 #elif KMP_ARCH_PPC64
648 // Nothing needed here (we have a real MB above).
649 #if KMP_OS_CNK
650   // The flushing thread needs to yield here; this prevents a
651   // busy-waiting thread from saturating the pipeline. flush is
652   // often used in loops like this:
653   // while (!flag) {
654   //   #pragma omp flush(flag)
655   // }
656   // and adding the yield here is good for at least a 10x speedup
657   // when running >2 threads per core (on the NAS LU benchmark).
658   __kmp_yield(TRUE);
659 #endif
660 #else
661 #error Unknown or unsupported architecture
662 #endif
663 
664 #if OMPT_SUPPORT && OMPT_OPTIONAL
665   if (ompt_enabled.ompt_callback_flush) {
666     ompt_callbacks.ompt_callback(ompt_callback_flush)(
667         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
668   }
669 #endif
670 }
671 
672 /* -------------------------------------------------------------------------- */
673 /*!
674 @ingroup SYNCHRONIZATION
675 @param loc source location information
676 @param global_tid thread id.
677 
678 Execute a barrier.
679 */
680 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
681   KMP_COUNT_BLOCK(OMP_BARRIER);
682   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
683 
684   if (!TCR_4(__kmp_init_parallel))
685     __kmp_parallel_initialize();
686 
687   if (__kmp_env_consistency_check) {
688     if (loc == 0) {
689       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
690     }
691 
692     __kmp_check_barrier(global_tid, ct_barrier, loc);
693   }
694 
695 #if OMPT_SUPPORT
696   omp_frame_t *ompt_frame;
697   if (ompt_enabled.enabled) {
698     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
699     if (ompt_frame->enter_frame == NULL)
700       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
701     OMPT_STORE_RETURN_ADDRESS(global_tid);
702   }
703 #endif
704   __kmp_threads[global_tid]->th.th_ident = loc;
705   // TODO: explicit barrier_wait_id:
706   //   this function is called when 'barrier' directive is present or
707   //   implicit barrier at the end of a worksharing construct.
708   // 1) better to add a per-thread barrier counter to a thread data structure
709   // 2) set to 0 when a new team is created
710   // 4) no sync is required
711 
712   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
713 #if OMPT_SUPPORT && OMPT_OPTIONAL
714   if (ompt_enabled.enabled) {
715     ompt_frame->enter_frame = NULL;
716   }
717 #endif
718 }
719 
720 /* The BARRIER for a MASTER section is always explicit   */
721 /*!
722 @ingroup WORK_SHARING
723 @param loc  source location information.
724 @param global_tid  global thread number .
725 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
726 */
727 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
728   int status = 0;
729 
730   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
731 
732   if (!TCR_4(__kmp_init_parallel))
733     __kmp_parallel_initialize();
734 
735   if (KMP_MASTER_GTID(global_tid)) {
736     KMP_COUNT_BLOCK(OMP_MASTER);
737     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
738     status = 1;
739   }
740 
741 #if OMPT_SUPPORT && OMPT_OPTIONAL
742   if (status) {
743     if (ompt_enabled.ompt_callback_master) {
744       kmp_info_t *this_thr = __kmp_threads[global_tid];
745       kmp_team_t *team = this_thr->th.th_team;
746 
747       int tid = __kmp_tid_from_gtid(global_tid);
748       ompt_callbacks.ompt_callback(ompt_callback_master)(
749           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
750           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
751           OMPT_GET_RETURN_ADDRESS(0));
752     }
753   }
754 #endif
755 
756   if (__kmp_env_consistency_check) {
757 #if KMP_USE_DYNAMIC_LOCK
758     if (status)
759       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
760     else
761       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
762 #else
763     if (status)
764       __kmp_push_sync(global_tid, ct_master, loc, NULL);
765     else
766       __kmp_check_sync(global_tid, ct_master, loc, NULL);
767 #endif
768   }
769 
770   return status;
771 }
772 
773 /*!
774 @ingroup WORK_SHARING
775 @param loc  source location information.
776 @param global_tid  global thread number .
777 
778 Mark the end of a <tt>master</tt> region. This should only be called by the
779 thread that executes the <tt>master</tt> region.
780 */
781 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
782   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
783 
784   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
785   KMP_POP_PARTITIONED_TIMER();
786 
787 #if OMPT_SUPPORT && OMPT_OPTIONAL
788   kmp_info_t *this_thr = __kmp_threads[global_tid];
789   kmp_team_t *team = this_thr->th.th_team;
790   if (ompt_enabled.ompt_callback_master) {
791     int tid = __kmp_tid_from_gtid(global_tid);
792     ompt_callbacks.ompt_callback(ompt_callback_master)(
793         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
794         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
795         OMPT_GET_RETURN_ADDRESS(0));
796   }
797 #endif
798 
799   if (__kmp_env_consistency_check) {
800     if (global_tid < 0)
801       KMP_WARNING(ThreadIdentInvalid);
802 
803     if (KMP_MASTER_GTID(global_tid))
804       __kmp_pop_sync(global_tid, ct_master, loc);
805   }
806 }
807 
808 /*!
809 @ingroup WORK_SHARING
810 @param loc  source location information.
811 @param gtid  global thread number.
812 
813 Start execution of an <tt>ordered</tt> construct.
814 */
815 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
816   int cid = 0;
817   kmp_info_t *th;
818   KMP_DEBUG_ASSERT(__kmp_init_serial);
819 
820   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
821 
822   if (!TCR_4(__kmp_init_parallel))
823     __kmp_parallel_initialize();
824 
825 #if USE_ITT_BUILD
826   __kmp_itt_ordered_prep(gtid);
827 // TODO: ordered_wait_id
828 #endif /* USE_ITT_BUILD */
829 
830   th = __kmp_threads[gtid];
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_team_t *team;
834   omp_wait_id_t lck;
835   void *codeptr_ra;
836   if (ompt_enabled.enabled) {
837     OMPT_STORE_RETURN_ADDRESS(gtid);
838     team = __kmp_team_from_gtid(gtid);
839     lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value;
840     /* OMPT state update */
841     th->th.ompt_thread_info.wait_id = lck;
842     th->th.ompt_thread_info.state = omp_state_wait_ordered;
843 
844     /* OMPT event callback */
845     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
846     if (ompt_enabled.ompt_callback_mutex_acquire) {
847       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
848           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
849           (omp_wait_id_t)lck, codeptr_ra);
850     }
851   }
852 #endif
853 
854   if (th->th.th_dispatch->th_deo_fcn != 0)
855     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
856   else
857     __kmp_parallel_deo(&gtid, &cid, loc);
858 
859 #if OMPT_SUPPORT && OMPT_OPTIONAL
860   if (ompt_enabled.enabled) {
861     /* OMPT state update */
862     th->th.ompt_thread_info.state = omp_state_work_parallel;
863     th->th.ompt_thread_info.wait_id = 0;
864 
865     /* OMPT event callback */
866     if (ompt_enabled.ompt_callback_mutex_acquired) {
867       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
868           ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra);
869     }
870   }
871 #endif
872 
873 #if USE_ITT_BUILD
874   __kmp_itt_ordered_start(gtid);
875 #endif /* USE_ITT_BUILD */
876 }
877 
878 /*!
879 @ingroup WORK_SHARING
880 @param loc  source location information.
881 @param gtid  global thread number.
882 
883 End execution of an <tt>ordered</tt> construct.
884 */
885 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
886   int cid = 0;
887   kmp_info_t *th;
888 
889   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
890 
891 #if USE_ITT_BUILD
892   __kmp_itt_ordered_end(gtid);
893 // TODO: ordered_wait_id
894 #endif /* USE_ITT_BUILD */
895 
896   th = __kmp_threads[gtid];
897 
898   if (th->th.th_dispatch->th_dxo_fcn != 0)
899     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
900   else
901     __kmp_parallel_dxo(&gtid, &cid, loc);
902 
903 #if OMPT_SUPPORT && OMPT_OPTIONAL
904   OMPT_STORE_RETURN_ADDRESS(gtid);
905   if (ompt_enabled.ompt_callback_mutex_released) {
906     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
907         ompt_mutex_ordered,
908         (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
909         OMPT_LOAD_RETURN_ADDRESS(gtid));
910   }
911 #endif
912 }
913 
914 #if KMP_USE_DYNAMIC_LOCK
915 
916 static __forceinline void
917 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
918                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
919   // Pointer to the allocated indirect lock is written to crit, while indexing
920   // is ignored.
921   void *idx;
922   kmp_indirect_lock_t **lck;
923   lck = (kmp_indirect_lock_t **)crit;
924   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
925   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
926   KMP_SET_I_LOCK_LOCATION(ilk, loc);
927   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
928   KA_TRACE(20,
929            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
930 #if USE_ITT_BUILD
931   __kmp_itt_critical_creating(ilk->lock, loc);
932 #endif
933   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
934   if (status == 0) {
935 #if USE_ITT_BUILD
936     __kmp_itt_critical_destroyed(ilk->lock);
937 #endif
938     // We don't really need to destroy the unclaimed lock here since it will be
939     // cleaned up at program exit.
940     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
941   }
942   KMP_DEBUG_ASSERT(*lck != NULL);
943 }
944 
945 // Fast-path acquire tas lock
946 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
947   {                                                                            \
948     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
949     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
950     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
951     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
952         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
953       kmp_uint32 spins;                                                        \
954       KMP_FSYNC_PREPARE(l);                                                    \
955       KMP_INIT_YIELD(spins);                                                   \
956       if (TCR_4(__kmp_nth) >                                                   \
957           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
958         KMP_YIELD(TRUE);                                                       \
959       } else {                                                                 \
960         KMP_YIELD_SPIN(spins);                                                 \
961       }                                                                        \
962       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
963       while (                                                                  \
964           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
965           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
966         __kmp_spin_backoff(&backoff);                                          \
967         if (TCR_4(__kmp_nth) >                                                 \
968             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
969           KMP_YIELD(TRUE);                                                     \
970         } else {                                                               \
971           KMP_YIELD_SPIN(spins);                                               \
972         }                                                                      \
973       }                                                                        \
974     }                                                                          \
975     KMP_FSYNC_ACQUIRED(l);                                                     \
976   }
977 
978 // Fast-path test tas lock
979 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
980   {                                                                            \
981     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
982     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
983     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
984     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
985          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
986   }
987 
988 // Fast-path release tas lock
989 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
990   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
991 
992 #if KMP_USE_FUTEX
993 
994 #include <sys/syscall.h>
995 #include <unistd.h>
996 #ifndef FUTEX_WAIT
997 #define FUTEX_WAIT 0
998 #endif
999 #ifndef FUTEX_WAKE
1000 #define FUTEX_WAKE 1
1001 #endif
1002 
1003 // Fast-path acquire futex lock
1004 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1005   {                                                                            \
1006     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1007     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1008     KMP_MB();                                                                  \
1009     KMP_FSYNC_PREPARE(ftx);                                                    \
1010     kmp_int32 poll_val;                                                        \
1011     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1012                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1013                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1014       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1015       if (!cond) {                                                             \
1016         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1017                                          poll_val |                            \
1018                                              KMP_LOCK_BUSY(1, futex))) {       \
1019           continue;                                                            \
1020         }                                                                      \
1021         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1022       }                                                                        \
1023       kmp_int32 rc;                                                            \
1024       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1025                         NULL, NULL, 0)) != 0) {                                \
1026         continue;                                                              \
1027       }                                                                        \
1028       gtid_code |= 1;                                                          \
1029     }                                                                          \
1030     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1031   }
1032 
1033 // Fast-path test futex lock
1034 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1035   {                                                                            \
1036     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1037     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1038                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1039       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1040       rc = TRUE;                                                               \
1041     } else {                                                                   \
1042       rc = FALSE;                                                              \
1043     }                                                                          \
1044   }
1045 
1046 // Fast-path release futex lock
1047 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1048   {                                                                            \
1049     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1050     KMP_MB();                                                                  \
1051     KMP_FSYNC_RELEASING(ftx);                                                  \
1052     kmp_int32 poll_val =                                                       \
1053         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1054     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1055       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1056               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1057     }                                                                          \
1058     KMP_MB();                                                                  \
1059     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1060               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1061   }
1062 
1063 #endif // KMP_USE_FUTEX
1064 
1065 #else // KMP_USE_DYNAMIC_LOCK
1066 
1067 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1068                                                       ident_t const *loc,
1069                                                       kmp_int32 gtid) {
1070   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1071 
1072   // Because of the double-check, the following load doesn't need to be volatile
1073   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1074 
1075   if (lck == NULL) {
1076     void *idx;
1077 
1078     // Allocate & initialize the lock.
1079     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1080     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1081     __kmp_init_user_lock_with_checks(lck);
1082     __kmp_set_user_lock_location(lck, loc);
1083 #if USE_ITT_BUILD
1084     __kmp_itt_critical_creating(lck);
1085 // __kmp_itt_critical_creating() should be called *before* the first usage
1086 // of underlying lock. It is the only place where we can guarantee it. There
1087 // are chances the lock will destroyed with no usage, but it is not a
1088 // problem, because this is not real event seen by user but rather setting
1089 // name for object (lock). See more details in kmp_itt.h.
1090 #endif /* USE_ITT_BUILD */
1091 
1092     // Use a cmpxchg instruction to slam the start of the critical section with
1093     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1094     // and use the lock that the other thread allocated.
1095     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1096 
1097     if (status == 0) {
1098 // Deallocate the lock and reload the value.
1099 #if USE_ITT_BUILD
1100       __kmp_itt_critical_destroyed(lck);
1101 // Let ITT know the lock is destroyed and the same memory location may be reused
1102 // for another purpose.
1103 #endif /* USE_ITT_BUILD */
1104       __kmp_destroy_user_lock_with_checks(lck);
1105       __kmp_user_lock_free(&idx, gtid, lck);
1106       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1107       KMP_DEBUG_ASSERT(lck != NULL);
1108     }
1109   }
1110   return lck;
1111 }
1112 
1113 #endif // KMP_USE_DYNAMIC_LOCK
1114 
1115 /*!
1116 @ingroup WORK_SHARING
1117 @param loc  source location information.
1118 @param global_tid  global thread number .
1119 @param crit identity of the critical section. This could be a pointer to a lock
1120 associated with the critical section, or some other suitably unique value.
1121 
1122 Enter code protected by a `critical` construct.
1123 This function blocks until the executing thread can enter the critical section.
1124 */
1125 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1126                      kmp_critical_name *crit) {
1127 #if KMP_USE_DYNAMIC_LOCK
1128 #if OMPT_SUPPORT && OMPT_OPTIONAL
1129   OMPT_STORE_RETURN_ADDRESS(global_tid);
1130 #endif // OMPT_SUPPORT
1131   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1132 #else
1133   KMP_COUNT_BLOCK(OMP_CRITICAL);
1134 #if OMPT_SUPPORT && OMPT_OPTIONAL
1135   omp_state_t prev_state = omp_state_undefined;
1136   ompt_thread_info_t ti;
1137 #endif
1138   kmp_user_lock_p lck;
1139 
1140   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1141 
1142   // TODO: add THR_OVHD_STATE
1143 
1144   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1145   KMP_CHECK_USER_LOCK_INIT();
1146 
1147   if ((__kmp_user_lock_kind == lk_tas) &&
1148       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1149     lck = (kmp_user_lock_p)crit;
1150   }
1151 #if KMP_USE_FUTEX
1152   else if ((__kmp_user_lock_kind == lk_futex) &&
1153            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1154     lck = (kmp_user_lock_p)crit;
1155   }
1156 #endif
1157   else { // ticket, queuing or drdpa
1158     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1159   }
1160 
1161   if (__kmp_env_consistency_check)
1162     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1163 
1164 // since the critical directive binds to all threads, not just the current
1165 // team we have to check this even if we are in a serialized team.
1166 // also, even if we are the uber thread, we still have to conduct the lock,
1167 // as we have to contend with sibling threads.
1168 
1169 #if USE_ITT_BUILD
1170   __kmp_itt_critical_acquiring(lck);
1171 #endif /* USE_ITT_BUILD */
1172 #if OMPT_SUPPORT && OMPT_OPTIONAL
1173   OMPT_STORE_RETURN_ADDRESS(gtid);
1174   void *codeptr_ra = NULL;
1175   if (ompt_enabled.enabled) {
1176     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1177     /* OMPT state update */
1178     prev_state = ti.state;
1179     ti.wait_id = (omp_wait_id_t)lck;
1180     ti.state = omp_state_wait_critical;
1181 
1182     /* OMPT event callback */
1183     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1184     if (ompt_enabled.ompt_callback_mutex_acquire) {
1185       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1186           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1187           (omp_wait_id_t)crit, codeptr_ra);
1188     }
1189   }
1190 #endif
1191   // Value of 'crit' should be good for using as a critical_id of the critical
1192   // section directive.
1193   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1194 
1195 #if USE_ITT_BUILD
1196   __kmp_itt_critical_acquired(lck);
1197 #endif /* USE_ITT_BUILD */
1198 #if OMPT_SUPPORT && OMPT_OPTIONAL
1199   if (ompt_enabled.enabled) {
1200     /* OMPT state update */
1201     ti.state = prev_state;
1202     ti.wait_id = 0;
1203 
1204     /* OMPT event callback */
1205     if (ompt_enabled.ompt_callback_mutex_acquired) {
1206       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1207           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra);
1208     }
1209   }
1210 #endif
1211   KMP_POP_PARTITIONED_TIMER();
1212 
1213   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1214   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1215 #endif // KMP_USE_DYNAMIC_LOCK
1216 }
1217 
1218 #if KMP_USE_DYNAMIC_LOCK
1219 
1220 // Converts the given hint to an internal lock implementation
1221 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1222 #if KMP_USE_TSX
1223 #define KMP_TSX_LOCK(seq) lockseq_##seq
1224 #else
1225 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1226 #endif
1227 
1228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1229 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1230 #else
1231 #define KMP_CPUINFO_RTM 0
1232 #endif
1233 
1234   // Hints that do not require further logic
1235   if (hint & kmp_lock_hint_hle)
1236     return KMP_TSX_LOCK(hle);
1237   if (hint & kmp_lock_hint_rtm)
1238     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1239   if (hint & kmp_lock_hint_adaptive)
1240     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1241 
1242   // Rule out conflicting hints first by returning the default lock
1243   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1244     return __kmp_user_lock_seq;
1245   if ((hint & omp_lock_hint_speculative) &&
1246       (hint & omp_lock_hint_nonspeculative))
1247     return __kmp_user_lock_seq;
1248 
1249   // Do not even consider speculation when it appears to be contended
1250   if (hint & omp_lock_hint_contended)
1251     return lockseq_queuing;
1252 
1253   // Uncontended lock without speculation
1254   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1255     return lockseq_tas;
1256 
1257   // HLE lock for speculation
1258   if (hint & omp_lock_hint_speculative)
1259     return KMP_TSX_LOCK(hle);
1260 
1261   return __kmp_user_lock_seq;
1262 }
1263 
1264 #if OMPT_SUPPORT && OMPT_OPTIONAL
1265 #if KMP_USE_DYNAMIC_LOCK
1266 static kmp_mutex_impl_t
1267 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1268   if (user_lock) {
1269     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1270     case 0:
1271       break;
1272 #if KMP_USE_FUTEX
1273     case locktag_futex:
1274       return kmp_mutex_impl_queuing;
1275 #endif
1276     case locktag_tas:
1277       return kmp_mutex_impl_spin;
1278 #if KMP_USE_TSX
1279     case locktag_hle:
1280       return kmp_mutex_impl_speculative;
1281 #endif
1282     default:
1283       return ompt_mutex_impl_unknown;
1284     }
1285     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1286   }
1287   KMP_ASSERT(ilock);
1288   switch (ilock->type) {
1289 #if KMP_USE_TSX
1290   case locktag_adaptive:
1291   case locktag_rtm:
1292     return kmp_mutex_impl_speculative;
1293 #endif
1294   case locktag_nested_tas:
1295     return kmp_mutex_impl_spin;
1296 #if KMP_USE_FUTEX
1297   case locktag_nested_futex:
1298 #endif
1299   case locktag_ticket:
1300   case locktag_queuing:
1301   case locktag_drdpa:
1302   case locktag_nested_ticket:
1303   case locktag_nested_queuing:
1304   case locktag_nested_drdpa:
1305     return kmp_mutex_impl_queuing;
1306   default:
1307     return ompt_mutex_impl_unknown;
1308   }
1309 }
1310 #else
1311 // For locks without dynamic binding
1312 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1313   switch (__kmp_user_lock_kind) {
1314   case lk_tas:
1315     return kmp_mutex_impl_spin;
1316 #if KMP_USE_FUTEX
1317   case lk_futex:
1318 #endif
1319   case lk_ticket:
1320   case lk_queuing:
1321   case lk_drdpa:
1322     return kmp_mutex_impl_queuing;
1323 #if KMP_USE_TSX
1324   case lk_hle:
1325   case lk_rtm:
1326   case lk_adaptive:
1327     return kmp_mutex_impl_speculative;
1328 #endif
1329   default:
1330     return ompt_mutex_impl_unknown;
1331   }
1332 }
1333 #endif // KMP_USE_DYNAMIC_LOCK
1334 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1335 
1336 /*!
1337 @ingroup WORK_SHARING
1338 @param loc  source location information.
1339 @param global_tid  global thread number.
1340 @param crit identity of the critical section. This could be a pointer to a lock
1341 associated with the critical section, or some other suitably unique value.
1342 @param hint the lock hint.
1343 
1344 Enter code protected by a `critical` construct with a hint. The hint value is
1345 used to suggest a lock implementation. This function blocks until the executing
1346 thread can enter the critical section unless the hint suggests use of
1347 speculative execution and the hardware supports it.
1348 */
1349 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1350                                kmp_critical_name *crit, uintptr_t hint) {
1351   KMP_COUNT_BLOCK(OMP_CRITICAL);
1352   kmp_user_lock_p lck;
1353 #if OMPT_SUPPORT && OMPT_OPTIONAL
1354   omp_state_t prev_state = omp_state_undefined;
1355   ompt_thread_info_t ti;
1356   // This is the case, if called from __kmpc_critical:
1357   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1358   if (!codeptr)
1359     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1360 #endif
1361 
1362   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1363 
1364   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1365   // Check if it is initialized.
1366   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1367   if (*lk == 0) {
1368     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1369     if (KMP_IS_D_LOCK(lckseq)) {
1370       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1371                                   KMP_GET_D_TAG(lckseq));
1372     } else {
1373       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1374     }
1375   }
1376   // Branch for accessing the actual lock object and set operation. This
1377   // branching is inevitable since this lock initialization does not follow the
1378   // normal dispatch path (lock table is not used).
1379   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1380     lck = (kmp_user_lock_p)lk;
1381     if (__kmp_env_consistency_check) {
1382       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1383                       __kmp_map_hint_to_lock(hint));
1384     }
1385 #if USE_ITT_BUILD
1386     __kmp_itt_critical_acquiring(lck);
1387 #endif
1388 #if OMPT_SUPPORT && OMPT_OPTIONAL
1389     if (ompt_enabled.enabled) {
1390       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1391       /* OMPT state update */
1392       prev_state = ti.state;
1393       ti.wait_id = (omp_wait_id_t)lck;
1394       ti.state = omp_state_wait_critical;
1395 
1396       /* OMPT event callback */
1397       if (ompt_enabled.ompt_callback_mutex_acquire) {
1398         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1399             ompt_mutex_critical, (unsigned int)hint,
1400             __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr);
1401       }
1402     }
1403 #endif
1404 #if KMP_USE_INLINED_TAS
1405     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1406       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1407     } else
1408 #elif KMP_USE_INLINED_FUTEX
1409     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1410       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1411     } else
1412 #endif
1413     {
1414       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1415     }
1416   } else {
1417     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1418     lck = ilk->lock;
1419     if (__kmp_env_consistency_check) {
1420       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1421                       __kmp_map_hint_to_lock(hint));
1422     }
1423 #if USE_ITT_BUILD
1424     __kmp_itt_critical_acquiring(lck);
1425 #endif
1426 #if OMPT_SUPPORT && OMPT_OPTIONAL
1427     if (ompt_enabled.enabled) {
1428       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1429       /* OMPT state update */
1430       prev_state = ti.state;
1431       ti.wait_id = (omp_wait_id_t)lck;
1432       ti.state = omp_state_wait_critical;
1433 
1434       /* OMPT event callback */
1435       if (ompt_enabled.ompt_callback_mutex_acquire) {
1436         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1437             ompt_mutex_critical, (unsigned int)hint,
1438             __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr);
1439       }
1440     }
1441 #endif
1442     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1443   }
1444   KMP_POP_PARTITIONED_TIMER();
1445 
1446 #if USE_ITT_BUILD
1447   __kmp_itt_critical_acquired(lck);
1448 #endif /* USE_ITT_BUILD */
1449 #if OMPT_SUPPORT && OMPT_OPTIONAL
1450   if (ompt_enabled.enabled) {
1451     /* OMPT state update */
1452     ti.state = prev_state;
1453     ti.wait_id = 0;
1454 
1455     /* OMPT event callback */
1456     if (ompt_enabled.ompt_callback_mutex_acquired) {
1457       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1458           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr);
1459     }
1460   }
1461 #endif
1462 
1463   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1464   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1465 } // __kmpc_critical_with_hint
1466 
1467 #endif // KMP_USE_DYNAMIC_LOCK
1468 
1469 /*!
1470 @ingroup WORK_SHARING
1471 @param loc  source location information.
1472 @param global_tid  global thread number .
1473 @param crit identity of the critical section. This could be a pointer to a lock
1474 associated with the critical section, or some other suitably unique value.
1475 
1476 Leave a critical section, releasing any lock that was held during its execution.
1477 */
1478 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1479                          kmp_critical_name *crit) {
1480   kmp_user_lock_p lck;
1481 
1482   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1483 
1484 #if KMP_USE_DYNAMIC_LOCK
1485   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1486     lck = (kmp_user_lock_p)crit;
1487     KMP_ASSERT(lck != NULL);
1488     if (__kmp_env_consistency_check) {
1489       __kmp_pop_sync(global_tid, ct_critical, loc);
1490     }
1491 #if USE_ITT_BUILD
1492     __kmp_itt_critical_releasing(lck);
1493 #endif
1494 #if KMP_USE_INLINED_TAS
1495     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1496       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1497     } else
1498 #elif KMP_USE_INLINED_FUTEX
1499     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1500       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1501     } else
1502 #endif
1503     {
1504       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1505     }
1506   } else {
1507     kmp_indirect_lock_t *ilk =
1508         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1509     KMP_ASSERT(ilk != NULL);
1510     lck = ilk->lock;
1511     if (__kmp_env_consistency_check) {
1512       __kmp_pop_sync(global_tid, ct_critical, loc);
1513     }
1514 #if USE_ITT_BUILD
1515     __kmp_itt_critical_releasing(lck);
1516 #endif
1517     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1518   }
1519 
1520 #else // KMP_USE_DYNAMIC_LOCK
1521 
1522   if ((__kmp_user_lock_kind == lk_tas) &&
1523       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1524     lck = (kmp_user_lock_p)crit;
1525   }
1526 #if KMP_USE_FUTEX
1527   else if ((__kmp_user_lock_kind == lk_futex) &&
1528            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1529     lck = (kmp_user_lock_p)crit;
1530   }
1531 #endif
1532   else { // ticket, queuing or drdpa
1533     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1534   }
1535 
1536   KMP_ASSERT(lck != NULL);
1537 
1538   if (__kmp_env_consistency_check)
1539     __kmp_pop_sync(global_tid, ct_critical, loc);
1540 
1541 #if USE_ITT_BUILD
1542   __kmp_itt_critical_releasing(lck);
1543 #endif /* USE_ITT_BUILD */
1544   // Value of 'crit' should be good for using as a critical_id of the critical
1545   // section directive.
1546   __kmp_release_user_lock_with_checks(lck, global_tid);
1547 
1548 #endif // KMP_USE_DYNAMIC_LOCK
1549 
1550 #if OMPT_SUPPORT && OMPT_OPTIONAL
1551   /* OMPT release event triggers after lock is released; place here to trigger
1552    * for all #if branches */
1553   OMPT_STORE_RETURN_ADDRESS(global_tid);
1554   if (ompt_enabled.ompt_callback_mutex_released) {
1555     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1556         ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1557   }
1558 #endif
1559 
1560   KMP_POP_PARTITIONED_TIMER();
1561   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1562 }
1563 
1564 /*!
1565 @ingroup SYNCHRONIZATION
1566 @param loc source location information
1567 @param global_tid thread id.
1568 @return one if the thread should execute the master block, zero otherwise
1569 
1570 Start execution of a combined barrier and master. The barrier is executed inside
1571 this function.
1572 */
1573 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1574   int status;
1575 
1576   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1577 
1578   if (!TCR_4(__kmp_init_parallel))
1579     __kmp_parallel_initialize();
1580 
1581   if (__kmp_env_consistency_check)
1582     __kmp_check_barrier(global_tid, ct_barrier, loc);
1583 
1584 #if OMPT_SUPPORT
1585   omp_frame_t *ompt_frame;
1586   if (ompt_enabled.enabled) {
1587     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1588     if (ompt_frame->enter_frame == NULL)
1589       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1590     OMPT_STORE_RETURN_ADDRESS(global_tid);
1591   }
1592 #endif
1593 #if USE_ITT_NOTIFY
1594   __kmp_threads[global_tid]->th.th_ident = loc;
1595 #endif
1596   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1597 #if OMPT_SUPPORT && OMPT_OPTIONAL
1598   if (ompt_enabled.enabled) {
1599     ompt_frame->enter_frame = NULL;
1600   }
1601 #endif
1602 
1603   return (status != 0) ? 0 : 1;
1604 }
1605 
1606 /*!
1607 @ingroup SYNCHRONIZATION
1608 @param loc source location information
1609 @param global_tid thread id.
1610 
1611 Complete the execution of a combined barrier and master. This function should
1612 only be called at the completion of the <tt>master</tt> code. Other threads will
1613 still be waiting at the barrier and this call releases them.
1614 */
1615 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1616   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1617 
1618   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1619 }
1620 
1621 /*!
1622 @ingroup SYNCHRONIZATION
1623 @param loc source location information
1624 @param global_tid thread id.
1625 @return one if the thread should execute the master block, zero otherwise
1626 
1627 Start execution of a combined barrier and master(nowait) construct.
1628 The barrier is executed inside this function.
1629 There is no equivalent "end" function, since the
1630 */
1631 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1632   kmp_int32 ret;
1633 
1634   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1635 
1636   if (!TCR_4(__kmp_init_parallel))
1637     __kmp_parallel_initialize();
1638 
1639   if (__kmp_env_consistency_check) {
1640     if (loc == 0) {
1641       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1642     }
1643     __kmp_check_barrier(global_tid, ct_barrier, loc);
1644   }
1645 
1646 #if OMPT_SUPPORT
1647   omp_frame_t *ompt_frame;
1648   if (ompt_enabled.enabled) {
1649     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1650     if (ompt_frame->enter_frame == NULL)
1651       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1652     OMPT_STORE_RETURN_ADDRESS(global_tid);
1653   }
1654 #endif
1655 #if USE_ITT_NOTIFY
1656   __kmp_threads[global_tid]->th.th_ident = loc;
1657 #endif
1658   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1659 #if OMPT_SUPPORT && OMPT_OPTIONAL
1660   if (ompt_enabled.enabled) {
1661     ompt_frame->enter_frame = NULL;
1662   }
1663 #endif
1664 
1665   ret = __kmpc_master(loc, global_tid);
1666 
1667   if (__kmp_env_consistency_check) {
1668     /*  there's no __kmpc_end_master called; so the (stats) */
1669     /*  actions of __kmpc_end_master are done here          */
1670 
1671     if (global_tid < 0) {
1672       KMP_WARNING(ThreadIdentInvalid);
1673     }
1674     if (ret) {
1675       /* only one thread should do the pop since only */
1676       /* one did the push (see __kmpc_master())       */
1677 
1678       __kmp_pop_sync(global_tid, ct_master, loc);
1679     }
1680   }
1681 
1682   return (ret);
1683 }
1684 
1685 /* The BARRIER for a SINGLE process section is always explicit   */
1686 /*!
1687 @ingroup WORK_SHARING
1688 @param loc  source location information
1689 @param global_tid  global thread number
1690 @return One if this thread should execute the single construct, zero otherwise.
1691 
1692 Test whether to execute a <tt>single</tt> construct.
1693 There are no implicit barriers in the two "single" calls, rather the compiler
1694 should introduce an explicit barrier if it is required.
1695 */
1696 
1697 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1698   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1699 
1700   if (rc) {
1701     // We are going to execute the single statement, so we should count it.
1702     KMP_COUNT_BLOCK(OMP_SINGLE);
1703     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1704   }
1705 
1706 #if OMPT_SUPPORT && OMPT_OPTIONAL
1707   kmp_info_t *this_thr = __kmp_threads[global_tid];
1708   kmp_team_t *team = this_thr->th.th_team;
1709   int tid = __kmp_tid_from_gtid(global_tid);
1710 
1711   if (ompt_enabled.enabled) {
1712     if (rc) {
1713       if (ompt_enabled.ompt_callback_work) {
1714         ompt_callbacks.ompt_callback(ompt_callback_work)(
1715             ompt_work_single_executor, ompt_scope_begin,
1716             &(team->t.ompt_team_info.parallel_data),
1717             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1718             1, OMPT_GET_RETURN_ADDRESS(0));
1719       }
1720     } else {
1721       if (ompt_enabled.ompt_callback_work) {
1722         ompt_callbacks.ompt_callback(ompt_callback_work)(
1723             ompt_work_single_other, ompt_scope_begin,
1724             &(team->t.ompt_team_info.parallel_data),
1725             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1726             1, OMPT_GET_RETURN_ADDRESS(0));
1727         ompt_callbacks.ompt_callback(ompt_callback_work)(
1728             ompt_work_single_other, ompt_scope_end,
1729             &(team->t.ompt_team_info.parallel_data),
1730             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1731             1, OMPT_GET_RETURN_ADDRESS(0));
1732       }
1733     }
1734   }
1735 #endif
1736 
1737   return rc;
1738 }
1739 
1740 /*!
1741 @ingroup WORK_SHARING
1742 @param loc  source location information
1743 @param global_tid  global thread number
1744 
1745 Mark the end of a <tt>single</tt> construct.  This function should
1746 only be called by the thread that executed the block of code protected
1747 by the `single` construct.
1748 */
1749 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1750   __kmp_exit_single(global_tid);
1751   KMP_POP_PARTITIONED_TIMER();
1752 
1753 #if OMPT_SUPPORT && OMPT_OPTIONAL
1754   kmp_info_t *this_thr = __kmp_threads[global_tid];
1755   kmp_team_t *team = this_thr->th.th_team;
1756   int tid = __kmp_tid_from_gtid(global_tid);
1757 
1758   if (ompt_enabled.ompt_callback_work) {
1759     ompt_callbacks.ompt_callback(ompt_callback_work)(
1760         ompt_work_single_executor, ompt_scope_end,
1761         &(team->t.ompt_team_info.parallel_data),
1762         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1763         OMPT_GET_RETURN_ADDRESS(0));
1764   }
1765 #endif
1766 }
1767 
1768 /*!
1769 @ingroup WORK_SHARING
1770 @param loc Source location
1771 @param global_tid Global thread id
1772 
1773 Mark the end of a statically scheduled loop.
1774 */
1775 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1776   KMP_POP_PARTITIONED_TIMER();
1777   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1778 
1779 #if OMPT_SUPPORT && OMPT_OPTIONAL
1780   if (ompt_enabled.ompt_callback_work) {
1781     ompt_work_type_t ompt_work_type = ompt_work_loop;
1782     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1783     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1784     // Determine workshare type
1785     if (loc != NULL) {
1786       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1787         ompt_work_type = ompt_work_loop;
1788       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1789         ompt_work_type = ompt_work_sections;
1790       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1791         ompt_work_type = ompt_work_distribute;
1792       } else {
1793         // use default set above.
1794         // a warning about this case is provided in __kmpc_for_static_init
1795       }
1796       KMP_DEBUG_ASSERT(ompt_work_type);
1797     }
1798     ompt_callbacks.ompt_callback(ompt_callback_work)(
1799         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1800         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1801   }
1802 #endif
1803   if (__kmp_env_consistency_check)
1804     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1805 }
1806 
1807 // User routines which take C-style arguments (call by value)
1808 // different from the Fortran equivalent routines
1809 
1810 void ompc_set_num_threads(int arg) {
1811   // !!!!! TODO: check the per-task binding
1812   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1813 }
1814 
1815 void ompc_set_dynamic(int flag) {
1816   kmp_info_t *thread;
1817 
1818   /* For the thread-private implementation of the internal controls */
1819   thread = __kmp_entry_thread();
1820 
1821   __kmp_save_internal_controls(thread);
1822 
1823   set__dynamic(thread, flag ? TRUE : FALSE);
1824 }
1825 
1826 void ompc_set_nested(int flag) {
1827   kmp_info_t *thread;
1828 
1829   /* For the thread-private internal controls implementation */
1830   thread = __kmp_entry_thread();
1831 
1832   __kmp_save_internal_controls(thread);
1833 
1834   set__nested(thread, flag ? TRUE : FALSE);
1835 }
1836 
1837 void ompc_set_max_active_levels(int max_active_levels) {
1838   /* TO DO */
1839   /* we want per-task implementation of this internal control */
1840 
1841   /* For the per-thread internal controls implementation */
1842   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1843 }
1844 
1845 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1846   // !!!!! TODO: check the per-task binding
1847   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1848 }
1849 
1850 int ompc_get_ancestor_thread_num(int level) {
1851   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1852 }
1853 
1854 int ompc_get_team_size(int level) {
1855   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1856 }
1857 
1858 void kmpc_set_stacksize(int arg) {
1859   // __kmp_aux_set_stacksize initializes the library if needed
1860   __kmp_aux_set_stacksize(arg);
1861 }
1862 
1863 void kmpc_set_stacksize_s(size_t arg) {
1864   // __kmp_aux_set_stacksize initializes the library if needed
1865   __kmp_aux_set_stacksize(arg);
1866 }
1867 
1868 void kmpc_set_blocktime(int arg) {
1869   int gtid, tid;
1870   kmp_info_t *thread;
1871 
1872   gtid = __kmp_entry_gtid();
1873   tid = __kmp_tid_from_gtid(gtid);
1874   thread = __kmp_thread_from_gtid(gtid);
1875 
1876   __kmp_aux_set_blocktime(arg, thread, tid);
1877 }
1878 
1879 void kmpc_set_library(int arg) {
1880   // __kmp_user_set_library initializes the library if needed
1881   __kmp_user_set_library((enum library_type)arg);
1882 }
1883 
1884 void kmpc_set_defaults(char const *str) {
1885   // __kmp_aux_set_defaults initializes the library if needed
1886   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1887 }
1888 
1889 void kmpc_set_disp_num_buffers(int arg) {
1890   // ignore after initialization because some teams have already
1891   // allocated dispatch buffers
1892   if (__kmp_init_serial == 0 && arg > 0)
1893     __kmp_dispatch_num_buffers = arg;
1894 }
1895 
1896 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1897 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1898   return -1;
1899 #else
1900   if (!TCR_4(__kmp_init_middle)) {
1901     __kmp_middle_initialize();
1902   }
1903   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1904 #endif
1905 }
1906 
1907 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1908 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1909   return -1;
1910 #else
1911   if (!TCR_4(__kmp_init_middle)) {
1912     __kmp_middle_initialize();
1913   }
1914   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1915 #endif
1916 }
1917 
1918 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1919 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1920   return -1;
1921 #else
1922   if (!TCR_4(__kmp_init_middle)) {
1923     __kmp_middle_initialize();
1924   }
1925   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1926 #endif
1927 }
1928 
1929 /* -------------------------------------------------------------------------- */
1930 /*!
1931 @ingroup THREADPRIVATE
1932 @param loc       source location information
1933 @param gtid      global thread number
1934 @param cpy_size  size of the cpy_data buffer
1935 @param cpy_data  pointer to data to be copied
1936 @param cpy_func  helper function to call for copying data
1937 @param didit     flag variable: 1=single thread; 0=not single thread
1938 
1939 __kmpc_copyprivate implements the interface for the private data broadcast
1940 needed for the copyprivate clause associated with a single region in an
1941 OpenMP<sup>*</sup> program (both C and Fortran).
1942 All threads participating in the parallel region call this routine.
1943 One of the threads (called the single thread) should have the <tt>didit</tt>
1944 variable set to 1 and all other threads should have that variable set to 0.
1945 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1946 
1947 The OpenMP specification forbids the use of nowait on the single region when a
1948 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1949 barrier internally to avoid race conditions, so the code generation for the
1950 single region should avoid generating a barrier after the call to @ref
1951 __kmpc_copyprivate.
1952 
1953 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1954 The <tt>loc</tt> parameter is a pointer to source location information.
1955 
1956 Internal implementation: The single thread will first copy its descriptor
1957 address (cpy_data) to a team-private location, then the other threads will each
1958 call the function pointed to by the parameter cpy_func, which carries out the
1959 copy by copying the data using the cpy_data buffer.
1960 
1961 The cpy_func routine used for the copy and the contents of the data area defined
1962 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1963 to be done. For instance, the cpy_data buffer can hold the actual data to be
1964 copied or it may hold a list of pointers to the data. The cpy_func routine must
1965 interpret the cpy_data buffer appropriately.
1966 
1967 The interface to cpy_func is as follows:
1968 @code
1969 void cpy_func( void *destination, void *source )
1970 @endcode
1971 where void *destination is the cpy_data pointer for the thread being copied to
1972 and void *source is the cpy_data pointer for the thread being copied from.
1973 */
1974 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1975                         void *cpy_data, void (*cpy_func)(void *, void *),
1976                         kmp_int32 didit) {
1977   void **data_ptr;
1978 
1979   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1980 
1981   KMP_MB();
1982 
1983   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1984 
1985   if (__kmp_env_consistency_check) {
1986     if (loc == 0) {
1987       KMP_WARNING(ConstructIdentInvalid);
1988     }
1989   }
1990 
1991   // ToDo: Optimize the following two barriers into some kind of split barrier
1992 
1993   if (didit)
1994     *data_ptr = cpy_data;
1995 
1996 #if OMPT_SUPPORT
1997   omp_frame_t *ompt_frame;
1998   if (ompt_enabled.enabled) {
1999     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2000     if (ompt_frame->enter_frame == NULL)
2001       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
2002     OMPT_STORE_RETURN_ADDRESS(gtid);
2003   }
2004 #endif
2005 /* This barrier is not a barrier region boundary */
2006 #if USE_ITT_NOTIFY
2007   __kmp_threads[gtid]->th.th_ident = loc;
2008 #endif
2009   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2010 
2011   if (!didit)
2012     (*cpy_func)(cpy_data, *data_ptr);
2013 
2014 // Consider next barrier a user-visible barrier for barrier region boundaries
2015 // Nesting checks are already handled by the single construct checks
2016 
2017 #if OMPT_SUPPORT
2018   if (ompt_enabled.enabled) {
2019     OMPT_STORE_RETURN_ADDRESS(gtid);
2020   }
2021 #endif
2022 #if USE_ITT_NOTIFY
2023   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2024 // tasks can overwrite the location)
2025 #endif
2026   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2027 #if OMPT_SUPPORT && OMPT_OPTIONAL
2028   if (ompt_enabled.enabled) {
2029     ompt_frame->enter_frame = NULL;
2030   }
2031 #endif
2032 }
2033 
2034 /* -------------------------------------------------------------------------- */
2035 
2036 #define INIT_LOCK __kmp_init_user_lock_with_checks
2037 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2038 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2039 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2040 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2041 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2042   __kmp_acquire_nested_user_lock_with_checks_timed
2043 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2044 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2045 #define TEST_LOCK __kmp_test_user_lock_with_checks
2046 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2047 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2048 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2049 
2050 // TODO: Make check abort messages use location info & pass it into
2051 // with_checks routines
2052 
2053 #if KMP_USE_DYNAMIC_LOCK
2054 
2055 // internal lock initializer
2056 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2057                                                     kmp_dyna_lockseq_t seq) {
2058   if (KMP_IS_D_LOCK(seq)) {
2059     KMP_INIT_D_LOCK(lock, seq);
2060 #if USE_ITT_BUILD
2061     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2062 #endif
2063   } else {
2064     KMP_INIT_I_LOCK(lock, seq);
2065 #if USE_ITT_BUILD
2066     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2067     __kmp_itt_lock_creating(ilk->lock, loc);
2068 #endif
2069   }
2070 }
2071 
2072 // internal nest lock initializer
2073 static __forceinline void
2074 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2075                                kmp_dyna_lockseq_t seq) {
2076 #if KMP_USE_TSX
2077   // Don't have nested lock implementation for speculative locks
2078   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2079     seq = __kmp_user_lock_seq;
2080 #endif
2081   switch (seq) {
2082   case lockseq_tas:
2083     seq = lockseq_nested_tas;
2084     break;
2085 #if KMP_USE_FUTEX
2086   case lockseq_futex:
2087     seq = lockseq_nested_futex;
2088     break;
2089 #endif
2090   case lockseq_ticket:
2091     seq = lockseq_nested_ticket;
2092     break;
2093   case lockseq_queuing:
2094     seq = lockseq_nested_queuing;
2095     break;
2096   case lockseq_drdpa:
2097     seq = lockseq_nested_drdpa;
2098     break;
2099   default:
2100     seq = lockseq_nested_queuing;
2101   }
2102   KMP_INIT_I_LOCK(lock, seq);
2103 #if USE_ITT_BUILD
2104   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2105   __kmp_itt_lock_creating(ilk->lock, loc);
2106 #endif
2107 }
2108 
2109 /* initialize the lock with a hint */
2110 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2111                                 uintptr_t hint) {
2112   KMP_DEBUG_ASSERT(__kmp_init_serial);
2113   if (__kmp_env_consistency_check && user_lock == NULL) {
2114     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2115   }
2116 
2117   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2118 
2119 #if OMPT_SUPPORT && OMPT_OPTIONAL
2120   // This is the case, if called from omp_init_lock_with_hint:
2121   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2122   if (!codeptr)
2123     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2124   if (ompt_enabled.ompt_callback_lock_init) {
2125     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2126         ompt_mutex_lock, (omp_lock_hint_t)hint,
2127         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2128         codeptr);
2129   }
2130 #endif
2131 }
2132 
2133 /* initialize the lock with a hint */
2134 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2135                                      void **user_lock, uintptr_t hint) {
2136   KMP_DEBUG_ASSERT(__kmp_init_serial);
2137   if (__kmp_env_consistency_check && user_lock == NULL) {
2138     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2139   }
2140 
2141   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2142 
2143 #if OMPT_SUPPORT && OMPT_OPTIONAL
2144   // This is the case, if called from omp_init_lock_with_hint:
2145   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2146   if (!codeptr)
2147     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2148   if (ompt_enabled.ompt_callback_lock_init) {
2149     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2150         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2151         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2152         codeptr);
2153   }
2154 #endif
2155 }
2156 
2157 #endif // KMP_USE_DYNAMIC_LOCK
2158 
2159 /* initialize the lock */
2160 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2161 #if KMP_USE_DYNAMIC_LOCK
2162 
2163   KMP_DEBUG_ASSERT(__kmp_init_serial);
2164   if (__kmp_env_consistency_check && user_lock == NULL) {
2165     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2166   }
2167   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2168 
2169 #if OMPT_SUPPORT && OMPT_OPTIONAL
2170   // This is the case, if called from omp_init_lock_with_hint:
2171   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2172   if (!codeptr)
2173     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2174   if (ompt_enabled.ompt_callback_lock_init) {
2175     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2176         ompt_mutex_lock, omp_lock_hint_none,
2177         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2178         codeptr);
2179   }
2180 #endif
2181 
2182 #else // KMP_USE_DYNAMIC_LOCK
2183 
2184   static char const *const func = "omp_init_lock";
2185   kmp_user_lock_p lck;
2186   KMP_DEBUG_ASSERT(__kmp_init_serial);
2187 
2188   if (__kmp_env_consistency_check) {
2189     if (user_lock == NULL) {
2190       KMP_FATAL(LockIsUninitialized, func);
2191     }
2192   }
2193 
2194   KMP_CHECK_USER_LOCK_INIT();
2195 
2196   if ((__kmp_user_lock_kind == lk_tas) &&
2197       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2198     lck = (kmp_user_lock_p)user_lock;
2199   }
2200 #if KMP_USE_FUTEX
2201   else if ((__kmp_user_lock_kind == lk_futex) &&
2202            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2203     lck = (kmp_user_lock_p)user_lock;
2204   }
2205 #endif
2206   else {
2207     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2208   }
2209   INIT_LOCK(lck);
2210   __kmp_set_user_lock_location(lck, loc);
2211 
2212 #if OMPT_SUPPORT && OMPT_OPTIONAL
2213   // This is the case, if called from omp_init_lock_with_hint:
2214   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2215   if (!codeptr)
2216     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2217   if (ompt_enabled.ompt_callback_lock_init) {
2218     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2219         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2220         (omp_wait_id_t)user_lock, codeptr);
2221   }
2222 #endif
2223 
2224 #if USE_ITT_BUILD
2225   __kmp_itt_lock_creating(lck);
2226 #endif /* USE_ITT_BUILD */
2227 
2228 #endif // KMP_USE_DYNAMIC_LOCK
2229 } // __kmpc_init_lock
2230 
2231 /* initialize the lock */
2232 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2233 #if KMP_USE_DYNAMIC_LOCK
2234 
2235   KMP_DEBUG_ASSERT(__kmp_init_serial);
2236   if (__kmp_env_consistency_check && user_lock == NULL) {
2237     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2238   }
2239   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2240 
2241 #if OMPT_SUPPORT && OMPT_OPTIONAL
2242   // This is the case, if called from omp_init_lock_with_hint:
2243   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2244   if (!codeptr)
2245     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2246   if (ompt_enabled.ompt_callback_lock_init) {
2247     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2248         ompt_mutex_nest_lock, omp_lock_hint_none,
2249         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2250         codeptr);
2251   }
2252 #endif
2253 
2254 #else // KMP_USE_DYNAMIC_LOCK
2255 
2256   static char const *const func = "omp_init_nest_lock";
2257   kmp_user_lock_p lck;
2258   KMP_DEBUG_ASSERT(__kmp_init_serial);
2259 
2260   if (__kmp_env_consistency_check) {
2261     if (user_lock == NULL) {
2262       KMP_FATAL(LockIsUninitialized, func);
2263     }
2264   }
2265 
2266   KMP_CHECK_USER_LOCK_INIT();
2267 
2268   if ((__kmp_user_lock_kind == lk_tas) &&
2269       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2270        OMP_NEST_LOCK_T_SIZE)) {
2271     lck = (kmp_user_lock_p)user_lock;
2272   }
2273 #if KMP_USE_FUTEX
2274   else if ((__kmp_user_lock_kind == lk_futex) &&
2275            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2276             OMP_NEST_LOCK_T_SIZE)) {
2277     lck = (kmp_user_lock_p)user_lock;
2278   }
2279 #endif
2280   else {
2281     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2282   }
2283 
2284   INIT_NESTED_LOCK(lck);
2285   __kmp_set_user_lock_location(lck, loc);
2286 
2287 #if OMPT_SUPPORT && OMPT_OPTIONAL
2288   // This is the case, if called from omp_init_lock_with_hint:
2289   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2290   if (!codeptr)
2291     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2292   if (ompt_enabled.ompt_callback_lock_init) {
2293     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2294         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2295         (omp_wait_id_t)user_lock, codeptr);
2296   }
2297 #endif
2298 
2299 #if USE_ITT_BUILD
2300   __kmp_itt_lock_creating(lck);
2301 #endif /* USE_ITT_BUILD */
2302 
2303 #endif // KMP_USE_DYNAMIC_LOCK
2304 } // __kmpc_init_nest_lock
2305 
2306 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2307 #if KMP_USE_DYNAMIC_LOCK
2308 
2309 #if USE_ITT_BUILD
2310   kmp_user_lock_p lck;
2311   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2312     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2313   } else {
2314     lck = (kmp_user_lock_p)user_lock;
2315   }
2316   __kmp_itt_lock_destroyed(lck);
2317 #endif
2318 #if OMPT_SUPPORT && OMPT_OPTIONAL
2319   // This is the case, if called from omp_init_lock_with_hint:
2320   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2321   if (!codeptr)
2322     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2323   if (ompt_enabled.ompt_callback_lock_destroy) {
2324     kmp_user_lock_p lck;
2325     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2326       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2327     } else {
2328       lck = (kmp_user_lock_p)user_lock;
2329     }
2330     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2331         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2332   }
2333 #endif
2334   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2335 #else
2336   kmp_user_lock_p lck;
2337 
2338   if ((__kmp_user_lock_kind == lk_tas) &&
2339       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2340     lck = (kmp_user_lock_p)user_lock;
2341   }
2342 #if KMP_USE_FUTEX
2343   else if ((__kmp_user_lock_kind == lk_futex) &&
2344            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2345     lck = (kmp_user_lock_p)user_lock;
2346   }
2347 #endif
2348   else {
2349     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2350   }
2351 
2352 #if OMPT_SUPPORT && OMPT_OPTIONAL
2353   // This is the case, if called from omp_init_lock_with_hint:
2354   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2355   if (!codeptr)
2356     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2357   if (ompt_enabled.ompt_callback_lock_destroy) {
2358     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2359         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2360   }
2361 #endif
2362 
2363 #if USE_ITT_BUILD
2364   __kmp_itt_lock_destroyed(lck);
2365 #endif /* USE_ITT_BUILD */
2366   DESTROY_LOCK(lck);
2367 
2368   if ((__kmp_user_lock_kind == lk_tas) &&
2369       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2370     ;
2371   }
2372 #if KMP_USE_FUTEX
2373   else if ((__kmp_user_lock_kind == lk_futex) &&
2374            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2375     ;
2376   }
2377 #endif
2378   else {
2379     __kmp_user_lock_free(user_lock, gtid, lck);
2380   }
2381 #endif // KMP_USE_DYNAMIC_LOCK
2382 } // __kmpc_destroy_lock
2383 
2384 /* destroy the lock */
2385 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2386 #if KMP_USE_DYNAMIC_LOCK
2387 
2388 #if USE_ITT_BUILD
2389   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2390   __kmp_itt_lock_destroyed(ilk->lock);
2391 #endif
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393   // This is the case, if called from omp_init_lock_with_hint:
2394   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2395   if (!codeptr)
2396     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2397   if (ompt_enabled.ompt_callback_lock_destroy) {
2398     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2399         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2400   }
2401 #endif
2402   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2403 
2404 #else // KMP_USE_DYNAMIC_LOCK
2405 
2406   kmp_user_lock_p lck;
2407 
2408   if ((__kmp_user_lock_kind == lk_tas) &&
2409       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2410        OMP_NEST_LOCK_T_SIZE)) {
2411     lck = (kmp_user_lock_p)user_lock;
2412   }
2413 #if KMP_USE_FUTEX
2414   else if ((__kmp_user_lock_kind == lk_futex) &&
2415            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2416             OMP_NEST_LOCK_T_SIZE)) {
2417     lck = (kmp_user_lock_p)user_lock;
2418   }
2419 #endif
2420   else {
2421     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2422   }
2423 
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425   // This is the case, if called from omp_init_lock_with_hint:
2426   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2427   if (!codeptr)
2428     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2429   if (ompt_enabled.ompt_callback_lock_destroy) {
2430     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2431         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2432   }
2433 #endif
2434 
2435 #if USE_ITT_BUILD
2436   __kmp_itt_lock_destroyed(lck);
2437 #endif /* USE_ITT_BUILD */
2438 
2439   DESTROY_NESTED_LOCK(lck);
2440 
2441   if ((__kmp_user_lock_kind == lk_tas) &&
2442       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2443        OMP_NEST_LOCK_T_SIZE)) {
2444     ;
2445   }
2446 #if KMP_USE_FUTEX
2447   else if ((__kmp_user_lock_kind == lk_futex) &&
2448            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2449             OMP_NEST_LOCK_T_SIZE)) {
2450     ;
2451   }
2452 #endif
2453   else {
2454     __kmp_user_lock_free(user_lock, gtid, lck);
2455   }
2456 #endif // KMP_USE_DYNAMIC_LOCK
2457 } // __kmpc_destroy_nest_lock
2458 
2459 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2460   KMP_COUNT_BLOCK(OMP_set_lock);
2461 #if KMP_USE_DYNAMIC_LOCK
2462   int tag = KMP_EXTRACT_D_TAG(user_lock);
2463 #if USE_ITT_BUILD
2464   __kmp_itt_lock_acquiring(
2465       (kmp_user_lock_p)
2466           user_lock); // itt function will get to the right lock object.
2467 #endif
2468 #if OMPT_SUPPORT && OMPT_OPTIONAL
2469   // This is the case, if called from omp_init_lock_with_hint:
2470   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2471   if (!codeptr)
2472     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2473   if (ompt_enabled.ompt_callback_mutex_acquire) {
2474     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2475         ompt_mutex_lock, omp_lock_hint_none,
2476         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2477         codeptr);
2478   }
2479 #endif
2480 #if KMP_USE_INLINED_TAS
2481   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2482     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2483   } else
2484 #elif KMP_USE_INLINED_FUTEX
2485   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2486     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2487   } else
2488 #endif
2489   {
2490     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2491   }
2492 #if USE_ITT_BUILD
2493   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2494 #endif
2495 #if OMPT_SUPPORT && OMPT_OPTIONAL
2496   if (ompt_enabled.ompt_callback_mutex_acquired) {
2497     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2498         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2499   }
2500 #endif
2501 
2502 #else // KMP_USE_DYNAMIC_LOCK
2503 
2504   kmp_user_lock_p lck;
2505 
2506   if ((__kmp_user_lock_kind == lk_tas) &&
2507       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2508     lck = (kmp_user_lock_p)user_lock;
2509   }
2510 #if KMP_USE_FUTEX
2511   else if ((__kmp_user_lock_kind == lk_futex) &&
2512            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2513     lck = (kmp_user_lock_p)user_lock;
2514   }
2515 #endif
2516   else {
2517     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2518   }
2519 
2520 #if USE_ITT_BUILD
2521   __kmp_itt_lock_acquiring(lck);
2522 #endif /* USE_ITT_BUILD */
2523 #if OMPT_SUPPORT && OMPT_OPTIONAL
2524   // This is the case, if called from omp_init_lock_with_hint:
2525   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2526   if (!codeptr)
2527     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2528   if (ompt_enabled.ompt_callback_mutex_acquire) {
2529     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2530         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2531         (omp_wait_id_t)lck, codeptr);
2532   }
2533 #endif
2534 
2535   ACQUIRE_LOCK(lck, gtid);
2536 
2537 #if USE_ITT_BUILD
2538   __kmp_itt_lock_acquired(lck);
2539 #endif /* USE_ITT_BUILD */
2540 
2541 #if OMPT_SUPPORT && OMPT_OPTIONAL
2542   if (ompt_enabled.ompt_callback_mutex_acquired) {
2543     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2544         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2545   }
2546 #endif
2547 
2548 #endif // KMP_USE_DYNAMIC_LOCK
2549 }
2550 
2551 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2552 #if KMP_USE_DYNAMIC_LOCK
2553 
2554 #if USE_ITT_BUILD
2555   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2556 #endif
2557 #if OMPT_SUPPORT && OMPT_OPTIONAL
2558   // This is the case, if called from omp_init_lock_with_hint:
2559   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2560   if (!codeptr)
2561     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2562   if (ompt_enabled.enabled) {
2563     if (ompt_enabled.ompt_callback_mutex_acquire) {
2564       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2565           ompt_mutex_nest_lock, omp_lock_hint_none,
2566           __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2567           codeptr);
2568     }
2569   }
2570 #endif
2571   int acquire_status =
2572       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2573   (void) acquire_status;
2574 #if USE_ITT_BUILD
2575   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2576 #endif
2577 
2578 #if OMPT_SUPPORT && OMPT_OPTIONAL
2579   if (ompt_enabled.enabled) {
2580     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2581       if (ompt_enabled.ompt_callback_mutex_acquired) {
2582         // lock_first
2583         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2584             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2585       }
2586     } else {
2587       if (ompt_enabled.ompt_callback_nest_lock) {
2588         // lock_next
2589         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2590             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
2591       }
2592     }
2593   }
2594 #endif
2595 
2596 #else // KMP_USE_DYNAMIC_LOCK
2597   int acquire_status;
2598   kmp_user_lock_p lck;
2599 
2600   if ((__kmp_user_lock_kind == lk_tas) &&
2601       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2602        OMP_NEST_LOCK_T_SIZE)) {
2603     lck = (kmp_user_lock_p)user_lock;
2604   }
2605 #if KMP_USE_FUTEX
2606   else if ((__kmp_user_lock_kind == lk_futex) &&
2607            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2608             OMP_NEST_LOCK_T_SIZE)) {
2609     lck = (kmp_user_lock_p)user_lock;
2610   }
2611 #endif
2612   else {
2613     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2614   }
2615 
2616 #if USE_ITT_BUILD
2617   __kmp_itt_lock_acquiring(lck);
2618 #endif /* USE_ITT_BUILD */
2619 #if OMPT_SUPPORT && OMPT_OPTIONAL
2620   // This is the case, if called from omp_init_lock_with_hint:
2621   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2622   if (!codeptr)
2623     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2624   if (ompt_enabled.enabled) {
2625     if (ompt_enabled.ompt_callback_mutex_acquire) {
2626       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2627           ompt_mutex_nest_lock, omp_lock_hint_none,
2628           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
2629     }
2630   }
2631 #endif
2632 
2633   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2634 
2635 #if USE_ITT_BUILD
2636   __kmp_itt_lock_acquired(lck);
2637 #endif /* USE_ITT_BUILD */
2638 
2639 #if OMPT_SUPPORT && OMPT_OPTIONAL
2640   if (ompt_enabled.enabled) {
2641     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2642       if (ompt_enabled.ompt_callback_mutex_acquired) {
2643         // lock_first
2644         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2645             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2646       }
2647     } else {
2648       if (ompt_enabled.ompt_callback_nest_lock) {
2649         // lock_next
2650         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2651             ompt_scope_begin, (omp_wait_id_t)lck, codeptr);
2652       }
2653     }
2654   }
2655 #endif
2656 
2657 #endif // KMP_USE_DYNAMIC_LOCK
2658 }
2659 
2660 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2661 #if KMP_USE_DYNAMIC_LOCK
2662 
2663   int tag = KMP_EXTRACT_D_TAG(user_lock);
2664 #if USE_ITT_BUILD
2665   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2666 #endif
2667 #if KMP_USE_INLINED_TAS
2668   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2669     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2670   } else
2671 #elif KMP_USE_INLINED_FUTEX
2672   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2673     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2674   } else
2675 #endif
2676   {
2677     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2678   }
2679 
2680 #if OMPT_SUPPORT && OMPT_OPTIONAL
2681   // This is the case, if called from omp_init_lock_with_hint:
2682   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2683   if (!codeptr)
2684     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2685   if (ompt_enabled.ompt_callback_mutex_released) {
2686     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2687         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2688   }
2689 #endif
2690 
2691 #else // KMP_USE_DYNAMIC_LOCK
2692 
2693   kmp_user_lock_p lck;
2694 
2695   /* Can't use serial interval since not block structured */
2696   /* release the lock */
2697 
2698   if ((__kmp_user_lock_kind == lk_tas) &&
2699       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2700 #if KMP_OS_LINUX &&                                                            \
2701     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2702 // "fast" path implemented to fix customer performance issue
2703 #if USE_ITT_BUILD
2704     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2705 #endif /* USE_ITT_BUILD */
2706     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2707     KMP_MB();
2708 
2709 #if OMPT_SUPPORT && OMPT_OPTIONAL
2710     // This is the case, if called from omp_init_lock_with_hint:
2711     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2712     if (!codeptr)
2713       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2714     if (ompt_enabled.ompt_callback_mutex_released) {
2715       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2716           ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2717     }
2718 #endif
2719 
2720     return;
2721 #else
2722     lck = (kmp_user_lock_p)user_lock;
2723 #endif
2724   }
2725 #if KMP_USE_FUTEX
2726   else if ((__kmp_user_lock_kind == lk_futex) &&
2727            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2728     lck = (kmp_user_lock_p)user_lock;
2729   }
2730 #endif
2731   else {
2732     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2733   }
2734 
2735 #if USE_ITT_BUILD
2736   __kmp_itt_lock_releasing(lck);
2737 #endif /* USE_ITT_BUILD */
2738 
2739   RELEASE_LOCK(lck, gtid);
2740 
2741 #if OMPT_SUPPORT && OMPT_OPTIONAL
2742   // This is the case, if called from omp_init_lock_with_hint:
2743   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2744   if (!codeptr)
2745     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2746   if (ompt_enabled.ompt_callback_mutex_released) {
2747     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2748         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2749   }
2750 #endif
2751 
2752 #endif // KMP_USE_DYNAMIC_LOCK
2753 }
2754 
2755 /* release the lock */
2756 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2757 #if KMP_USE_DYNAMIC_LOCK
2758 
2759 #if USE_ITT_BUILD
2760   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2761 #endif
2762   int release_status =
2763       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2764   (void) release_status;
2765 
2766 #if OMPT_SUPPORT && OMPT_OPTIONAL
2767   // This is the case, if called from omp_init_lock_with_hint:
2768   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2769   if (!codeptr)
2770     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2771   if (ompt_enabled.enabled) {
2772     if (release_status == KMP_LOCK_RELEASED) {
2773       if (ompt_enabled.ompt_callback_mutex_released) {
2774         // release_lock_last
2775         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2776             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2777       }
2778     } else if (ompt_enabled.ompt_callback_nest_lock) {
2779       // release_lock_prev
2780       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2781           ompt_scope_end, (omp_wait_id_t)user_lock, codeptr);
2782     }
2783   }
2784 #endif
2785 
2786 #else // KMP_USE_DYNAMIC_LOCK
2787 
2788   kmp_user_lock_p lck;
2789 
2790   /* Can't use serial interval since not block structured */
2791 
2792   if ((__kmp_user_lock_kind == lk_tas) &&
2793       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2794        OMP_NEST_LOCK_T_SIZE)) {
2795 #if KMP_OS_LINUX &&                                                            \
2796     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2797     // "fast" path implemented to fix customer performance issue
2798     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2799 #if USE_ITT_BUILD
2800     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2801 #endif /* USE_ITT_BUILD */
2802 
2803 #if OMPT_SUPPORT && OMPT_OPTIONAL
2804     int release_status = KMP_LOCK_STILL_HELD;
2805 #endif
2806 
2807     if (--(tl->lk.depth_locked) == 0) {
2808       TCW_4(tl->lk.poll, 0);
2809 #if OMPT_SUPPORT && OMPT_OPTIONAL
2810       release_status = KMP_LOCK_RELEASED;
2811 #endif
2812     }
2813     KMP_MB();
2814 
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816     // This is the case, if called from omp_init_lock_with_hint:
2817     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2818     if (!codeptr)
2819       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2820     if (ompt_enabled.enabled) {
2821       if (release_status == KMP_LOCK_RELEASED) {
2822         if (ompt_enabled.ompt_callback_mutex_released) {
2823           // release_lock_last
2824           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2825               ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2826         }
2827       } else if (ompt_enabled.ompt_callback_nest_lock) {
2828         // release_lock_previous
2829         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2830             ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2831       }
2832     }
2833 #endif
2834 
2835     return;
2836 #else
2837     lck = (kmp_user_lock_p)user_lock;
2838 #endif
2839   }
2840 #if KMP_USE_FUTEX
2841   else if ((__kmp_user_lock_kind == lk_futex) &&
2842            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2843             OMP_NEST_LOCK_T_SIZE)) {
2844     lck = (kmp_user_lock_p)user_lock;
2845   }
2846 #endif
2847   else {
2848     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2849   }
2850 
2851 #if USE_ITT_BUILD
2852   __kmp_itt_lock_releasing(lck);
2853 #endif /* USE_ITT_BUILD */
2854 
2855   int release_status;
2856   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2857 #if OMPT_SUPPORT && OMPT_OPTIONAL
2858   // This is the case, if called from omp_init_lock_with_hint:
2859   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2860   if (!codeptr)
2861     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2862   if (ompt_enabled.enabled) {
2863     if (release_status == KMP_LOCK_RELEASED) {
2864       if (ompt_enabled.ompt_callback_mutex_released) {
2865         // release_lock_last
2866         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2867             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2868       }
2869     } else if (ompt_enabled.ompt_callback_nest_lock) {
2870       // release_lock_previous
2871       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2872           ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2873     }
2874   }
2875 #endif
2876 
2877 #endif // KMP_USE_DYNAMIC_LOCK
2878 }
2879 
2880 /* try to acquire the lock */
2881 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2882   KMP_COUNT_BLOCK(OMP_test_lock);
2883 
2884 #if KMP_USE_DYNAMIC_LOCK
2885   int rc;
2886   int tag = KMP_EXTRACT_D_TAG(user_lock);
2887 #if USE_ITT_BUILD
2888   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2889 #endif
2890 #if OMPT_SUPPORT && OMPT_OPTIONAL
2891   // This is the case, if called from omp_init_lock_with_hint:
2892   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2893   if (!codeptr)
2894     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2895   if (ompt_enabled.ompt_callback_mutex_acquire) {
2896     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2897         ompt_mutex_lock, omp_lock_hint_none,
2898         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2899         codeptr);
2900   }
2901 #endif
2902 #if KMP_USE_INLINED_TAS
2903   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2904     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2905   } else
2906 #elif KMP_USE_INLINED_FUTEX
2907   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2908     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2909   } else
2910 #endif
2911   {
2912     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2913   }
2914   if (rc) {
2915 #if USE_ITT_BUILD
2916     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2917 #endif
2918 #if OMPT_SUPPORT && OMPT_OPTIONAL
2919     if (ompt_enabled.ompt_callback_mutex_acquired) {
2920       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2921           ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2922     }
2923 #endif
2924     return FTN_TRUE;
2925   } else {
2926 #if USE_ITT_BUILD
2927     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2928 #endif
2929     return FTN_FALSE;
2930   }
2931 
2932 #else // KMP_USE_DYNAMIC_LOCK
2933 
2934   kmp_user_lock_p lck;
2935   int rc;
2936 
2937   if ((__kmp_user_lock_kind == lk_tas) &&
2938       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2939     lck = (kmp_user_lock_p)user_lock;
2940   }
2941 #if KMP_USE_FUTEX
2942   else if ((__kmp_user_lock_kind == lk_futex) &&
2943            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2944     lck = (kmp_user_lock_p)user_lock;
2945   }
2946 #endif
2947   else {
2948     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2949   }
2950 
2951 #if USE_ITT_BUILD
2952   __kmp_itt_lock_acquiring(lck);
2953 #endif /* USE_ITT_BUILD */
2954 #if OMPT_SUPPORT && OMPT_OPTIONAL
2955   // This is the case, if called from omp_init_lock_with_hint:
2956   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2957   if (!codeptr)
2958     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2959   if (ompt_enabled.ompt_callback_mutex_acquire) {
2960     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2961         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2962         (omp_wait_id_t)lck, codeptr);
2963   }
2964 #endif
2965 
2966   rc = TEST_LOCK(lck, gtid);
2967 #if USE_ITT_BUILD
2968   if (rc) {
2969     __kmp_itt_lock_acquired(lck);
2970   } else {
2971     __kmp_itt_lock_cancelled(lck);
2972   }
2973 #endif /* USE_ITT_BUILD */
2974 #if OMPT_SUPPORT && OMPT_OPTIONAL
2975   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2976     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2977         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2978   }
2979 #endif
2980 
2981   return (rc ? FTN_TRUE : FTN_FALSE);
2982 
2983 /* Can't use serial interval since not block structured */
2984 
2985 #endif // KMP_USE_DYNAMIC_LOCK
2986 }
2987 
2988 /* try to acquire the lock */
2989 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2990 #if KMP_USE_DYNAMIC_LOCK
2991   int rc;
2992 #if USE_ITT_BUILD
2993   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2994 #endif
2995 #if OMPT_SUPPORT && OMPT_OPTIONAL
2996   // This is the case, if called from omp_init_lock_with_hint:
2997   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2998   if (!codeptr)
2999     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3000   if (ompt_enabled.ompt_callback_mutex_acquire) {
3001     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3002         ompt_mutex_nest_lock, omp_lock_hint_none,
3003         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
3004         codeptr);
3005   }
3006 #endif
3007   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3008 #if USE_ITT_BUILD
3009   if (rc) {
3010     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3011   } else {
3012     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3013   }
3014 #endif
3015 #if OMPT_SUPPORT && OMPT_OPTIONAL
3016   if (ompt_enabled.enabled && rc) {
3017     if (rc == 1) {
3018       if (ompt_enabled.ompt_callback_mutex_acquired) {
3019         // lock_first
3020         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3021             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
3022       }
3023     } else {
3024       if (ompt_enabled.ompt_callback_nest_lock) {
3025         // lock_next
3026         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3027             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
3028       }
3029     }
3030   }
3031 #endif
3032   return rc;
3033 
3034 #else // KMP_USE_DYNAMIC_LOCK
3035 
3036   kmp_user_lock_p lck;
3037   int rc;
3038 
3039   if ((__kmp_user_lock_kind == lk_tas) &&
3040       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3041        OMP_NEST_LOCK_T_SIZE)) {
3042     lck = (kmp_user_lock_p)user_lock;
3043   }
3044 #if KMP_USE_FUTEX
3045   else if ((__kmp_user_lock_kind == lk_futex) &&
3046            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3047             OMP_NEST_LOCK_T_SIZE)) {
3048     lck = (kmp_user_lock_p)user_lock;
3049   }
3050 #endif
3051   else {
3052     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3053   }
3054 
3055 #if USE_ITT_BUILD
3056   __kmp_itt_lock_acquiring(lck);
3057 #endif /* USE_ITT_BUILD */
3058 
3059 #if OMPT_SUPPORT && OMPT_OPTIONAL
3060   // This is the case, if called from omp_init_lock_with_hint:
3061   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3062   if (!codeptr)
3063     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3064   if (ompt_enabled.enabled) &&
3065         ompt_enabled.ompt_callback_mutex_acquire) {
3066       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3067           ompt_mutex_nest_lock, omp_lock_hint_none,
3068           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
3069     }
3070 #endif
3071 
3072   rc = TEST_NESTED_LOCK(lck, gtid);
3073 #if USE_ITT_BUILD
3074   if (rc) {
3075     __kmp_itt_lock_acquired(lck);
3076   } else {
3077     __kmp_itt_lock_cancelled(lck);
3078   }
3079 #endif /* USE_ITT_BUILD */
3080 #if OMPT_SUPPORT && OMPT_OPTIONAL
3081   if (ompt_enabled.enabled && rc) {
3082     if (rc == 1) {
3083       if (ompt_enabled.ompt_callback_mutex_acquired) {
3084         // lock_first
3085         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3086             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
3087       }
3088     } else {
3089       if (ompt_enabled.ompt_callback_nest_lock) {
3090         // lock_next
3091         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3092             ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr);
3093       }
3094     }
3095   }
3096 #endif
3097   return rc;
3098 
3099 /* Can't use serial interval since not block structured */
3100 
3101 #endif // KMP_USE_DYNAMIC_LOCK
3102 }
3103 
3104 // Interface to fast scalable reduce methods routines
3105 
3106 // keep the selected method in a thread local structure for cross-function
3107 // usage: will be used in __kmpc_end_reduce* functions;
3108 // another solution: to re-determine the method one more time in
3109 // __kmpc_end_reduce* functions (new prototype required then)
3110 // AT: which solution is better?
3111 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3112   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3113 
3114 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3115   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3116 
3117 // description of the packed_reduction_method variable: look at the macros in
3118 // kmp.h
3119 
3120 // used in a critical section reduce block
3121 static __forceinline void
3122 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3123                                           kmp_critical_name *crit) {
3124 
3125   // this lock was visible to a customer and to the threading profile tool as a
3126   // serial overhead span (although it's used for an internal purpose only)
3127   //            why was it visible in previous implementation?
3128   //            should we keep it visible in new reduce block?
3129   kmp_user_lock_p lck;
3130 
3131 #if KMP_USE_DYNAMIC_LOCK
3132 
3133   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3134   // Check if it is initialized.
3135   if (*lk == 0) {
3136     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3137       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3138                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3139     } else {
3140       __kmp_init_indirect_csptr(crit, loc, global_tid,
3141                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3142     }
3143   }
3144   // Branch for accessing the actual lock object and set operation. This
3145   // branching is inevitable since this lock initialization does not follow the
3146   // normal dispatch path (lock table is not used).
3147   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3148     lck = (kmp_user_lock_p)lk;
3149     KMP_DEBUG_ASSERT(lck != NULL);
3150     if (__kmp_env_consistency_check) {
3151       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3152     }
3153     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3154   } else {
3155     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3156     lck = ilk->lock;
3157     KMP_DEBUG_ASSERT(lck != NULL);
3158     if (__kmp_env_consistency_check) {
3159       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3160     }
3161     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3162   }
3163 
3164 #else // KMP_USE_DYNAMIC_LOCK
3165 
3166   // We know that the fast reduction code is only emitted by Intel compilers
3167   // with 32 byte critical sections. If there isn't enough space, then we
3168   // have to use a pointer.
3169   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3170     lck = (kmp_user_lock_p)crit;
3171   } else {
3172     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3173   }
3174   KMP_DEBUG_ASSERT(lck != NULL);
3175 
3176   if (__kmp_env_consistency_check)
3177     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3178 
3179   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3180 
3181 #endif // KMP_USE_DYNAMIC_LOCK
3182 }
3183 
3184 // used in a critical section reduce block
3185 static __forceinline void
3186 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3187                                         kmp_critical_name *crit) {
3188 
3189   kmp_user_lock_p lck;
3190 
3191 #if KMP_USE_DYNAMIC_LOCK
3192 
3193   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3194     lck = (kmp_user_lock_p)crit;
3195     if (__kmp_env_consistency_check)
3196       __kmp_pop_sync(global_tid, ct_critical, loc);
3197     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3198   } else {
3199     kmp_indirect_lock_t *ilk =
3200         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3201     if (__kmp_env_consistency_check)
3202       __kmp_pop_sync(global_tid, ct_critical, loc);
3203     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3204   }
3205 
3206 #else // KMP_USE_DYNAMIC_LOCK
3207 
3208   // We know that the fast reduction code is only emitted by Intel compilers
3209   // with 32 byte critical sections. If there isn't enough space, then we have
3210   // to use a pointer.
3211   if (__kmp_base_user_lock_size > 32) {
3212     lck = *((kmp_user_lock_p *)crit);
3213     KMP_ASSERT(lck != NULL);
3214   } else {
3215     lck = (kmp_user_lock_p)crit;
3216   }
3217 
3218   if (__kmp_env_consistency_check)
3219     __kmp_pop_sync(global_tid, ct_critical, loc);
3220 
3221   __kmp_release_user_lock_with_checks(lck, global_tid);
3222 
3223 #endif // KMP_USE_DYNAMIC_LOCK
3224 } // __kmp_end_critical_section_reduce_block
3225 
3226 #if OMP_40_ENABLED
3227 static __forceinline int
3228 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3229                                      int *task_state) {
3230   kmp_team_t *team;
3231 
3232   // Check if we are inside the teams construct?
3233   if (th->th.th_teams_microtask) {
3234     *team_p = team = th->th.th_team;
3235     if (team->t.t_level == th->th.th_teams_level) {
3236       // This is reduction at teams construct.
3237       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3238       // Let's swap teams temporarily for the reduction.
3239       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3240       th->th.th_team = team->t.t_parent;
3241       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3242       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3243       *task_state = th->th.th_task_state;
3244       th->th.th_task_state = 0;
3245 
3246       return 1;
3247     }
3248   }
3249   return 0;
3250 }
3251 
3252 static __forceinline void
3253 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3254   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3255   th->th.th_info.ds.ds_tid = 0;
3256   th->th.th_team = team;
3257   th->th.th_team_nproc = team->t.t_nproc;
3258   th->th.th_task_team = team->t.t_task_team[task_state];
3259   th->th.th_task_state = task_state;
3260 }
3261 #endif
3262 
3263 /* 2.a.i. Reduce Block without a terminating barrier */
3264 /*!
3265 @ingroup SYNCHRONIZATION
3266 @param loc source location information
3267 @param global_tid global thread number
3268 @param num_vars number of items (variables) to be reduced
3269 @param reduce_size size of data in bytes to be reduced
3270 @param reduce_data pointer to data to be reduced
3271 @param reduce_func callback function providing reduction operation on two
3272 operands and returning result of reduction in lhs_data
3273 @param lck pointer to the unique lock data structure
3274 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3275 threads if atomic reduction needed
3276 
3277 The nowait version is used for a reduce clause with the nowait argument.
3278 */
3279 kmp_int32
3280 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3281                      size_t reduce_size, void *reduce_data,
3282                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3283                      kmp_critical_name *lck) {
3284 
3285   KMP_COUNT_BLOCK(REDUCE_nowait);
3286   int retval = 0;
3287   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3288 #if OMP_40_ENABLED
3289   kmp_info_t *th;
3290   kmp_team_t *team;
3291   int teams_swapped = 0, task_state;
3292 #endif
3293   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3294 
3295   // why do we need this initialization here at all?
3296   // Reduction clause can not be used as a stand-alone directive.
3297 
3298   // do not call __kmp_serial_initialize(), it will be called by
3299   // __kmp_parallel_initialize() if needed
3300   // possible detection of false-positive race by the threadchecker ???
3301   if (!TCR_4(__kmp_init_parallel))
3302     __kmp_parallel_initialize();
3303 
3304 // check correctness of reduce block nesting
3305 #if KMP_USE_DYNAMIC_LOCK
3306   if (__kmp_env_consistency_check)
3307     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3308 #else
3309   if (__kmp_env_consistency_check)
3310     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3311 #endif
3312 
3313 #if OMP_40_ENABLED
3314   th = __kmp_thread_from_gtid(global_tid);
3315   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3316 #endif // OMP_40_ENABLED
3317 
3318   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3319   // the value should be kept in a variable
3320   // the variable should be either a construct-specific or thread-specific
3321   // property, not a team specific property
3322   //     (a thread can reach the next reduce block on the next construct, reduce
3323   //     method may differ on the next construct)
3324   // an ident_t "loc" parameter could be used as a construct-specific property
3325   // (what if loc == 0?)
3326   //     (if both construct-specific and team-specific variables were shared,
3327   //     then unness extra syncs should be needed)
3328   // a thread-specific variable is better regarding two issues above (next
3329   // construct and extra syncs)
3330   // a thread-specific "th_local.reduction_method" variable is used currently
3331   // each thread executes 'determine' and 'set' lines (no need to execute by one
3332   // thread, to avoid unness extra syncs)
3333 
3334   packed_reduction_method = __kmp_determine_reduction_method(
3335       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3336   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3337 
3338   if (packed_reduction_method == critical_reduce_block) {
3339 
3340     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3341     retval = 1;
3342 
3343   } else if (packed_reduction_method == empty_reduce_block) {
3344 
3345     // usage: if team size == 1, no synchronization is required ( Intel
3346     // platforms only )
3347     retval = 1;
3348 
3349   } else if (packed_reduction_method == atomic_reduce_block) {
3350 
3351     retval = 2;
3352 
3353     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3354     // won't be called by the code gen)
3355     //     (it's not quite good, because the checking block has been closed by
3356     //     this 'pop',
3357     //      but atomic operation has not been executed yet, will be executed
3358     //      slightly later, literally on next instruction)
3359     if (__kmp_env_consistency_check)
3360       __kmp_pop_sync(global_tid, ct_reduce, loc);
3361 
3362   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3363                                    tree_reduce_block)) {
3364 
3365 // AT: performance issue: a real barrier here
3366 // AT:     (if master goes slow, other threads are blocked here waiting for the
3367 // master to come and release them)
3368 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3369 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3370 // be confusing to a customer)
3371 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3372 // might go faster and be more in line with sense of NOWAIT
3373 // AT: TO DO: do epcc test and compare times
3374 
3375 // this barrier should be invisible to a customer and to the threading profile
3376 // tool (it's neither a terminating barrier nor customer's code, it's
3377 // used for an internal purpose)
3378 #if OMPT_SUPPORT
3379     // JP: can this barrier potentially leed to task scheduling?
3380     // JP: as long as there is a barrier in the implementation, OMPT should and
3381     // will provide the barrier events
3382     //         so we set-up the necessary frame/return addresses.
3383     omp_frame_t *ompt_frame;
3384     if (ompt_enabled.enabled) {
3385       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3386       if (ompt_frame->enter_frame == NULL)
3387         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3388       OMPT_STORE_RETURN_ADDRESS(global_tid);
3389     }
3390 #endif
3391 #if USE_ITT_NOTIFY
3392     __kmp_threads[global_tid]->th.th_ident = loc;
3393 #endif
3394     retval =
3395         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3396                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3397     retval = (retval != 0) ? (0) : (1);
3398 #if OMPT_SUPPORT && OMPT_OPTIONAL
3399     if (ompt_enabled.enabled) {
3400       ompt_frame->enter_frame = NULL;
3401     }
3402 #endif
3403 
3404     // all other workers except master should do this pop here
3405     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3406     if (__kmp_env_consistency_check) {
3407       if (retval == 0) {
3408         __kmp_pop_sync(global_tid, ct_reduce, loc);
3409       }
3410     }
3411 
3412   } else {
3413 
3414     // should never reach this block
3415     KMP_ASSERT(0); // "unexpected method"
3416   }
3417 #if OMP_40_ENABLED
3418   if (teams_swapped) {
3419     __kmp_restore_swapped_teams(th, team, task_state);
3420   }
3421 #endif
3422   KA_TRACE(
3423       10,
3424       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3425        global_tid, packed_reduction_method, retval));
3426 
3427   return retval;
3428 }
3429 
3430 /*!
3431 @ingroup SYNCHRONIZATION
3432 @param loc source location information
3433 @param global_tid global thread id.
3434 @param lck pointer to the unique lock data structure
3435 
3436 Finish the execution of a reduce nowait.
3437 */
3438 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3439                               kmp_critical_name *lck) {
3440 
3441   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3442 
3443   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3444 
3445   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3446 
3447   if (packed_reduction_method == critical_reduce_block) {
3448 
3449     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3450 
3451   } else if (packed_reduction_method == empty_reduce_block) {
3452 
3453     // usage: if team size == 1, no synchronization is required ( on Intel
3454     // platforms only )
3455 
3456   } else if (packed_reduction_method == atomic_reduce_block) {
3457 
3458     // neither master nor other workers should get here
3459     //     (code gen does not generate this call in case 2: atomic reduce block)
3460     // actually it's better to remove this elseif at all;
3461     // after removal this value will checked by the 'else' and will assert
3462 
3463   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3464                                    tree_reduce_block)) {
3465 
3466     // only master gets here
3467 
3468   } else {
3469 
3470     // should never reach this block
3471     KMP_ASSERT(0); // "unexpected method"
3472   }
3473 
3474   if (__kmp_env_consistency_check)
3475     __kmp_pop_sync(global_tid, ct_reduce, loc);
3476 
3477   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3478                 global_tid, packed_reduction_method));
3479 
3480   return;
3481 }
3482 
3483 /* 2.a.ii. Reduce Block with a terminating barrier */
3484 
3485 /*!
3486 @ingroup SYNCHRONIZATION
3487 @param loc source location information
3488 @param global_tid global thread number
3489 @param num_vars number of items (variables) to be reduced
3490 @param reduce_size size of data in bytes to be reduced
3491 @param reduce_data pointer to data to be reduced
3492 @param reduce_func callback function providing reduction operation on two
3493 operands and returning result of reduction in lhs_data
3494 @param lck pointer to the unique lock data structure
3495 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3496 threads if atomic reduction needed
3497 
3498 A blocking reduce that includes an implicit barrier.
3499 */
3500 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3501                         size_t reduce_size, void *reduce_data,
3502                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3503                         kmp_critical_name *lck) {
3504   KMP_COUNT_BLOCK(REDUCE_wait);
3505   int retval = 0;
3506   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3507 #if OMP_40_ENABLED
3508   kmp_info_t *th;
3509   kmp_team_t *team;
3510   int teams_swapped = 0, task_state;
3511 #endif
3512 
3513   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3514 
3515   // why do we need this initialization here at all?
3516   // Reduction clause can not be a stand-alone directive.
3517 
3518   // do not call __kmp_serial_initialize(), it will be called by
3519   // __kmp_parallel_initialize() if needed
3520   // possible detection of false-positive race by the threadchecker ???
3521   if (!TCR_4(__kmp_init_parallel))
3522     __kmp_parallel_initialize();
3523 
3524 // check correctness of reduce block nesting
3525 #if KMP_USE_DYNAMIC_LOCK
3526   if (__kmp_env_consistency_check)
3527     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3528 #else
3529   if (__kmp_env_consistency_check)
3530     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3531 #endif
3532 
3533 #if OMP_40_ENABLED
3534   th = __kmp_thread_from_gtid(global_tid);
3535   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3536 #endif // OMP_40_ENABLED
3537 
3538   packed_reduction_method = __kmp_determine_reduction_method(
3539       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3540   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3541 
3542   if (packed_reduction_method == critical_reduce_block) {
3543 
3544     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3545     retval = 1;
3546 
3547   } else if (packed_reduction_method == empty_reduce_block) {
3548 
3549     // usage: if team size == 1, no synchronization is required ( Intel
3550     // platforms only )
3551     retval = 1;
3552 
3553   } else if (packed_reduction_method == atomic_reduce_block) {
3554 
3555     retval = 2;
3556 
3557   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3558                                    tree_reduce_block)) {
3559 
3560 // case tree_reduce_block:
3561 // this barrier should be visible to a customer and to the threading profile
3562 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3563 #if OMPT_SUPPORT
3564     omp_frame_t *ompt_frame;
3565     if (ompt_enabled.enabled) {
3566       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3567       if (ompt_frame->enter_frame == NULL)
3568         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3569       OMPT_STORE_RETURN_ADDRESS(global_tid);
3570     }
3571 #endif
3572 #if USE_ITT_NOTIFY
3573     __kmp_threads[global_tid]->th.th_ident =
3574         loc; // needed for correct notification of frames
3575 #endif
3576     retval =
3577         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3578                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3579     retval = (retval != 0) ? (0) : (1);
3580 #if OMPT_SUPPORT && OMPT_OPTIONAL
3581     if (ompt_enabled.enabled) {
3582       ompt_frame->enter_frame = NULL;
3583     }
3584 #endif
3585 
3586     // all other workers except master should do this pop here
3587     // ( none of other workers except master will enter __kmpc_end_reduce() )
3588     if (__kmp_env_consistency_check) {
3589       if (retval == 0) { // 0: all other workers; 1: master
3590         __kmp_pop_sync(global_tid, ct_reduce, loc);
3591       }
3592     }
3593 
3594   } else {
3595 
3596     // should never reach this block
3597     KMP_ASSERT(0); // "unexpected method"
3598   }
3599 #if OMP_40_ENABLED
3600   if (teams_swapped) {
3601     __kmp_restore_swapped_teams(th, team, task_state);
3602   }
3603 #endif
3604 
3605   KA_TRACE(10,
3606            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3607             global_tid, packed_reduction_method, retval));
3608 
3609   return retval;
3610 }
3611 
3612 /*!
3613 @ingroup SYNCHRONIZATION
3614 @param loc source location information
3615 @param global_tid global thread id.
3616 @param lck pointer to the unique lock data structure
3617 
3618 Finish the execution of a blocking reduce.
3619 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3620 start function.
3621 */
3622 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3623                        kmp_critical_name *lck) {
3624 
3625   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3626 #if OMP_40_ENABLED
3627   kmp_info_t *th;
3628   kmp_team_t *team;
3629   int teams_swapped = 0, task_state;
3630 #endif
3631 
3632   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3633 
3634 #if OMP_40_ENABLED
3635   th = __kmp_thread_from_gtid(global_tid);
3636   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3637 #endif // OMP_40_ENABLED
3638 
3639   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3640 
3641   // this barrier should be visible to a customer and to the threading profile
3642   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3643 
3644   if (packed_reduction_method == critical_reduce_block) {
3645 
3646     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3647 
3648 // TODO: implicit barrier: should be exposed
3649 #if OMPT_SUPPORT
3650     omp_frame_t *ompt_frame;
3651     if (ompt_enabled.enabled) {
3652       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3653       if (ompt_frame->enter_frame == NULL)
3654         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3655       OMPT_STORE_RETURN_ADDRESS(global_tid);
3656     }
3657 #endif
3658 #if USE_ITT_NOTIFY
3659     __kmp_threads[global_tid]->th.th_ident = loc;
3660 #endif
3661     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3662 #if OMPT_SUPPORT && OMPT_OPTIONAL
3663     if (ompt_enabled.enabled) {
3664       ompt_frame->enter_frame = NULL;
3665     }
3666 #endif
3667 
3668   } else if (packed_reduction_method == empty_reduce_block) {
3669 
3670 // usage: if team size==1, no synchronization is required (Intel platforms only)
3671 
3672 // TODO: implicit barrier: should be exposed
3673 #if OMPT_SUPPORT
3674     omp_frame_t *ompt_frame;
3675     if (ompt_enabled.enabled) {
3676       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3677       if (ompt_frame->enter_frame == NULL)
3678         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3679       OMPT_STORE_RETURN_ADDRESS(global_tid);
3680     }
3681 #endif
3682 #if USE_ITT_NOTIFY
3683     __kmp_threads[global_tid]->th.th_ident = loc;
3684 #endif
3685     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3686 #if OMPT_SUPPORT && OMPT_OPTIONAL
3687     if (ompt_enabled.enabled) {
3688       ompt_frame->enter_frame = NULL;
3689     }
3690 #endif
3691 
3692   } else if (packed_reduction_method == atomic_reduce_block) {
3693 
3694 #if OMPT_SUPPORT
3695     omp_frame_t *ompt_frame;
3696     if (ompt_enabled.enabled) {
3697       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3698       if (ompt_frame->enter_frame == NULL)
3699         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3700       OMPT_STORE_RETURN_ADDRESS(global_tid);
3701     }
3702 #endif
3703 // TODO: implicit barrier: should be exposed
3704 #if USE_ITT_NOTIFY
3705     __kmp_threads[global_tid]->th.th_ident = loc;
3706 #endif
3707     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3708 #if OMPT_SUPPORT && OMPT_OPTIONAL
3709     if (ompt_enabled.enabled) {
3710       ompt_frame->enter_frame = NULL;
3711     }
3712 #endif
3713 
3714   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3715                                    tree_reduce_block)) {
3716 
3717     // only master executes here (master releases all other workers)
3718     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3719                             global_tid);
3720 
3721   } else {
3722 
3723     // should never reach this block
3724     KMP_ASSERT(0); // "unexpected method"
3725   }
3726 #if OMP_40_ENABLED
3727   if (teams_swapped) {
3728     __kmp_restore_swapped_teams(th, team, task_state);
3729   }
3730 #endif
3731 
3732   if (__kmp_env_consistency_check)
3733     __kmp_pop_sync(global_tid, ct_reduce, loc);
3734 
3735   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3736                 global_tid, packed_reduction_method));
3737 
3738   return;
3739 }
3740 
3741 #undef __KMP_GET_REDUCTION_METHOD
3742 #undef __KMP_SET_REDUCTION_METHOD
3743 
3744 /* end of interface to fast scalable reduce routines */
3745 
3746 kmp_uint64 __kmpc_get_taskid() {
3747 
3748   kmp_int32 gtid;
3749   kmp_info_t *thread;
3750 
3751   gtid = __kmp_get_gtid();
3752   if (gtid < 0) {
3753     return 0;
3754   }
3755   thread = __kmp_thread_from_gtid(gtid);
3756   return thread->th.th_current_task->td_task_id;
3757 
3758 } // __kmpc_get_taskid
3759 
3760 kmp_uint64 __kmpc_get_parent_taskid() {
3761 
3762   kmp_int32 gtid;
3763   kmp_info_t *thread;
3764   kmp_taskdata_t *parent_task;
3765 
3766   gtid = __kmp_get_gtid();
3767   if (gtid < 0) {
3768     return 0;
3769   }
3770   thread = __kmp_thread_from_gtid(gtid);
3771   parent_task = thread->th.th_current_task->td_parent;
3772   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3773 
3774 } // __kmpc_get_parent_taskid
3775 
3776 #if OMP_45_ENABLED
3777 /*!
3778 @ingroup WORK_SHARING
3779 @param loc  source location information.
3780 @param gtid  global thread number.
3781 @param num_dims  number of associated doacross loops.
3782 @param dims  info on loops bounds.
3783 
3784 Initialize doacross loop information.
3785 Expect compiler send us inclusive bounds,
3786 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3787 */
3788 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3789                           const struct kmp_dim *dims) {
3790   int j, idx;
3791   kmp_int64 last, trace_count;
3792   kmp_info_t *th = __kmp_threads[gtid];
3793   kmp_team_t *team = th->th.th_team;
3794   kmp_uint32 *flags;
3795   kmp_disp_t *pr_buf = th->th.th_dispatch;
3796   dispatch_shared_info_t *sh_buf;
3797 
3798   KA_TRACE(
3799       20,
3800       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3801        gtid, num_dims, !team->t.t_serialized));
3802   KMP_DEBUG_ASSERT(dims != NULL);
3803   KMP_DEBUG_ASSERT(num_dims > 0);
3804 
3805   if (team->t.t_serialized) {
3806     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3807     return; // no dependencies if team is serialized
3808   }
3809   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3810   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3811   // the next loop
3812   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3813 
3814   // Save bounds info into allocated private buffer
3815   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3816   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3817       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3818   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3819   pr_buf->th_doacross_info[0] =
3820       (kmp_int64)num_dims; // first element is number of dimensions
3821   // Save also address of num_done in order to access it later without knowing
3822   // the buffer index
3823   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3824   pr_buf->th_doacross_info[2] = dims[0].lo;
3825   pr_buf->th_doacross_info[3] = dims[0].up;
3826   pr_buf->th_doacross_info[4] = dims[0].st;
3827   last = 5;
3828   for (j = 1; j < num_dims; ++j) {
3829     kmp_int64
3830         range_length; // To keep ranges of all dimensions but the first dims[0]
3831     if (dims[j].st == 1) { // most common case
3832       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3833       range_length = dims[j].up - dims[j].lo + 1;
3834     } else {
3835       if (dims[j].st > 0) {
3836         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3837         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3838       } else { // negative increment
3839         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3840         range_length =
3841             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3842       }
3843     }
3844     pr_buf->th_doacross_info[last++] = range_length;
3845     pr_buf->th_doacross_info[last++] = dims[j].lo;
3846     pr_buf->th_doacross_info[last++] = dims[j].up;
3847     pr_buf->th_doacross_info[last++] = dims[j].st;
3848   }
3849 
3850   // Compute total trip count.
3851   // Start with range of dims[0] which we don't need to keep in the buffer.
3852   if (dims[0].st == 1) { // most common case
3853     trace_count = dims[0].up - dims[0].lo + 1;
3854   } else if (dims[0].st > 0) {
3855     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3856     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3857   } else { // negative increment
3858     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3859     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3860   }
3861   for (j = 1; j < num_dims; ++j) {
3862     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3863   }
3864   KMP_DEBUG_ASSERT(trace_count > 0);
3865 
3866   // Check if shared buffer is not occupied by other loop (idx -
3867   // __kmp_dispatch_num_buffers)
3868   if (idx != sh_buf->doacross_buf_idx) {
3869     // Shared buffer is occupied, wait for it to be free
3870     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3871                        __kmp_eq_4, NULL);
3872   }
3873 #if KMP_32_BIT_ARCH
3874   // Check if we are the first thread. After the CAS the first thread gets 0,
3875   // others get 1 if initialization is in progress, allocated pointer otherwise.
3876   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3877   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3878       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3879 #else
3880   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3881       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3882 #endif
3883   if (flags == NULL) {
3884     // we are the first thread, allocate the array of flags
3885     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3886     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3887     KMP_MB();
3888     sh_buf->doacross_flags = flags;
3889   } else if (flags == (kmp_uint32 *)1) {
3890 #if KMP_32_BIT_ARCH
3891     // initialization is still in progress, need to wait
3892     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3893 #else
3894     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3895 #endif
3896       KMP_YIELD(TRUE);
3897     KMP_MB();
3898   } else {
3899     KMP_MB();
3900   }
3901   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3902   pr_buf->th_doacross_flags =
3903       sh_buf->doacross_flags; // save private copy in order to not
3904   // touch shared buffer on each iteration
3905   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3906 }
3907 
3908 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3909   kmp_int32 shft, num_dims, i;
3910   kmp_uint32 flag;
3911   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3912   kmp_info_t *th = __kmp_threads[gtid];
3913   kmp_team_t *team = th->th.th_team;
3914   kmp_disp_t *pr_buf;
3915   kmp_int64 lo, up, st;
3916 
3917   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3918   if (team->t.t_serialized) {
3919     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3920     return; // no dependencies if team is serialized
3921   }
3922 
3923   // calculate sequential iteration number and check out-of-bounds condition
3924   pr_buf = th->th.th_dispatch;
3925   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3926   num_dims = pr_buf->th_doacross_info[0];
3927   lo = pr_buf->th_doacross_info[2];
3928   up = pr_buf->th_doacross_info[3];
3929   st = pr_buf->th_doacross_info[4];
3930   if (st == 1) { // most common case
3931     if (vec[0] < lo || vec[0] > up) {
3932       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3933                     "bounds [%lld,%lld]\n",
3934                     gtid, vec[0], lo, up));
3935       return;
3936     }
3937     iter_number = vec[0] - lo;
3938   } else if (st > 0) {
3939     if (vec[0] < lo || vec[0] > up) {
3940       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3941                     "bounds [%lld,%lld]\n",
3942                     gtid, vec[0], lo, up));
3943       return;
3944     }
3945     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3946   } else { // negative increment
3947     if (vec[0] > lo || vec[0] < up) {
3948       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3949                     "bounds [%lld,%lld]\n",
3950                     gtid, vec[0], lo, up));
3951       return;
3952     }
3953     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3954   }
3955   for (i = 1; i < num_dims; ++i) {
3956     kmp_int64 iter, ln;
3957     kmp_int32 j = i * 4;
3958     ln = pr_buf->th_doacross_info[j + 1];
3959     lo = pr_buf->th_doacross_info[j + 2];
3960     up = pr_buf->th_doacross_info[j + 3];
3961     st = pr_buf->th_doacross_info[j + 4];
3962     if (st == 1) {
3963       if (vec[i] < lo || vec[i] > up) {
3964         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3965                       "bounds [%lld,%lld]\n",
3966                       gtid, vec[i], lo, up));
3967         return;
3968       }
3969       iter = vec[i] - lo;
3970     } else if (st > 0) {
3971       if (vec[i] < lo || vec[i] > up) {
3972         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3973                       "bounds [%lld,%lld]\n",
3974                       gtid, vec[i], lo, up));
3975         return;
3976       }
3977       iter = (kmp_uint64)(vec[i] - lo) / st;
3978     } else { // st < 0
3979       if (vec[i] > lo || vec[i] < up) {
3980         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3981                       "bounds [%lld,%lld]\n",
3982                       gtid, vec[i], lo, up));
3983         return;
3984       }
3985       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3986     }
3987     iter_number = iter + ln * iter_number;
3988   }
3989   shft = iter_number % 32; // use 32-bit granularity
3990   iter_number >>= 5; // divided by 32
3991   flag = 1 << shft;
3992   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3993     KMP_YIELD(TRUE);
3994   }
3995   KMP_MB();
3996   KA_TRACE(20,
3997            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3998             gtid, (iter_number << 5) + shft));
3999 }
4000 
4001 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4002   kmp_int32 shft, num_dims, i;
4003   kmp_uint32 flag;
4004   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4005   kmp_info_t *th = __kmp_threads[gtid];
4006   kmp_team_t *team = th->th.th_team;
4007   kmp_disp_t *pr_buf;
4008   kmp_int64 lo, st;
4009 
4010   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4011   if (team->t.t_serialized) {
4012     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4013     return; // no dependencies if team is serialized
4014   }
4015 
4016   // calculate sequential iteration number (same as in "wait" but no
4017   // out-of-bounds checks)
4018   pr_buf = th->th.th_dispatch;
4019   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4020   num_dims = pr_buf->th_doacross_info[0];
4021   lo = pr_buf->th_doacross_info[2];
4022   st = pr_buf->th_doacross_info[4];
4023   if (st == 1) { // most common case
4024     iter_number = vec[0] - lo;
4025   } else if (st > 0) {
4026     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4027   } else { // negative increment
4028     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4029   }
4030   for (i = 1; i < num_dims; ++i) {
4031     kmp_int64 iter, ln;
4032     kmp_int32 j = i * 4;
4033     ln = pr_buf->th_doacross_info[j + 1];
4034     lo = pr_buf->th_doacross_info[j + 2];
4035     st = pr_buf->th_doacross_info[j + 4];
4036     if (st == 1) {
4037       iter = vec[i] - lo;
4038     } else if (st > 0) {
4039       iter = (kmp_uint64)(vec[i] - lo) / st;
4040     } else { // st < 0
4041       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4042     }
4043     iter_number = iter + ln * iter_number;
4044   }
4045   shft = iter_number % 32; // use 32-bit granularity
4046   iter_number >>= 5; // divided by 32
4047   flag = 1 << shft;
4048   KMP_MB();
4049   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4050     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4051   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4052                 (iter_number << 5) + shft));
4053 }
4054 
4055 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4056   kmp_int32 num_done;
4057   kmp_info_t *th = __kmp_threads[gtid];
4058   kmp_team_t *team = th->th.th_team;
4059   kmp_disp_t *pr_buf = th->th.th_dispatch;
4060 
4061   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4062   if (team->t.t_serialized) {
4063     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4064     return; // nothing to do
4065   }
4066   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4067   if (num_done == th->th.th_team_nproc) {
4068     // we are the last thread, need to free shared resources
4069     int idx = pr_buf->th_doacross_buf_idx - 1;
4070     dispatch_shared_info_t *sh_buf =
4071         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4072     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4073                      (kmp_int64)&sh_buf->doacross_num_done);
4074     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4075     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4076     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4077     sh_buf->doacross_flags = NULL;
4078     sh_buf->doacross_num_done = 0;
4079     sh_buf->doacross_buf_idx +=
4080         __kmp_dispatch_num_buffers; // free buffer for future re-use
4081   }
4082   // free private resources (need to keep buffer index forever)
4083   pr_buf->th_doacross_flags = NULL;
4084   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4085   pr_buf->th_doacross_info = NULL;
4086   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4087 }
4088 #endif
4089 
4090 #if OMP_50_ENABLED
4091 int __kmpc_get_target_offload(void) {
4092   if (!__kmp_init_serial) {
4093     __kmp_serial_initialize();
4094   }
4095   return __kmp_target_offload;
4096 }
4097 #endif // OMP_50_ENABLED
4098 
4099 // end of file //
4100