1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #define __KMP_IMP
15 #include "omp.h" /* extern "C" declarations of user-visible routines */
16 #include "kmp.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_stats.h"
22 
23 #if OMPT_SUPPORT
24 #include "ompt-specific.h"
25 #endif
26 
27 #define MAX_MESSAGE 512
28 
29 // flags will be used in future, e.g. to implement openmp_strict library
30 // restrictions
31 
32 /*!
33  * @ingroup STARTUP_SHUTDOWN
34  * @param loc   in   source location information
35  * @param flags in   for future use (currently ignored)
36  *
37  * Initialize the runtime library. This call is optional; if it is not made then
38  * it will be implicitly called by attempts to use other library functions.
39  */
40 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
41   // By default __kmpc_begin() is no-op.
42   char *env;
43   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
44       __kmp_str_match_true(env)) {
45     __kmp_middle_initialize();
46     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
47   } else if (__kmp_ignore_mppbeg() == FALSE) {
48     // By default __kmp_ignore_mppbeg() returns TRUE.
49     __kmp_internal_begin();
50     KC_TRACE(10, ("__kmpc_begin: called\n"));
51   }
52 }
53 
54 /*!
55  * @ingroup STARTUP_SHUTDOWN
56  * @param loc source location information
57  *
58  * Shutdown the runtime library. This is also optional, and even if called will
59  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
60  * zero.
61  */
62 void __kmpc_end(ident_t *loc) {
63   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
64   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
65   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
66   // returns FALSE and __kmpc_end() will unregister this root (it can cause
67   // library shut down).
68   if (__kmp_ignore_mppend() == FALSE) {
69     KC_TRACE(10, ("__kmpc_end: called\n"));
70     KA_TRACE(30, ("__kmpc_end\n"));
71 
72     __kmp_internal_end_thread(-1);
73   }
74 }
75 
76 /*!
77 @ingroup THREAD_STATES
78 @param loc Source location information.
79 @return The global thread index of the active thread.
80 
81 This function can be called in any context.
82 
83 If the runtime has ony been entered at the outermost level from a
84 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
85 that which would be returned by omp_get_thread_num() in the outermost
86 active parallel construct. (Or zero if there is no active parallel
87 construct, since the master thread is necessarily thread zero).
88 
89 If multiple non-OpenMP threads all enter an OpenMP construct then this
90 will be a unique thread identifier among all the threads created by
91 the OpenMP runtime (but the value cannote be defined in terms of
92 OpenMP thread ids returned by omp_get_thread_num()).
93 */
94 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
95   kmp_int32 gtid = __kmp_entry_gtid();
96 
97   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
98 
99   return gtid;
100 }
101 
102 /*!
103 @ingroup THREAD_STATES
104 @param loc Source location information.
105 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
106 
107 This function can be called in any context.
108 It returns the total number of threads under the control of the OpenMP runtime.
109 That is not a number that can be determined by any OpenMP standard calls, since
110 the library may be called from more than one non-OpenMP thread, and this
111 reflects the total over all such calls. Similarly the runtime maintains
112 underlying threads even when they are not active (since the cost of creating
113 and destroying OS threads is high), this call counts all such threads even if
114 they are not waiting for work.
115 */
116 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
117   KC_TRACE(10,
118            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
119 
120   return TCR_4(__kmp_all_nth);
121 }
122 
123 /*!
124 @ingroup THREAD_STATES
125 @param loc Source location information.
126 @return The thread number of the calling thread in the innermost active parallel
127 construct.
128 */
129 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
130   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
131   return __kmp_tid_from_gtid(__kmp_entry_gtid());
132 }
133 
134 /*!
135 @ingroup THREAD_STATES
136 @param loc Source location information.
137 @return The number of threads in the innermost active parallel construct.
138 */
139 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
140   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
141 
142   return __kmp_entry_thread()->th.th_team->t.t_nproc;
143 }
144 
145 /*!
146  * @ingroup DEPRECATED
147  * @param loc location description
148  *
149  * This function need not be called. It always returns TRUE.
150  */
151 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
152 #ifndef KMP_DEBUG
153 
154   return TRUE;
155 
156 #else
157 
158   const char *semi2;
159   const char *semi3;
160   int line_no;
161 
162   if (__kmp_par_range == 0) {
163     return TRUE;
164   }
165   semi2 = loc->psource;
166   if (semi2 == NULL) {
167     return TRUE;
168   }
169   semi2 = strchr(semi2, ';');
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2 + 1, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   if (__kmp_par_range_filename[0]) {
178     const char *name = semi2 - 1;
179     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
180       name--;
181     }
182     if ((*name == '/') || (*name == ';')) {
183       name++;
184     }
185     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
186       return __kmp_par_range < 0;
187     }
188   }
189   semi3 = strchr(semi2 + 1, ';');
190   if (__kmp_par_range_routine[0]) {
191     if ((semi3 != NULL) && (semi3 > semi2) &&
192         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
193       return __kmp_par_range < 0;
194     }
195   }
196   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
197     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
198       return __kmp_par_range > 0;
199     }
200     return __kmp_par_range < 0;
201   }
202   return TRUE;
203 
204 #endif /* KMP_DEBUG */
205 }
206 
207 /*!
208 @ingroup THREAD_STATES
209 @param loc Source location information.
210 @return 1 if this thread is executing inside an active parallel region, zero if
211 not.
212 */
213 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
214   return __kmp_entry_thread()->th.th_root->r.r_active;
215 }
216 
217 /*!
218 @ingroup PARALLEL
219 @param loc source location information
220 @param global_tid global thread number
221 @param num_threads number of threads requested for this parallel construct
222 
223 Set the number of threads to be used by the next fork spawned by this thread.
224 This call is only required if the parallel construct has a `num_threads` clause.
225 */
226 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
227                              kmp_int32 num_threads) {
228   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
229                 global_tid, num_threads));
230 
231   __kmp_push_num_threads(loc, global_tid, num_threads);
232 }
233 
234 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
235   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
236 
237   /* the num_threads are automatically popped */
238 }
239 
240 #if OMP_40_ENABLED
241 
242 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
243                            kmp_int32 proc_bind) {
244   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
245                 proc_bind));
246 
247   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
248 }
249 
250 #endif /* OMP_40_ENABLED */
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     omp_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       kmp_team_t *parent_team = master_th->th.th_team;
292       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293       if (lwt)
294         ompt_frame = &(lwt->ompt_task_info.frame);
295       else {
296         int tid = __kmp_tid_from_gtid(gtid);
297         ompt_frame = &(
298             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299       }
300       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
301       OMPT_STORE_RETURN_ADDRESS(gtid);
302     }
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306     SSC_MARK_FORKING();
307 #endif
308     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
312 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
313                     &ap
314 #else
315                     ap
316 #endif
317                     );
318 #if INCLUDE_SSC_MARKS
319     SSC_MARK_JOINING();
320 #endif
321     __kmp_join_call(loc, gtid
322 #if OMPT_SUPPORT
323                     ,
324                     fork_context_intel
325 #endif
326                     );
327 
328     va_end(ap);
329   }
330 
331 #if KMP_STATS_ENABLED
332   if (previous_state == stats_state_e::SERIAL_REGION) {
333     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
334   } else {
335     KMP_POP_PARTITIONED_TIMER();
336   }
337 #endif // KMP_STATS_ENABLED
338 }
339 
340 #if OMP_40_ENABLED
341 /*!
342 @ingroup PARALLEL
343 @param loc source location information
344 @param global_tid global thread number
345 @param num_teams number of teams requested for the teams construct
346 @param num_threads number of threads per team requested for the teams construct
347 
348 Set the number of teams to be used by the teams construct.
349 This call is only required if the teams construct has a `num_teams` clause
350 or a `thread_limit` clause (or both).
351 */
352 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
353                            kmp_int32 num_teams, kmp_int32 num_threads) {
354   KA_TRACE(20,
355            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
356             global_tid, num_teams, num_threads));
357 
358   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
359 }
360 
361 /*!
362 @ingroup PARALLEL
363 @param loc  source location information
364 @param argc  total number of arguments in the ellipsis
365 @param microtask  pointer to callback routine consisting of outlined teams
366 construct
367 @param ...  pointers to shared variables that aren't global
368 
369 Do the actual fork and call the microtask in the relevant number of threads.
370 */
371 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
372                        ...) {
373   int gtid = __kmp_entry_gtid();
374   kmp_info_t *this_thr = __kmp_threads[gtid];
375   va_list ap;
376   va_start(ap, microtask);
377 
378   KMP_COUNT_BLOCK(OMP_TEAMS);
379 
380   // remember teams entry point and nesting level
381   this_thr->th.th_teams_microtask = microtask;
382   this_thr->th.th_teams_level =
383       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
384 
385 #if OMPT_SUPPORT
386   kmp_team_t *parent_team = this_thr->th.th_team;
387   int tid = __kmp_tid_from_gtid(gtid);
388   if (ompt_enabled.enabled) {
389     parent_team->t.t_implicit_task_taskdata[tid]
390         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
391   }
392   OMPT_STORE_RETURN_ADDRESS(gtid);
393 #endif
394 
395   // check if __kmpc_push_num_teams called, set default number of teams
396   // otherwise
397   if (this_thr->th.th_teams_size.nteams == 0) {
398     __kmp_push_num_teams(loc, gtid, 0, 0);
399   }
400   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
401   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
402   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
403 
404   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
405                   VOLATILE_CAST(microtask_t)
406                       __kmp_teams_master, // "wrapped" task
407                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
408 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
409                   &ap
410 #else
411                   ap
412 #endif
413                   );
414   __kmp_join_call(loc, gtid
415 #if OMPT_SUPPORT
416                   ,
417                   fork_context_intel
418 #endif
419                   );
420 
421   this_thr->th.th_teams_microtask = NULL;
422   this_thr->th.th_teams_level = 0;
423   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
424   va_end(ap);
425 }
426 #endif /* OMP_40_ENABLED */
427 
428 // I don't think this function should ever have been exported.
429 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
430 // openmp code ever called it, but it's been exported from the RTL for so
431 // long that I'm afraid to remove the definition.
432 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
433 
434 /*!
435 @ingroup PARALLEL
436 @param loc  source location information
437 @param global_tid  global thread number
438 
439 Enter a serialized parallel construct. This interface is used to handle a
440 conditional parallel region, like this,
441 @code
442 #pragma omp parallel if (condition)
443 @endcode
444 when the condition is false.
445 */
446 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
447 // The implementation is now in kmp_runtime.cpp so that it can share static
448 // functions with kmp_fork_call since the tasks to be done are similar in
449 // each case.
450 #if OMPT_SUPPORT
451   OMPT_STORE_RETURN_ADDRESS(global_tid);
452 #endif
453   __kmp_serialized_parallel(loc, global_tid);
454 }
455 
456 /*!
457 @ingroup PARALLEL
458 @param loc  source location information
459 @param global_tid  global thread number
460 
461 Leave a serialized parallel construct.
462 */
463 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
464   kmp_internal_control_t *top;
465   kmp_info_t *this_thr;
466   kmp_team_t *serial_team;
467 
468   KC_TRACE(10,
469            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
470 
471   /* skip all this code for autopar serialized loops since it results in
472      unacceptable overhead */
473   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
474     return;
475 
476   // Not autopar code
477   if (!TCR_4(__kmp_init_parallel))
478     __kmp_parallel_initialize();
479 
480   this_thr = __kmp_threads[global_tid];
481   serial_team = this_thr->th.th_serial_team;
482 
483 #if OMP_45_ENABLED
484   kmp_task_team_t *task_team = this_thr->th.th_task_team;
485 
486   // we need to wait for the proxy tasks before finishing the thread
487   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
488     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
489 #endif
490 
491   KMP_MB();
492   KMP_DEBUG_ASSERT(serial_team);
493   KMP_ASSERT(serial_team->t.t_serialized);
494   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
495   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
496   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
497   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
498 
499 #if OMPT_SUPPORT
500   if (ompt_enabled.enabled &&
501       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
502     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
503     if (ompt_enabled.ompt_callback_implicit_task) {
504       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
505           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
506           OMPT_CUR_TASK_INFO(this_thr)->thread_num);
507     }
508 
509     // reset clear the task id only after unlinking the task
510     ompt_data_t *parent_task_data;
511     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
512 
513     if (ompt_enabled.ompt_callback_parallel_end) {
514       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
515           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
516           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
517     }
518     __ompt_lw_taskteam_unlink(this_thr);
519     this_thr->th.ompt_thread_info.state = omp_state_overhead;
520   }
521 #endif
522 
523   /* If necessary, pop the internal control stack values and replace the team
524    * values */
525   top = serial_team->t.t_control_stack_top;
526   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
527     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
528     serial_team->t.t_control_stack_top = top->next;
529     __kmp_free(top);
530   }
531 
532   // if( serial_team -> t.t_serialized > 1 )
533   serial_team->t.t_level--;
534 
535   /* pop dispatch buffers stack */
536   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
537   {
538     dispatch_private_info_t *disp_buffer =
539         serial_team->t.t_dispatch->th_disp_buffer;
540     serial_team->t.t_dispatch->th_disp_buffer =
541         serial_team->t.t_dispatch->th_disp_buffer->next;
542     __kmp_free(disp_buffer);
543   }
544 #if OMP_50_ENABLED
545   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
546 #endif
547 
548   --serial_team->t.t_serialized;
549   if (serial_team->t.t_serialized == 0) {
550 
551 /* return to the parallel section */
552 
553 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
554     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
555       __kmp_clear_x87_fpu_status_word();
556       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
557       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
558     }
559 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
560 
561     this_thr->th.th_team = serial_team->t.t_parent;
562     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
563 
564     /* restore values cached in the thread */
565     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
566     this_thr->th.th_team_master =
567         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
568     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
569 
570     /* TODO the below shouldn't need to be adjusted for serialized teams */
571     this_thr->th.th_dispatch =
572         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
573 
574     __kmp_pop_current_task_from_thread(this_thr);
575 
576     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
577     this_thr->th.th_current_task->td_flags.executing = 1;
578 
579     if (__kmp_tasking_mode != tskm_immediate_exec) {
580       // Copy the task team from the new child / old parent team to the thread.
581       this_thr->th.th_task_team =
582           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
583       KA_TRACE(20,
584                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
585                 "team %p\n",
586                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
587     }
588   } else {
589     if (__kmp_tasking_mode != tskm_immediate_exec) {
590       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
591                     "depth of serial team %p to %d\n",
592                     global_tid, serial_team, serial_team->t.t_serialized));
593     }
594   }
595 
596   if (__kmp_env_consistency_check)
597     __kmp_pop_parallel(global_tid, NULL);
598 #if OMPT_SUPPORT
599   if (ompt_enabled.enabled)
600     this_thr->th.ompt_thread_info.state =
601         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
602                                            : omp_state_work_parallel);
603 #endif
604 }
605 
606 /*!
607 @ingroup SYNCHRONIZATION
608 @param loc  source location information.
609 
610 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
611 depending on the memory ordering convention obeyed by the compiler
612 even that may not be necessary).
613 */
614 void __kmpc_flush(ident_t *loc) {
615   KC_TRACE(10, ("__kmpc_flush: called\n"));
616 
617   /* need explicit __mf() here since use volatile instead in library */
618   KMP_MB(); /* Flush all pending memory write invalidates.  */
619 
620 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
621 #if KMP_MIC
622 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
623 // We shouldn't need it, though, since the ABI rules require that
624 // * If the compiler generates NGO stores it also generates the fence
625 // * If users hand-code NGO stores they should insert the fence
626 // therefore no incomplete unordered stores should be visible.
627 #else
628   // C74404
629   // This is to address non-temporal store instructions (sfence needed).
630   // The clflush instruction is addressed either (mfence needed).
631   // Probably the non-temporal load monvtdqa instruction should also be
632   // addressed.
633   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
634   if (!__kmp_cpuinfo.initialized) {
635     __kmp_query_cpuid(&__kmp_cpuinfo);
636   }
637   if (!__kmp_cpuinfo.sse2) {
638     // CPU cannot execute SSE2 instructions.
639   } else {
640 #if KMP_COMPILER_ICC
641     _mm_mfence();
642 #elif KMP_COMPILER_MSVC
643     MemoryBarrier();
644 #else
645     __sync_synchronize();
646 #endif // KMP_COMPILER_ICC
647   }
648 #endif // KMP_MIC
649 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
650 // Nothing to see here move along
651 #elif KMP_ARCH_PPC64
652 // Nothing needed here (we have a real MB above).
653 #if KMP_OS_CNK
654   // The flushing thread needs to yield here; this prevents a
655   // busy-waiting thread from saturating the pipeline. flush is
656   // often used in loops like this:
657   // while (!flag) {
658   //   #pragma omp flush(flag)
659   // }
660   // and adding the yield here is good for at least a 10x speedup
661   // when running >2 threads per core (on the NAS LU benchmark).
662   __kmp_yield(TRUE);
663 #endif
664 #else
665 #error Unknown or unsupported architecture
666 #endif
667 
668 #if OMPT_SUPPORT && OMPT_OPTIONAL
669   if (ompt_enabled.ompt_callback_flush) {
670     ompt_callbacks.ompt_callback(ompt_callback_flush)(
671         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
672   }
673 #endif
674 }
675 
676 /* -------------------------------------------------------------------------- */
677 /*!
678 @ingroup SYNCHRONIZATION
679 @param loc source location information
680 @param global_tid thread id.
681 
682 Execute a barrier.
683 */
684 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
685   KMP_COUNT_BLOCK(OMP_BARRIER);
686   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
687 
688   if (!TCR_4(__kmp_init_parallel))
689     __kmp_parallel_initialize();
690 
691   if (__kmp_env_consistency_check) {
692     if (loc == 0) {
693       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
694     }
695 
696     __kmp_check_barrier(global_tid, ct_barrier, loc);
697   }
698 
699 #if OMPT_SUPPORT
700   omp_frame_t *ompt_frame;
701   if (ompt_enabled.enabled) {
702     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
703     if (ompt_frame->enter_frame == NULL)
704       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
705     OMPT_STORE_RETURN_ADDRESS(global_tid);
706   }
707 #endif
708   __kmp_threads[global_tid]->th.th_ident = loc;
709   // TODO: explicit barrier_wait_id:
710   //   this function is called when 'barrier' directive is present or
711   //   implicit barrier at the end of a worksharing construct.
712   // 1) better to add a per-thread barrier counter to a thread data structure
713   // 2) set to 0 when a new team is created
714   // 4) no sync is required
715 
716   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
717 #if OMPT_SUPPORT && OMPT_OPTIONAL
718   if (ompt_enabled.enabled) {
719     ompt_frame->enter_frame = NULL;
720   }
721 #endif
722 }
723 
724 /* The BARRIER for a MASTER section is always explicit   */
725 /*!
726 @ingroup WORK_SHARING
727 @param loc  source location information.
728 @param global_tid  global thread number .
729 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
730 */
731 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
732   int status = 0;
733 
734   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
735 
736   if (!TCR_4(__kmp_init_parallel))
737     __kmp_parallel_initialize();
738 
739   if (KMP_MASTER_GTID(global_tid)) {
740     KMP_COUNT_BLOCK(OMP_MASTER);
741     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
742     status = 1;
743   }
744 
745 #if OMPT_SUPPORT && OMPT_OPTIONAL
746   if (status) {
747     if (ompt_enabled.ompt_callback_master) {
748       kmp_info_t *this_thr = __kmp_threads[global_tid];
749       kmp_team_t *team = this_thr->th.th_team;
750 
751       int tid = __kmp_tid_from_gtid(global_tid);
752       ompt_callbacks.ompt_callback(ompt_callback_master)(
753           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
754           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
755           OMPT_GET_RETURN_ADDRESS(0));
756     }
757   }
758 #endif
759 
760   if (__kmp_env_consistency_check) {
761 #if KMP_USE_DYNAMIC_LOCK
762     if (status)
763       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
764     else
765       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
766 #else
767     if (status)
768       __kmp_push_sync(global_tid, ct_master, loc, NULL);
769     else
770       __kmp_check_sync(global_tid, ct_master, loc, NULL);
771 #endif
772   }
773 
774   return status;
775 }
776 
777 /*!
778 @ingroup WORK_SHARING
779 @param loc  source location information.
780 @param global_tid  global thread number .
781 
782 Mark the end of a <tt>master</tt> region. This should only be called by the
783 thread that executes the <tt>master</tt> region.
784 */
785 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
786   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
787 
788   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
789   KMP_POP_PARTITIONED_TIMER();
790 
791 #if OMPT_SUPPORT && OMPT_OPTIONAL
792   kmp_info_t *this_thr = __kmp_threads[global_tid];
793   kmp_team_t *team = this_thr->th.th_team;
794   if (ompt_enabled.ompt_callback_master) {
795     int tid = __kmp_tid_from_gtid(global_tid);
796     ompt_callbacks.ompt_callback(ompt_callback_master)(
797         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
798         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
799         OMPT_GET_RETURN_ADDRESS(0));
800   }
801 #endif
802 
803   if (__kmp_env_consistency_check) {
804     if (global_tid < 0)
805       KMP_WARNING(ThreadIdentInvalid);
806 
807     if (KMP_MASTER_GTID(global_tid))
808       __kmp_pop_sync(global_tid, ct_master, loc);
809   }
810 }
811 
812 /*!
813 @ingroup WORK_SHARING
814 @param loc  source location information.
815 @param gtid  global thread number.
816 
817 Start execution of an <tt>ordered</tt> construct.
818 */
819 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
820   int cid = 0;
821   kmp_info_t *th;
822   KMP_DEBUG_ASSERT(__kmp_init_serial);
823 
824   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
825 
826   if (!TCR_4(__kmp_init_parallel))
827     __kmp_parallel_initialize();
828 
829 #if USE_ITT_BUILD
830   __kmp_itt_ordered_prep(gtid);
831 // TODO: ordered_wait_id
832 #endif /* USE_ITT_BUILD */
833 
834   th = __kmp_threads[gtid];
835 
836 #if OMPT_SUPPORT && OMPT_OPTIONAL
837   kmp_team_t *team;
838   omp_wait_id_t lck;
839   void *codeptr_ra;
840   if (ompt_enabled.enabled) {
841     OMPT_STORE_RETURN_ADDRESS(gtid);
842     team = __kmp_team_from_gtid(gtid);
843     lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value;
844     /* OMPT state update */
845     th->th.ompt_thread_info.wait_id = lck;
846     th->th.ompt_thread_info.state = omp_state_wait_ordered;
847 
848     /* OMPT event callback */
849     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
850     if (ompt_enabled.ompt_callback_mutex_acquire) {
851       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
852           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
853           (omp_wait_id_t)lck, codeptr_ra);
854     }
855   }
856 #endif
857 
858   if (th->th.th_dispatch->th_deo_fcn != 0)
859     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
860   else
861     __kmp_parallel_deo(&gtid, &cid, loc);
862 
863 #if OMPT_SUPPORT && OMPT_OPTIONAL
864   if (ompt_enabled.enabled) {
865     /* OMPT state update */
866     th->th.ompt_thread_info.state = omp_state_work_parallel;
867     th->th.ompt_thread_info.wait_id = 0;
868 
869     /* OMPT event callback */
870     if (ompt_enabled.ompt_callback_mutex_acquired) {
871       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
872           ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra);
873     }
874   }
875 #endif
876 
877 #if USE_ITT_BUILD
878   __kmp_itt_ordered_start(gtid);
879 #endif /* USE_ITT_BUILD */
880 }
881 
882 /*!
883 @ingroup WORK_SHARING
884 @param loc  source location information.
885 @param gtid  global thread number.
886 
887 End execution of an <tt>ordered</tt> construct.
888 */
889 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
890   int cid = 0;
891   kmp_info_t *th;
892 
893   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
894 
895 #if USE_ITT_BUILD
896   __kmp_itt_ordered_end(gtid);
897 // TODO: ordered_wait_id
898 #endif /* USE_ITT_BUILD */
899 
900   th = __kmp_threads[gtid];
901 
902   if (th->th.th_dispatch->th_dxo_fcn != 0)
903     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
904   else
905     __kmp_parallel_dxo(&gtid, &cid, loc);
906 
907 #if OMPT_SUPPORT && OMPT_OPTIONAL
908   OMPT_STORE_RETURN_ADDRESS(gtid);
909   if (ompt_enabled.ompt_callback_mutex_released) {
910     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
911         ompt_mutex_ordered,
912         (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
913         OMPT_LOAD_RETURN_ADDRESS(gtid));
914   }
915 #endif
916 }
917 
918 #if KMP_USE_DYNAMIC_LOCK
919 
920 static __forceinline void
921 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
922                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
923   // Pointer to the allocated indirect lock is written to crit, while indexing
924   // is ignored.
925   void *idx;
926   kmp_indirect_lock_t **lck;
927   lck = (kmp_indirect_lock_t **)crit;
928   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
929   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
930   KMP_SET_I_LOCK_LOCATION(ilk, loc);
931   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
932   KA_TRACE(20,
933            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
934 #if USE_ITT_BUILD
935   __kmp_itt_critical_creating(ilk->lock, loc);
936 #endif
937   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
938   if (status == 0) {
939 #if USE_ITT_BUILD
940     __kmp_itt_critical_destroyed(ilk->lock);
941 #endif
942     // We don't really need to destroy the unclaimed lock here since it will be
943     // cleaned up at program exit.
944     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
945   }
946   KMP_DEBUG_ASSERT(*lck != NULL);
947 }
948 
949 // Fast-path acquire tas lock
950 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
951   {                                                                            \
952     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
953     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
954     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
955     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
956         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
957       kmp_uint32 spins;                                                        \
958       KMP_FSYNC_PREPARE(l);                                                    \
959       KMP_INIT_YIELD(spins);                                                   \
960       if (TCR_4(__kmp_nth) >                                                   \
961           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
962         KMP_YIELD(TRUE);                                                       \
963       } else {                                                                 \
964         KMP_YIELD_SPIN(spins);                                                 \
965       }                                                                        \
966       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
967       while (                                                                  \
968           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
969           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
970         __kmp_spin_backoff(&backoff);                                          \
971         if (TCR_4(__kmp_nth) >                                                 \
972             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
973           KMP_YIELD(TRUE);                                                     \
974         } else {                                                               \
975           KMP_YIELD_SPIN(spins);                                               \
976         }                                                                      \
977       }                                                                        \
978     }                                                                          \
979     KMP_FSYNC_ACQUIRED(l);                                                     \
980   }
981 
982 // Fast-path test tas lock
983 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
984   {                                                                            \
985     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
986     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
987     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
988     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
989          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
990   }
991 
992 // Fast-path release tas lock
993 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
994   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
995 
996 #if KMP_USE_FUTEX
997 
998 #include <sys/syscall.h>
999 #include <unistd.h>
1000 #ifndef FUTEX_WAIT
1001 #define FUTEX_WAIT 0
1002 #endif
1003 #ifndef FUTEX_WAKE
1004 #define FUTEX_WAKE 1
1005 #endif
1006 
1007 // Fast-path acquire futex lock
1008 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1009   {                                                                            \
1010     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1011     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1012     KMP_MB();                                                                  \
1013     KMP_FSYNC_PREPARE(ftx);                                                    \
1014     kmp_int32 poll_val;                                                        \
1015     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1016                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1017                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1018       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1019       if (!cond) {                                                             \
1020         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1021                                          poll_val |                            \
1022                                              KMP_LOCK_BUSY(1, futex))) {       \
1023           continue;                                                            \
1024         }                                                                      \
1025         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1026       }                                                                        \
1027       kmp_int32 rc;                                                            \
1028       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1029                         NULL, NULL, 0)) != 0) {                                \
1030         continue;                                                              \
1031       }                                                                        \
1032       gtid_code |= 1;                                                          \
1033     }                                                                          \
1034     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1035   }
1036 
1037 // Fast-path test futex lock
1038 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1039   {                                                                            \
1040     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1041     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1042                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1043       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1044       rc = TRUE;                                                               \
1045     } else {                                                                   \
1046       rc = FALSE;                                                              \
1047     }                                                                          \
1048   }
1049 
1050 // Fast-path release futex lock
1051 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1052   {                                                                            \
1053     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1054     KMP_MB();                                                                  \
1055     KMP_FSYNC_RELEASING(ftx);                                                  \
1056     kmp_int32 poll_val =                                                       \
1057         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1058     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1059       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1060               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1061     }                                                                          \
1062     KMP_MB();                                                                  \
1063     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1064               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1065   }
1066 
1067 #endif // KMP_USE_FUTEX
1068 
1069 #else // KMP_USE_DYNAMIC_LOCK
1070 
1071 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1072                                                       ident_t const *loc,
1073                                                       kmp_int32 gtid) {
1074   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1075 
1076   // Because of the double-check, the following load doesn't need to be volatile
1077   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1078 
1079   if (lck == NULL) {
1080     void *idx;
1081 
1082     // Allocate & initialize the lock.
1083     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1084     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1085     __kmp_init_user_lock_with_checks(lck);
1086     __kmp_set_user_lock_location(lck, loc);
1087 #if USE_ITT_BUILD
1088     __kmp_itt_critical_creating(lck);
1089 // __kmp_itt_critical_creating() should be called *before* the first usage
1090 // of underlying lock. It is the only place where we can guarantee it. There
1091 // are chances the lock will destroyed with no usage, but it is not a
1092 // problem, because this is not real event seen by user but rather setting
1093 // name for object (lock). See more details in kmp_itt.h.
1094 #endif /* USE_ITT_BUILD */
1095 
1096     // Use a cmpxchg instruction to slam the start of the critical section with
1097     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1098     // and use the lock that the other thread allocated.
1099     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1100 
1101     if (status == 0) {
1102 // Deallocate the lock and reload the value.
1103 #if USE_ITT_BUILD
1104       __kmp_itt_critical_destroyed(lck);
1105 // Let ITT know the lock is destroyed and the same memory location may be reused
1106 // for another purpose.
1107 #endif /* USE_ITT_BUILD */
1108       __kmp_destroy_user_lock_with_checks(lck);
1109       __kmp_user_lock_free(&idx, gtid, lck);
1110       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1111       KMP_DEBUG_ASSERT(lck != NULL);
1112     }
1113   }
1114   return lck;
1115 }
1116 
1117 #endif // KMP_USE_DYNAMIC_LOCK
1118 
1119 /*!
1120 @ingroup WORK_SHARING
1121 @param loc  source location information.
1122 @param global_tid  global thread number .
1123 @param crit identity of the critical section. This could be a pointer to a lock
1124 associated with the critical section, or some other suitably unique value.
1125 
1126 Enter code protected by a `critical` construct.
1127 This function blocks until the executing thread can enter the critical section.
1128 */
1129 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1130                      kmp_critical_name *crit) {
1131 #if KMP_USE_DYNAMIC_LOCK
1132 #if OMPT_SUPPORT && OMPT_OPTIONAL
1133   OMPT_STORE_RETURN_ADDRESS(global_tid);
1134 #endif // OMPT_SUPPORT
1135   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1136 #else
1137   KMP_COUNT_BLOCK(OMP_CRITICAL);
1138 #if OMPT_SUPPORT && OMPT_OPTIONAL
1139   omp_state_t prev_state = omp_state_undefined;
1140   ompt_thread_info_t ti;
1141 #endif
1142   kmp_user_lock_p lck;
1143 
1144   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1145 
1146   // TODO: add THR_OVHD_STATE
1147 
1148   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1149   KMP_CHECK_USER_LOCK_INIT();
1150 
1151   if ((__kmp_user_lock_kind == lk_tas) &&
1152       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1153     lck = (kmp_user_lock_p)crit;
1154   }
1155 #if KMP_USE_FUTEX
1156   else if ((__kmp_user_lock_kind == lk_futex) &&
1157            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1158     lck = (kmp_user_lock_p)crit;
1159   }
1160 #endif
1161   else { // ticket, queuing or drdpa
1162     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1163   }
1164 
1165   if (__kmp_env_consistency_check)
1166     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1167 
1168 // since the critical directive binds to all threads, not just the current
1169 // team we have to check this even if we are in a serialized team.
1170 // also, even if we are the uber thread, we still have to conduct the lock,
1171 // as we have to contend with sibling threads.
1172 
1173 #if USE_ITT_BUILD
1174   __kmp_itt_critical_acquiring(lck);
1175 #endif /* USE_ITT_BUILD */
1176 #if OMPT_SUPPORT && OMPT_OPTIONAL
1177   OMPT_STORE_RETURN_ADDRESS(gtid);
1178   void *codeptr_ra = NULL;
1179   if (ompt_enabled.enabled) {
1180     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1181     /* OMPT state update */
1182     prev_state = ti.state;
1183     ti.wait_id = (omp_wait_id_t)lck;
1184     ti.state = omp_state_wait_critical;
1185 
1186     /* OMPT event callback */
1187     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1188     if (ompt_enabled.ompt_callback_mutex_acquire) {
1189       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1190           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1191           (omp_wait_id_t)crit, codeptr_ra);
1192     }
1193   }
1194 #endif
1195   // Value of 'crit' should be good for using as a critical_id of the critical
1196   // section directive.
1197   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1198 
1199 #if USE_ITT_BUILD
1200   __kmp_itt_critical_acquired(lck);
1201 #endif /* USE_ITT_BUILD */
1202 #if OMPT_SUPPORT && OMPT_OPTIONAL
1203   if (ompt_enabled.enabled) {
1204     /* OMPT state update */
1205     ti.state = prev_state;
1206     ti.wait_id = 0;
1207 
1208     /* OMPT event callback */
1209     if (ompt_enabled.ompt_callback_mutex_acquired) {
1210       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1211           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra);
1212     }
1213   }
1214 #endif
1215   KMP_POP_PARTITIONED_TIMER();
1216 
1217   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1218   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1219 #endif // KMP_USE_DYNAMIC_LOCK
1220 }
1221 
1222 #if KMP_USE_DYNAMIC_LOCK
1223 
1224 // Converts the given hint to an internal lock implementation
1225 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1226 #if KMP_USE_TSX
1227 #define KMP_TSX_LOCK(seq) lockseq_##seq
1228 #else
1229 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1230 #endif
1231 
1232 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1233 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1234 #else
1235 #define KMP_CPUINFO_RTM 0
1236 #endif
1237 
1238   // Hints that do not require further logic
1239   if (hint & kmp_lock_hint_hle)
1240     return KMP_TSX_LOCK(hle);
1241   if (hint & kmp_lock_hint_rtm)
1242     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1243   if (hint & kmp_lock_hint_adaptive)
1244     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1245 
1246   // Rule out conflicting hints first by returning the default lock
1247   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1248     return __kmp_user_lock_seq;
1249   if ((hint & omp_lock_hint_speculative) &&
1250       (hint & omp_lock_hint_nonspeculative))
1251     return __kmp_user_lock_seq;
1252 
1253   // Do not even consider speculation when it appears to be contended
1254   if (hint & omp_lock_hint_contended)
1255     return lockseq_queuing;
1256 
1257   // Uncontended lock without speculation
1258   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1259     return lockseq_tas;
1260 
1261   // HLE lock for speculation
1262   if (hint & omp_lock_hint_speculative)
1263     return KMP_TSX_LOCK(hle);
1264 
1265   return __kmp_user_lock_seq;
1266 }
1267 
1268 #if OMPT_SUPPORT && OMPT_OPTIONAL
1269 #if KMP_USE_DYNAMIC_LOCK
1270 static kmp_mutex_impl_t
1271 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1272   if (user_lock) {
1273     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1274     case 0:
1275       break;
1276 #if KMP_USE_FUTEX
1277     case locktag_futex:
1278       return kmp_mutex_impl_queuing;
1279 #endif
1280     case locktag_tas:
1281       return kmp_mutex_impl_spin;
1282 #if KMP_USE_TSX
1283     case locktag_hle:
1284       return kmp_mutex_impl_speculative;
1285 #endif
1286     default:
1287       return ompt_mutex_impl_unknown;
1288     }
1289     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1290   }
1291   KMP_ASSERT(ilock);
1292   switch (ilock->type) {
1293 #if KMP_USE_TSX
1294   case locktag_adaptive:
1295   case locktag_rtm:
1296     return kmp_mutex_impl_speculative;
1297 #endif
1298   case locktag_nested_tas:
1299     return kmp_mutex_impl_spin;
1300 #if KMP_USE_FUTEX
1301   case locktag_nested_futex:
1302 #endif
1303   case locktag_ticket:
1304   case locktag_queuing:
1305   case locktag_drdpa:
1306   case locktag_nested_ticket:
1307   case locktag_nested_queuing:
1308   case locktag_nested_drdpa:
1309     return kmp_mutex_impl_queuing;
1310   default:
1311     return ompt_mutex_impl_unknown;
1312   }
1313 }
1314 #else
1315 // For locks without dynamic binding
1316 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1317   switch (__kmp_user_lock_kind) {
1318   case lk_tas:
1319     return kmp_mutex_impl_spin;
1320 #if KMP_USE_FUTEX
1321   case lk_futex:
1322 #endif
1323   case lk_ticket:
1324   case lk_queuing:
1325   case lk_drdpa:
1326     return kmp_mutex_impl_queuing;
1327 #if KMP_USE_TSX
1328   case lk_hle:
1329   case lk_rtm:
1330   case lk_adaptive:
1331     return kmp_mutex_impl_speculative;
1332 #endif
1333   default:
1334     return ompt_mutex_impl_unknown;
1335   }
1336 }
1337 #endif // KMP_USE_DYNAMIC_LOCK
1338 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1339 
1340 /*!
1341 @ingroup WORK_SHARING
1342 @param loc  source location information.
1343 @param global_tid  global thread number.
1344 @param crit identity of the critical section. This could be a pointer to a lock
1345 associated with the critical section, or some other suitably unique value.
1346 @param hint the lock hint.
1347 
1348 Enter code protected by a `critical` construct with a hint. The hint value is
1349 used to suggest a lock implementation. This function blocks until the executing
1350 thread can enter the critical section unless the hint suggests use of
1351 speculative execution and the hardware supports it.
1352 */
1353 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1354                                kmp_critical_name *crit, uint32_t hint) {
1355   KMP_COUNT_BLOCK(OMP_CRITICAL);
1356   kmp_user_lock_p lck;
1357 #if OMPT_SUPPORT && OMPT_OPTIONAL
1358   omp_state_t prev_state = omp_state_undefined;
1359   ompt_thread_info_t ti;
1360   // This is the case, if called from __kmpc_critical:
1361   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1362   if (!codeptr)
1363     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1364 #endif
1365 
1366   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1367 
1368   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1369   // Check if it is initialized.
1370   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1371   if (*lk == 0) {
1372     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1373     if (KMP_IS_D_LOCK(lckseq)) {
1374       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1375                                   KMP_GET_D_TAG(lckseq));
1376     } else {
1377       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1378     }
1379   }
1380   // Branch for accessing the actual lock object and set operation. This
1381   // branching is inevitable since this lock initialization does not follow the
1382   // normal dispatch path (lock table is not used).
1383   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1384     lck = (kmp_user_lock_p)lk;
1385     if (__kmp_env_consistency_check) {
1386       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1387                       __kmp_map_hint_to_lock(hint));
1388     }
1389 #if USE_ITT_BUILD
1390     __kmp_itt_critical_acquiring(lck);
1391 #endif
1392 #if OMPT_SUPPORT && OMPT_OPTIONAL
1393     if (ompt_enabled.enabled) {
1394       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1395       /* OMPT state update */
1396       prev_state = ti.state;
1397       ti.wait_id = (omp_wait_id_t)lck;
1398       ti.state = omp_state_wait_critical;
1399 
1400       /* OMPT event callback */
1401       if (ompt_enabled.ompt_callback_mutex_acquire) {
1402         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1403             ompt_mutex_critical, (unsigned int)hint,
1404             __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr);
1405       }
1406     }
1407 #endif
1408 #if KMP_USE_INLINED_TAS
1409     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1410       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1411     } else
1412 #elif KMP_USE_INLINED_FUTEX
1413     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1414       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1415     } else
1416 #endif
1417     {
1418       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1419     }
1420   } else {
1421     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1422     lck = ilk->lock;
1423     if (__kmp_env_consistency_check) {
1424       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1425                       __kmp_map_hint_to_lock(hint));
1426     }
1427 #if USE_ITT_BUILD
1428     __kmp_itt_critical_acquiring(lck);
1429 #endif
1430 #if OMPT_SUPPORT && OMPT_OPTIONAL
1431     if (ompt_enabled.enabled) {
1432       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1433       /* OMPT state update */
1434       prev_state = ti.state;
1435       ti.wait_id = (omp_wait_id_t)lck;
1436       ti.state = omp_state_wait_critical;
1437 
1438       /* OMPT event callback */
1439       if (ompt_enabled.ompt_callback_mutex_acquire) {
1440         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1441             ompt_mutex_critical, (unsigned int)hint,
1442             __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr);
1443       }
1444     }
1445 #endif
1446     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1447   }
1448   KMP_POP_PARTITIONED_TIMER();
1449 
1450 #if USE_ITT_BUILD
1451   __kmp_itt_critical_acquired(lck);
1452 #endif /* USE_ITT_BUILD */
1453 #if OMPT_SUPPORT && OMPT_OPTIONAL
1454   if (ompt_enabled.enabled) {
1455     /* OMPT state update */
1456     ti.state = prev_state;
1457     ti.wait_id = 0;
1458 
1459     /* OMPT event callback */
1460     if (ompt_enabled.ompt_callback_mutex_acquired) {
1461       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1462           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr);
1463     }
1464   }
1465 #endif
1466 
1467   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1468   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1469 } // __kmpc_critical_with_hint
1470 
1471 #endif // KMP_USE_DYNAMIC_LOCK
1472 
1473 /*!
1474 @ingroup WORK_SHARING
1475 @param loc  source location information.
1476 @param global_tid  global thread number .
1477 @param crit identity of the critical section. This could be a pointer to a lock
1478 associated with the critical section, or some other suitably unique value.
1479 
1480 Leave a critical section, releasing any lock that was held during its execution.
1481 */
1482 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1483                          kmp_critical_name *crit) {
1484   kmp_user_lock_p lck;
1485 
1486   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1487 
1488 #if KMP_USE_DYNAMIC_LOCK
1489   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1490     lck = (kmp_user_lock_p)crit;
1491     KMP_ASSERT(lck != NULL);
1492     if (__kmp_env_consistency_check) {
1493       __kmp_pop_sync(global_tid, ct_critical, loc);
1494     }
1495 #if USE_ITT_BUILD
1496     __kmp_itt_critical_releasing(lck);
1497 #endif
1498 #if KMP_USE_INLINED_TAS
1499     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1500       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1501     } else
1502 #elif KMP_USE_INLINED_FUTEX
1503     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1504       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1505     } else
1506 #endif
1507     {
1508       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1509     }
1510   } else {
1511     kmp_indirect_lock_t *ilk =
1512         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1513     KMP_ASSERT(ilk != NULL);
1514     lck = ilk->lock;
1515     if (__kmp_env_consistency_check) {
1516       __kmp_pop_sync(global_tid, ct_critical, loc);
1517     }
1518 #if USE_ITT_BUILD
1519     __kmp_itt_critical_releasing(lck);
1520 #endif
1521     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1522   }
1523 
1524 #else // KMP_USE_DYNAMIC_LOCK
1525 
1526   if ((__kmp_user_lock_kind == lk_tas) &&
1527       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1528     lck = (kmp_user_lock_p)crit;
1529   }
1530 #if KMP_USE_FUTEX
1531   else if ((__kmp_user_lock_kind == lk_futex) &&
1532            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1533     lck = (kmp_user_lock_p)crit;
1534   }
1535 #endif
1536   else { // ticket, queuing or drdpa
1537     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1538   }
1539 
1540   KMP_ASSERT(lck != NULL);
1541 
1542   if (__kmp_env_consistency_check)
1543     __kmp_pop_sync(global_tid, ct_critical, loc);
1544 
1545 #if USE_ITT_BUILD
1546   __kmp_itt_critical_releasing(lck);
1547 #endif /* USE_ITT_BUILD */
1548   // Value of 'crit' should be good for using as a critical_id of the critical
1549   // section directive.
1550   __kmp_release_user_lock_with_checks(lck, global_tid);
1551 
1552 #endif // KMP_USE_DYNAMIC_LOCK
1553 
1554 #if OMPT_SUPPORT && OMPT_OPTIONAL
1555   /* OMPT release event triggers after lock is released; place here to trigger
1556    * for all #if branches */
1557   OMPT_STORE_RETURN_ADDRESS(global_tid);
1558   if (ompt_enabled.ompt_callback_mutex_released) {
1559     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1560         ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1561   }
1562 #endif
1563 
1564   KMP_POP_PARTITIONED_TIMER();
1565   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1566 }
1567 
1568 /*!
1569 @ingroup SYNCHRONIZATION
1570 @param loc source location information
1571 @param global_tid thread id.
1572 @return one if the thread should execute the master block, zero otherwise
1573 
1574 Start execution of a combined barrier and master. The barrier is executed inside
1575 this function.
1576 */
1577 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1578   int status;
1579 
1580   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1581 
1582   if (!TCR_4(__kmp_init_parallel))
1583     __kmp_parallel_initialize();
1584 
1585   if (__kmp_env_consistency_check)
1586     __kmp_check_barrier(global_tid, ct_barrier, loc);
1587 
1588 #if OMPT_SUPPORT
1589   omp_frame_t *ompt_frame;
1590   if (ompt_enabled.enabled) {
1591     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1592     if (ompt_frame->enter_frame == NULL)
1593       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1594     OMPT_STORE_RETURN_ADDRESS(global_tid);
1595   }
1596 #endif
1597 #if USE_ITT_NOTIFY
1598   __kmp_threads[global_tid]->th.th_ident = loc;
1599 #endif
1600   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1601 #if OMPT_SUPPORT && OMPT_OPTIONAL
1602   if (ompt_enabled.enabled) {
1603     ompt_frame->enter_frame = NULL;
1604   }
1605 #endif
1606 
1607   return (status != 0) ? 0 : 1;
1608 }
1609 
1610 /*!
1611 @ingroup SYNCHRONIZATION
1612 @param loc source location information
1613 @param global_tid thread id.
1614 
1615 Complete the execution of a combined barrier and master. This function should
1616 only be called at the completion of the <tt>master</tt> code. Other threads will
1617 still be waiting at the barrier and this call releases them.
1618 */
1619 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1620   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1621 
1622   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1623 }
1624 
1625 /*!
1626 @ingroup SYNCHRONIZATION
1627 @param loc source location information
1628 @param global_tid thread id.
1629 @return one if the thread should execute the master block, zero otherwise
1630 
1631 Start execution of a combined barrier and master(nowait) construct.
1632 The barrier is executed inside this function.
1633 There is no equivalent "end" function, since the
1634 */
1635 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1636   kmp_int32 ret;
1637 
1638   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1639 
1640   if (!TCR_4(__kmp_init_parallel))
1641     __kmp_parallel_initialize();
1642 
1643   if (__kmp_env_consistency_check) {
1644     if (loc == 0) {
1645       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1646     }
1647     __kmp_check_barrier(global_tid, ct_barrier, loc);
1648   }
1649 
1650 #if OMPT_SUPPORT
1651   omp_frame_t *ompt_frame;
1652   if (ompt_enabled.enabled) {
1653     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1654     if (ompt_frame->enter_frame == NULL)
1655       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1656     OMPT_STORE_RETURN_ADDRESS(global_tid);
1657   }
1658 #endif
1659 #if USE_ITT_NOTIFY
1660   __kmp_threads[global_tid]->th.th_ident = loc;
1661 #endif
1662   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1663 #if OMPT_SUPPORT && OMPT_OPTIONAL
1664   if (ompt_enabled.enabled) {
1665     ompt_frame->enter_frame = NULL;
1666   }
1667 #endif
1668 
1669   ret = __kmpc_master(loc, global_tid);
1670 
1671   if (__kmp_env_consistency_check) {
1672     /*  there's no __kmpc_end_master called; so the (stats) */
1673     /*  actions of __kmpc_end_master are done here          */
1674 
1675     if (global_tid < 0) {
1676       KMP_WARNING(ThreadIdentInvalid);
1677     }
1678     if (ret) {
1679       /* only one thread should do the pop since only */
1680       /* one did the push (see __kmpc_master())       */
1681 
1682       __kmp_pop_sync(global_tid, ct_master, loc);
1683     }
1684   }
1685 
1686   return (ret);
1687 }
1688 
1689 /* The BARRIER for a SINGLE process section is always explicit   */
1690 /*!
1691 @ingroup WORK_SHARING
1692 @param loc  source location information
1693 @param global_tid  global thread number
1694 @return One if this thread should execute the single construct, zero otherwise.
1695 
1696 Test whether to execute a <tt>single</tt> construct.
1697 There are no implicit barriers in the two "single" calls, rather the compiler
1698 should introduce an explicit barrier if it is required.
1699 */
1700 
1701 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1702   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1703 
1704   if (rc) {
1705     // We are going to execute the single statement, so we should count it.
1706     KMP_COUNT_BLOCK(OMP_SINGLE);
1707     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1708   }
1709 
1710 #if OMPT_SUPPORT && OMPT_OPTIONAL
1711   kmp_info_t *this_thr = __kmp_threads[global_tid];
1712   kmp_team_t *team = this_thr->th.th_team;
1713   int tid = __kmp_tid_from_gtid(global_tid);
1714 
1715   if (ompt_enabled.enabled) {
1716     if (rc) {
1717       if (ompt_enabled.ompt_callback_work) {
1718         ompt_callbacks.ompt_callback(ompt_callback_work)(
1719             ompt_work_single_executor, ompt_scope_begin,
1720             &(team->t.ompt_team_info.parallel_data),
1721             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1722             1, OMPT_GET_RETURN_ADDRESS(0));
1723       }
1724     } else {
1725       if (ompt_enabled.ompt_callback_work) {
1726         ompt_callbacks.ompt_callback(ompt_callback_work)(
1727             ompt_work_single_other, ompt_scope_begin,
1728             &(team->t.ompt_team_info.parallel_data),
1729             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1730             1, OMPT_GET_RETURN_ADDRESS(0));
1731         ompt_callbacks.ompt_callback(ompt_callback_work)(
1732             ompt_work_single_other, ompt_scope_end,
1733             &(team->t.ompt_team_info.parallel_data),
1734             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1735             1, OMPT_GET_RETURN_ADDRESS(0));
1736       }
1737     }
1738   }
1739 #endif
1740 
1741   return rc;
1742 }
1743 
1744 /*!
1745 @ingroup WORK_SHARING
1746 @param loc  source location information
1747 @param global_tid  global thread number
1748 
1749 Mark the end of a <tt>single</tt> construct.  This function should
1750 only be called by the thread that executed the block of code protected
1751 by the `single` construct.
1752 */
1753 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1754   __kmp_exit_single(global_tid);
1755   KMP_POP_PARTITIONED_TIMER();
1756 
1757 #if OMPT_SUPPORT && OMPT_OPTIONAL
1758   kmp_info_t *this_thr = __kmp_threads[global_tid];
1759   kmp_team_t *team = this_thr->th.th_team;
1760   int tid = __kmp_tid_from_gtid(global_tid);
1761 
1762   if (ompt_enabled.ompt_callback_work) {
1763     ompt_callbacks.ompt_callback(ompt_callback_work)(
1764         ompt_work_single_executor, ompt_scope_end,
1765         &(team->t.ompt_team_info.parallel_data),
1766         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1767         OMPT_GET_RETURN_ADDRESS(0));
1768   }
1769 #endif
1770 }
1771 
1772 /*!
1773 @ingroup WORK_SHARING
1774 @param loc Source location
1775 @param global_tid Global thread id
1776 
1777 Mark the end of a statically scheduled loop.
1778 */
1779 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1780   KMP_POP_PARTITIONED_TIMER();
1781   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1782 
1783 #if OMPT_SUPPORT && OMPT_OPTIONAL
1784   if (ompt_enabled.ompt_callback_work) {
1785     ompt_work_t ompt_work_type = ompt_work_loop;
1786     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1787     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1788     // Determine workshare type
1789     if (loc != NULL) {
1790       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1791         ompt_work_type = ompt_work_loop;
1792       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1793         ompt_work_type = ompt_work_sections;
1794       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1795         ompt_work_type = ompt_work_distribute;
1796       } else {
1797         // use default set above.
1798         // a warning about this case is provided in __kmpc_for_static_init
1799       }
1800       KMP_DEBUG_ASSERT(ompt_work_type);
1801     }
1802     ompt_callbacks.ompt_callback(ompt_callback_work)(
1803         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1804         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1805   }
1806 #endif
1807   if (__kmp_env_consistency_check)
1808     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1809 }
1810 
1811 // User routines which take C-style arguments (call by value)
1812 // different from the Fortran equivalent routines
1813 
1814 void ompc_set_num_threads(int arg) {
1815   // !!!!! TODO: check the per-task binding
1816   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1817 }
1818 
1819 void ompc_set_dynamic(int flag) {
1820   kmp_info_t *thread;
1821 
1822   /* For the thread-private implementation of the internal controls */
1823   thread = __kmp_entry_thread();
1824 
1825   __kmp_save_internal_controls(thread);
1826 
1827   set__dynamic(thread, flag ? TRUE : FALSE);
1828 }
1829 
1830 void ompc_set_nested(int flag) {
1831   kmp_info_t *thread;
1832 
1833   /* For the thread-private internal controls implementation */
1834   thread = __kmp_entry_thread();
1835 
1836   __kmp_save_internal_controls(thread);
1837 
1838   set__nested(thread, flag ? TRUE : FALSE);
1839 }
1840 
1841 void ompc_set_max_active_levels(int max_active_levels) {
1842   /* TO DO */
1843   /* we want per-task implementation of this internal control */
1844 
1845   /* For the per-thread internal controls implementation */
1846   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1847 }
1848 
1849 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1850   // !!!!! TODO: check the per-task binding
1851   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1852 }
1853 
1854 int ompc_get_ancestor_thread_num(int level) {
1855   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1856 }
1857 
1858 int ompc_get_team_size(int level) {
1859   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1860 }
1861 
1862 void kmpc_set_stacksize(int arg) {
1863   // __kmp_aux_set_stacksize initializes the library if needed
1864   __kmp_aux_set_stacksize(arg);
1865 }
1866 
1867 void kmpc_set_stacksize_s(size_t arg) {
1868   // __kmp_aux_set_stacksize initializes the library if needed
1869   __kmp_aux_set_stacksize(arg);
1870 }
1871 
1872 void kmpc_set_blocktime(int arg) {
1873   int gtid, tid;
1874   kmp_info_t *thread;
1875 
1876   gtid = __kmp_entry_gtid();
1877   tid = __kmp_tid_from_gtid(gtid);
1878   thread = __kmp_thread_from_gtid(gtid);
1879 
1880   __kmp_aux_set_blocktime(arg, thread, tid);
1881 }
1882 
1883 void kmpc_set_library(int arg) {
1884   // __kmp_user_set_library initializes the library if needed
1885   __kmp_user_set_library((enum library_type)arg);
1886 }
1887 
1888 void kmpc_set_defaults(char const *str) {
1889   // __kmp_aux_set_defaults initializes the library if needed
1890   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1891 }
1892 
1893 void kmpc_set_disp_num_buffers(int arg) {
1894   // ignore after initialization because some teams have already
1895   // allocated dispatch buffers
1896   if (__kmp_init_serial == 0 && arg > 0)
1897     __kmp_dispatch_num_buffers = arg;
1898 }
1899 
1900 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1901 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1902   return -1;
1903 #else
1904   if (!TCR_4(__kmp_init_middle)) {
1905     __kmp_middle_initialize();
1906   }
1907   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1908 #endif
1909 }
1910 
1911 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1912 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1913   return -1;
1914 #else
1915   if (!TCR_4(__kmp_init_middle)) {
1916     __kmp_middle_initialize();
1917   }
1918   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1919 #endif
1920 }
1921 
1922 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1923 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1924   return -1;
1925 #else
1926   if (!TCR_4(__kmp_init_middle)) {
1927     __kmp_middle_initialize();
1928   }
1929   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1930 #endif
1931 }
1932 
1933 /* -------------------------------------------------------------------------- */
1934 /*!
1935 @ingroup THREADPRIVATE
1936 @param loc       source location information
1937 @param gtid      global thread number
1938 @param cpy_size  size of the cpy_data buffer
1939 @param cpy_data  pointer to data to be copied
1940 @param cpy_func  helper function to call for copying data
1941 @param didit     flag variable: 1=single thread; 0=not single thread
1942 
1943 __kmpc_copyprivate implements the interface for the private data broadcast
1944 needed for the copyprivate clause associated with a single region in an
1945 OpenMP<sup>*</sup> program (both C and Fortran).
1946 All threads participating in the parallel region call this routine.
1947 One of the threads (called the single thread) should have the <tt>didit</tt>
1948 variable set to 1 and all other threads should have that variable set to 0.
1949 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1950 
1951 The OpenMP specification forbids the use of nowait on the single region when a
1952 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1953 barrier internally to avoid race conditions, so the code generation for the
1954 single region should avoid generating a barrier after the call to @ref
1955 __kmpc_copyprivate.
1956 
1957 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1958 The <tt>loc</tt> parameter is a pointer to source location information.
1959 
1960 Internal implementation: The single thread will first copy its descriptor
1961 address (cpy_data) to a team-private location, then the other threads will each
1962 call the function pointed to by the parameter cpy_func, which carries out the
1963 copy by copying the data using the cpy_data buffer.
1964 
1965 The cpy_func routine used for the copy and the contents of the data area defined
1966 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1967 to be done. For instance, the cpy_data buffer can hold the actual data to be
1968 copied or it may hold a list of pointers to the data. The cpy_func routine must
1969 interpret the cpy_data buffer appropriately.
1970 
1971 The interface to cpy_func is as follows:
1972 @code
1973 void cpy_func( void *destination, void *source )
1974 @endcode
1975 where void *destination is the cpy_data pointer for the thread being copied to
1976 and void *source is the cpy_data pointer for the thread being copied from.
1977 */
1978 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1979                         void *cpy_data, void (*cpy_func)(void *, void *),
1980                         kmp_int32 didit) {
1981   void **data_ptr;
1982 
1983   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1984 
1985   KMP_MB();
1986 
1987   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1988 
1989   if (__kmp_env_consistency_check) {
1990     if (loc == 0) {
1991       KMP_WARNING(ConstructIdentInvalid);
1992     }
1993   }
1994 
1995   // ToDo: Optimize the following two barriers into some kind of split barrier
1996 
1997   if (didit)
1998     *data_ptr = cpy_data;
1999 
2000 #if OMPT_SUPPORT
2001   omp_frame_t *ompt_frame;
2002   if (ompt_enabled.enabled) {
2003     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2004     if (ompt_frame->enter_frame == NULL)
2005       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
2006     OMPT_STORE_RETURN_ADDRESS(gtid);
2007   }
2008 #endif
2009 /* This barrier is not a barrier region boundary */
2010 #if USE_ITT_NOTIFY
2011   __kmp_threads[gtid]->th.th_ident = loc;
2012 #endif
2013   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2014 
2015   if (!didit)
2016     (*cpy_func)(cpy_data, *data_ptr);
2017 
2018 // Consider next barrier a user-visible barrier for barrier region boundaries
2019 // Nesting checks are already handled by the single construct checks
2020 
2021 #if OMPT_SUPPORT
2022   if (ompt_enabled.enabled) {
2023     OMPT_STORE_RETURN_ADDRESS(gtid);
2024   }
2025 #endif
2026 #if USE_ITT_NOTIFY
2027   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2028 // tasks can overwrite the location)
2029 #endif
2030   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2031 #if OMPT_SUPPORT && OMPT_OPTIONAL
2032   if (ompt_enabled.enabled) {
2033     ompt_frame->enter_frame = NULL;
2034   }
2035 #endif
2036 }
2037 
2038 /* -------------------------------------------------------------------------- */
2039 
2040 #define INIT_LOCK __kmp_init_user_lock_with_checks
2041 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2042 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2043 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2044 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2045 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2046   __kmp_acquire_nested_user_lock_with_checks_timed
2047 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2048 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2049 #define TEST_LOCK __kmp_test_user_lock_with_checks
2050 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2051 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2052 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2053 
2054 // TODO: Make check abort messages use location info & pass it into
2055 // with_checks routines
2056 
2057 #if KMP_USE_DYNAMIC_LOCK
2058 
2059 // internal lock initializer
2060 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2061                                                     kmp_dyna_lockseq_t seq) {
2062   if (KMP_IS_D_LOCK(seq)) {
2063     KMP_INIT_D_LOCK(lock, seq);
2064 #if USE_ITT_BUILD
2065     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2066 #endif
2067   } else {
2068     KMP_INIT_I_LOCK(lock, seq);
2069 #if USE_ITT_BUILD
2070     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2071     __kmp_itt_lock_creating(ilk->lock, loc);
2072 #endif
2073   }
2074 }
2075 
2076 // internal nest lock initializer
2077 static __forceinline void
2078 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2079                                kmp_dyna_lockseq_t seq) {
2080 #if KMP_USE_TSX
2081   // Don't have nested lock implementation for speculative locks
2082   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2083     seq = __kmp_user_lock_seq;
2084 #endif
2085   switch (seq) {
2086   case lockseq_tas:
2087     seq = lockseq_nested_tas;
2088     break;
2089 #if KMP_USE_FUTEX
2090   case lockseq_futex:
2091     seq = lockseq_nested_futex;
2092     break;
2093 #endif
2094   case lockseq_ticket:
2095     seq = lockseq_nested_ticket;
2096     break;
2097   case lockseq_queuing:
2098     seq = lockseq_nested_queuing;
2099     break;
2100   case lockseq_drdpa:
2101     seq = lockseq_nested_drdpa;
2102     break;
2103   default:
2104     seq = lockseq_nested_queuing;
2105   }
2106   KMP_INIT_I_LOCK(lock, seq);
2107 #if USE_ITT_BUILD
2108   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2109   __kmp_itt_lock_creating(ilk->lock, loc);
2110 #endif
2111 }
2112 
2113 /* initialize the lock with a hint */
2114 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2115                                 uintptr_t hint) {
2116   KMP_DEBUG_ASSERT(__kmp_init_serial);
2117   if (__kmp_env_consistency_check && user_lock == NULL) {
2118     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2119   }
2120 
2121   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2122 
2123 #if OMPT_SUPPORT && OMPT_OPTIONAL
2124   // This is the case, if called from omp_init_lock_with_hint:
2125   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2126   if (!codeptr)
2127     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2128   if (ompt_enabled.ompt_callback_lock_init) {
2129     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2130         ompt_mutex_lock, (omp_lock_hint_t)hint,
2131         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2132         codeptr);
2133   }
2134 #endif
2135 }
2136 
2137 /* initialize the lock with a hint */
2138 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2139                                      void **user_lock, uintptr_t hint) {
2140   KMP_DEBUG_ASSERT(__kmp_init_serial);
2141   if (__kmp_env_consistency_check && user_lock == NULL) {
2142     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2143   }
2144 
2145   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2146 
2147 #if OMPT_SUPPORT && OMPT_OPTIONAL
2148   // This is the case, if called from omp_init_lock_with_hint:
2149   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2150   if (!codeptr)
2151     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2152   if (ompt_enabled.ompt_callback_lock_init) {
2153     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2154         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2155         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2156         codeptr);
2157   }
2158 #endif
2159 }
2160 
2161 #endif // KMP_USE_DYNAMIC_LOCK
2162 
2163 /* initialize the lock */
2164 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2165 #if KMP_USE_DYNAMIC_LOCK
2166 
2167   KMP_DEBUG_ASSERT(__kmp_init_serial);
2168   if (__kmp_env_consistency_check && user_lock == NULL) {
2169     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2170   }
2171   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2172 
2173 #if OMPT_SUPPORT && OMPT_OPTIONAL
2174   // This is the case, if called from omp_init_lock_with_hint:
2175   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2176   if (!codeptr)
2177     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2178   if (ompt_enabled.ompt_callback_lock_init) {
2179     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2180         ompt_mutex_lock, omp_lock_hint_none,
2181         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2182         codeptr);
2183   }
2184 #endif
2185 
2186 #else // KMP_USE_DYNAMIC_LOCK
2187 
2188   static char const *const func = "omp_init_lock";
2189   kmp_user_lock_p lck;
2190   KMP_DEBUG_ASSERT(__kmp_init_serial);
2191 
2192   if (__kmp_env_consistency_check) {
2193     if (user_lock == NULL) {
2194       KMP_FATAL(LockIsUninitialized, func);
2195     }
2196   }
2197 
2198   KMP_CHECK_USER_LOCK_INIT();
2199 
2200   if ((__kmp_user_lock_kind == lk_tas) &&
2201       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2202     lck = (kmp_user_lock_p)user_lock;
2203   }
2204 #if KMP_USE_FUTEX
2205   else if ((__kmp_user_lock_kind == lk_futex) &&
2206            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2207     lck = (kmp_user_lock_p)user_lock;
2208   }
2209 #endif
2210   else {
2211     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2212   }
2213   INIT_LOCK(lck);
2214   __kmp_set_user_lock_location(lck, loc);
2215 
2216 #if OMPT_SUPPORT && OMPT_OPTIONAL
2217   // This is the case, if called from omp_init_lock_with_hint:
2218   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2219   if (!codeptr)
2220     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2221   if (ompt_enabled.ompt_callback_lock_init) {
2222     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2223         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2224         (omp_wait_id_t)user_lock, codeptr);
2225   }
2226 #endif
2227 
2228 #if USE_ITT_BUILD
2229   __kmp_itt_lock_creating(lck);
2230 #endif /* USE_ITT_BUILD */
2231 
2232 #endif // KMP_USE_DYNAMIC_LOCK
2233 } // __kmpc_init_lock
2234 
2235 /* initialize the lock */
2236 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2237 #if KMP_USE_DYNAMIC_LOCK
2238 
2239   KMP_DEBUG_ASSERT(__kmp_init_serial);
2240   if (__kmp_env_consistency_check && user_lock == NULL) {
2241     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2242   }
2243   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2244 
2245 #if OMPT_SUPPORT && OMPT_OPTIONAL
2246   // This is the case, if called from omp_init_lock_with_hint:
2247   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2248   if (!codeptr)
2249     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2250   if (ompt_enabled.ompt_callback_lock_init) {
2251     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2252         ompt_mutex_nest_lock, omp_lock_hint_none,
2253         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2254         codeptr);
2255   }
2256 #endif
2257 
2258 #else // KMP_USE_DYNAMIC_LOCK
2259 
2260   static char const *const func = "omp_init_nest_lock";
2261   kmp_user_lock_p lck;
2262   KMP_DEBUG_ASSERT(__kmp_init_serial);
2263 
2264   if (__kmp_env_consistency_check) {
2265     if (user_lock == NULL) {
2266       KMP_FATAL(LockIsUninitialized, func);
2267     }
2268   }
2269 
2270   KMP_CHECK_USER_LOCK_INIT();
2271 
2272   if ((__kmp_user_lock_kind == lk_tas) &&
2273       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2274        OMP_NEST_LOCK_T_SIZE)) {
2275     lck = (kmp_user_lock_p)user_lock;
2276   }
2277 #if KMP_USE_FUTEX
2278   else if ((__kmp_user_lock_kind == lk_futex) &&
2279            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2280             OMP_NEST_LOCK_T_SIZE)) {
2281     lck = (kmp_user_lock_p)user_lock;
2282   }
2283 #endif
2284   else {
2285     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2286   }
2287 
2288   INIT_NESTED_LOCK(lck);
2289   __kmp_set_user_lock_location(lck, loc);
2290 
2291 #if OMPT_SUPPORT && OMPT_OPTIONAL
2292   // This is the case, if called from omp_init_lock_with_hint:
2293   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2294   if (!codeptr)
2295     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2296   if (ompt_enabled.ompt_callback_lock_init) {
2297     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2298         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2299         (omp_wait_id_t)user_lock, codeptr);
2300   }
2301 #endif
2302 
2303 #if USE_ITT_BUILD
2304   __kmp_itt_lock_creating(lck);
2305 #endif /* USE_ITT_BUILD */
2306 
2307 #endif // KMP_USE_DYNAMIC_LOCK
2308 } // __kmpc_init_nest_lock
2309 
2310 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2311 #if KMP_USE_DYNAMIC_LOCK
2312 
2313 #if USE_ITT_BUILD
2314   kmp_user_lock_p lck;
2315   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2316     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2317   } else {
2318     lck = (kmp_user_lock_p)user_lock;
2319   }
2320   __kmp_itt_lock_destroyed(lck);
2321 #endif
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL
2323   // This is the case, if called from omp_init_lock_with_hint:
2324   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2325   if (!codeptr)
2326     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2327   if (ompt_enabled.ompt_callback_lock_destroy) {
2328     kmp_user_lock_p lck;
2329     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2330       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2331     } else {
2332       lck = (kmp_user_lock_p)user_lock;
2333     }
2334     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2335         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2336   }
2337 #endif
2338   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2339 #else
2340   kmp_user_lock_p lck;
2341 
2342   if ((__kmp_user_lock_kind == lk_tas) &&
2343       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2344     lck = (kmp_user_lock_p)user_lock;
2345   }
2346 #if KMP_USE_FUTEX
2347   else if ((__kmp_user_lock_kind == lk_futex) &&
2348            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2349     lck = (kmp_user_lock_p)user_lock;
2350   }
2351 #endif
2352   else {
2353     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2354   }
2355 
2356 #if OMPT_SUPPORT && OMPT_OPTIONAL
2357   // This is the case, if called from omp_init_lock_with_hint:
2358   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2359   if (!codeptr)
2360     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2361   if (ompt_enabled.ompt_callback_lock_destroy) {
2362     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2363         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2364   }
2365 #endif
2366 
2367 #if USE_ITT_BUILD
2368   __kmp_itt_lock_destroyed(lck);
2369 #endif /* USE_ITT_BUILD */
2370   DESTROY_LOCK(lck);
2371 
2372   if ((__kmp_user_lock_kind == lk_tas) &&
2373       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2374     ;
2375   }
2376 #if KMP_USE_FUTEX
2377   else if ((__kmp_user_lock_kind == lk_futex) &&
2378            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2379     ;
2380   }
2381 #endif
2382   else {
2383     __kmp_user_lock_free(user_lock, gtid, lck);
2384   }
2385 #endif // KMP_USE_DYNAMIC_LOCK
2386 } // __kmpc_destroy_lock
2387 
2388 /* destroy the lock */
2389 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2390 #if KMP_USE_DYNAMIC_LOCK
2391 
2392 #if USE_ITT_BUILD
2393   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2394   __kmp_itt_lock_destroyed(ilk->lock);
2395 #endif
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397   // This is the case, if called from omp_init_lock_with_hint:
2398   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2399   if (!codeptr)
2400     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2401   if (ompt_enabled.ompt_callback_lock_destroy) {
2402     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2403         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2404   }
2405 #endif
2406   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2407 
2408 #else // KMP_USE_DYNAMIC_LOCK
2409 
2410   kmp_user_lock_p lck;
2411 
2412   if ((__kmp_user_lock_kind == lk_tas) &&
2413       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2414        OMP_NEST_LOCK_T_SIZE)) {
2415     lck = (kmp_user_lock_p)user_lock;
2416   }
2417 #if KMP_USE_FUTEX
2418   else if ((__kmp_user_lock_kind == lk_futex) &&
2419            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2420             OMP_NEST_LOCK_T_SIZE)) {
2421     lck = (kmp_user_lock_p)user_lock;
2422   }
2423 #endif
2424   else {
2425     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2426   }
2427 
2428 #if OMPT_SUPPORT && OMPT_OPTIONAL
2429   // This is the case, if called from omp_init_lock_with_hint:
2430   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2431   if (!codeptr)
2432     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2433   if (ompt_enabled.ompt_callback_lock_destroy) {
2434     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2435         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2436   }
2437 #endif
2438 
2439 #if USE_ITT_BUILD
2440   __kmp_itt_lock_destroyed(lck);
2441 #endif /* USE_ITT_BUILD */
2442 
2443   DESTROY_NESTED_LOCK(lck);
2444 
2445   if ((__kmp_user_lock_kind == lk_tas) &&
2446       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2447        OMP_NEST_LOCK_T_SIZE)) {
2448     ;
2449   }
2450 #if KMP_USE_FUTEX
2451   else if ((__kmp_user_lock_kind == lk_futex) &&
2452            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2453             OMP_NEST_LOCK_T_SIZE)) {
2454     ;
2455   }
2456 #endif
2457   else {
2458     __kmp_user_lock_free(user_lock, gtid, lck);
2459   }
2460 #endif // KMP_USE_DYNAMIC_LOCK
2461 } // __kmpc_destroy_nest_lock
2462 
2463 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2464   KMP_COUNT_BLOCK(OMP_set_lock);
2465 #if KMP_USE_DYNAMIC_LOCK
2466   int tag = KMP_EXTRACT_D_TAG(user_lock);
2467 #if USE_ITT_BUILD
2468   __kmp_itt_lock_acquiring(
2469       (kmp_user_lock_p)
2470           user_lock); // itt function will get to the right lock object.
2471 #endif
2472 #if OMPT_SUPPORT && OMPT_OPTIONAL
2473   // This is the case, if called from omp_init_lock_with_hint:
2474   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2475   if (!codeptr)
2476     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2477   if (ompt_enabled.ompt_callback_mutex_acquire) {
2478     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2479         ompt_mutex_lock, omp_lock_hint_none,
2480         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2481         codeptr);
2482   }
2483 #endif
2484 #if KMP_USE_INLINED_TAS
2485   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2486     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2487   } else
2488 #elif KMP_USE_INLINED_FUTEX
2489   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2490     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2491   } else
2492 #endif
2493   {
2494     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2495   }
2496 #if USE_ITT_BUILD
2497   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2498 #endif
2499 #if OMPT_SUPPORT && OMPT_OPTIONAL
2500   if (ompt_enabled.ompt_callback_mutex_acquired) {
2501     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2502         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2503   }
2504 #endif
2505 
2506 #else // KMP_USE_DYNAMIC_LOCK
2507 
2508   kmp_user_lock_p lck;
2509 
2510   if ((__kmp_user_lock_kind == lk_tas) &&
2511       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2512     lck = (kmp_user_lock_p)user_lock;
2513   }
2514 #if KMP_USE_FUTEX
2515   else if ((__kmp_user_lock_kind == lk_futex) &&
2516            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2517     lck = (kmp_user_lock_p)user_lock;
2518   }
2519 #endif
2520   else {
2521     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2522   }
2523 
2524 #if USE_ITT_BUILD
2525   __kmp_itt_lock_acquiring(lck);
2526 #endif /* USE_ITT_BUILD */
2527 #if OMPT_SUPPORT && OMPT_OPTIONAL
2528   // This is the case, if called from omp_init_lock_with_hint:
2529   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2530   if (!codeptr)
2531     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2532   if (ompt_enabled.ompt_callback_mutex_acquire) {
2533     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2534         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2535         (omp_wait_id_t)lck, codeptr);
2536   }
2537 #endif
2538 
2539   ACQUIRE_LOCK(lck, gtid);
2540 
2541 #if USE_ITT_BUILD
2542   __kmp_itt_lock_acquired(lck);
2543 #endif /* USE_ITT_BUILD */
2544 
2545 #if OMPT_SUPPORT && OMPT_OPTIONAL
2546   if (ompt_enabled.ompt_callback_mutex_acquired) {
2547     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2548         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2549   }
2550 #endif
2551 
2552 #endif // KMP_USE_DYNAMIC_LOCK
2553 }
2554 
2555 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2556 #if KMP_USE_DYNAMIC_LOCK
2557 
2558 #if USE_ITT_BUILD
2559   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2560 #endif
2561 #if OMPT_SUPPORT && OMPT_OPTIONAL
2562   // This is the case, if called from omp_init_lock_with_hint:
2563   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2564   if (!codeptr)
2565     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2566   if (ompt_enabled.enabled) {
2567     if (ompt_enabled.ompt_callback_mutex_acquire) {
2568       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2569           ompt_mutex_nest_lock, omp_lock_hint_none,
2570           __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2571           codeptr);
2572     }
2573   }
2574 #endif
2575   int acquire_status =
2576       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2577   (void) acquire_status;
2578 #if USE_ITT_BUILD
2579   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2580 #endif
2581 
2582 #if OMPT_SUPPORT && OMPT_OPTIONAL
2583   if (ompt_enabled.enabled) {
2584     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2585       if (ompt_enabled.ompt_callback_mutex_acquired) {
2586         // lock_first
2587         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2588             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2589       }
2590     } else {
2591       if (ompt_enabled.ompt_callback_nest_lock) {
2592         // lock_next
2593         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2594             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
2595       }
2596     }
2597   }
2598 #endif
2599 
2600 #else // KMP_USE_DYNAMIC_LOCK
2601   int acquire_status;
2602   kmp_user_lock_p lck;
2603 
2604   if ((__kmp_user_lock_kind == lk_tas) &&
2605       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2606        OMP_NEST_LOCK_T_SIZE)) {
2607     lck = (kmp_user_lock_p)user_lock;
2608   }
2609 #if KMP_USE_FUTEX
2610   else if ((__kmp_user_lock_kind == lk_futex) &&
2611            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2612             OMP_NEST_LOCK_T_SIZE)) {
2613     lck = (kmp_user_lock_p)user_lock;
2614   }
2615 #endif
2616   else {
2617     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2618   }
2619 
2620 #if USE_ITT_BUILD
2621   __kmp_itt_lock_acquiring(lck);
2622 #endif /* USE_ITT_BUILD */
2623 #if OMPT_SUPPORT && OMPT_OPTIONAL
2624   // This is the case, if called from omp_init_lock_with_hint:
2625   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2626   if (!codeptr)
2627     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2628   if (ompt_enabled.enabled) {
2629     if (ompt_enabled.ompt_callback_mutex_acquire) {
2630       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2631           ompt_mutex_nest_lock, omp_lock_hint_none,
2632           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
2633     }
2634   }
2635 #endif
2636 
2637   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2638 
2639 #if USE_ITT_BUILD
2640   __kmp_itt_lock_acquired(lck);
2641 #endif /* USE_ITT_BUILD */
2642 
2643 #if OMPT_SUPPORT && OMPT_OPTIONAL
2644   if (ompt_enabled.enabled) {
2645     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2646       if (ompt_enabled.ompt_callback_mutex_acquired) {
2647         // lock_first
2648         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2649             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2650       }
2651     } else {
2652       if (ompt_enabled.ompt_callback_nest_lock) {
2653         // lock_next
2654         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2655             ompt_scope_begin, (omp_wait_id_t)lck, codeptr);
2656       }
2657     }
2658   }
2659 #endif
2660 
2661 #endif // KMP_USE_DYNAMIC_LOCK
2662 }
2663 
2664 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2665 #if KMP_USE_DYNAMIC_LOCK
2666 
2667   int tag = KMP_EXTRACT_D_TAG(user_lock);
2668 #if USE_ITT_BUILD
2669   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2670 #endif
2671 #if KMP_USE_INLINED_TAS
2672   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2673     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2674   } else
2675 #elif KMP_USE_INLINED_FUTEX
2676   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2677     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2678   } else
2679 #endif
2680   {
2681     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2682   }
2683 
2684 #if OMPT_SUPPORT && OMPT_OPTIONAL
2685   // This is the case, if called from omp_init_lock_with_hint:
2686   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2687   if (!codeptr)
2688     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2689   if (ompt_enabled.ompt_callback_mutex_released) {
2690     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2691         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2692   }
2693 #endif
2694 
2695 #else // KMP_USE_DYNAMIC_LOCK
2696 
2697   kmp_user_lock_p lck;
2698 
2699   /* Can't use serial interval since not block structured */
2700   /* release the lock */
2701 
2702   if ((__kmp_user_lock_kind == lk_tas) &&
2703       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2704 #if KMP_OS_LINUX &&                                                            \
2705     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2706 // "fast" path implemented to fix customer performance issue
2707 #if USE_ITT_BUILD
2708     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2709 #endif /* USE_ITT_BUILD */
2710     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2711     KMP_MB();
2712 
2713 #if OMPT_SUPPORT && OMPT_OPTIONAL
2714     // This is the case, if called from omp_init_lock_with_hint:
2715     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2716     if (!codeptr)
2717       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2718     if (ompt_enabled.ompt_callback_mutex_released) {
2719       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2720           ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2721     }
2722 #endif
2723 
2724     return;
2725 #else
2726     lck = (kmp_user_lock_p)user_lock;
2727 #endif
2728   }
2729 #if KMP_USE_FUTEX
2730   else if ((__kmp_user_lock_kind == lk_futex) &&
2731            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2732     lck = (kmp_user_lock_p)user_lock;
2733   }
2734 #endif
2735   else {
2736     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2737   }
2738 
2739 #if USE_ITT_BUILD
2740   __kmp_itt_lock_releasing(lck);
2741 #endif /* USE_ITT_BUILD */
2742 
2743   RELEASE_LOCK(lck, gtid);
2744 
2745 #if OMPT_SUPPORT && OMPT_OPTIONAL
2746   // This is the case, if called from omp_init_lock_with_hint:
2747   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2748   if (!codeptr)
2749     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2750   if (ompt_enabled.ompt_callback_mutex_released) {
2751     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2752         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2753   }
2754 #endif
2755 
2756 #endif // KMP_USE_DYNAMIC_LOCK
2757 }
2758 
2759 /* release the lock */
2760 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2761 #if KMP_USE_DYNAMIC_LOCK
2762 
2763 #if USE_ITT_BUILD
2764   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2765 #endif
2766   int release_status =
2767       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2768   (void) release_status;
2769 
2770 #if OMPT_SUPPORT && OMPT_OPTIONAL
2771   // This is the case, if called from omp_init_lock_with_hint:
2772   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2773   if (!codeptr)
2774     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2775   if (ompt_enabled.enabled) {
2776     if (release_status == KMP_LOCK_RELEASED) {
2777       if (ompt_enabled.ompt_callback_mutex_released) {
2778         // release_lock_last
2779         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2780             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2781       }
2782     } else if (ompt_enabled.ompt_callback_nest_lock) {
2783       // release_lock_prev
2784       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2785           ompt_scope_end, (omp_wait_id_t)user_lock, codeptr);
2786     }
2787   }
2788 #endif
2789 
2790 #else // KMP_USE_DYNAMIC_LOCK
2791 
2792   kmp_user_lock_p lck;
2793 
2794   /* Can't use serial interval since not block structured */
2795 
2796   if ((__kmp_user_lock_kind == lk_tas) &&
2797       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2798        OMP_NEST_LOCK_T_SIZE)) {
2799 #if KMP_OS_LINUX &&                                                            \
2800     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2801     // "fast" path implemented to fix customer performance issue
2802     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2803 #if USE_ITT_BUILD
2804     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2805 #endif /* USE_ITT_BUILD */
2806 
2807 #if OMPT_SUPPORT && OMPT_OPTIONAL
2808     int release_status = KMP_LOCK_STILL_HELD;
2809 #endif
2810 
2811     if (--(tl->lk.depth_locked) == 0) {
2812       TCW_4(tl->lk.poll, 0);
2813 #if OMPT_SUPPORT && OMPT_OPTIONAL
2814       release_status = KMP_LOCK_RELEASED;
2815 #endif
2816     }
2817     KMP_MB();
2818 
2819 #if OMPT_SUPPORT && OMPT_OPTIONAL
2820     // This is the case, if called from omp_init_lock_with_hint:
2821     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2822     if (!codeptr)
2823       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2824     if (ompt_enabled.enabled) {
2825       if (release_status == KMP_LOCK_RELEASED) {
2826         if (ompt_enabled.ompt_callback_mutex_released) {
2827           // release_lock_last
2828           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2829               ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2830         }
2831       } else if (ompt_enabled.ompt_callback_nest_lock) {
2832         // release_lock_previous
2833         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2834             ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2835       }
2836     }
2837 #endif
2838 
2839     return;
2840 #else
2841     lck = (kmp_user_lock_p)user_lock;
2842 #endif
2843   }
2844 #if KMP_USE_FUTEX
2845   else if ((__kmp_user_lock_kind == lk_futex) &&
2846            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2847             OMP_NEST_LOCK_T_SIZE)) {
2848     lck = (kmp_user_lock_p)user_lock;
2849   }
2850 #endif
2851   else {
2852     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2853   }
2854 
2855 #if USE_ITT_BUILD
2856   __kmp_itt_lock_releasing(lck);
2857 #endif /* USE_ITT_BUILD */
2858 
2859   int release_status;
2860   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2861 #if OMPT_SUPPORT && OMPT_OPTIONAL
2862   // This is the case, if called from omp_init_lock_with_hint:
2863   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2864   if (!codeptr)
2865     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2866   if (ompt_enabled.enabled) {
2867     if (release_status == KMP_LOCK_RELEASED) {
2868       if (ompt_enabled.ompt_callback_mutex_released) {
2869         // release_lock_last
2870         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2871             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2872       }
2873     } else if (ompt_enabled.ompt_callback_nest_lock) {
2874       // release_lock_previous
2875       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2876           ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2877     }
2878   }
2879 #endif
2880 
2881 #endif // KMP_USE_DYNAMIC_LOCK
2882 }
2883 
2884 /* try to acquire the lock */
2885 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2886   KMP_COUNT_BLOCK(OMP_test_lock);
2887 
2888 #if KMP_USE_DYNAMIC_LOCK
2889   int rc;
2890   int tag = KMP_EXTRACT_D_TAG(user_lock);
2891 #if USE_ITT_BUILD
2892   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2893 #endif
2894 #if OMPT_SUPPORT && OMPT_OPTIONAL
2895   // This is the case, if called from omp_init_lock_with_hint:
2896   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2897   if (!codeptr)
2898     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2899   if (ompt_enabled.ompt_callback_mutex_acquire) {
2900     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2901         ompt_mutex_lock, omp_lock_hint_none,
2902         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2903         codeptr);
2904   }
2905 #endif
2906 #if KMP_USE_INLINED_TAS
2907   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2908     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2909   } else
2910 #elif KMP_USE_INLINED_FUTEX
2911   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2912     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2913   } else
2914 #endif
2915   {
2916     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2917   }
2918   if (rc) {
2919 #if USE_ITT_BUILD
2920     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2921 #endif
2922 #if OMPT_SUPPORT && OMPT_OPTIONAL
2923     if (ompt_enabled.ompt_callback_mutex_acquired) {
2924       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2925           ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2926     }
2927 #endif
2928     return FTN_TRUE;
2929   } else {
2930 #if USE_ITT_BUILD
2931     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2932 #endif
2933     return FTN_FALSE;
2934   }
2935 
2936 #else // KMP_USE_DYNAMIC_LOCK
2937 
2938   kmp_user_lock_p lck;
2939   int rc;
2940 
2941   if ((__kmp_user_lock_kind == lk_tas) &&
2942       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2943     lck = (kmp_user_lock_p)user_lock;
2944   }
2945 #if KMP_USE_FUTEX
2946   else if ((__kmp_user_lock_kind == lk_futex) &&
2947            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2948     lck = (kmp_user_lock_p)user_lock;
2949   }
2950 #endif
2951   else {
2952     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2953   }
2954 
2955 #if USE_ITT_BUILD
2956   __kmp_itt_lock_acquiring(lck);
2957 #endif /* USE_ITT_BUILD */
2958 #if OMPT_SUPPORT && OMPT_OPTIONAL
2959   // This is the case, if called from omp_init_lock_with_hint:
2960   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2961   if (!codeptr)
2962     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2963   if (ompt_enabled.ompt_callback_mutex_acquire) {
2964     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2965         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2966         (omp_wait_id_t)lck, codeptr);
2967   }
2968 #endif
2969 
2970   rc = TEST_LOCK(lck, gtid);
2971 #if USE_ITT_BUILD
2972   if (rc) {
2973     __kmp_itt_lock_acquired(lck);
2974   } else {
2975     __kmp_itt_lock_cancelled(lck);
2976   }
2977 #endif /* USE_ITT_BUILD */
2978 #if OMPT_SUPPORT && OMPT_OPTIONAL
2979   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2980     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2981         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2982   }
2983 #endif
2984 
2985   return (rc ? FTN_TRUE : FTN_FALSE);
2986 
2987 /* Can't use serial interval since not block structured */
2988 
2989 #endif // KMP_USE_DYNAMIC_LOCK
2990 }
2991 
2992 /* try to acquire the lock */
2993 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2994 #if KMP_USE_DYNAMIC_LOCK
2995   int rc;
2996 #if USE_ITT_BUILD
2997   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2998 #endif
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000   // This is the case, if called from omp_init_lock_with_hint:
3001   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3002   if (!codeptr)
3003     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3004   if (ompt_enabled.ompt_callback_mutex_acquire) {
3005     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3006         ompt_mutex_nest_lock, omp_lock_hint_none,
3007         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
3008         codeptr);
3009   }
3010 #endif
3011   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3012 #if USE_ITT_BUILD
3013   if (rc) {
3014     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3015   } else {
3016     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3017   }
3018 #endif
3019 #if OMPT_SUPPORT && OMPT_OPTIONAL
3020   if (ompt_enabled.enabled && rc) {
3021     if (rc == 1) {
3022       if (ompt_enabled.ompt_callback_mutex_acquired) {
3023         // lock_first
3024         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3025             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
3026       }
3027     } else {
3028       if (ompt_enabled.ompt_callback_nest_lock) {
3029         // lock_next
3030         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3031             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
3032       }
3033     }
3034   }
3035 #endif
3036   return rc;
3037 
3038 #else // KMP_USE_DYNAMIC_LOCK
3039 
3040   kmp_user_lock_p lck;
3041   int rc;
3042 
3043   if ((__kmp_user_lock_kind == lk_tas) &&
3044       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3045        OMP_NEST_LOCK_T_SIZE)) {
3046     lck = (kmp_user_lock_p)user_lock;
3047   }
3048 #if KMP_USE_FUTEX
3049   else if ((__kmp_user_lock_kind == lk_futex) &&
3050            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3051             OMP_NEST_LOCK_T_SIZE)) {
3052     lck = (kmp_user_lock_p)user_lock;
3053   }
3054 #endif
3055   else {
3056     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3057   }
3058 
3059 #if USE_ITT_BUILD
3060   __kmp_itt_lock_acquiring(lck);
3061 #endif /* USE_ITT_BUILD */
3062 
3063 #if OMPT_SUPPORT && OMPT_OPTIONAL
3064   // This is the case, if called from omp_init_lock_with_hint:
3065   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3066   if (!codeptr)
3067     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3068   if (ompt_enabled.enabled) &&
3069         ompt_enabled.ompt_callback_mutex_acquire) {
3070       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3071           ompt_mutex_nest_lock, omp_lock_hint_none,
3072           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
3073     }
3074 #endif
3075 
3076   rc = TEST_NESTED_LOCK(lck, gtid);
3077 #if USE_ITT_BUILD
3078   if (rc) {
3079     __kmp_itt_lock_acquired(lck);
3080   } else {
3081     __kmp_itt_lock_cancelled(lck);
3082   }
3083 #endif /* USE_ITT_BUILD */
3084 #if OMPT_SUPPORT && OMPT_OPTIONAL
3085   if (ompt_enabled.enabled && rc) {
3086     if (rc == 1) {
3087       if (ompt_enabled.ompt_callback_mutex_acquired) {
3088         // lock_first
3089         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3090             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
3091       }
3092     } else {
3093       if (ompt_enabled.ompt_callback_nest_lock) {
3094         // lock_next
3095         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3096             ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr);
3097       }
3098     }
3099   }
3100 #endif
3101   return rc;
3102 
3103 /* Can't use serial interval since not block structured */
3104 
3105 #endif // KMP_USE_DYNAMIC_LOCK
3106 }
3107 
3108 // Interface to fast scalable reduce methods routines
3109 
3110 // keep the selected method in a thread local structure for cross-function
3111 // usage: will be used in __kmpc_end_reduce* functions;
3112 // another solution: to re-determine the method one more time in
3113 // __kmpc_end_reduce* functions (new prototype required then)
3114 // AT: which solution is better?
3115 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3116   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3117 
3118 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3119   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3120 
3121 // description of the packed_reduction_method variable: look at the macros in
3122 // kmp.h
3123 
3124 // used in a critical section reduce block
3125 static __forceinline void
3126 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3127                                           kmp_critical_name *crit) {
3128 
3129   // this lock was visible to a customer and to the threading profile tool as a
3130   // serial overhead span (although it's used for an internal purpose only)
3131   //            why was it visible in previous implementation?
3132   //            should we keep it visible in new reduce block?
3133   kmp_user_lock_p lck;
3134 
3135 #if KMP_USE_DYNAMIC_LOCK
3136 
3137   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3138   // Check if it is initialized.
3139   if (*lk == 0) {
3140     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3141       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3142                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3143     } else {
3144       __kmp_init_indirect_csptr(crit, loc, global_tid,
3145                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3146     }
3147   }
3148   // Branch for accessing the actual lock object and set operation. This
3149   // branching is inevitable since this lock initialization does not follow the
3150   // normal dispatch path (lock table is not used).
3151   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3152     lck = (kmp_user_lock_p)lk;
3153     KMP_DEBUG_ASSERT(lck != NULL);
3154     if (__kmp_env_consistency_check) {
3155       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3156     }
3157     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3158   } else {
3159     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3160     lck = ilk->lock;
3161     KMP_DEBUG_ASSERT(lck != NULL);
3162     if (__kmp_env_consistency_check) {
3163       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3164     }
3165     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3166   }
3167 
3168 #else // KMP_USE_DYNAMIC_LOCK
3169 
3170   // We know that the fast reduction code is only emitted by Intel compilers
3171   // with 32 byte critical sections. If there isn't enough space, then we
3172   // have to use a pointer.
3173   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3174     lck = (kmp_user_lock_p)crit;
3175   } else {
3176     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3177   }
3178   KMP_DEBUG_ASSERT(lck != NULL);
3179 
3180   if (__kmp_env_consistency_check)
3181     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3182 
3183   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3184 
3185 #endif // KMP_USE_DYNAMIC_LOCK
3186 }
3187 
3188 // used in a critical section reduce block
3189 static __forceinline void
3190 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3191                                         kmp_critical_name *crit) {
3192 
3193   kmp_user_lock_p lck;
3194 
3195 #if KMP_USE_DYNAMIC_LOCK
3196 
3197   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3198     lck = (kmp_user_lock_p)crit;
3199     if (__kmp_env_consistency_check)
3200       __kmp_pop_sync(global_tid, ct_critical, loc);
3201     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3202   } else {
3203     kmp_indirect_lock_t *ilk =
3204         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3205     if (__kmp_env_consistency_check)
3206       __kmp_pop_sync(global_tid, ct_critical, loc);
3207     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3208   }
3209 
3210 #else // KMP_USE_DYNAMIC_LOCK
3211 
3212   // We know that the fast reduction code is only emitted by Intel compilers
3213   // with 32 byte critical sections. If there isn't enough space, then we have
3214   // to use a pointer.
3215   if (__kmp_base_user_lock_size > 32) {
3216     lck = *((kmp_user_lock_p *)crit);
3217     KMP_ASSERT(lck != NULL);
3218   } else {
3219     lck = (kmp_user_lock_p)crit;
3220   }
3221 
3222   if (__kmp_env_consistency_check)
3223     __kmp_pop_sync(global_tid, ct_critical, loc);
3224 
3225   __kmp_release_user_lock_with_checks(lck, global_tid);
3226 
3227 #endif // KMP_USE_DYNAMIC_LOCK
3228 } // __kmp_end_critical_section_reduce_block
3229 
3230 #if OMP_40_ENABLED
3231 static __forceinline int
3232 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3233                                      int *task_state) {
3234   kmp_team_t *team;
3235 
3236   // Check if we are inside the teams construct?
3237   if (th->th.th_teams_microtask) {
3238     *team_p = team = th->th.th_team;
3239     if (team->t.t_level == th->th.th_teams_level) {
3240       // This is reduction at teams construct.
3241       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3242       // Let's swap teams temporarily for the reduction.
3243       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3244       th->th.th_team = team->t.t_parent;
3245       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3246       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3247       *task_state = th->th.th_task_state;
3248       th->th.th_task_state = 0;
3249 
3250       return 1;
3251     }
3252   }
3253   return 0;
3254 }
3255 
3256 static __forceinline void
3257 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3258   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3259   th->th.th_info.ds.ds_tid = 0;
3260   th->th.th_team = team;
3261   th->th.th_team_nproc = team->t.t_nproc;
3262   th->th.th_task_team = team->t.t_task_team[task_state];
3263   th->th.th_task_state = task_state;
3264 }
3265 #endif
3266 
3267 /* 2.a.i. Reduce Block without a terminating barrier */
3268 /*!
3269 @ingroup SYNCHRONIZATION
3270 @param loc source location information
3271 @param global_tid global thread number
3272 @param num_vars number of items (variables) to be reduced
3273 @param reduce_size size of data in bytes to be reduced
3274 @param reduce_data pointer to data to be reduced
3275 @param reduce_func callback function providing reduction operation on two
3276 operands and returning result of reduction in lhs_data
3277 @param lck pointer to the unique lock data structure
3278 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3279 threads if atomic reduction needed
3280 
3281 The nowait version is used for a reduce clause with the nowait argument.
3282 */
3283 kmp_int32
3284 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3285                      size_t reduce_size, void *reduce_data,
3286                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3287                      kmp_critical_name *lck) {
3288 
3289   KMP_COUNT_BLOCK(REDUCE_nowait);
3290   int retval = 0;
3291   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3292 #if OMP_40_ENABLED
3293   kmp_info_t *th;
3294   kmp_team_t *team;
3295   int teams_swapped = 0, task_state;
3296 #endif
3297   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3298 
3299   // why do we need this initialization here at all?
3300   // Reduction clause can not be used as a stand-alone directive.
3301 
3302   // do not call __kmp_serial_initialize(), it will be called by
3303   // __kmp_parallel_initialize() if needed
3304   // possible detection of false-positive race by the threadchecker ???
3305   if (!TCR_4(__kmp_init_parallel))
3306     __kmp_parallel_initialize();
3307 
3308 // check correctness of reduce block nesting
3309 #if KMP_USE_DYNAMIC_LOCK
3310   if (__kmp_env_consistency_check)
3311     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3312 #else
3313   if (__kmp_env_consistency_check)
3314     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3315 #endif
3316 
3317 #if OMP_40_ENABLED
3318   th = __kmp_thread_from_gtid(global_tid);
3319   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3320 #endif // OMP_40_ENABLED
3321 
3322   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3323   // the value should be kept in a variable
3324   // the variable should be either a construct-specific or thread-specific
3325   // property, not a team specific property
3326   //     (a thread can reach the next reduce block on the next construct, reduce
3327   //     method may differ on the next construct)
3328   // an ident_t "loc" parameter could be used as a construct-specific property
3329   // (what if loc == 0?)
3330   //     (if both construct-specific and team-specific variables were shared,
3331   //     then unness extra syncs should be needed)
3332   // a thread-specific variable is better regarding two issues above (next
3333   // construct and extra syncs)
3334   // a thread-specific "th_local.reduction_method" variable is used currently
3335   // each thread executes 'determine' and 'set' lines (no need to execute by one
3336   // thread, to avoid unness extra syncs)
3337 
3338   packed_reduction_method = __kmp_determine_reduction_method(
3339       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3340   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3341 
3342   if (packed_reduction_method == critical_reduce_block) {
3343 
3344     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3345     retval = 1;
3346 
3347   } else if (packed_reduction_method == empty_reduce_block) {
3348 
3349     // usage: if team size == 1, no synchronization is required ( Intel
3350     // platforms only )
3351     retval = 1;
3352 
3353   } else if (packed_reduction_method == atomic_reduce_block) {
3354 
3355     retval = 2;
3356 
3357     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3358     // won't be called by the code gen)
3359     //     (it's not quite good, because the checking block has been closed by
3360     //     this 'pop',
3361     //      but atomic operation has not been executed yet, will be executed
3362     //      slightly later, literally on next instruction)
3363     if (__kmp_env_consistency_check)
3364       __kmp_pop_sync(global_tid, ct_reduce, loc);
3365 
3366   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3367                                    tree_reduce_block)) {
3368 
3369 // AT: performance issue: a real barrier here
3370 // AT:     (if master goes slow, other threads are blocked here waiting for the
3371 // master to come and release them)
3372 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3373 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3374 // be confusing to a customer)
3375 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3376 // might go faster and be more in line with sense of NOWAIT
3377 // AT: TO DO: do epcc test and compare times
3378 
3379 // this barrier should be invisible to a customer and to the threading profile
3380 // tool (it's neither a terminating barrier nor customer's code, it's
3381 // used for an internal purpose)
3382 #if OMPT_SUPPORT
3383     // JP: can this barrier potentially leed to task scheduling?
3384     // JP: as long as there is a barrier in the implementation, OMPT should and
3385     // will provide the barrier events
3386     //         so we set-up the necessary frame/return addresses.
3387     omp_frame_t *ompt_frame;
3388     if (ompt_enabled.enabled) {
3389       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3390       if (ompt_frame->enter_frame == NULL)
3391         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3392       OMPT_STORE_RETURN_ADDRESS(global_tid);
3393     }
3394 #endif
3395 #if USE_ITT_NOTIFY
3396     __kmp_threads[global_tid]->th.th_ident = loc;
3397 #endif
3398     retval =
3399         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3400                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3401     retval = (retval != 0) ? (0) : (1);
3402 #if OMPT_SUPPORT && OMPT_OPTIONAL
3403     if (ompt_enabled.enabled) {
3404       ompt_frame->enter_frame = NULL;
3405     }
3406 #endif
3407 
3408     // all other workers except master should do this pop here
3409     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3410     if (__kmp_env_consistency_check) {
3411       if (retval == 0) {
3412         __kmp_pop_sync(global_tid, ct_reduce, loc);
3413       }
3414     }
3415 
3416   } else {
3417 
3418     // should never reach this block
3419     KMP_ASSERT(0); // "unexpected method"
3420   }
3421 #if OMP_40_ENABLED
3422   if (teams_swapped) {
3423     __kmp_restore_swapped_teams(th, team, task_state);
3424   }
3425 #endif
3426   KA_TRACE(
3427       10,
3428       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3429        global_tid, packed_reduction_method, retval));
3430 
3431   return retval;
3432 }
3433 
3434 /*!
3435 @ingroup SYNCHRONIZATION
3436 @param loc source location information
3437 @param global_tid global thread id.
3438 @param lck pointer to the unique lock data structure
3439 
3440 Finish the execution of a reduce nowait.
3441 */
3442 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3443                               kmp_critical_name *lck) {
3444 
3445   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3446 
3447   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3448 
3449   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3450 
3451   if (packed_reduction_method == critical_reduce_block) {
3452 
3453     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3454 
3455   } else if (packed_reduction_method == empty_reduce_block) {
3456 
3457     // usage: if team size == 1, no synchronization is required ( on Intel
3458     // platforms only )
3459 
3460   } else if (packed_reduction_method == atomic_reduce_block) {
3461 
3462     // neither master nor other workers should get here
3463     //     (code gen does not generate this call in case 2: atomic reduce block)
3464     // actually it's better to remove this elseif at all;
3465     // after removal this value will checked by the 'else' and will assert
3466 
3467   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3468                                    tree_reduce_block)) {
3469 
3470     // only master gets here
3471 
3472   } else {
3473 
3474     // should never reach this block
3475     KMP_ASSERT(0); // "unexpected method"
3476   }
3477 
3478   if (__kmp_env_consistency_check)
3479     __kmp_pop_sync(global_tid, ct_reduce, loc);
3480 
3481   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3482                 global_tid, packed_reduction_method));
3483 
3484   return;
3485 }
3486 
3487 /* 2.a.ii. Reduce Block with a terminating barrier */
3488 
3489 /*!
3490 @ingroup SYNCHRONIZATION
3491 @param loc source location information
3492 @param global_tid global thread number
3493 @param num_vars number of items (variables) to be reduced
3494 @param reduce_size size of data in bytes to be reduced
3495 @param reduce_data pointer to data to be reduced
3496 @param reduce_func callback function providing reduction operation on two
3497 operands and returning result of reduction in lhs_data
3498 @param lck pointer to the unique lock data structure
3499 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3500 threads if atomic reduction needed
3501 
3502 A blocking reduce that includes an implicit barrier.
3503 */
3504 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3505                         size_t reduce_size, void *reduce_data,
3506                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3507                         kmp_critical_name *lck) {
3508   KMP_COUNT_BLOCK(REDUCE_wait);
3509   int retval = 0;
3510   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3511 #if OMP_40_ENABLED
3512   kmp_info_t *th;
3513   kmp_team_t *team;
3514   int teams_swapped = 0, task_state;
3515 #endif
3516 
3517   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3518 
3519   // why do we need this initialization here at all?
3520   // Reduction clause can not be a stand-alone directive.
3521 
3522   // do not call __kmp_serial_initialize(), it will be called by
3523   // __kmp_parallel_initialize() if needed
3524   // possible detection of false-positive race by the threadchecker ???
3525   if (!TCR_4(__kmp_init_parallel))
3526     __kmp_parallel_initialize();
3527 
3528 // check correctness of reduce block nesting
3529 #if KMP_USE_DYNAMIC_LOCK
3530   if (__kmp_env_consistency_check)
3531     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3532 #else
3533   if (__kmp_env_consistency_check)
3534     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3535 #endif
3536 
3537 #if OMP_40_ENABLED
3538   th = __kmp_thread_from_gtid(global_tid);
3539   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3540 #endif // OMP_40_ENABLED
3541 
3542   packed_reduction_method = __kmp_determine_reduction_method(
3543       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3544   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3545 
3546   if (packed_reduction_method == critical_reduce_block) {
3547 
3548     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3549     retval = 1;
3550 
3551   } else if (packed_reduction_method == empty_reduce_block) {
3552 
3553     // usage: if team size == 1, no synchronization is required ( Intel
3554     // platforms only )
3555     retval = 1;
3556 
3557   } else if (packed_reduction_method == atomic_reduce_block) {
3558 
3559     retval = 2;
3560 
3561   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3562                                    tree_reduce_block)) {
3563 
3564 // case tree_reduce_block:
3565 // this barrier should be visible to a customer and to the threading profile
3566 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3567 #if OMPT_SUPPORT
3568     omp_frame_t *ompt_frame;
3569     if (ompt_enabled.enabled) {
3570       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3571       if (ompt_frame->enter_frame == NULL)
3572         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3573       OMPT_STORE_RETURN_ADDRESS(global_tid);
3574     }
3575 #endif
3576 #if USE_ITT_NOTIFY
3577     __kmp_threads[global_tid]->th.th_ident =
3578         loc; // needed for correct notification of frames
3579 #endif
3580     retval =
3581         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3582                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3583     retval = (retval != 0) ? (0) : (1);
3584 #if OMPT_SUPPORT && OMPT_OPTIONAL
3585     if (ompt_enabled.enabled) {
3586       ompt_frame->enter_frame = NULL;
3587     }
3588 #endif
3589 
3590     // all other workers except master should do this pop here
3591     // ( none of other workers except master will enter __kmpc_end_reduce() )
3592     if (__kmp_env_consistency_check) {
3593       if (retval == 0) { // 0: all other workers; 1: master
3594         __kmp_pop_sync(global_tid, ct_reduce, loc);
3595       }
3596     }
3597 
3598   } else {
3599 
3600     // should never reach this block
3601     KMP_ASSERT(0); // "unexpected method"
3602   }
3603 #if OMP_40_ENABLED
3604   if (teams_swapped) {
3605     __kmp_restore_swapped_teams(th, team, task_state);
3606   }
3607 #endif
3608 
3609   KA_TRACE(10,
3610            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3611             global_tid, packed_reduction_method, retval));
3612 
3613   return retval;
3614 }
3615 
3616 /*!
3617 @ingroup SYNCHRONIZATION
3618 @param loc source location information
3619 @param global_tid global thread id.
3620 @param lck pointer to the unique lock data structure
3621 
3622 Finish the execution of a blocking reduce.
3623 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3624 start function.
3625 */
3626 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3627                        kmp_critical_name *lck) {
3628 
3629   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3630 #if OMP_40_ENABLED
3631   kmp_info_t *th;
3632   kmp_team_t *team;
3633   int teams_swapped = 0, task_state;
3634 #endif
3635 
3636   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3637 
3638 #if OMP_40_ENABLED
3639   th = __kmp_thread_from_gtid(global_tid);
3640   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3641 #endif // OMP_40_ENABLED
3642 
3643   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3644 
3645   // this barrier should be visible to a customer and to the threading profile
3646   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3647 
3648   if (packed_reduction_method == critical_reduce_block) {
3649 
3650     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3651 
3652 // TODO: implicit barrier: should be exposed
3653 #if OMPT_SUPPORT
3654     omp_frame_t *ompt_frame;
3655     if (ompt_enabled.enabled) {
3656       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3657       if (ompt_frame->enter_frame == NULL)
3658         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3659       OMPT_STORE_RETURN_ADDRESS(global_tid);
3660     }
3661 #endif
3662 #if USE_ITT_NOTIFY
3663     __kmp_threads[global_tid]->th.th_ident = loc;
3664 #endif
3665     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3666 #if OMPT_SUPPORT && OMPT_OPTIONAL
3667     if (ompt_enabled.enabled) {
3668       ompt_frame->enter_frame = NULL;
3669     }
3670 #endif
3671 
3672   } else if (packed_reduction_method == empty_reduce_block) {
3673 
3674 // usage: if team size==1, no synchronization is required (Intel platforms only)
3675 
3676 // TODO: implicit barrier: should be exposed
3677 #if OMPT_SUPPORT
3678     omp_frame_t *ompt_frame;
3679     if (ompt_enabled.enabled) {
3680       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3681       if (ompt_frame->enter_frame == NULL)
3682         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3683       OMPT_STORE_RETURN_ADDRESS(global_tid);
3684     }
3685 #endif
3686 #if USE_ITT_NOTIFY
3687     __kmp_threads[global_tid]->th.th_ident = loc;
3688 #endif
3689     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3690 #if OMPT_SUPPORT && OMPT_OPTIONAL
3691     if (ompt_enabled.enabled) {
3692       ompt_frame->enter_frame = NULL;
3693     }
3694 #endif
3695 
3696   } else if (packed_reduction_method == atomic_reduce_block) {
3697 
3698 #if OMPT_SUPPORT
3699     omp_frame_t *ompt_frame;
3700     if (ompt_enabled.enabled) {
3701       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3702       if (ompt_frame->enter_frame == NULL)
3703         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3704       OMPT_STORE_RETURN_ADDRESS(global_tid);
3705     }
3706 #endif
3707 // TODO: implicit barrier: should be exposed
3708 #if USE_ITT_NOTIFY
3709     __kmp_threads[global_tid]->th.th_ident = loc;
3710 #endif
3711     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3712 #if OMPT_SUPPORT && OMPT_OPTIONAL
3713     if (ompt_enabled.enabled) {
3714       ompt_frame->enter_frame = NULL;
3715     }
3716 #endif
3717 
3718   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3719                                    tree_reduce_block)) {
3720 
3721     // only master executes here (master releases all other workers)
3722     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3723                             global_tid);
3724 
3725   } else {
3726 
3727     // should never reach this block
3728     KMP_ASSERT(0); // "unexpected method"
3729   }
3730 #if OMP_40_ENABLED
3731   if (teams_swapped) {
3732     __kmp_restore_swapped_teams(th, team, task_state);
3733   }
3734 #endif
3735 
3736   if (__kmp_env_consistency_check)
3737     __kmp_pop_sync(global_tid, ct_reduce, loc);
3738 
3739   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3740                 global_tid, packed_reduction_method));
3741 
3742   return;
3743 }
3744 
3745 #undef __KMP_GET_REDUCTION_METHOD
3746 #undef __KMP_SET_REDUCTION_METHOD
3747 
3748 /* end of interface to fast scalable reduce routines */
3749 
3750 kmp_uint64 __kmpc_get_taskid() {
3751 
3752   kmp_int32 gtid;
3753   kmp_info_t *thread;
3754 
3755   gtid = __kmp_get_gtid();
3756   if (gtid < 0) {
3757     return 0;
3758   }
3759   thread = __kmp_thread_from_gtid(gtid);
3760   return thread->th.th_current_task->td_task_id;
3761 
3762 } // __kmpc_get_taskid
3763 
3764 kmp_uint64 __kmpc_get_parent_taskid() {
3765 
3766   kmp_int32 gtid;
3767   kmp_info_t *thread;
3768   kmp_taskdata_t *parent_task;
3769 
3770   gtid = __kmp_get_gtid();
3771   if (gtid < 0) {
3772     return 0;
3773   }
3774   thread = __kmp_thread_from_gtid(gtid);
3775   parent_task = thread->th.th_current_task->td_parent;
3776   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3777 
3778 } // __kmpc_get_parent_taskid
3779 
3780 #if OMP_45_ENABLED
3781 /*!
3782 @ingroup WORK_SHARING
3783 @param loc  source location information.
3784 @param gtid  global thread number.
3785 @param num_dims  number of associated doacross loops.
3786 @param dims  info on loops bounds.
3787 
3788 Initialize doacross loop information.
3789 Expect compiler send us inclusive bounds,
3790 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3791 */
3792 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3793                           const struct kmp_dim *dims) {
3794   int j, idx;
3795   kmp_int64 last, trace_count;
3796   kmp_info_t *th = __kmp_threads[gtid];
3797   kmp_team_t *team = th->th.th_team;
3798   kmp_uint32 *flags;
3799   kmp_disp_t *pr_buf = th->th.th_dispatch;
3800   dispatch_shared_info_t *sh_buf;
3801 
3802   KA_TRACE(
3803       20,
3804       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3805        gtid, num_dims, !team->t.t_serialized));
3806   KMP_DEBUG_ASSERT(dims != NULL);
3807   KMP_DEBUG_ASSERT(num_dims > 0);
3808 
3809   if (team->t.t_serialized) {
3810     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3811     return; // no dependencies if team is serialized
3812   }
3813   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3814   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3815   // the next loop
3816   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3817 
3818   // Save bounds info into allocated private buffer
3819   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3820   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3821       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3822   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3823   pr_buf->th_doacross_info[0] =
3824       (kmp_int64)num_dims; // first element is number of dimensions
3825   // Save also address of num_done in order to access it later without knowing
3826   // the buffer index
3827   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3828   pr_buf->th_doacross_info[2] = dims[0].lo;
3829   pr_buf->th_doacross_info[3] = dims[0].up;
3830   pr_buf->th_doacross_info[4] = dims[0].st;
3831   last = 5;
3832   for (j = 1; j < num_dims; ++j) {
3833     kmp_int64
3834         range_length; // To keep ranges of all dimensions but the first dims[0]
3835     if (dims[j].st == 1) { // most common case
3836       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3837       range_length = dims[j].up - dims[j].lo + 1;
3838     } else {
3839       if (dims[j].st > 0) {
3840         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3841         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3842       } else { // negative increment
3843         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3844         range_length =
3845             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3846       }
3847     }
3848     pr_buf->th_doacross_info[last++] = range_length;
3849     pr_buf->th_doacross_info[last++] = dims[j].lo;
3850     pr_buf->th_doacross_info[last++] = dims[j].up;
3851     pr_buf->th_doacross_info[last++] = dims[j].st;
3852   }
3853 
3854   // Compute total trip count.
3855   // Start with range of dims[0] which we don't need to keep in the buffer.
3856   if (dims[0].st == 1) { // most common case
3857     trace_count = dims[0].up - dims[0].lo + 1;
3858   } else if (dims[0].st > 0) {
3859     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3860     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3861   } else { // negative increment
3862     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3863     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3864   }
3865   for (j = 1; j < num_dims; ++j) {
3866     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3867   }
3868   KMP_DEBUG_ASSERT(trace_count > 0);
3869 
3870   // Check if shared buffer is not occupied by other loop (idx -
3871   // __kmp_dispatch_num_buffers)
3872   if (idx != sh_buf->doacross_buf_idx) {
3873     // Shared buffer is occupied, wait for it to be free
3874     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3875                        __kmp_eq_4, NULL);
3876   }
3877 #if KMP_32_BIT_ARCH
3878   // Check if we are the first thread. After the CAS the first thread gets 0,
3879   // others get 1 if initialization is in progress, allocated pointer otherwise.
3880   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3881   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3882       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3883 #else
3884   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3885       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3886 #endif
3887   if (flags == NULL) {
3888     // we are the first thread, allocate the array of flags
3889     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3890     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3891     KMP_MB();
3892     sh_buf->doacross_flags = flags;
3893   } else if (flags == (kmp_uint32 *)1) {
3894 #if KMP_32_BIT_ARCH
3895     // initialization is still in progress, need to wait
3896     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3897 #else
3898     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3899 #endif
3900       KMP_YIELD(TRUE);
3901     KMP_MB();
3902   } else {
3903     KMP_MB();
3904   }
3905   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3906   pr_buf->th_doacross_flags =
3907       sh_buf->doacross_flags; // save private copy in order to not
3908   // touch shared buffer on each iteration
3909   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3910 }
3911 
3912 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3913   kmp_int32 shft, num_dims, i;
3914   kmp_uint32 flag;
3915   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3916   kmp_info_t *th = __kmp_threads[gtid];
3917   kmp_team_t *team = th->th.th_team;
3918   kmp_disp_t *pr_buf;
3919   kmp_int64 lo, up, st;
3920 
3921   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3922   if (team->t.t_serialized) {
3923     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3924     return; // no dependencies if team is serialized
3925   }
3926 
3927   // calculate sequential iteration number and check out-of-bounds condition
3928   pr_buf = th->th.th_dispatch;
3929   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3930   num_dims = pr_buf->th_doacross_info[0];
3931   lo = pr_buf->th_doacross_info[2];
3932   up = pr_buf->th_doacross_info[3];
3933   st = pr_buf->th_doacross_info[4];
3934   if (st == 1) { // most common case
3935     if (vec[0] < lo || vec[0] > up) {
3936       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3937                     "bounds [%lld,%lld]\n",
3938                     gtid, vec[0], lo, up));
3939       return;
3940     }
3941     iter_number = vec[0] - lo;
3942   } else if (st > 0) {
3943     if (vec[0] < lo || vec[0] > up) {
3944       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3945                     "bounds [%lld,%lld]\n",
3946                     gtid, vec[0], lo, up));
3947       return;
3948     }
3949     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3950   } else { // negative increment
3951     if (vec[0] > lo || vec[0] < up) {
3952       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3953                     "bounds [%lld,%lld]\n",
3954                     gtid, vec[0], lo, up));
3955       return;
3956     }
3957     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3958   }
3959   for (i = 1; i < num_dims; ++i) {
3960     kmp_int64 iter, ln;
3961     kmp_int32 j = i * 4;
3962     ln = pr_buf->th_doacross_info[j + 1];
3963     lo = pr_buf->th_doacross_info[j + 2];
3964     up = pr_buf->th_doacross_info[j + 3];
3965     st = pr_buf->th_doacross_info[j + 4];
3966     if (st == 1) {
3967       if (vec[i] < lo || vec[i] > up) {
3968         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3969                       "bounds [%lld,%lld]\n",
3970                       gtid, vec[i], lo, up));
3971         return;
3972       }
3973       iter = vec[i] - lo;
3974     } else if (st > 0) {
3975       if (vec[i] < lo || vec[i] > up) {
3976         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3977                       "bounds [%lld,%lld]\n",
3978                       gtid, vec[i], lo, up));
3979         return;
3980       }
3981       iter = (kmp_uint64)(vec[i] - lo) / st;
3982     } else { // st < 0
3983       if (vec[i] > lo || vec[i] < up) {
3984         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3985                       "bounds [%lld,%lld]\n",
3986                       gtid, vec[i], lo, up));
3987         return;
3988       }
3989       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3990     }
3991     iter_number = iter + ln * iter_number;
3992   }
3993   shft = iter_number % 32; // use 32-bit granularity
3994   iter_number >>= 5; // divided by 32
3995   flag = 1 << shft;
3996   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3997     KMP_YIELD(TRUE);
3998   }
3999   KMP_MB();
4000   KA_TRACE(20,
4001            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4002             gtid, (iter_number << 5) + shft));
4003 }
4004 
4005 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4006   kmp_int32 shft, num_dims, i;
4007   kmp_uint32 flag;
4008   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4009   kmp_info_t *th = __kmp_threads[gtid];
4010   kmp_team_t *team = th->th.th_team;
4011   kmp_disp_t *pr_buf;
4012   kmp_int64 lo, st;
4013 
4014   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4015   if (team->t.t_serialized) {
4016     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4017     return; // no dependencies if team is serialized
4018   }
4019 
4020   // calculate sequential iteration number (same as in "wait" but no
4021   // out-of-bounds checks)
4022   pr_buf = th->th.th_dispatch;
4023   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4024   num_dims = pr_buf->th_doacross_info[0];
4025   lo = pr_buf->th_doacross_info[2];
4026   st = pr_buf->th_doacross_info[4];
4027   if (st == 1) { // most common case
4028     iter_number = vec[0] - lo;
4029   } else if (st > 0) {
4030     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4031   } else { // negative increment
4032     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4033   }
4034   for (i = 1; i < num_dims; ++i) {
4035     kmp_int64 iter, ln;
4036     kmp_int32 j = i * 4;
4037     ln = pr_buf->th_doacross_info[j + 1];
4038     lo = pr_buf->th_doacross_info[j + 2];
4039     st = pr_buf->th_doacross_info[j + 4];
4040     if (st == 1) {
4041       iter = vec[i] - lo;
4042     } else if (st > 0) {
4043       iter = (kmp_uint64)(vec[i] - lo) / st;
4044     } else { // st < 0
4045       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4046     }
4047     iter_number = iter + ln * iter_number;
4048   }
4049   shft = iter_number % 32; // use 32-bit granularity
4050   iter_number >>= 5; // divided by 32
4051   flag = 1 << shft;
4052   KMP_MB();
4053   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4054     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4055   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4056                 (iter_number << 5) + shft));
4057 }
4058 
4059 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4060   kmp_int32 num_done;
4061   kmp_info_t *th = __kmp_threads[gtid];
4062   kmp_team_t *team = th->th.th_team;
4063   kmp_disp_t *pr_buf = th->th.th_dispatch;
4064 
4065   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4066   if (team->t.t_serialized) {
4067     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4068     return; // nothing to do
4069   }
4070   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4071   if (num_done == th->th.th_team_nproc) {
4072     // we are the last thread, need to free shared resources
4073     int idx = pr_buf->th_doacross_buf_idx - 1;
4074     dispatch_shared_info_t *sh_buf =
4075         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4076     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4077                      (kmp_int64)&sh_buf->doacross_num_done);
4078     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4079     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4080     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4081     sh_buf->doacross_flags = NULL;
4082     sh_buf->doacross_num_done = 0;
4083     sh_buf->doacross_buf_idx +=
4084         __kmp_dispatch_num_buffers; // free buffer for future re-use
4085   }
4086   // free private resources (need to keep buffer index forever)
4087   pr_buf->th_doacross_flags = NULL;
4088   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4089   pr_buf->th_doacross_info = NULL;
4090   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4091 }
4092 #endif
4093 
4094 #if OMP_50_ENABLED
4095 int __kmpc_get_target_offload(void) {
4096   if (!__kmp_init_serial) {
4097     __kmp_serial_initialize();
4098   }
4099   return __kmp_target_offload;
4100 }
4101 #endif // OMP_50_ENABLED
4102 
4103 // end of file //
4104