1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #define __KMP_IMP
15 #include "omp.h" /* extern "C" declarations of user-visible routines */
16 #include "kmp.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_stats.h"
22 
23 #if OMPT_SUPPORT
24 #include "ompt-specific.h"
25 #endif
26 
27 #define MAX_MESSAGE 512
28 
29 // flags will be used in future, e.g. to implement openmp_strict library
30 // restrictions
31 
32 /*!
33  * @ingroup STARTUP_SHUTDOWN
34  * @param loc   in   source location information
35  * @param flags in   for future use (currently ignored)
36  *
37  * Initialize the runtime library. This call is optional; if it is not made then
38  * it will be implicitly called by attempts to use other library functions.
39  */
40 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
41   // By default __kmpc_begin() is no-op.
42   char *env;
43   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
44       __kmp_str_match_true(env)) {
45     __kmp_middle_initialize();
46     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
47   } else if (__kmp_ignore_mppbeg() == FALSE) {
48     // By default __kmp_ignore_mppbeg() returns TRUE.
49     __kmp_internal_begin();
50     KC_TRACE(10, ("__kmpc_begin: called\n"));
51   }
52 }
53 
54 /*!
55  * @ingroup STARTUP_SHUTDOWN
56  * @param loc source location information
57  *
58  * Shutdown the runtime library. This is also optional, and even if called will
59  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
60  * zero.
61  */
62 void __kmpc_end(ident_t *loc) {
63   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
64   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
65   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
66   // returns FALSE and __kmpc_end() will unregister this root (it can cause
67   // library shut down).
68   if (__kmp_ignore_mppend() == FALSE) {
69     KC_TRACE(10, ("__kmpc_end: called\n"));
70     KA_TRACE(30, ("__kmpc_end\n"));
71 
72     __kmp_internal_end_thread(-1);
73   }
74 #if KMP_OS_WINDOWS && OMPT_SUPPORT
75   // Normal exit process on Windows does not allow worker threads of the final
76   // parallel region to finish reporting their events, so shutting down the
77   // library here fixes the issue at least for the cases where __kmpc_end() is
78   // placed properly.
79   if (ompt_enabled.enabled)
80     __kmp_internal_end_library(__kmp_gtid_get_specific());
81 #endif
82 }
83 
84 /*!
85 @ingroup THREAD_STATES
86 @param loc Source location information.
87 @return The global thread index of the active thread.
88 
89 This function can be called in any context.
90 
91 If the runtime has ony been entered at the outermost level from a
92 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
93 that which would be returned by omp_get_thread_num() in the outermost
94 active parallel construct. (Or zero if there is no active parallel
95 construct, since the master thread is necessarily thread zero).
96 
97 If multiple non-OpenMP threads all enter an OpenMP construct then this
98 will be a unique thread identifier among all the threads created by
99 the OpenMP runtime (but the value cannote be defined in terms of
100 OpenMP thread ids returned by omp_get_thread_num()).
101 */
102 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
103   kmp_int32 gtid = __kmp_entry_gtid();
104 
105   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
106 
107   return gtid;
108 }
109 
110 /*!
111 @ingroup THREAD_STATES
112 @param loc Source location information.
113 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
114 
115 This function can be called in any context.
116 It returns the total number of threads under the control of the OpenMP runtime.
117 That is not a number that can be determined by any OpenMP standard calls, since
118 the library may be called from more than one non-OpenMP thread, and this
119 reflects the total over all such calls. Similarly the runtime maintains
120 underlying threads even when they are not active (since the cost of creating
121 and destroying OS threads is high), this call counts all such threads even if
122 they are not waiting for work.
123 */
124 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
125   KC_TRACE(10,
126            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
127 
128   return TCR_4(__kmp_all_nth);
129 }
130 
131 /*!
132 @ingroup THREAD_STATES
133 @param loc Source location information.
134 @return The thread number of the calling thread in the innermost active parallel
135 construct.
136 */
137 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
138   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
139   return __kmp_tid_from_gtid(__kmp_entry_gtid());
140 }
141 
142 /*!
143 @ingroup THREAD_STATES
144 @param loc Source location information.
145 @return The number of threads in the innermost active parallel construct.
146 */
147 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
148   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
149 
150   return __kmp_entry_thread()->th.th_team->t.t_nproc;
151 }
152 
153 /*!
154  * @ingroup DEPRECATED
155  * @param loc location description
156  *
157  * This function need not be called. It always returns TRUE.
158  */
159 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
160 #ifndef KMP_DEBUG
161 
162   return TRUE;
163 
164 #else
165 
166   const char *semi2;
167   const char *semi3;
168   int line_no;
169 
170   if (__kmp_par_range == 0) {
171     return TRUE;
172   }
173   semi2 = loc->psource;
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   semi2 = strchr(semi2 + 1, ';');
182   if (semi2 == NULL) {
183     return TRUE;
184   }
185   if (__kmp_par_range_filename[0]) {
186     const char *name = semi2 - 1;
187     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
188       name--;
189     }
190     if ((*name == '/') || (*name == ';')) {
191       name++;
192     }
193     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
194       return __kmp_par_range < 0;
195     }
196   }
197   semi3 = strchr(semi2 + 1, ';');
198   if (__kmp_par_range_routine[0]) {
199     if ((semi3 != NULL) && (semi3 > semi2) &&
200         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
201       return __kmp_par_range < 0;
202     }
203   }
204   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
205     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
206       return __kmp_par_range > 0;
207     }
208     return __kmp_par_range < 0;
209   }
210   return TRUE;
211 
212 #endif /* KMP_DEBUG */
213 }
214 
215 /*!
216 @ingroup THREAD_STATES
217 @param loc Source location information.
218 @return 1 if this thread is executing inside an active parallel region, zero if
219 not.
220 */
221 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
222   return __kmp_entry_thread()->th.th_root->r.r_active;
223 }
224 
225 /*!
226 @ingroup PARALLEL
227 @param loc source location information
228 @param global_tid global thread number
229 @param num_threads number of threads requested for this parallel construct
230 
231 Set the number of threads to be used by the next fork spawned by this thread.
232 This call is only required if the parallel construct has a `num_threads` clause.
233 */
234 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
235                              kmp_int32 num_threads) {
236   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
237                 global_tid, num_threads));
238 
239   __kmp_push_num_threads(loc, global_tid, num_threads);
240 }
241 
242 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
243   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
244 
245   /* the num_threads are automatically popped */
246 }
247 
248 #if OMP_40_ENABLED
249 
250 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
251                            kmp_int32 proc_bind) {
252   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
253                 proc_bind));
254 
255   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
256 }
257 
258 #endif /* OMP_40_ENABLED */
259 
260 /*!
261 @ingroup PARALLEL
262 @param loc  source location information
263 @param argc  total number of arguments in the ellipsis
264 @param microtask  pointer to callback routine consisting of outlined parallel
265 construct
266 @param ...  pointers to shared variables that aren't global
267 
268 Do the actual fork and call the microtask in the relevant number of threads.
269 */
270 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
271   int gtid = __kmp_entry_gtid();
272 
273 #if (KMP_STATS_ENABLED)
274   // If we were in a serial region, then stop the serial timer, record
275   // the event, and start parallel region timer
276   stats_state_e previous_state = KMP_GET_THREAD_STATE();
277   if (previous_state == stats_state_e::SERIAL_REGION) {
278     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
279   } else {
280     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
281   }
282   int inParallel = __kmpc_in_parallel(loc);
283   if (inParallel) {
284     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
285   } else {
286     KMP_COUNT_BLOCK(OMP_PARALLEL);
287   }
288 #endif
289 
290   // maybe to save thr_state is enough here
291   {
292     va_list ap;
293     va_start(ap, microtask);
294 
295 #if OMPT_SUPPORT
296     omp_frame_t *ompt_frame;
297     if (ompt_enabled.enabled) {
298       kmp_info_t *master_th = __kmp_threads[gtid];
299       kmp_team_t *parent_team = master_th->th.th_team;
300       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
301       if (lwt)
302         ompt_frame = &(lwt->ompt_task_info.frame);
303       else {
304         int tid = __kmp_tid_from_gtid(gtid);
305         ompt_frame = &(
306             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
307       }
308       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
309       OMPT_STORE_RETURN_ADDRESS(gtid);
310     }
311 #endif
312 
313 #if INCLUDE_SSC_MARKS
314     SSC_MARK_FORKING();
315 #endif
316     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
317                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
318                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
319 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
320 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
321                     &ap
322 #else
323                     ap
324 #endif
325                     );
326 #if INCLUDE_SSC_MARKS
327     SSC_MARK_JOINING();
328 #endif
329     __kmp_join_call(loc, gtid
330 #if OMPT_SUPPORT
331                     ,
332                     fork_context_intel
333 #endif
334                     );
335 
336     va_end(ap);
337   }
338 
339 #if KMP_STATS_ENABLED
340   if (previous_state == stats_state_e::SERIAL_REGION) {
341     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
342   } else {
343     KMP_POP_PARTITIONED_TIMER();
344   }
345 #endif // KMP_STATS_ENABLED
346 }
347 
348 #if OMP_40_ENABLED
349 /*!
350 @ingroup PARALLEL
351 @param loc source location information
352 @param global_tid global thread number
353 @param num_teams number of teams requested for the teams construct
354 @param num_threads number of threads per team requested for the teams construct
355 
356 Set the number of teams to be used by the teams construct.
357 This call is only required if the teams construct has a `num_teams` clause
358 or a `thread_limit` clause (or both).
359 */
360 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
361                            kmp_int32 num_teams, kmp_int32 num_threads) {
362   KA_TRACE(20,
363            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
364             global_tid, num_teams, num_threads));
365 
366   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
367 }
368 
369 /*!
370 @ingroup PARALLEL
371 @param loc  source location information
372 @param argc  total number of arguments in the ellipsis
373 @param microtask  pointer to callback routine consisting of outlined teams
374 construct
375 @param ...  pointers to shared variables that aren't global
376 
377 Do the actual fork and call the microtask in the relevant number of threads.
378 */
379 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
380                        ...) {
381   int gtid = __kmp_entry_gtid();
382   kmp_info_t *this_thr = __kmp_threads[gtid];
383   va_list ap;
384   va_start(ap, microtask);
385 
386   KMP_COUNT_BLOCK(OMP_TEAMS);
387 
388   // remember teams entry point and nesting level
389   this_thr->th.th_teams_microtask = microtask;
390   this_thr->th.th_teams_level =
391       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
392 
393 #if OMPT_SUPPORT
394   kmp_team_t *parent_team = this_thr->th.th_team;
395   int tid = __kmp_tid_from_gtid(gtid);
396   if (ompt_enabled.enabled) {
397     parent_team->t.t_implicit_task_taskdata[tid]
398         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
399   }
400   OMPT_STORE_RETURN_ADDRESS(gtid);
401 #endif
402 
403   // check if __kmpc_push_num_teams called, set default number of teams
404   // otherwise
405   if (this_thr->th.th_teams_size.nteams == 0) {
406     __kmp_push_num_teams(loc, gtid, 0, 0);
407   }
408   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
409   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
410   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
411 
412   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
413                   VOLATILE_CAST(microtask_t)
414                       __kmp_teams_master, // "wrapped" task
415                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
416 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
417                   &ap
418 #else
419                   ap
420 #endif
421                   );
422   __kmp_join_call(loc, gtid
423 #if OMPT_SUPPORT
424                   ,
425                   fork_context_intel
426 #endif
427                   );
428 
429   this_thr->th.th_teams_microtask = NULL;
430   this_thr->th.th_teams_level = 0;
431   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
432   va_end(ap);
433 }
434 #endif /* OMP_40_ENABLED */
435 
436 // I don't think this function should ever have been exported.
437 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
438 // openmp code ever called it, but it's been exported from the RTL for so
439 // long that I'm afraid to remove the definition.
440 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
441 
442 /*!
443 @ingroup PARALLEL
444 @param loc  source location information
445 @param global_tid  global thread number
446 
447 Enter a serialized parallel construct. This interface is used to handle a
448 conditional parallel region, like this,
449 @code
450 #pragma omp parallel if (condition)
451 @endcode
452 when the condition is false.
453 */
454 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
455 // The implementation is now in kmp_runtime.cpp so that it can share static
456 // functions with kmp_fork_call since the tasks to be done are similar in
457 // each case.
458 #if OMPT_SUPPORT
459   OMPT_STORE_RETURN_ADDRESS(global_tid);
460 #endif
461   __kmp_serialized_parallel(loc, global_tid);
462 }
463 
464 /*!
465 @ingroup PARALLEL
466 @param loc  source location information
467 @param global_tid  global thread number
468 
469 Leave a serialized parallel construct.
470 */
471 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
472   kmp_internal_control_t *top;
473   kmp_info_t *this_thr;
474   kmp_team_t *serial_team;
475 
476   KC_TRACE(10,
477            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
478 
479   /* skip all this code for autopar serialized loops since it results in
480      unacceptable overhead */
481   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
482     return;
483 
484   // Not autopar code
485   if (!TCR_4(__kmp_init_parallel))
486     __kmp_parallel_initialize();
487 
488   this_thr = __kmp_threads[global_tid];
489   serial_team = this_thr->th.th_serial_team;
490 
491 #if OMP_45_ENABLED
492   kmp_task_team_t *task_team = this_thr->th.th_task_team;
493 
494   // we need to wait for the proxy tasks before finishing the thread
495   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
496     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
497 #endif
498 
499   KMP_MB();
500   KMP_DEBUG_ASSERT(serial_team);
501   KMP_ASSERT(serial_team->t.t_serialized);
502   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
503   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
504   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
505   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
506 
507 #if OMPT_SUPPORT
508   if (ompt_enabled.enabled &&
509       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
510     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
511     if (ompt_enabled.ompt_callback_implicit_task) {
512       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
513           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
514           OMPT_CUR_TASK_INFO(this_thr)->thread_num);
515     }
516 
517     // reset clear the task id only after unlinking the task
518     ompt_data_t *parent_task_data;
519     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
520 
521     if (ompt_enabled.ompt_callback_parallel_end) {
522       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
523           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
524           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
525     }
526     __ompt_lw_taskteam_unlink(this_thr);
527     this_thr->th.ompt_thread_info.state = omp_state_overhead;
528   }
529 #endif
530 
531   /* If necessary, pop the internal control stack values and replace the team
532    * values */
533   top = serial_team->t.t_control_stack_top;
534   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
535     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
536     serial_team->t.t_control_stack_top = top->next;
537     __kmp_free(top);
538   }
539 
540   // if( serial_team -> t.t_serialized > 1 )
541   serial_team->t.t_level--;
542 
543   /* pop dispatch buffers stack */
544   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
545   {
546     dispatch_private_info_t *disp_buffer =
547         serial_team->t.t_dispatch->th_disp_buffer;
548     serial_team->t.t_dispatch->th_disp_buffer =
549         serial_team->t.t_dispatch->th_disp_buffer->next;
550     __kmp_free(disp_buffer);
551   }
552 #if OMP_50_ENABLED
553   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
554 #endif
555 
556   --serial_team->t.t_serialized;
557   if (serial_team->t.t_serialized == 0) {
558 
559 /* return to the parallel section */
560 
561 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
562     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
563       __kmp_clear_x87_fpu_status_word();
564       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
565       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
566     }
567 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
568 
569     this_thr->th.th_team = serial_team->t.t_parent;
570     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
571 
572     /* restore values cached in the thread */
573     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
574     this_thr->th.th_team_master =
575         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
576     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
577 
578     /* TODO the below shouldn't need to be adjusted for serialized teams */
579     this_thr->th.th_dispatch =
580         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
581 
582     __kmp_pop_current_task_from_thread(this_thr);
583 
584     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
585     this_thr->th.th_current_task->td_flags.executing = 1;
586 
587     if (__kmp_tasking_mode != tskm_immediate_exec) {
588       // Copy the task team from the new child / old parent team to the thread.
589       this_thr->th.th_task_team =
590           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
591       KA_TRACE(20,
592                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
593                 "team %p\n",
594                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
595     }
596   } else {
597     if (__kmp_tasking_mode != tskm_immediate_exec) {
598       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
599                     "depth of serial team %p to %d\n",
600                     global_tid, serial_team, serial_team->t.t_serialized));
601     }
602   }
603 
604   if (__kmp_env_consistency_check)
605     __kmp_pop_parallel(global_tid, NULL);
606 #if OMPT_SUPPORT
607   if (ompt_enabled.enabled)
608     this_thr->th.ompt_thread_info.state =
609         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
610                                            : omp_state_work_parallel);
611 #endif
612 }
613 
614 /*!
615 @ingroup SYNCHRONIZATION
616 @param loc  source location information.
617 
618 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
619 depending on the memory ordering convention obeyed by the compiler
620 even that may not be necessary).
621 */
622 void __kmpc_flush(ident_t *loc) {
623   KC_TRACE(10, ("__kmpc_flush: called\n"));
624 
625   /* need explicit __mf() here since use volatile instead in library */
626   KMP_MB(); /* Flush all pending memory write invalidates.  */
627 
628 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
629 #if KMP_MIC
630 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
631 // We shouldn't need it, though, since the ABI rules require that
632 // * If the compiler generates NGO stores it also generates the fence
633 // * If users hand-code NGO stores they should insert the fence
634 // therefore no incomplete unordered stores should be visible.
635 #else
636   // C74404
637   // This is to address non-temporal store instructions (sfence needed).
638   // The clflush instruction is addressed either (mfence needed).
639   // Probably the non-temporal load monvtdqa instruction should also be
640   // addressed.
641   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
642   if (!__kmp_cpuinfo.initialized) {
643     __kmp_query_cpuid(&__kmp_cpuinfo);
644   }
645   if (!__kmp_cpuinfo.sse2) {
646     // CPU cannot execute SSE2 instructions.
647   } else {
648 #if KMP_COMPILER_ICC
649     _mm_mfence();
650 #elif KMP_COMPILER_MSVC
651     MemoryBarrier();
652 #else
653     __sync_synchronize();
654 #endif // KMP_COMPILER_ICC
655   }
656 #endif // KMP_MIC
657 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
658 // Nothing to see here move along
659 #elif KMP_ARCH_PPC64
660 // Nothing needed here (we have a real MB above).
661 #if KMP_OS_CNK
662   // The flushing thread needs to yield here; this prevents a
663   // busy-waiting thread from saturating the pipeline. flush is
664   // often used in loops like this:
665   // while (!flag) {
666   //   #pragma omp flush(flag)
667   // }
668   // and adding the yield here is good for at least a 10x speedup
669   // when running >2 threads per core (on the NAS LU benchmark).
670   __kmp_yield(TRUE);
671 #endif
672 #else
673 #error Unknown or unsupported architecture
674 #endif
675 
676 #if OMPT_SUPPORT && OMPT_OPTIONAL
677   if (ompt_enabled.ompt_callback_flush) {
678     ompt_callbacks.ompt_callback(ompt_callback_flush)(
679         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
680   }
681 #endif
682 }
683 
684 /* -------------------------------------------------------------------------- */
685 /*!
686 @ingroup SYNCHRONIZATION
687 @param loc source location information
688 @param global_tid thread id.
689 
690 Execute a barrier.
691 */
692 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
693   KMP_COUNT_BLOCK(OMP_BARRIER);
694   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
695 
696   if (!TCR_4(__kmp_init_parallel))
697     __kmp_parallel_initialize();
698 
699   if (__kmp_env_consistency_check) {
700     if (loc == 0) {
701       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
702     }
703 
704     __kmp_check_barrier(global_tid, ct_barrier, loc);
705   }
706 
707 #if OMPT_SUPPORT
708   omp_frame_t *ompt_frame;
709   if (ompt_enabled.enabled) {
710     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
711     if (ompt_frame->enter_frame == NULL)
712       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
713     OMPT_STORE_RETURN_ADDRESS(global_tid);
714   }
715 #endif
716   __kmp_threads[global_tid]->th.th_ident = loc;
717   // TODO: explicit barrier_wait_id:
718   //   this function is called when 'barrier' directive is present or
719   //   implicit barrier at the end of a worksharing construct.
720   // 1) better to add a per-thread barrier counter to a thread data structure
721   // 2) set to 0 when a new team is created
722   // 4) no sync is required
723 
724   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
725 #if OMPT_SUPPORT && OMPT_OPTIONAL
726   if (ompt_enabled.enabled) {
727     ompt_frame->enter_frame = NULL;
728   }
729 #endif
730 }
731 
732 /* The BARRIER for a MASTER section is always explicit   */
733 /*!
734 @ingroup WORK_SHARING
735 @param loc  source location information.
736 @param global_tid  global thread number .
737 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
738 */
739 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
740   int status = 0;
741 
742   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
743 
744   if (!TCR_4(__kmp_init_parallel))
745     __kmp_parallel_initialize();
746 
747   if (KMP_MASTER_GTID(global_tid)) {
748     KMP_COUNT_BLOCK(OMP_MASTER);
749     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
750     status = 1;
751   }
752 
753 #if OMPT_SUPPORT && OMPT_OPTIONAL
754   if (status) {
755     if (ompt_enabled.ompt_callback_master) {
756       kmp_info_t *this_thr = __kmp_threads[global_tid];
757       kmp_team_t *team = this_thr->th.th_team;
758 
759       int tid = __kmp_tid_from_gtid(global_tid);
760       ompt_callbacks.ompt_callback(ompt_callback_master)(
761           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
762           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
763           OMPT_GET_RETURN_ADDRESS(0));
764     }
765   }
766 #endif
767 
768   if (__kmp_env_consistency_check) {
769 #if KMP_USE_DYNAMIC_LOCK
770     if (status)
771       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
772     else
773       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
774 #else
775     if (status)
776       __kmp_push_sync(global_tid, ct_master, loc, NULL);
777     else
778       __kmp_check_sync(global_tid, ct_master, loc, NULL);
779 #endif
780   }
781 
782   return status;
783 }
784 
785 /*!
786 @ingroup WORK_SHARING
787 @param loc  source location information.
788 @param global_tid  global thread number .
789 
790 Mark the end of a <tt>master</tt> region. This should only be called by the
791 thread that executes the <tt>master</tt> region.
792 */
793 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
794   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
795 
796   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
797   KMP_POP_PARTITIONED_TIMER();
798 
799 #if OMPT_SUPPORT && OMPT_OPTIONAL
800   kmp_info_t *this_thr = __kmp_threads[global_tid];
801   kmp_team_t *team = this_thr->th.th_team;
802   if (ompt_enabled.ompt_callback_master) {
803     int tid = __kmp_tid_from_gtid(global_tid);
804     ompt_callbacks.ompt_callback(ompt_callback_master)(
805         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
806         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
807         OMPT_GET_RETURN_ADDRESS(0));
808   }
809 #endif
810 
811   if (__kmp_env_consistency_check) {
812     if (global_tid < 0)
813       KMP_WARNING(ThreadIdentInvalid);
814 
815     if (KMP_MASTER_GTID(global_tid))
816       __kmp_pop_sync(global_tid, ct_master, loc);
817   }
818 }
819 
820 /*!
821 @ingroup WORK_SHARING
822 @param loc  source location information.
823 @param gtid  global thread number.
824 
825 Start execution of an <tt>ordered</tt> construct.
826 */
827 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
828   int cid = 0;
829   kmp_info_t *th;
830   KMP_DEBUG_ASSERT(__kmp_init_serial);
831 
832   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
833 
834   if (!TCR_4(__kmp_init_parallel))
835     __kmp_parallel_initialize();
836 
837 #if USE_ITT_BUILD
838   __kmp_itt_ordered_prep(gtid);
839 // TODO: ordered_wait_id
840 #endif /* USE_ITT_BUILD */
841 
842   th = __kmp_threads[gtid];
843 
844 #if OMPT_SUPPORT && OMPT_OPTIONAL
845   kmp_team_t *team;
846   omp_wait_id_t lck;
847   void *codeptr_ra;
848   if (ompt_enabled.enabled) {
849     OMPT_STORE_RETURN_ADDRESS(gtid);
850     team = __kmp_team_from_gtid(gtid);
851     lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value;
852     /* OMPT state update */
853     th->th.ompt_thread_info.wait_id = lck;
854     th->th.ompt_thread_info.state = omp_state_wait_ordered;
855 
856     /* OMPT event callback */
857     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
858     if (ompt_enabled.ompt_callback_mutex_acquire) {
859       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
860           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
861           (omp_wait_id_t)lck, codeptr_ra);
862     }
863   }
864 #endif
865 
866   if (th->th.th_dispatch->th_deo_fcn != 0)
867     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
868   else
869     __kmp_parallel_deo(&gtid, &cid, loc);
870 
871 #if OMPT_SUPPORT && OMPT_OPTIONAL
872   if (ompt_enabled.enabled) {
873     /* OMPT state update */
874     th->th.ompt_thread_info.state = omp_state_work_parallel;
875     th->th.ompt_thread_info.wait_id = 0;
876 
877     /* OMPT event callback */
878     if (ompt_enabled.ompt_callback_mutex_acquired) {
879       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
880           ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra);
881     }
882   }
883 #endif
884 
885 #if USE_ITT_BUILD
886   __kmp_itt_ordered_start(gtid);
887 #endif /* USE_ITT_BUILD */
888 }
889 
890 /*!
891 @ingroup WORK_SHARING
892 @param loc  source location information.
893 @param gtid  global thread number.
894 
895 End execution of an <tt>ordered</tt> construct.
896 */
897 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
898   int cid = 0;
899   kmp_info_t *th;
900 
901   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
902 
903 #if USE_ITT_BUILD
904   __kmp_itt_ordered_end(gtid);
905 // TODO: ordered_wait_id
906 #endif /* USE_ITT_BUILD */
907 
908   th = __kmp_threads[gtid];
909 
910   if (th->th.th_dispatch->th_dxo_fcn != 0)
911     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
912   else
913     __kmp_parallel_dxo(&gtid, &cid, loc);
914 
915 #if OMPT_SUPPORT && OMPT_OPTIONAL
916   OMPT_STORE_RETURN_ADDRESS(gtid);
917   if (ompt_enabled.ompt_callback_mutex_released) {
918     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
919         ompt_mutex_ordered,
920         (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
921         OMPT_LOAD_RETURN_ADDRESS(gtid));
922   }
923 #endif
924 }
925 
926 #if KMP_USE_DYNAMIC_LOCK
927 
928 static __forceinline void
929 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
930                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
931   // Pointer to the allocated indirect lock is written to crit, while indexing
932   // is ignored.
933   void *idx;
934   kmp_indirect_lock_t **lck;
935   lck = (kmp_indirect_lock_t **)crit;
936   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
937   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
938   KMP_SET_I_LOCK_LOCATION(ilk, loc);
939   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
940   KA_TRACE(20,
941            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
942 #if USE_ITT_BUILD
943   __kmp_itt_critical_creating(ilk->lock, loc);
944 #endif
945   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
946   if (status == 0) {
947 #if USE_ITT_BUILD
948     __kmp_itt_critical_destroyed(ilk->lock);
949 #endif
950     // We don't really need to destroy the unclaimed lock here since it will be
951     // cleaned up at program exit.
952     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
953   }
954   KMP_DEBUG_ASSERT(*lck != NULL);
955 }
956 
957 // Fast-path acquire tas lock
958 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
959   {                                                                            \
960     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
961     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
962     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
963     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
964         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
965       kmp_uint32 spins;                                                        \
966       KMP_FSYNC_PREPARE(l);                                                    \
967       KMP_INIT_YIELD(spins);                                                   \
968       if (TCR_4(__kmp_nth) >                                                   \
969           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
970         KMP_YIELD(TRUE);                                                       \
971       } else {                                                                 \
972         KMP_YIELD_SPIN(spins);                                                 \
973       }                                                                        \
974       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
975       while (                                                                  \
976           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
977           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
978         __kmp_spin_backoff(&backoff);                                          \
979         if (TCR_4(__kmp_nth) >                                                 \
980             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
981           KMP_YIELD(TRUE);                                                     \
982         } else {                                                               \
983           KMP_YIELD_SPIN(spins);                                               \
984         }                                                                      \
985       }                                                                        \
986     }                                                                          \
987     KMP_FSYNC_ACQUIRED(l);                                                     \
988   }
989 
990 // Fast-path test tas lock
991 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
992   {                                                                            \
993     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
994     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
995     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
996     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
997          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
998   }
999 
1000 // Fast-path release tas lock
1001 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1002   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1003 
1004 #if KMP_USE_FUTEX
1005 
1006 #include <sys/syscall.h>
1007 #include <unistd.h>
1008 #ifndef FUTEX_WAIT
1009 #define FUTEX_WAIT 0
1010 #endif
1011 #ifndef FUTEX_WAKE
1012 #define FUTEX_WAKE 1
1013 #endif
1014 
1015 // Fast-path acquire futex lock
1016 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1017   {                                                                            \
1018     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1019     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1020     KMP_MB();                                                                  \
1021     KMP_FSYNC_PREPARE(ftx);                                                    \
1022     kmp_int32 poll_val;                                                        \
1023     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1024                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1025                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1026       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1027       if (!cond) {                                                             \
1028         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1029                                          poll_val |                            \
1030                                              KMP_LOCK_BUSY(1, futex))) {       \
1031           continue;                                                            \
1032         }                                                                      \
1033         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1034       }                                                                        \
1035       kmp_int32 rc;                                                            \
1036       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1037                         NULL, NULL, 0)) != 0) {                                \
1038         continue;                                                              \
1039       }                                                                        \
1040       gtid_code |= 1;                                                          \
1041     }                                                                          \
1042     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1043   }
1044 
1045 // Fast-path test futex lock
1046 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1047   {                                                                            \
1048     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1049     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1050                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1051       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1052       rc = TRUE;                                                               \
1053     } else {                                                                   \
1054       rc = FALSE;                                                              \
1055     }                                                                          \
1056   }
1057 
1058 // Fast-path release futex lock
1059 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1060   {                                                                            \
1061     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1062     KMP_MB();                                                                  \
1063     KMP_FSYNC_RELEASING(ftx);                                                  \
1064     kmp_int32 poll_val =                                                       \
1065         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1066     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1067       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1068               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1069     }                                                                          \
1070     KMP_MB();                                                                  \
1071     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1072               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1073   }
1074 
1075 #endif // KMP_USE_FUTEX
1076 
1077 #else // KMP_USE_DYNAMIC_LOCK
1078 
1079 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1080                                                       ident_t const *loc,
1081                                                       kmp_int32 gtid) {
1082   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1083 
1084   // Because of the double-check, the following load doesn't need to be volatile
1085   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1086 
1087   if (lck == NULL) {
1088     void *idx;
1089 
1090     // Allocate & initialize the lock.
1091     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1092     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1093     __kmp_init_user_lock_with_checks(lck);
1094     __kmp_set_user_lock_location(lck, loc);
1095 #if USE_ITT_BUILD
1096     __kmp_itt_critical_creating(lck);
1097 // __kmp_itt_critical_creating() should be called *before* the first usage
1098 // of underlying lock. It is the only place where we can guarantee it. There
1099 // are chances the lock will destroyed with no usage, but it is not a
1100 // problem, because this is not real event seen by user but rather setting
1101 // name for object (lock). See more details in kmp_itt.h.
1102 #endif /* USE_ITT_BUILD */
1103 
1104     // Use a cmpxchg instruction to slam the start of the critical section with
1105     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1106     // and use the lock that the other thread allocated.
1107     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1108 
1109     if (status == 0) {
1110 // Deallocate the lock and reload the value.
1111 #if USE_ITT_BUILD
1112       __kmp_itt_critical_destroyed(lck);
1113 // Let ITT know the lock is destroyed and the same memory location may be reused
1114 // for another purpose.
1115 #endif /* USE_ITT_BUILD */
1116       __kmp_destroy_user_lock_with_checks(lck);
1117       __kmp_user_lock_free(&idx, gtid, lck);
1118       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1119       KMP_DEBUG_ASSERT(lck != NULL);
1120     }
1121   }
1122   return lck;
1123 }
1124 
1125 #endif // KMP_USE_DYNAMIC_LOCK
1126 
1127 /*!
1128 @ingroup WORK_SHARING
1129 @param loc  source location information.
1130 @param global_tid  global thread number .
1131 @param crit identity of the critical section. This could be a pointer to a lock
1132 associated with the critical section, or some other suitably unique value.
1133 
1134 Enter code protected by a `critical` construct.
1135 This function blocks until the executing thread can enter the critical section.
1136 */
1137 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1138                      kmp_critical_name *crit) {
1139 #if KMP_USE_DYNAMIC_LOCK
1140 #if OMPT_SUPPORT && OMPT_OPTIONAL
1141   OMPT_STORE_RETURN_ADDRESS(global_tid);
1142 #endif // OMPT_SUPPORT
1143   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1144 #else
1145   KMP_COUNT_BLOCK(OMP_CRITICAL);
1146 #if OMPT_SUPPORT && OMPT_OPTIONAL
1147   omp_state_t prev_state = omp_state_undefined;
1148   ompt_thread_info_t ti;
1149 #endif
1150   kmp_user_lock_p lck;
1151 
1152   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1153 
1154   // TODO: add THR_OVHD_STATE
1155 
1156   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1157   KMP_CHECK_USER_LOCK_INIT();
1158 
1159   if ((__kmp_user_lock_kind == lk_tas) &&
1160       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1161     lck = (kmp_user_lock_p)crit;
1162   }
1163 #if KMP_USE_FUTEX
1164   else if ((__kmp_user_lock_kind == lk_futex) &&
1165            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1166     lck = (kmp_user_lock_p)crit;
1167   }
1168 #endif
1169   else { // ticket, queuing or drdpa
1170     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1171   }
1172 
1173   if (__kmp_env_consistency_check)
1174     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1175 
1176 // since the critical directive binds to all threads, not just the current
1177 // team we have to check this even if we are in a serialized team.
1178 // also, even if we are the uber thread, we still have to conduct the lock,
1179 // as we have to contend with sibling threads.
1180 
1181 #if USE_ITT_BUILD
1182   __kmp_itt_critical_acquiring(lck);
1183 #endif /* USE_ITT_BUILD */
1184 #if OMPT_SUPPORT && OMPT_OPTIONAL
1185   OMPT_STORE_RETURN_ADDRESS(gtid);
1186   void *codeptr_ra = NULL;
1187   if (ompt_enabled.enabled) {
1188     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1189     /* OMPT state update */
1190     prev_state = ti.state;
1191     ti.wait_id = (omp_wait_id_t)lck;
1192     ti.state = omp_state_wait_critical;
1193 
1194     /* OMPT event callback */
1195     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1196     if (ompt_enabled.ompt_callback_mutex_acquire) {
1197       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1198           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1199           (omp_wait_id_t)crit, codeptr_ra);
1200     }
1201   }
1202 #endif
1203   // Value of 'crit' should be good for using as a critical_id of the critical
1204   // section directive.
1205   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1206 
1207 #if USE_ITT_BUILD
1208   __kmp_itt_critical_acquired(lck);
1209 #endif /* USE_ITT_BUILD */
1210 #if OMPT_SUPPORT && OMPT_OPTIONAL
1211   if (ompt_enabled.enabled) {
1212     /* OMPT state update */
1213     ti.state = prev_state;
1214     ti.wait_id = 0;
1215 
1216     /* OMPT event callback */
1217     if (ompt_enabled.ompt_callback_mutex_acquired) {
1218       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1219           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra);
1220     }
1221   }
1222 #endif
1223   KMP_POP_PARTITIONED_TIMER();
1224 
1225   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1226   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1227 #endif // KMP_USE_DYNAMIC_LOCK
1228 }
1229 
1230 #if KMP_USE_DYNAMIC_LOCK
1231 
1232 // Converts the given hint to an internal lock implementation
1233 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1234 #if KMP_USE_TSX
1235 #define KMP_TSX_LOCK(seq) lockseq_##seq
1236 #else
1237 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1238 #endif
1239 
1240 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1241 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1242 #else
1243 #define KMP_CPUINFO_RTM 0
1244 #endif
1245 
1246   // Hints that do not require further logic
1247   if (hint & kmp_lock_hint_hle)
1248     return KMP_TSX_LOCK(hle);
1249   if (hint & kmp_lock_hint_rtm)
1250     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1251   if (hint & kmp_lock_hint_adaptive)
1252     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1253 
1254   // Rule out conflicting hints first by returning the default lock
1255   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1256     return __kmp_user_lock_seq;
1257   if ((hint & omp_lock_hint_speculative) &&
1258       (hint & omp_lock_hint_nonspeculative))
1259     return __kmp_user_lock_seq;
1260 
1261   // Do not even consider speculation when it appears to be contended
1262   if (hint & omp_lock_hint_contended)
1263     return lockseq_queuing;
1264 
1265   // Uncontended lock without speculation
1266   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1267     return lockseq_tas;
1268 
1269   // HLE lock for speculation
1270   if (hint & omp_lock_hint_speculative)
1271     return KMP_TSX_LOCK(hle);
1272 
1273   return __kmp_user_lock_seq;
1274 }
1275 
1276 #if OMPT_SUPPORT && OMPT_OPTIONAL
1277 #if KMP_USE_DYNAMIC_LOCK
1278 static kmp_mutex_impl_t
1279 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1280   if (user_lock) {
1281     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1282     case 0:
1283       break;
1284 #if KMP_USE_FUTEX
1285     case locktag_futex:
1286       return kmp_mutex_impl_queuing;
1287 #endif
1288     case locktag_tas:
1289       return kmp_mutex_impl_spin;
1290 #if KMP_USE_TSX
1291     case locktag_hle:
1292       return kmp_mutex_impl_speculative;
1293 #endif
1294     default:
1295       return ompt_mutex_impl_unknown;
1296     }
1297     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1298   }
1299   KMP_ASSERT(ilock);
1300   switch (ilock->type) {
1301 #if KMP_USE_TSX
1302   case locktag_adaptive:
1303   case locktag_rtm:
1304     return kmp_mutex_impl_speculative;
1305 #endif
1306   case locktag_nested_tas:
1307     return kmp_mutex_impl_spin;
1308 #if KMP_USE_FUTEX
1309   case locktag_nested_futex:
1310 #endif
1311   case locktag_ticket:
1312   case locktag_queuing:
1313   case locktag_drdpa:
1314   case locktag_nested_ticket:
1315   case locktag_nested_queuing:
1316   case locktag_nested_drdpa:
1317     return kmp_mutex_impl_queuing;
1318   default:
1319     return ompt_mutex_impl_unknown;
1320   }
1321 }
1322 #else
1323 // For locks without dynamic binding
1324 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1325   switch (__kmp_user_lock_kind) {
1326   case lk_tas:
1327     return kmp_mutex_impl_spin;
1328 #if KMP_USE_FUTEX
1329   case lk_futex:
1330 #endif
1331   case lk_ticket:
1332   case lk_queuing:
1333   case lk_drdpa:
1334     return kmp_mutex_impl_queuing;
1335 #if KMP_USE_TSX
1336   case lk_hle:
1337   case lk_rtm:
1338   case lk_adaptive:
1339     return kmp_mutex_impl_speculative;
1340 #endif
1341   default:
1342     return ompt_mutex_impl_unknown;
1343   }
1344 }
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1347 
1348 /*!
1349 @ingroup WORK_SHARING
1350 @param loc  source location information.
1351 @param global_tid  global thread number.
1352 @param crit identity of the critical section. This could be a pointer to a lock
1353 associated with the critical section, or some other suitably unique value.
1354 @param hint the lock hint.
1355 
1356 Enter code protected by a `critical` construct with a hint. The hint value is
1357 used to suggest a lock implementation. This function blocks until the executing
1358 thread can enter the critical section unless the hint suggests use of
1359 speculative execution and the hardware supports it.
1360 */
1361 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1362                                kmp_critical_name *crit, uint32_t hint) {
1363   KMP_COUNT_BLOCK(OMP_CRITICAL);
1364   kmp_user_lock_p lck;
1365 #if OMPT_SUPPORT && OMPT_OPTIONAL
1366   omp_state_t prev_state = omp_state_undefined;
1367   ompt_thread_info_t ti;
1368   // This is the case, if called from __kmpc_critical:
1369   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1370   if (!codeptr)
1371     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1372 #endif
1373 
1374   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1375 
1376   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1377   // Check if it is initialized.
1378   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1379   if (*lk == 0) {
1380     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1381     if (KMP_IS_D_LOCK(lckseq)) {
1382       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1383                                   KMP_GET_D_TAG(lckseq));
1384     } else {
1385       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1386     }
1387   }
1388   // Branch for accessing the actual lock object and set operation. This
1389   // branching is inevitable since this lock initialization does not follow the
1390   // normal dispatch path (lock table is not used).
1391   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1392     lck = (kmp_user_lock_p)lk;
1393     if (__kmp_env_consistency_check) {
1394       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1395                       __kmp_map_hint_to_lock(hint));
1396     }
1397 #if USE_ITT_BUILD
1398     __kmp_itt_critical_acquiring(lck);
1399 #endif
1400 #if OMPT_SUPPORT && OMPT_OPTIONAL
1401     if (ompt_enabled.enabled) {
1402       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1403       /* OMPT state update */
1404       prev_state = ti.state;
1405       ti.wait_id = (omp_wait_id_t)lck;
1406       ti.state = omp_state_wait_critical;
1407 
1408       /* OMPT event callback */
1409       if (ompt_enabled.ompt_callback_mutex_acquire) {
1410         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1411             ompt_mutex_critical, (unsigned int)hint,
1412             __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr);
1413       }
1414     }
1415 #endif
1416 #if KMP_USE_INLINED_TAS
1417     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1418       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1419     } else
1420 #elif KMP_USE_INLINED_FUTEX
1421     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1422       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1423     } else
1424 #endif
1425     {
1426       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1427     }
1428   } else {
1429     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1430     lck = ilk->lock;
1431     if (__kmp_env_consistency_check) {
1432       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1433                       __kmp_map_hint_to_lock(hint));
1434     }
1435 #if USE_ITT_BUILD
1436     __kmp_itt_critical_acquiring(lck);
1437 #endif
1438 #if OMPT_SUPPORT && OMPT_OPTIONAL
1439     if (ompt_enabled.enabled) {
1440       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1441       /* OMPT state update */
1442       prev_state = ti.state;
1443       ti.wait_id = (omp_wait_id_t)lck;
1444       ti.state = omp_state_wait_critical;
1445 
1446       /* OMPT event callback */
1447       if (ompt_enabled.ompt_callback_mutex_acquire) {
1448         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1449             ompt_mutex_critical, (unsigned int)hint,
1450             __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr);
1451       }
1452     }
1453 #endif
1454     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1455   }
1456   KMP_POP_PARTITIONED_TIMER();
1457 
1458 #if USE_ITT_BUILD
1459   __kmp_itt_critical_acquired(lck);
1460 #endif /* USE_ITT_BUILD */
1461 #if OMPT_SUPPORT && OMPT_OPTIONAL
1462   if (ompt_enabled.enabled) {
1463     /* OMPT state update */
1464     ti.state = prev_state;
1465     ti.wait_id = 0;
1466 
1467     /* OMPT event callback */
1468     if (ompt_enabled.ompt_callback_mutex_acquired) {
1469       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1470           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr);
1471     }
1472   }
1473 #endif
1474 
1475   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1476   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1477 } // __kmpc_critical_with_hint
1478 
1479 #endif // KMP_USE_DYNAMIC_LOCK
1480 
1481 /*!
1482 @ingroup WORK_SHARING
1483 @param loc  source location information.
1484 @param global_tid  global thread number .
1485 @param crit identity of the critical section. This could be a pointer to a lock
1486 associated with the critical section, or some other suitably unique value.
1487 
1488 Leave a critical section, releasing any lock that was held during its execution.
1489 */
1490 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1491                          kmp_critical_name *crit) {
1492   kmp_user_lock_p lck;
1493 
1494   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1495 
1496 #if KMP_USE_DYNAMIC_LOCK
1497   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1498     lck = (kmp_user_lock_p)crit;
1499     KMP_ASSERT(lck != NULL);
1500     if (__kmp_env_consistency_check) {
1501       __kmp_pop_sync(global_tid, ct_critical, loc);
1502     }
1503 #if USE_ITT_BUILD
1504     __kmp_itt_critical_releasing(lck);
1505 #endif
1506 #if KMP_USE_INLINED_TAS
1507     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1508       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1509     } else
1510 #elif KMP_USE_INLINED_FUTEX
1511     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1512       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1513     } else
1514 #endif
1515     {
1516       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1517     }
1518   } else {
1519     kmp_indirect_lock_t *ilk =
1520         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1521     KMP_ASSERT(ilk != NULL);
1522     lck = ilk->lock;
1523     if (__kmp_env_consistency_check) {
1524       __kmp_pop_sync(global_tid, ct_critical, loc);
1525     }
1526 #if USE_ITT_BUILD
1527     __kmp_itt_critical_releasing(lck);
1528 #endif
1529     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1530   }
1531 
1532 #else // KMP_USE_DYNAMIC_LOCK
1533 
1534   if ((__kmp_user_lock_kind == lk_tas) &&
1535       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1536     lck = (kmp_user_lock_p)crit;
1537   }
1538 #if KMP_USE_FUTEX
1539   else if ((__kmp_user_lock_kind == lk_futex) &&
1540            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1541     lck = (kmp_user_lock_p)crit;
1542   }
1543 #endif
1544   else { // ticket, queuing or drdpa
1545     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1546   }
1547 
1548   KMP_ASSERT(lck != NULL);
1549 
1550   if (__kmp_env_consistency_check)
1551     __kmp_pop_sync(global_tid, ct_critical, loc);
1552 
1553 #if USE_ITT_BUILD
1554   __kmp_itt_critical_releasing(lck);
1555 #endif /* USE_ITT_BUILD */
1556   // Value of 'crit' should be good for using as a critical_id of the critical
1557   // section directive.
1558   __kmp_release_user_lock_with_checks(lck, global_tid);
1559 
1560 #endif // KMP_USE_DYNAMIC_LOCK
1561 
1562 #if OMPT_SUPPORT && OMPT_OPTIONAL
1563   /* OMPT release event triggers after lock is released; place here to trigger
1564    * for all #if branches */
1565   OMPT_STORE_RETURN_ADDRESS(global_tid);
1566   if (ompt_enabled.ompt_callback_mutex_released) {
1567     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1568         ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1569   }
1570 #endif
1571 
1572   KMP_POP_PARTITIONED_TIMER();
1573   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1574 }
1575 
1576 /*!
1577 @ingroup SYNCHRONIZATION
1578 @param loc source location information
1579 @param global_tid thread id.
1580 @return one if the thread should execute the master block, zero otherwise
1581 
1582 Start execution of a combined barrier and master. The barrier is executed inside
1583 this function.
1584 */
1585 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1586   int status;
1587 
1588   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1589 
1590   if (!TCR_4(__kmp_init_parallel))
1591     __kmp_parallel_initialize();
1592 
1593   if (__kmp_env_consistency_check)
1594     __kmp_check_barrier(global_tid, ct_barrier, loc);
1595 
1596 #if OMPT_SUPPORT
1597   omp_frame_t *ompt_frame;
1598   if (ompt_enabled.enabled) {
1599     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1600     if (ompt_frame->enter_frame == NULL)
1601       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1602     OMPT_STORE_RETURN_ADDRESS(global_tid);
1603   }
1604 #endif
1605 #if USE_ITT_NOTIFY
1606   __kmp_threads[global_tid]->th.th_ident = loc;
1607 #endif
1608   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1609 #if OMPT_SUPPORT && OMPT_OPTIONAL
1610   if (ompt_enabled.enabled) {
1611     ompt_frame->enter_frame = NULL;
1612   }
1613 #endif
1614 
1615   return (status != 0) ? 0 : 1;
1616 }
1617 
1618 /*!
1619 @ingroup SYNCHRONIZATION
1620 @param loc source location information
1621 @param global_tid thread id.
1622 
1623 Complete the execution of a combined barrier and master. This function should
1624 only be called at the completion of the <tt>master</tt> code. Other threads will
1625 still be waiting at the barrier and this call releases them.
1626 */
1627 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1628   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1629 
1630   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1631 }
1632 
1633 /*!
1634 @ingroup SYNCHRONIZATION
1635 @param loc source location information
1636 @param global_tid thread id.
1637 @return one if the thread should execute the master block, zero otherwise
1638 
1639 Start execution of a combined barrier and master(nowait) construct.
1640 The barrier is executed inside this function.
1641 There is no equivalent "end" function, since the
1642 */
1643 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1644   kmp_int32 ret;
1645 
1646   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1647 
1648   if (!TCR_4(__kmp_init_parallel))
1649     __kmp_parallel_initialize();
1650 
1651   if (__kmp_env_consistency_check) {
1652     if (loc == 0) {
1653       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1654     }
1655     __kmp_check_barrier(global_tid, ct_barrier, loc);
1656   }
1657 
1658 #if OMPT_SUPPORT
1659   omp_frame_t *ompt_frame;
1660   if (ompt_enabled.enabled) {
1661     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1662     if (ompt_frame->enter_frame == NULL)
1663       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1664     OMPT_STORE_RETURN_ADDRESS(global_tid);
1665   }
1666 #endif
1667 #if USE_ITT_NOTIFY
1668   __kmp_threads[global_tid]->th.th_ident = loc;
1669 #endif
1670   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1671 #if OMPT_SUPPORT && OMPT_OPTIONAL
1672   if (ompt_enabled.enabled) {
1673     ompt_frame->enter_frame = NULL;
1674   }
1675 #endif
1676 
1677   ret = __kmpc_master(loc, global_tid);
1678 
1679   if (__kmp_env_consistency_check) {
1680     /*  there's no __kmpc_end_master called; so the (stats) */
1681     /*  actions of __kmpc_end_master are done here          */
1682 
1683     if (global_tid < 0) {
1684       KMP_WARNING(ThreadIdentInvalid);
1685     }
1686     if (ret) {
1687       /* only one thread should do the pop since only */
1688       /* one did the push (see __kmpc_master())       */
1689 
1690       __kmp_pop_sync(global_tid, ct_master, loc);
1691     }
1692   }
1693 
1694   return (ret);
1695 }
1696 
1697 /* The BARRIER for a SINGLE process section is always explicit   */
1698 /*!
1699 @ingroup WORK_SHARING
1700 @param loc  source location information
1701 @param global_tid  global thread number
1702 @return One if this thread should execute the single construct, zero otherwise.
1703 
1704 Test whether to execute a <tt>single</tt> construct.
1705 There are no implicit barriers in the two "single" calls, rather the compiler
1706 should introduce an explicit barrier if it is required.
1707 */
1708 
1709 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1710   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1711 
1712   if (rc) {
1713     // We are going to execute the single statement, so we should count it.
1714     KMP_COUNT_BLOCK(OMP_SINGLE);
1715     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1716   }
1717 
1718 #if OMPT_SUPPORT && OMPT_OPTIONAL
1719   kmp_info_t *this_thr = __kmp_threads[global_tid];
1720   kmp_team_t *team = this_thr->th.th_team;
1721   int tid = __kmp_tid_from_gtid(global_tid);
1722 
1723   if (ompt_enabled.enabled) {
1724     if (rc) {
1725       if (ompt_enabled.ompt_callback_work) {
1726         ompt_callbacks.ompt_callback(ompt_callback_work)(
1727             ompt_work_single_executor, ompt_scope_begin,
1728             &(team->t.ompt_team_info.parallel_data),
1729             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1730             1, OMPT_GET_RETURN_ADDRESS(0));
1731       }
1732     } else {
1733       if (ompt_enabled.ompt_callback_work) {
1734         ompt_callbacks.ompt_callback(ompt_callback_work)(
1735             ompt_work_single_other, ompt_scope_begin,
1736             &(team->t.ompt_team_info.parallel_data),
1737             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1738             1, OMPT_GET_RETURN_ADDRESS(0));
1739         ompt_callbacks.ompt_callback(ompt_callback_work)(
1740             ompt_work_single_other, ompt_scope_end,
1741             &(team->t.ompt_team_info.parallel_data),
1742             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1743             1, OMPT_GET_RETURN_ADDRESS(0));
1744       }
1745     }
1746   }
1747 #endif
1748 
1749   return rc;
1750 }
1751 
1752 /*!
1753 @ingroup WORK_SHARING
1754 @param loc  source location information
1755 @param global_tid  global thread number
1756 
1757 Mark the end of a <tt>single</tt> construct.  This function should
1758 only be called by the thread that executed the block of code protected
1759 by the `single` construct.
1760 */
1761 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1762   __kmp_exit_single(global_tid);
1763   KMP_POP_PARTITIONED_TIMER();
1764 
1765 #if OMPT_SUPPORT && OMPT_OPTIONAL
1766   kmp_info_t *this_thr = __kmp_threads[global_tid];
1767   kmp_team_t *team = this_thr->th.th_team;
1768   int tid = __kmp_tid_from_gtid(global_tid);
1769 
1770   if (ompt_enabled.ompt_callback_work) {
1771     ompt_callbacks.ompt_callback(ompt_callback_work)(
1772         ompt_work_single_executor, ompt_scope_end,
1773         &(team->t.ompt_team_info.parallel_data),
1774         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1775         OMPT_GET_RETURN_ADDRESS(0));
1776   }
1777 #endif
1778 }
1779 
1780 /*!
1781 @ingroup WORK_SHARING
1782 @param loc Source location
1783 @param global_tid Global thread id
1784 
1785 Mark the end of a statically scheduled loop.
1786 */
1787 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1788   KMP_POP_PARTITIONED_TIMER();
1789   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1790 
1791 #if OMPT_SUPPORT && OMPT_OPTIONAL
1792   if (ompt_enabled.ompt_callback_work) {
1793     ompt_work_t ompt_work_type = ompt_work_loop;
1794     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1795     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1796     // Determine workshare type
1797     if (loc != NULL) {
1798       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1799         ompt_work_type = ompt_work_loop;
1800       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1801         ompt_work_type = ompt_work_sections;
1802       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1803         ompt_work_type = ompt_work_distribute;
1804       } else {
1805         // use default set above.
1806         // a warning about this case is provided in __kmpc_for_static_init
1807       }
1808       KMP_DEBUG_ASSERT(ompt_work_type);
1809     }
1810     ompt_callbacks.ompt_callback(ompt_callback_work)(
1811         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1812         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1813   }
1814 #endif
1815   if (__kmp_env_consistency_check)
1816     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1817 }
1818 
1819 // User routines which take C-style arguments (call by value)
1820 // different from the Fortran equivalent routines
1821 
1822 void ompc_set_num_threads(int arg) {
1823   // !!!!! TODO: check the per-task binding
1824   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1825 }
1826 
1827 void ompc_set_dynamic(int flag) {
1828   kmp_info_t *thread;
1829 
1830   /* For the thread-private implementation of the internal controls */
1831   thread = __kmp_entry_thread();
1832 
1833   __kmp_save_internal_controls(thread);
1834 
1835   set__dynamic(thread, flag ? TRUE : FALSE);
1836 }
1837 
1838 void ompc_set_nested(int flag) {
1839   kmp_info_t *thread;
1840 
1841   /* For the thread-private internal controls implementation */
1842   thread = __kmp_entry_thread();
1843 
1844   __kmp_save_internal_controls(thread);
1845 
1846   set__nested(thread, flag ? TRUE : FALSE);
1847 }
1848 
1849 void ompc_set_max_active_levels(int max_active_levels) {
1850   /* TO DO */
1851   /* we want per-task implementation of this internal control */
1852 
1853   /* For the per-thread internal controls implementation */
1854   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1855 }
1856 
1857 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1858   // !!!!! TODO: check the per-task binding
1859   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1860 }
1861 
1862 int ompc_get_ancestor_thread_num(int level) {
1863   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1864 }
1865 
1866 int ompc_get_team_size(int level) {
1867   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1868 }
1869 
1870 void kmpc_set_stacksize(int arg) {
1871   // __kmp_aux_set_stacksize initializes the library if needed
1872   __kmp_aux_set_stacksize(arg);
1873 }
1874 
1875 void kmpc_set_stacksize_s(size_t arg) {
1876   // __kmp_aux_set_stacksize initializes the library if needed
1877   __kmp_aux_set_stacksize(arg);
1878 }
1879 
1880 void kmpc_set_blocktime(int arg) {
1881   int gtid, tid;
1882   kmp_info_t *thread;
1883 
1884   gtid = __kmp_entry_gtid();
1885   tid = __kmp_tid_from_gtid(gtid);
1886   thread = __kmp_thread_from_gtid(gtid);
1887 
1888   __kmp_aux_set_blocktime(arg, thread, tid);
1889 }
1890 
1891 void kmpc_set_library(int arg) {
1892   // __kmp_user_set_library initializes the library if needed
1893   __kmp_user_set_library((enum library_type)arg);
1894 }
1895 
1896 void kmpc_set_defaults(char const *str) {
1897   // __kmp_aux_set_defaults initializes the library if needed
1898   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1899 }
1900 
1901 void kmpc_set_disp_num_buffers(int arg) {
1902   // ignore after initialization because some teams have already
1903   // allocated dispatch buffers
1904   if (__kmp_init_serial == 0 && arg > 0)
1905     __kmp_dispatch_num_buffers = arg;
1906 }
1907 
1908 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1909 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1910   return -1;
1911 #else
1912   if (!TCR_4(__kmp_init_middle)) {
1913     __kmp_middle_initialize();
1914   }
1915   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1916 #endif
1917 }
1918 
1919 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1920 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1921   return -1;
1922 #else
1923   if (!TCR_4(__kmp_init_middle)) {
1924     __kmp_middle_initialize();
1925   }
1926   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1927 #endif
1928 }
1929 
1930 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1931 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1932   return -1;
1933 #else
1934   if (!TCR_4(__kmp_init_middle)) {
1935     __kmp_middle_initialize();
1936   }
1937   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1938 #endif
1939 }
1940 
1941 /* -------------------------------------------------------------------------- */
1942 /*!
1943 @ingroup THREADPRIVATE
1944 @param loc       source location information
1945 @param gtid      global thread number
1946 @param cpy_size  size of the cpy_data buffer
1947 @param cpy_data  pointer to data to be copied
1948 @param cpy_func  helper function to call for copying data
1949 @param didit     flag variable: 1=single thread; 0=not single thread
1950 
1951 __kmpc_copyprivate implements the interface for the private data broadcast
1952 needed for the copyprivate clause associated with a single region in an
1953 OpenMP<sup>*</sup> program (both C and Fortran).
1954 All threads participating in the parallel region call this routine.
1955 One of the threads (called the single thread) should have the <tt>didit</tt>
1956 variable set to 1 and all other threads should have that variable set to 0.
1957 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1958 
1959 The OpenMP specification forbids the use of nowait on the single region when a
1960 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1961 barrier internally to avoid race conditions, so the code generation for the
1962 single region should avoid generating a barrier after the call to @ref
1963 __kmpc_copyprivate.
1964 
1965 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1966 The <tt>loc</tt> parameter is a pointer to source location information.
1967 
1968 Internal implementation: The single thread will first copy its descriptor
1969 address (cpy_data) to a team-private location, then the other threads will each
1970 call the function pointed to by the parameter cpy_func, which carries out the
1971 copy by copying the data using the cpy_data buffer.
1972 
1973 The cpy_func routine used for the copy and the contents of the data area defined
1974 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1975 to be done. For instance, the cpy_data buffer can hold the actual data to be
1976 copied or it may hold a list of pointers to the data. The cpy_func routine must
1977 interpret the cpy_data buffer appropriately.
1978 
1979 The interface to cpy_func is as follows:
1980 @code
1981 void cpy_func( void *destination, void *source )
1982 @endcode
1983 where void *destination is the cpy_data pointer for the thread being copied to
1984 and void *source is the cpy_data pointer for the thread being copied from.
1985 */
1986 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1987                         void *cpy_data, void (*cpy_func)(void *, void *),
1988                         kmp_int32 didit) {
1989   void **data_ptr;
1990 
1991   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1992 
1993   KMP_MB();
1994 
1995   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1996 
1997   if (__kmp_env_consistency_check) {
1998     if (loc == 0) {
1999       KMP_WARNING(ConstructIdentInvalid);
2000     }
2001   }
2002 
2003   // ToDo: Optimize the following two barriers into some kind of split barrier
2004 
2005   if (didit)
2006     *data_ptr = cpy_data;
2007 
2008 #if OMPT_SUPPORT
2009   omp_frame_t *ompt_frame;
2010   if (ompt_enabled.enabled) {
2011     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2012     if (ompt_frame->enter_frame == NULL)
2013       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
2014     OMPT_STORE_RETURN_ADDRESS(gtid);
2015   }
2016 #endif
2017 /* This barrier is not a barrier region boundary */
2018 #if USE_ITT_NOTIFY
2019   __kmp_threads[gtid]->th.th_ident = loc;
2020 #endif
2021   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2022 
2023   if (!didit)
2024     (*cpy_func)(cpy_data, *data_ptr);
2025 
2026 // Consider next barrier a user-visible barrier for barrier region boundaries
2027 // Nesting checks are already handled by the single construct checks
2028 
2029 #if OMPT_SUPPORT
2030   if (ompt_enabled.enabled) {
2031     OMPT_STORE_RETURN_ADDRESS(gtid);
2032   }
2033 #endif
2034 #if USE_ITT_NOTIFY
2035   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2036 // tasks can overwrite the location)
2037 #endif
2038   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2039 #if OMPT_SUPPORT && OMPT_OPTIONAL
2040   if (ompt_enabled.enabled) {
2041     ompt_frame->enter_frame = NULL;
2042   }
2043 #endif
2044 }
2045 
2046 /* -------------------------------------------------------------------------- */
2047 
2048 #define INIT_LOCK __kmp_init_user_lock_with_checks
2049 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2050 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2051 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2052 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2053 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2054   __kmp_acquire_nested_user_lock_with_checks_timed
2055 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2056 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2057 #define TEST_LOCK __kmp_test_user_lock_with_checks
2058 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2059 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2060 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2061 
2062 // TODO: Make check abort messages use location info & pass it into
2063 // with_checks routines
2064 
2065 #if KMP_USE_DYNAMIC_LOCK
2066 
2067 // internal lock initializer
2068 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2069                                                     kmp_dyna_lockseq_t seq) {
2070   if (KMP_IS_D_LOCK(seq)) {
2071     KMP_INIT_D_LOCK(lock, seq);
2072 #if USE_ITT_BUILD
2073     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2074 #endif
2075   } else {
2076     KMP_INIT_I_LOCK(lock, seq);
2077 #if USE_ITT_BUILD
2078     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2079     __kmp_itt_lock_creating(ilk->lock, loc);
2080 #endif
2081   }
2082 }
2083 
2084 // internal nest lock initializer
2085 static __forceinline void
2086 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2087                                kmp_dyna_lockseq_t seq) {
2088 #if KMP_USE_TSX
2089   // Don't have nested lock implementation for speculative locks
2090   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2091     seq = __kmp_user_lock_seq;
2092 #endif
2093   switch (seq) {
2094   case lockseq_tas:
2095     seq = lockseq_nested_tas;
2096     break;
2097 #if KMP_USE_FUTEX
2098   case lockseq_futex:
2099     seq = lockseq_nested_futex;
2100     break;
2101 #endif
2102   case lockseq_ticket:
2103     seq = lockseq_nested_ticket;
2104     break;
2105   case lockseq_queuing:
2106     seq = lockseq_nested_queuing;
2107     break;
2108   case lockseq_drdpa:
2109     seq = lockseq_nested_drdpa;
2110     break;
2111   default:
2112     seq = lockseq_nested_queuing;
2113   }
2114   KMP_INIT_I_LOCK(lock, seq);
2115 #if USE_ITT_BUILD
2116   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2117   __kmp_itt_lock_creating(ilk->lock, loc);
2118 #endif
2119 }
2120 
2121 /* initialize the lock with a hint */
2122 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2123                                 uintptr_t hint) {
2124   KMP_DEBUG_ASSERT(__kmp_init_serial);
2125   if (__kmp_env_consistency_check && user_lock == NULL) {
2126     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2127   }
2128 
2129   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2130 
2131 #if OMPT_SUPPORT && OMPT_OPTIONAL
2132   // This is the case, if called from omp_init_lock_with_hint:
2133   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2134   if (!codeptr)
2135     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2136   if (ompt_enabled.ompt_callback_lock_init) {
2137     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2138         ompt_mutex_lock, (omp_lock_hint_t)hint,
2139         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2140         codeptr);
2141   }
2142 #endif
2143 }
2144 
2145 /* initialize the lock with a hint */
2146 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2147                                      void **user_lock, uintptr_t hint) {
2148   KMP_DEBUG_ASSERT(__kmp_init_serial);
2149   if (__kmp_env_consistency_check && user_lock == NULL) {
2150     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2151   }
2152 
2153   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2154 
2155 #if OMPT_SUPPORT && OMPT_OPTIONAL
2156   // This is the case, if called from omp_init_lock_with_hint:
2157   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2158   if (!codeptr)
2159     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2160   if (ompt_enabled.ompt_callback_lock_init) {
2161     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2162         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2163         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2164         codeptr);
2165   }
2166 #endif
2167 }
2168 
2169 #endif // KMP_USE_DYNAMIC_LOCK
2170 
2171 /* initialize the lock */
2172 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2173 #if KMP_USE_DYNAMIC_LOCK
2174 
2175   KMP_DEBUG_ASSERT(__kmp_init_serial);
2176   if (__kmp_env_consistency_check && user_lock == NULL) {
2177     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2178   }
2179   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2180 
2181 #if OMPT_SUPPORT && OMPT_OPTIONAL
2182   // This is the case, if called from omp_init_lock_with_hint:
2183   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2184   if (!codeptr)
2185     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2186   if (ompt_enabled.ompt_callback_lock_init) {
2187     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2188         ompt_mutex_lock, omp_lock_hint_none,
2189         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2190         codeptr);
2191   }
2192 #endif
2193 
2194 #else // KMP_USE_DYNAMIC_LOCK
2195 
2196   static char const *const func = "omp_init_lock";
2197   kmp_user_lock_p lck;
2198   KMP_DEBUG_ASSERT(__kmp_init_serial);
2199 
2200   if (__kmp_env_consistency_check) {
2201     if (user_lock == NULL) {
2202       KMP_FATAL(LockIsUninitialized, func);
2203     }
2204   }
2205 
2206   KMP_CHECK_USER_LOCK_INIT();
2207 
2208   if ((__kmp_user_lock_kind == lk_tas) &&
2209       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2210     lck = (kmp_user_lock_p)user_lock;
2211   }
2212 #if KMP_USE_FUTEX
2213   else if ((__kmp_user_lock_kind == lk_futex) &&
2214            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2215     lck = (kmp_user_lock_p)user_lock;
2216   }
2217 #endif
2218   else {
2219     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2220   }
2221   INIT_LOCK(lck);
2222   __kmp_set_user_lock_location(lck, loc);
2223 
2224 #if OMPT_SUPPORT && OMPT_OPTIONAL
2225   // This is the case, if called from omp_init_lock_with_hint:
2226   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2227   if (!codeptr)
2228     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2229   if (ompt_enabled.ompt_callback_lock_init) {
2230     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2231         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2232         (omp_wait_id_t)user_lock, codeptr);
2233   }
2234 #endif
2235 
2236 #if USE_ITT_BUILD
2237   __kmp_itt_lock_creating(lck);
2238 #endif /* USE_ITT_BUILD */
2239 
2240 #endif // KMP_USE_DYNAMIC_LOCK
2241 } // __kmpc_init_lock
2242 
2243 /* initialize the lock */
2244 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2245 #if KMP_USE_DYNAMIC_LOCK
2246 
2247   KMP_DEBUG_ASSERT(__kmp_init_serial);
2248   if (__kmp_env_consistency_check && user_lock == NULL) {
2249     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2250   }
2251   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2252 
2253 #if OMPT_SUPPORT && OMPT_OPTIONAL
2254   // This is the case, if called from omp_init_lock_with_hint:
2255   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2256   if (!codeptr)
2257     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2258   if (ompt_enabled.ompt_callback_lock_init) {
2259     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2260         ompt_mutex_nest_lock, omp_lock_hint_none,
2261         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2262         codeptr);
2263   }
2264 #endif
2265 
2266 #else // KMP_USE_DYNAMIC_LOCK
2267 
2268   static char const *const func = "omp_init_nest_lock";
2269   kmp_user_lock_p lck;
2270   KMP_DEBUG_ASSERT(__kmp_init_serial);
2271 
2272   if (__kmp_env_consistency_check) {
2273     if (user_lock == NULL) {
2274       KMP_FATAL(LockIsUninitialized, func);
2275     }
2276   }
2277 
2278   KMP_CHECK_USER_LOCK_INIT();
2279 
2280   if ((__kmp_user_lock_kind == lk_tas) &&
2281       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2282        OMP_NEST_LOCK_T_SIZE)) {
2283     lck = (kmp_user_lock_p)user_lock;
2284   }
2285 #if KMP_USE_FUTEX
2286   else if ((__kmp_user_lock_kind == lk_futex) &&
2287            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2288             OMP_NEST_LOCK_T_SIZE)) {
2289     lck = (kmp_user_lock_p)user_lock;
2290   }
2291 #endif
2292   else {
2293     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2294   }
2295 
2296   INIT_NESTED_LOCK(lck);
2297   __kmp_set_user_lock_location(lck, loc);
2298 
2299 #if OMPT_SUPPORT && OMPT_OPTIONAL
2300   // This is the case, if called from omp_init_lock_with_hint:
2301   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2302   if (!codeptr)
2303     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2304   if (ompt_enabled.ompt_callback_lock_init) {
2305     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2306         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2307         (omp_wait_id_t)user_lock, codeptr);
2308   }
2309 #endif
2310 
2311 #if USE_ITT_BUILD
2312   __kmp_itt_lock_creating(lck);
2313 #endif /* USE_ITT_BUILD */
2314 
2315 #endif // KMP_USE_DYNAMIC_LOCK
2316 } // __kmpc_init_nest_lock
2317 
2318 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2319 #if KMP_USE_DYNAMIC_LOCK
2320 
2321 #if USE_ITT_BUILD
2322   kmp_user_lock_p lck;
2323   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2324     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2325   } else {
2326     lck = (kmp_user_lock_p)user_lock;
2327   }
2328   __kmp_itt_lock_destroyed(lck);
2329 #endif
2330 #if OMPT_SUPPORT && OMPT_OPTIONAL
2331   // This is the case, if called from omp_init_lock_with_hint:
2332   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2333   if (!codeptr)
2334     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2335   if (ompt_enabled.ompt_callback_lock_destroy) {
2336     kmp_user_lock_p lck;
2337     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2338       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2339     } else {
2340       lck = (kmp_user_lock_p)user_lock;
2341     }
2342     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2343         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2344   }
2345 #endif
2346   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2347 #else
2348   kmp_user_lock_p lck;
2349 
2350   if ((__kmp_user_lock_kind == lk_tas) &&
2351       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2352     lck = (kmp_user_lock_p)user_lock;
2353   }
2354 #if KMP_USE_FUTEX
2355   else if ((__kmp_user_lock_kind == lk_futex) &&
2356            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2357     lck = (kmp_user_lock_p)user_lock;
2358   }
2359 #endif
2360   else {
2361     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2362   }
2363 
2364 #if OMPT_SUPPORT && OMPT_OPTIONAL
2365   // This is the case, if called from omp_init_lock_with_hint:
2366   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2367   if (!codeptr)
2368     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2369   if (ompt_enabled.ompt_callback_lock_destroy) {
2370     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2371         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2372   }
2373 #endif
2374 
2375 #if USE_ITT_BUILD
2376   __kmp_itt_lock_destroyed(lck);
2377 #endif /* USE_ITT_BUILD */
2378   DESTROY_LOCK(lck);
2379 
2380   if ((__kmp_user_lock_kind == lk_tas) &&
2381       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2382     ;
2383   }
2384 #if KMP_USE_FUTEX
2385   else if ((__kmp_user_lock_kind == lk_futex) &&
2386            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2387     ;
2388   }
2389 #endif
2390   else {
2391     __kmp_user_lock_free(user_lock, gtid, lck);
2392   }
2393 #endif // KMP_USE_DYNAMIC_LOCK
2394 } // __kmpc_destroy_lock
2395 
2396 /* destroy the lock */
2397 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2398 #if KMP_USE_DYNAMIC_LOCK
2399 
2400 #if USE_ITT_BUILD
2401   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2402   __kmp_itt_lock_destroyed(ilk->lock);
2403 #endif
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL
2405   // This is the case, if called from omp_init_lock_with_hint:
2406   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2407   if (!codeptr)
2408     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2409   if (ompt_enabled.ompt_callback_lock_destroy) {
2410     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2411         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2412   }
2413 #endif
2414   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2415 
2416 #else // KMP_USE_DYNAMIC_LOCK
2417 
2418   kmp_user_lock_p lck;
2419 
2420   if ((__kmp_user_lock_kind == lk_tas) &&
2421       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2422        OMP_NEST_LOCK_T_SIZE)) {
2423     lck = (kmp_user_lock_p)user_lock;
2424   }
2425 #if KMP_USE_FUTEX
2426   else if ((__kmp_user_lock_kind == lk_futex) &&
2427            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2428             OMP_NEST_LOCK_T_SIZE)) {
2429     lck = (kmp_user_lock_p)user_lock;
2430   }
2431 #endif
2432   else {
2433     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2434   }
2435 
2436 #if OMPT_SUPPORT && OMPT_OPTIONAL
2437   // This is the case, if called from omp_init_lock_with_hint:
2438   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2439   if (!codeptr)
2440     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2441   if (ompt_enabled.ompt_callback_lock_destroy) {
2442     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2443         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2444   }
2445 #endif
2446 
2447 #if USE_ITT_BUILD
2448   __kmp_itt_lock_destroyed(lck);
2449 #endif /* USE_ITT_BUILD */
2450 
2451   DESTROY_NESTED_LOCK(lck);
2452 
2453   if ((__kmp_user_lock_kind == lk_tas) &&
2454       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2455        OMP_NEST_LOCK_T_SIZE)) {
2456     ;
2457   }
2458 #if KMP_USE_FUTEX
2459   else if ((__kmp_user_lock_kind == lk_futex) &&
2460            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2461             OMP_NEST_LOCK_T_SIZE)) {
2462     ;
2463   }
2464 #endif
2465   else {
2466     __kmp_user_lock_free(user_lock, gtid, lck);
2467   }
2468 #endif // KMP_USE_DYNAMIC_LOCK
2469 } // __kmpc_destroy_nest_lock
2470 
2471 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2472   KMP_COUNT_BLOCK(OMP_set_lock);
2473 #if KMP_USE_DYNAMIC_LOCK
2474   int tag = KMP_EXTRACT_D_TAG(user_lock);
2475 #if USE_ITT_BUILD
2476   __kmp_itt_lock_acquiring(
2477       (kmp_user_lock_p)
2478           user_lock); // itt function will get to the right lock object.
2479 #endif
2480 #if OMPT_SUPPORT && OMPT_OPTIONAL
2481   // This is the case, if called from omp_init_lock_with_hint:
2482   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2483   if (!codeptr)
2484     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2485   if (ompt_enabled.ompt_callback_mutex_acquire) {
2486     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2487         ompt_mutex_lock, omp_lock_hint_none,
2488         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2489         codeptr);
2490   }
2491 #endif
2492 #if KMP_USE_INLINED_TAS
2493   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2494     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2495   } else
2496 #elif KMP_USE_INLINED_FUTEX
2497   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2498     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2499   } else
2500 #endif
2501   {
2502     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2503   }
2504 #if USE_ITT_BUILD
2505   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2506 #endif
2507 #if OMPT_SUPPORT && OMPT_OPTIONAL
2508   if (ompt_enabled.ompt_callback_mutex_acquired) {
2509     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2510         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2511   }
2512 #endif
2513 
2514 #else // KMP_USE_DYNAMIC_LOCK
2515 
2516   kmp_user_lock_p lck;
2517 
2518   if ((__kmp_user_lock_kind == lk_tas) &&
2519       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2520     lck = (kmp_user_lock_p)user_lock;
2521   }
2522 #if KMP_USE_FUTEX
2523   else if ((__kmp_user_lock_kind == lk_futex) &&
2524            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2525     lck = (kmp_user_lock_p)user_lock;
2526   }
2527 #endif
2528   else {
2529     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2530   }
2531 
2532 #if USE_ITT_BUILD
2533   __kmp_itt_lock_acquiring(lck);
2534 #endif /* USE_ITT_BUILD */
2535 #if OMPT_SUPPORT && OMPT_OPTIONAL
2536   // This is the case, if called from omp_init_lock_with_hint:
2537   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2538   if (!codeptr)
2539     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2540   if (ompt_enabled.ompt_callback_mutex_acquire) {
2541     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2542         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2543         (omp_wait_id_t)lck, codeptr);
2544   }
2545 #endif
2546 
2547   ACQUIRE_LOCK(lck, gtid);
2548 
2549 #if USE_ITT_BUILD
2550   __kmp_itt_lock_acquired(lck);
2551 #endif /* USE_ITT_BUILD */
2552 
2553 #if OMPT_SUPPORT && OMPT_OPTIONAL
2554   if (ompt_enabled.ompt_callback_mutex_acquired) {
2555     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2556         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2557   }
2558 #endif
2559 
2560 #endif // KMP_USE_DYNAMIC_LOCK
2561 }
2562 
2563 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2564 #if KMP_USE_DYNAMIC_LOCK
2565 
2566 #if USE_ITT_BUILD
2567   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2568 #endif
2569 #if OMPT_SUPPORT && OMPT_OPTIONAL
2570   // This is the case, if called from omp_init_lock_with_hint:
2571   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2572   if (!codeptr)
2573     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2574   if (ompt_enabled.enabled) {
2575     if (ompt_enabled.ompt_callback_mutex_acquire) {
2576       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2577           ompt_mutex_nest_lock, omp_lock_hint_none,
2578           __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2579           codeptr);
2580     }
2581   }
2582 #endif
2583   int acquire_status =
2584       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2585   (void) acquire_status;
2586 #if USE_ITT_BUILD
2587   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2588 #endif
2589 
2590 #if OMPT_SUPPORT && OMPT_OPTIONAL
2591   if (ompt_enabled.enabled) {
2592     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2593       if (ompt_enabled.ompt_callback_mutex_acquired) {
2594         // lock_first
2595         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2596             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2597       }
2598     } else {
2599       if (ompt_enabled.ompt_callback_nest_lock) {
2600         // lock_next
2601         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2602             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
2603       }
2604     }
2605   }
2606 #endif
2607 
2608 #else // KMP_USE_DYNAMIC_LOCK
2609   int acquire_status;
2610   kmp_user_lock_p lck;
2611 
2612   if ((__kmp_user_lock_kind == lk_tas) &&
2613       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2614        OMP_NEST_LOCK_T_SIZE)) {
2615     lck = (kmp_user_lock_p)user_lock;
2616   }
2617 #if KMP_USE_FUTEX
2618   else if ((__kmp_user_lock_kind == lk_futex) &&
2619            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2620             OMP_NEST_LOCK_T_SIZE)) {
2621     lck = (kmp_user_lock_p)user_lock;
2622   }
2623 #endif
2624   else {
2625     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2626   }
2627 
2628 #if USE_ITT_BUILD
2629   __kmp_itt_lock_acquiring(lck);
2630 #endif /* USE_ITT_BUILD */
2631 #if OMPT_SUPPORT && OMPT_OPTIONAL
2632   // This is the case, if called from omp_init_lock_with_hint:
2633   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2634   if (!codeptr)
2635     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2636   if (ompt_enabled.enabled) {
2637     if (ompt_enabled.ompt_callback_mutex_acquire) {
2638       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2639           ompt_mutex_nest_lock, omp_lock_hint_none,
2640           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
2641     }
2642   }
2643 #endif
2644 
2645   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2646 
2647 #if USE_ITT_BUILD
2648   __kmp_itt_lock_acquired(lck);
2649 #endif /* USE_ITT_BUILD */
2650 
2651 #if OMPT_SUPPORT && OMPT_OPTIONAL
2652   if (ompt_enabled.enabled) {
2653     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2654       if (ompt_enabled.ompt_callback_mutex_acquired) {
2655         // lock_first
2656         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2657             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2658       }
2659     } else {
2660       if (ompt_enabled.ompt_callback_nest_lock) {
2661         // lock_next
2662         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2663             ompt_scope_begin, (omp_wait_id_t)lck, codeptr);
2664       }
2665     }
2666   }
2667 #endif
2668 
2669 #endif // KMP_USE_DYNAMIC_LOCK
2670 }
2671 
2672 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2673 #if KMP_USE_DYNAMIC_LOCK
2674 
2675   int tag = KMP_EXTRACT_D_TAG(user_lock);
2676 #if USE_ITT_BUILD
2677   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2678 #endif
2679 #if KMP_USE_INLINED_TAS
2680   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2681     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2682   } else
2683 #elif KMP_USE_INLINED_FUTEX
2684   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2685     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2686   } else
2687 #endif
2688   {
2689     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2690   }
2691 
2692 #if OMPT_SUPPORT && OMPT_OPTIONAL
2693   // This is the case, if called from omp_init_lock_with_hint:
2694   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2695   if (!codeptr)
2696     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2697   if (ompt_enabled.ompt_callback_mutex_released) {
2698     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2699         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2700   }
2701 #endif
2702 
2703 #else // KMP_USE_DYNAMIC_LOCK
2704 
2705   kmp_user_lock_p lck;
2706 
2707   /* Can't use serial interval since not block structured */
2708   /* release the lock */
2709 
2710   if ((__kmp_user_lock_kind == lk_tas) &&
2711       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2712 #if KMP_OS_LINUX &&                                                            \
2713     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2714 // "fast" path implemented to fix customer performance issue
2715 #if USE_ITT_BUILD
2716     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2717 #endif /* USE_ITT_BUILD */
2718     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2719     KMP_MB();
2720 
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722     // This is the case, if called from omp_init_lock_with_hint:
2723     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2724     if (!codeptr)
2725       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2726     if (ompt_enabled.ompt_callback_mutex_released) {
2727       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2728           ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2729     }
2730 #endif
2731 
2732     return;
2733 #else
2734     lck = (kmp_user_lock_p)user_lock;
2735 #endif
2736   }
2737 #if KMP_USE_FUTEX
2738   else if ((__kmp_user_lock_kind == lk_futex) &&
2739            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2740     lck = (kmp_user_lock_p)user_lock;
2741   }
2742 #endif
2743   else {
2744     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2745   }
2746 
2747 #if USE_ITT_BUILD
2748   __kmp_itt_lock_releasing(lck);
2749 #endif /* USE_ITT_BUILD */
2750 
2751   RELEASE_LOCK(lck, gtid);
2752 
2753 #if OMPT_SUPPORT && OMPT_OPTIONAL
2754   // This is the case, if called from omp_init_lock_with_hint:
2755   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2756   if (!codeptr)
2757     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2758   if (ompt_enabled.ompt_callback_mutex_released) {
2759     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2760         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2761   }
2762 #endif
2763 
2764 #endif // KMP_USE_DYNAMIC_LOCK
2765 }
2766 
2767 /* release the lock */
2768 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2769 #if KMP_USE_DYNAMIC_LOCK
2770 
2771 #if USE_ITT_BUILD
2772   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2773 #endif
2774   int release_status =
2775       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2776   (void) release_status;
2777 
2778 #if OMPT_SUPPORT && OMPT_OPTIONAL
2779   // This is the case, if called from omp_init_lock_with_hint:
2780   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2781   if (!codeptr)
2782     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2783   if (ompt_enabled.enabled) {
2784     if (release_status == KMP_LOCK_RELEASED) {
2785       if (ompt_enabled.ompt_callback_mutex_released) {
2786         // release_lock_last
2787         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2788             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2789       }
2790     } else if (ompt_enabled.ompt_callback_nest_lock) {
2791       // release_lock_prev
2792       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2793           ompt_scope_end, (omp_wait_id_t)user_lock, codeptr);
2794     }
2795   }
2796 #endif
2797 
2798 #else // KMP_USE_DYNAMIC_LOCK
2799 
2800   kmp_user_lock_p lck;
2801 
2802   /* Can't use serial interval since not block structured */
2803 
2804   if ((__kmp_user_lock_kind == lk_tas) &&
2805       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2806        OMP_NEST_LOCK_T_SIZE)) {
2807 #if KMP_OS_LINUX &&                                                            \
2808     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2809     // "fast" path implemented to fix customer performance issue
2810     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2811 #if USE_ITT_BUILD
2812     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2813 #endif /* USE_ITT_BUILD */
2814 
2815 #if OMPT_SUPPORT && OMPT_OPTIONAL
2816     int release_status = KMP_LOCK_STILL_HELD;
2817 #endif
2818 
2819     if (--(tl->lk.depth_locked) == 0) {
2820       TCW_4(tl->lk.poll, 0);
2821 #if OMPT_SUPPORT && OMPT_OPTIONAL
2822       release_status = KMP_LOCK_RELEASED;
2823 #endif
2824     }
2825     KMP_MB();
2826 
2827 #if OMPT_SUPPORT && OMPT_OPTIONAL
2828     // This is the case, if called from omp_init_lock_with_hint:
2829     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2830     if (!codeptr)
2831       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2832     if (ompt_enabled.enabled) {
2833       if (release_status == KMP_LOCK_RELEASED) {
2834         if (ompt_enabled.ompt_callback_mutex_released) {
2835           // release_lock_last
2836           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2837               ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2838         }
2839       } else if (ompt_enabled.ompt_callback_nest_lock) {
2840         // release_lock_previous
2841         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2842             ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2843       }
2844     }
2845 #endif
2846 
2847     return;
2848 #else
2849     lck = (kmp_user_lock_p)user_lock;
2850 #endif
2851   }
2852 #if KMP_USE_FUTEX
2853   else if ((__kmp_user_lock_kind == lk_futex) &&
2854            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2855             OMP_NEST_LOCK_T_SIZE)) {
2856     lck = (kmp_user_lock_p)user_lock;
2857   }
2858 #endif
2859   else {
2860     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2861   }
2862 
2863 #if USE_ITT_BUILD
2864   __kmp_itt_lock_releasing(lck);
2865 #endif /* USE_ITT_BUILD */
2866 
2867   int release_status;
2868   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   // This is the case, if called from omp_init_lock_with_hint:
2871   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2872   if (!codeptr)
2873     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2874   if (ompt_enabled.enabled) {
2875     if (release_status == KMP_LOCK_RELEASED) {
2876       if (ompt_enabled.ompt_callback_mutex_released) {
2877         // release_lock_last
2878         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2879             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2880       }
2881     } else if (ompt_enabled.ompt_callback_nest_lock) {
2882       // release_lock_previous
2883       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2884           ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2885     }
2886   }
2887 #endif
2888 
2889 #endif // KMP_USE_DYNAMIC_LOCK
2890 }
2891 
2892 /* try to acquire the lock */
2893 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2894   KMP_COUNT_BLOCK(OMP_test_lock);
2895 
2896 #if KMP_USE_DYNAMIC_LOCK
2897   int rc;
2898   int tag = KMP_EXTRACT_D_TAG(user_lock);
2899 #if USE_ITT_BUILD
2900   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2901 #endif
2902 #if OMPT_SUPPORT && OMPT_OPTIONAL
2903   // This is the case, if called from omp_init_lock_with_hint:
2904   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2905   if (!codeptr)
2906     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2907   if (ompt_enabled.ompt_callback_mutex_acquire) {
2908     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2909         ompt_mutex_lock, omp_lock_hint_none,
2910         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2911         codeptr);
2912   }
2913 #endif
2914 #if KMP_USE_INLINED_TAS
2915   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2916     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2917   } else
2918 #elif KMP_USE_INLINED_FUTEX
2919   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2920     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2921   } else
2922 #endif
2923   {
2924     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2925   }
2926   if (rc) {
2927 #if USE_ITT_BUILD
2928     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2929 #endif
2930 #if OMPT_SUPPORT && OMPT_OPTIONAL
2931     if (ompt_enabled.ompt_callback_mutex_acquired) {
2932       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2933           ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2934     }
2935 #endif
2936     return FTN_TRUE;
2937   } else {
2938 #if USE_ITT_BUILD
2939     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2940 #endif
2941     return FTN_FALSE;
2942   }
2943 
2944 #else // KMP_USE_DYNAMIC_LOCK
2945 
2946   kmp_user_lock_p lck;
2947   int rc;
2948 
2949   if ((__kmp_user_lock_kind == lk_tas) &&
2950       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2951     lck = (kmp_user_lock_p)user_lock;
2952   }
2953 #if KMP_USE_FUTEX
2954   else if ((__kmp_user_lock_kind == lk_futex) &&
2955            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2956     lck = (kmp_user_lock_p)user_lock;
2957   }
2958 #endif
2959   else {
2960     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2961   }
2962 
2963 #if USE_ITT_BUILD
2964   __kmp_itt_lock_acquiring(lck);
2965 #endif /* USE_ITT_BUILD */
2966 #if OMPT_SUPPORT && OMPT_OPTIONAL
2967   // This is the case, if called from omp_init_lock_with_hint:
2968   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2969   if (!codeptr)
2970     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2971   if (ompt_enabled.ompt_callback_mutex_acquire) {
2972     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2973         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2974         (omp_wait_id_t)lck, codeptr);
2975   }
2976 #endif
2977 
2978   rc = TEST_LOCK(lck, gtid);
2979 #if USE_ITT_BUILD
2980   if (rc) {
2981     __kmp_itt_lock_acquired(lck);
2982   } else {
2983     __kmp_itt_lock_cancelled(lck);
2984   }
2985 #endif /* USE_ITT_BUILD */
2986 #if OMPT_SUPPORT && OMPT_OPTIONAL
2987   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2988     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2989         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2990   }
2991 #endif
2992 
2993   return (rc ? FTN_TRUE : FTN_FALSE);
2994 
2995 /* Can't use serial interval since not block structured */
2996 
2997 #endif // KMP_USE_DYNAMIC_LOCK
2998 }
2999 
3000 /* try to acquire the lock */
3001 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3002 #if KMP_USE_DYNAMIC_LOCK
3003   int rc;
3004 #if USE_ITT_BUILD
3005   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3006 #endif
3007 #if OMPT_SUPPORT && OMPT_OPTIONAL
3008   // This is the case, if called from omp_init_lock_with_hint:
3009   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3010   if (!codeptr)
3011     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3012   if (ompt_enabled.ompt_callback_mutex_acquire) {
3013     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3014         ompt_mutex_nest_lock, omp_lock_hint_none,
3015         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
3016         codeptr);
3017   }
3018 #endif
3019   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3020 #if USE_ITT_BUILD
3021   if (rc) {
3022     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3023   } else {
3024     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3025   }
3026 #endif
3027 #if OMPT_SUPPORT && OMPT_OPTIONAL
3028   if (ompt_enabled.enabled && rc) {
3029     if (rc == 1) {
3030       if (ompt_enabled.ompt_callback_mutex_acquired) {
3031         // lock_first
3032         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3033             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
3034       }
3035     } else {
3036       if (ompt_enabled.ompt_callback_nest_lock) {
3037         // lock_next
3038         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3039             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
3040       }
3041     }
3042   }
3043 #endif
3044   return rc;
3045 
3046 #else // KMP_USE_DYNAMIC_LOCK
3047 
3048   kmp_user_lock_p lck;
3049   int rc;
3050 
3051   if ((__kmp_user_lock_kind == lk_tas) &&
3052       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3053        OMP_NEST_LOCK_T_SIZE)) {
3054     lck = (kmp_user_lock_p)user_lock;
3055   }
3056 #if KMP_USE_FUTEX
3057   else if ((__kmp_user_lock_kind == lk_futex) &&
3058            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3059             OMP_NEST_LOCK_T_SIZE)) {
3060     lck = (kmp_user_lock_p)user_lock;
3061   }
3062 #endif
3063   else {
3064     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3065   }
3066 
3067 #if USE_ITT_BUILD
3068   __kmp_itt_lock_acquiring(lck);
3069 #endif /* USE_ITT_BUILD */
3070 
3071 #if OMPT_SUPPORT && OMPT_OPTIONAL
3072   // This is the case, if called from omp_init_lock_with_hint:
3073   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3074   if (!codeptr)
3075     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3076   if (ompt_enabled.enabled) &&
3077         ompt_enabled.ompt_callback_mutex_acquire) {
3078       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3079           ompt_mutex_nest_lock, omp_lock_hint_none,
3080           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
3081     }
3082 #endif
3083 
3084   rc = TEST_NESTED_LOCK(lck, gtid);
3085 #if USE_ITT_BUILD
3086   if (rc) {
3087     __kmp_itt_lock_acquired(lck);
3088   } else {
3089     __kmp_itt_lock_cancelled(lck);
3090   }
3091 #endif /* USE_ITT_BUILD */
3092 #if OMPT_SUPPORT && OMPT_OPTIONAL
3093   if (ompt_enabled.enabled && rc) {
3094     if (rc == 1) {
3095       if (ompt_enabled.ompt_callback_mutex_acquired) {
3096         // lock_first
3097         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3098             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
3099       }
3100     } else {
3101       if (ompt_enabled.ompt_callback_nest_lock) {
3102         // lock_next
3103         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3104             ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr);
3105       }
3106     }
3107   }
3108 #endif
3109   return rc;
3110 
3111 /* Can't use serial interval since not block structured */
3112 
3113 #endif // KMP_USE_DYNAMIC_LOCK
3114 }
3115 
3116 // Interface to fast scalable reduce methods routines
3117 
3118 // keep the selected method in a thread local structure for cross-function
3119 // usage: will be used in __kmpc_end_reduce* functions;
3120 // another solution: to re-determine the method one more time in
3121 // __kmpc_end_reduce* functions (new prototype required then)
3122 // AT: which solution is better?
3123 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3124   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3125 
3126 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3127   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3128 
3129 // description of the packed_reduction_method variable: look at the macros in
3130 // kmp.h
3131 
3132 // used in a critical section reduce block
3133 static __forceinline void
3134 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3135                                           kmp_critical_name *crit) {
3136 
3137   // this lock was visible to a customer and to the threading profile tool as a
3138   // serial overhead span (although it's used for an internal purpose only)
3139   //            why was it visible in previous implementation?
3140   //            should we keep it visible in new reduce block?
3141   kmp_user_lock_p lck;
3142 
3143 #if KMP_USE_DYNAMIC_LOCK
3144 
3145   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3146   // Check if it is initialized.
3147   if (*lk == 0) {
3148     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3149       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3150                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3151     } else {
3152       __kmp_init_indirect_csptr(crit, loc, global_tid,
3153                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3154     }
3155   }
3156   // Branch for accessing the actual lock object and set operation. This
3157   // branching is inevitable since this lock initialization does not follow the
3158   // normal dispatch path (lock table is not used).
3159   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3160     lck = (kmp_user_lock_p)lk;
3161     KMP_DEBUG_ASSERT(lck != NULL);
3162     if (__kmp_env_consistency_check) {
3163       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3164     }
3165     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3166   } else {
3167     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3168     lck = ilk->lock;
3169     KMP_DEBUG_ASSERT(lck != NULL);
3170     if (__kmp_env_consistency_check) {
3171       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3172     }
3173     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3174   }
3175 
3176 #else // KMP_USE_DYNAMIC_LOCK
3177 
3178   // We know that the fast reduction code is only emitted by Intel compilers
3179   // with 32 byte critical sections. If there isn't enough space, then we
3180   // have to use a pointer.
3181   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3182     lck = (kmp_user_lock_p)crit;
3183   } else {
3184     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3185   }
3186   KMP_DEBUG_ASSERT(lck != NULL);
3187 
3188   if (__kmp_env_consistency_check)
3189     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3190 
3191   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3192 
3193 #endif // KMP_USE_DYNAMIC_LOCK
3194 }
3195 
3196 // used in a critical section reduce block
3197 static __forceinline void
3198 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3199                                         kmp_critical_name *crit) {
3200 
3201   kmp_user_lock_p lck;
3202 
3203 #if KMP_USE_DYNAMIC_LOCK
3204 
3205   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3206     lck = (kmp_user_lock_p)crit;
3207     if (__kmp_env_consistency_check)
3208       __kmp_pop_sync(global_tid, ct_critical, loc);
3209     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3210   } else {
3211     kmp_indirect_lock_t *ilk =
3212         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3213     if (__kmp_env_consistency_check)
3214       __kmp_pop_sync(global_tid, ct_critical, loc);
3215     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3216   }
3217 
3218 #else // KMP_USE_DYNAMIC_LOCK
3219 
3220   // We know that the fast reduction code is only emitted by Intel compilers
3221   // with 32 byte critical sections. If there isn't enough space, then we have
3222   // to use a pointer.
3223   if (__kmp_base_user_lock_size > 32) {
3224     lck = *((kmp_user_lock_p *)crit);
3225     KMP_ASSERT(lck != NULL);
3226   } else {
3227     lck = (kmp_user_lock_p)crit;
3228   }
3229 
3230   if (__kmp_env_consistency_check)
3231     __kmp_pop_sync(global_tid, ct_critical, loc);
3232 
3233   __kmp_release_user_lock_with_checks(lck, global_tid);
3234 
3235 #endif // KMP_USE_DYNAMIC_LOCK
3236 } // __kmp_end_critical_section_reduce_block
3237 
3238 #if OMP_40_ENABLED
3239 static __forceinline int
3240 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3241                                      int *task_state) {
3242   kmp_team_t *team;
3243 
3244   // Check if we are inside the teams construct?
3245   if (th->th.th_teams_microtask) {
3246     *team_p = team = th->th.th_team;
3247     if (team->t.t_level == th->th.th_teams_level) {
3248       // This is reduction at teams construct.
3249       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3250       // Let's swap teams temporarily for the reduction.
3251       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3252       th->th.th_team = team->t.t_parent;
3253       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3254       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3255       *task_state = th->th.th_task_state;
3256       th->th.th_task_state = 0;
3257 
3258       return 1;
3259     }
3260   }
3261   return 0;
3262 }
3263 
3264 static __forceinline void
3265 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3266   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3267   th->th.th_info.ds.ds_tid = 0;
3268   th->th.th_team = team;
3269   th->th.th_team_nproc = team->t.t_nproc;
3270   th->th.th_task_team = team->t.t_task_team[task_state];
3271   th->th.th_task_state = task_state;
3272 }
3273 #endif
3274 
3275 /* 2.a.i. Reduce Block without a terminating barrier */
3276 /*!
3277 @ingroup SYNCHRONIZATION
3278 @param loc source location information
3279 @param global_tid global thread number
3280 @param num_vars number of items (variables) to be reduced
3281 @param reduce_size size of data in bytes to be reduced
3282 @param reduce_data pointer to data to be reduced
3283 @param reduce_func callback function providing reduction operation on two
3284 operands and returning result of reduction in lhs_data
3285 @param lck pointer to the unique lock data structure
3286 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3287 threads if atomic reduction needed
3288 
3289 The nowait version is used for a reduce clause with the nowait argument.
3290 */
3291 kmp_int32
3292 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3293                      size_t reduce_size, void *reduce_data,
3294                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3295                      kmp_critical_name *lck) {
3296 
3297   KMP_COUNT_BLOCK(REDUCE_nowait);
3298   int retval = 0;
3299   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3300 #if OMP_40_ENABLED
3301   kmp_info_t *th;
3302   kmp_team_t *team;
3303   int teams_swapped = 0, task_state;
3304 #endif
3305   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3306 
3307   // why do we need this initialization here at all?
3308   // Reduction clause can not be used as a stand-alone directive.
3309 
3310   // do not call __kmp_serial_initialize(), it will be called by
3311   // __kmp_parallel_initialize() if needed
3312   // possible detection of false-positive race by the threadchecker ???
3313   if (!TCR_4(__kmp_init_parallel))
3314     __kmp_parallel_initialize();
3315 
3316 // check correctness of reduce block nesting
3317 #if KMP_USE_DYNAMIC_LOCK
3318   if (__kmp_env_consistency_check)
3319     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3320 #else
3321   if (__kmp_env_consistency_check)
3322     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3323 #endif
3324 
3325 #if OMP_40_ENABLED
3326   th = __kmp_thread_from_gtid(global_tid);
3327   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3328 #endif // OMP_40_ENABLED
3329 
3330   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3331   // the value should be kept in a variable
3332   // the variable should be either a construct-specific or thread-specific
3333   // property, not a team specific property
3334   //     (a thread can reach the next reduce block on the next construct, reduce
3335   //     method may differ on the next construct)
3336   // an ident_t "loc" parameter could be used as a construct-specific property
3337   // (what if loc == 0?)
3338   //     (if both construct-specific and team-specific variables were shared,
3339   //     then unness extra syncs should be needed)
3340   // a thread-specific variable is better regarding two issues above (next
3341   // construct and extra syncs)
3342   // a thread-specific "th_local.reduction_method" variable is used currently
3343   // each thread executes 'determine' and 'set' lines (no need to execute by one
3344   // thread, to avoid unness extra syncs)
3345 
3346   packed_reduction_method = __kmp_determine_reduction_method(
3347       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3348   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3349 
3350   if (packed_reduction_method == critical_reduce_block) {
3351 
3352     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3353     retval = 1;
3354 
3355   } else if (packed_reduction_method == empty_reduce_block) {
3356 
3357     // usage: if team size == 1, no synchronization is required ( Intel
3358     // platforms only )
3359     retval = 1;
3360 
3361   } else if (packed_reduction_method == atomic_reduce_block) {
3362 
3363     retval = 2;
3364 
3365     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3366     // won't be called by the code gen)
3367     //     (it's not quite good, because the checking block has been closed by
3368     //     this 'pop',
3369     //      but atomic operation has not been executed yet, will be executed
3370     //      slightly later, literally on next instruction)
3371     if (__kmp_env_consistency_check)
3372       __kmp_pop_sync(global_tid, ct_reduce, loc);
3373 
3374   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3375                                    tree_reduce_block)) {
3376 
3377 // AT: performance issue: a real barrier here
3378 // AT:     (if master goes slow, other threads are blocked here waiting for the
3379 // master to come and release them)
3380 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3381 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3382 // be confusing to a customer)
3383 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3384 // might go faster and be more in line with sense of NOWAIT
3385 // AT: TO DO: do epcc test and compare times
3386 
3387 // this barrier should be invisible to a customer and to the threading profile
3388 // tool (it's neither a terminating barrier nor customer's code, it's
3389 // used for an internal purpose)
3390 #if OMPT_SUPPORT
3391     // JP: can this barrier potentially leed to task scheduling?
3392     // JP: as long as there is a barrier in the implementation, OMPT should and
3393     // will provide the barrier events
3394     //         so we set-up the necessary frame/return addresses.
3395     omp_frame_t *ompt_frame;
3396     if (ompt_enabled.enabled) {
3397       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3398       if (ompt_frame->enter_frame == NULL)
3399         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3400       OMPT_STORE_RETURN_ADDRESS(global_tid);
3401     }
3402 #endif
3403 #if USE_ITT_NOTIFY
3404     __kmp_threads[global_tid]->th.th_ident = loc;
3405 #endif
3406     retval =
3407         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3408                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3409     retval = (retval != 0) ? (0) : (1);
3410 #if OMPT_SUPPORT && OMPT_OPTIONAL
3411     if (ompt_enabled.enabled) {
3412       ompt_frame->enter_frame = NULL;
3413     }
3414 #endif
3415 
3416     // all other workers except master should do this pop here
3417     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3418     if (__kmp_env_consistency_check) {
3419       if (retval == 0) {
3420         __kmp_pop_sync(global_tid, ct_reduce, loc);
3421       }
3422     }
3423 
3424   } else {
3425 
3426     // should never reach this block
3427     KMP_ASSERT(0); // "unexpected method"
3428   }
3429 #if OMP_40_ENABLED
3430   if (teams_swapped) {
3431     __kmp_restore_swapped_teams(th, team, task_state);
3432   }
3433 #endif
3434   KA_TRACE(
3435       10,
3436       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3437        global_tid, packed_reduction_method, retval));
3438 
3439   return retval;
3440 }
3441 
3442 /*!
3443 @ingroup SYNCHRONIZATION
3444 @param loc source location information
3445 @param global_tid global thread id.
3446 @param lck pointer to the unique lock data structure
3447 
3448 Finish the execution of a reduce nowait.
3449 */
3450 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3451                               kmp_critical_name *lck) {
3452 
3453   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3454 
3455   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3456 
3457   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3458 
3459   if (packed_reduction_method == critical_reduce_block) {
3460 
3461     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3462 
3463   } else if (packed_reduction_method == empty_reduce_block) {
3464 
3465     // usage: if team size == 1, no synchronization is required ( on Intel
3466     // platforms only )
3467 
3468   } else if (packed_reduction_method == atomic_reduce_block) {
3469 
3470     // neither master nor other workers should get here
3471     //     (code gen does not generate this call in case 2: atomic reduce block)
3472     // actually it's better to remove this elseif at all;
3473     // after removal this value will checked by the 'else' and will assert
3474 
3475   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3476                                    tree_reduce_block)) {
3477 
3478     // only master gets here
3479 
3480   } else {
3481 
3482     // should never reach this block
3483     KMP_ASSERT(0); // "unexpected method"
3484   }
3485 
3486   if (__kmp_env_consistency_check)
3487     __kmp_pop_sync(global_tid, ct_reduce, loc);
3488 
3489   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3490                 global_tid, packed_reduction_method));
3491 
3492   return;
3493 }
3494 
3495 /* 2.a.ii. Reduce Block with a terminating barrier */
3496 
3497 /*!
3498 @ingroup SYNCHRONIZATION
3499 @param loc source location information
3500 @param global_tid global thread number
3501 @param num_vars number of items (variables) to be reduced
3502 @param reduce_size size of data in bytes to be reduced
3503 @param reduce_data pointer to data to be reduced
3504 @param reduce_func callback function providing reduction operation on two
3505 operands and returning result of reduction in lhs_data
3506 @param lck pointer to the unique lock data structure
3507 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3508 threads if atomic reduction needed
3509 
3510 A blocking reduce that includes an implicit barrier.
3511 */
3512 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3513                         size_t reduce_size, void *reduce_data,
3514                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3515                         kmp_critical_name *lck) {
3516   KMP_COUNT_BLOCK(REDUCE_wait);
3517   int retval = 0;
3518   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3519 #if OMP_40_ENABLED
3520   kmp_info_t *th;
3521   kmp_team_t *team;
3522   int teams_swapped = 0, task_state;
3523 #endif
3524 
3525   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3526 
3527   // why do we need this initialization here at all?
3528   // Reduction clause can not be a stand-alone directive.
3529 
3530   // do not call __kmp_serial_initialize(), it will be called by
3531   // __kmp_parallel_initialize() if needed
3532   // possible detection of false-positive race by the threadchecker ???
3533   if (!TCR_4(__kmp_init_parallel))
3534     __kmp_parallel_initialize();
3535 
3536 // check correctness of reduce block nesting
3537 #if KMP_USE_DYNAMIC_LOCK
3538   if (__kmp_env_consistency_check)
3539     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3540 #else
3541   if (__kmp_env_consistency_check)
3542     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3543 #endif
3544 
3545 #if OMP_40_ENABLED
3546   th = __kmp_thread_from_gtid(global_tid);
3547   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3548 #endif // OMP_40_ENABLED
3549 
3550   packed_reduction_method = __kmp_determine_reduction_method(
3551       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3552   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3553 
3554   if (packed_reduction_method == critical_reduce_block) {
3555 
3556     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3557     retval = 1;
3558 
3559   } else if (packed_reduction_method == empty_reduce_block) {
3560 
3561     // usage: if team size == 1, no synchronization is required ( Intel
3562     // platforms only )
3563     retval = 1;
3564 
3565   } else if (packed_reduction_method == atomic_reduce_block) {
3566 
3567     retval = 2;
3568 
3569   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3570                                    tree_reduce_block)) {
3571 
3572 // case tree_reduce_block:
3573 // this barrier should be visible to a customer and to the threading profile
3574 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3575 #if OMPT_SUPPORT
3576     omp_frame_t *ompt_frame;
3577     if (ompt_enabled.enabled) {
3578       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3579       if (ompt_frame->enter_frame == NULL)
3580         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3581       OMPT_STORE_RETURN_ADDRESS(global_tid);
3582     }
3583 #endif
3584 #if USE_ITT_NOTIFY
3585     __kmp_threads[global_tid]->th.th_ident =
3586         loc; // needed for correct notification of frames
3587 #endif
3588     retval =
3589         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3590                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3591     retval = (retval != 0) ? (0) : (1);
3592 #if OMPT_SUPPORT && OMPT_OPTIONAL
3593     if (ompt_enabled.enabled) {
3594       ompt_frame->enter_frame = NULL;
3595     }
3596 #endif
3597 
3598     // all other workers except master should do this pop here
3599     // ( none of other workers except master will enter __kmpc_end_reduce() )
3600     if (__kmp_env_consistency_check) {
3601       if (retval == 0) { // 0: all other workers; 1: master
3602         __kmp_pop_sync(global_tid, ct_reduce, loc);
3603       }
3604     }
3605 
3606   } else {
3607 
3608     // should never reach this block
3609     KMP_ASSERT(0); // "unexpected method"
3610   }
3611 #if OMP_40_ENABLED
3612   if (teams_swapped) {
3613     __kmp_restore_swapped_teams(th, team, task_state);
3614   }
3615 #endif
3616 
3617   KA_TRACE(10,
3618            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3619             global_tid, packed_reduction_method, retval));
3620 
3621   return retval;
3622 }
3623 
3624 /*!
3625 @ingroup SYNCHRONIZATION
3626 @param loc source location information
3627 @param global_tid global thread id.
3628 @param lck pointer to the unique lock data structure
3629 
3630 Finish the execution of a blocking reduce.
3631 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3632 start function.
3633 */
3634 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3635                        kmp_critical_name *lck) {
3636 
3637   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3638 #if OMP_40_ENABLED
3639   kmp_info_t *th;
3640   kmp_team_t *team;
3641   int teams_swapped = 0, task_state;
3642 #endif
3643 
3644   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3645 
3646 #if OMP_40_ENABLED
3647   th = __kmp_thread_from_gtid(global_tid);
3648   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3649 #endif // OMP_40_ENABLED
3650 
3651   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3652 
3653   // this barrier should be visible to a customer and to the threading profile
3654   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3655 
3656   if (packed_reduction_method == critical_reduce_block) {
3657 
3658     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3659 
3660 // TODO: implicit barrier: should be exposed
3661 #if OMPT_SUPPORT
3662     omp_frame_t *ompt_frame;
3663     if (ompt_enabled.enabled) {
3664       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3665       if (ompt_frame->enter_frame == NULL)
3666         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3667       OMPT_STORE_RETURN_ADDRESS(global_tid);
3668     }
3669 #endif
3670 #if USE_ITT_NOTIFY
3671     __kmp_threads[global_tid]->th.th_ident = loc;
3672 #endif
3673     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3674 #if OMPT_SUPPORT && OMPT_OPTIONAL
3675     if (ompt_enabled.enabled) {
3676       ompt_frame->enter_frame = NULL;
3677     }
3678 #endif
3679 
3680   } else if (packed_reduction_method == empty_reduce_block) {
3681 
3682 // usage: if team size==1, no synchronization is required (Intel platforms only)
3683 
3684 // TODO: implicit barrier: should be exposed
3685 #if OMPT_SUPPORT
3686     omp_frame_t *ompt_frame;
3687     if (ompt_enabled.enabled) {
3688       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3689       if (ompt_frame->enter_frame == NULL)
3690         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3691       OMPT_STORE_RETURN_ADDRESS(global_tid);
3692     }
3693 #endif
3694 #if USE_ITT_NOTIFY
3695     __kmp_threads[global_tid]->th.th_ident = loc;
3696 #endif
3697     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3698 #if OMPT_SUPPORT && OMPT_OPTIONAL
3699     if (ompt_enabled.enabled) {
3700       ompt_frame->enter_frame = NULL;
3701     }
3702 #endif
3703 
3704   } else if (packed_reduction_method == atomic_reduce_block) {
3705 
3706 #if OMPT_SUPPORT
3707     omp_frame_t *ompt_frame;
3708     if (ompt_enabled.enabled) {
3709       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3710       if (ompt_frame->enter_frame == NULL)
3711         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3712       OMPT_STORE_RETURN_ADDRESS(global_tid);
3713     }
3714 #endif
3715 // TODO: implicit barrier: should be exposed
3716 #if USE_ITT_NOTIFY
3717     __kmp_threads[global_tid]->th.th_ident = loc;
3718 #endif
3719     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3720 #if OMPT_SUPPORT && OMPT_OPTIONAL
3721     if (ompt_enabled.enabled) {
3722       ompt_frame->enter_frame = NULL;
3723     }
3724 #endif
3725 
3726   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3727                                    tree_reduce_block)) {
3728 
3729     // only master executes here (master releases all other workers)
3730     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3731                             global_tid);
3732 
3733   } else {
3734 
3735     // should never reach this block
3736     KMP_ASSERT(0); // "unexpected method"
3737   }
3738 #if OMP_40_ENABLED
3739   if (teams_swapped) {
3740     __kmp_restore_swapped_teams(th, team, task_state);
3741   }
3742 #endif
3743 
3744   if (__kmp_env_consistency_check)
3745     __kmp_pop_sync(global_tid, ct_reduce, loc);
3746 
3747   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3748                 global_tid, packed_reduction_method));
3749 
3750   return;
3751 }
3752 
3753 #undef __KMP_GET_REDUCTION_METHOD
3754 #undef __KMP_SET_REDUCTION_METHOD
3755 
3756 /* end of interface to fast scalable reduce routines */
3757 
3758 kmp_uint64 __kmpc_get_taskid() {
3759 
3760   kmp_int32 gtid;
3761   kmp_info_t *thread;
3762 
3763   gtid = __kmp_get_gtid();
3764   if (gtid < 0) {
3765     return 0;
3766   }
3767   thread = __kmp_thread_from_gtid(gtid);
3768   return thread->th.th_current_task->td_task_id;
3769 
3770 } // __kmpc_get_taskid
3771 
3772 kmp_uint64 __kmpc_get_parent_taskid() {
3773 
3774   kmp_int32 gtid;
3775   kmp_info_t *thread;
3776   kmp_taskdata_t *parent_task;
3777 
3778   gtid = __kmp_get_gtid();
3779   if (gtid < 0) {
3780     return 0;
3781   }
3782   thread = __kmp_thread_from_gtid(gtid);
3783   parent_task = thread->th.th_current_task->td_parent;
3784   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3785 
3786 } // __kmpc_get_parent_taskid
3787 
3788 #if OMP_45_ENABLED
3789 /*!
3790 @ingroup WORK_SHARING
3791 @param loc  source location information.
3792 @param gtid  global thread number.
3793 @param num_dims  number of associated doacross loops.
3794 @param dims  info on loops bounds.
3795 
3796 Initialize doacross loop information.
3797 Expect compiler send us inclusive bounds,
3798 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3799 */
3800 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3801                           const struct kmp_dim *dims) {
3802   int j, idx;
3803   kmp_int64 last, trace_count;
3804   kmp_info_t *th = __kmp_threads[gtid];
3805   kmp_team_t *team = th->th.th_team;
3806   kmp_uint32 *flags;
3807   kmp_disp_t *pr_buf = th->th.th_dispatch;
3808   dispatch_shared_info_t *sh_buf;
3809 
3810   KA_TRACE(
3811       20,
3812       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3813        gtid, num_dims, !team->t.t_serialized));
3814   KMP_DEBUG_ASSERT(dims != NULL);
3815   KMP_DEBUG_ASSERT(num_dims > 0);
3816 
3817   if (team->t.t_serialized) {
3818     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3819     return; // no dependencies if team is serialized
3820   }
3821   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3822   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3823   // the next loop
3824   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3825 
3826   // Save bounds info into allocated private buffer
3827   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3828   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3829       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3830   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3831   pr_buf->th_doacross_info[0] =
3832       (kmp_int64)num_dims; // first element is number of dimensions
3833   // Save also address of num_done in order to access it later without knowing
3834   // the buffer index
3835   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3836   pr_buf->th_doacross_info[2] = dims[0].lo;
3837   pr_buf->th_doacross_info[3] = dims[0].up;
3838   pr_buf->th_doacross_info[4] = dims[0].st;
3839   last = 5;
3840   for (j = 1; j < num_dims; ++j) {
3841     kmp_int64
3842         range_length; // To keep ranges of all dimensions but the first dims[0]
3843     if (dims[j].st == 1) { // most common case
3844       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3845       range_length = dims[j].up - dims[j].lo + 1;
3846     } else {
3847       if (dims[j].st > 0) {
3848         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3849         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3850       } else { // negative increment
3851         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3852         range_length =
3853             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3854       }
3855     }
3856     pr_buf->th_doacross_info[last++] = range_length;
3857     pr_buf->th_doacross_info[last++] = dims[j].lo;
3858     pr_buf->th_doacross_info[last++] = dims[j].up;
3859     pr_buf->th_doacross_info[last++] = dims[j].st;
3860   }
3861 
3862   // Compute total trip count.
3863   // Start with range of dims[0] which we don't need to keep in the buffer.
3864   if (dims[0].st == 1) { // most common case
3865     trace_count = dims[0].up - dims[0].lo + 1;
3866   } else if (dims[0].st > 0) {
3867     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3868     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3869   } else { // negative increment
3870     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3871     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3872   }
3873   for (j = 1; j < num_dims; ++j) {
3874     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3875   }
3876   KMP_DEBUG_ASSERT(trace_count > 0);
3877 
3878   // Check if shared buffer is not occupied by other loop (idx -
3879   // __kmp_dispatch_num_buffers)
3880   if (idx != sh_buf->doacross_buf_idx) {
3881     // Shared buffer is occupied, wait for it to be free
3882     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3883                        __kmp_eq_4, NULL);
3884   }
3885 #if KMP_32_BIT_ARCH
3886   // Check if we are the first thread. After the CAS the first thread gets 0,
3887   // others get 1 if initialization is in progress, allocated pointer otherwise.
3888   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3889   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3890       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3891 #else
3892   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3893       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3894 #endif
3895   if (flags == NULL) {
3896     // we are the first thread, allocate the array of flags
3897     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3898     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3899     KMP_MB();
3900     sh_buf->doacross_flags = flags;
3901   } else if (flags == (kmp_uint32 *)1) {
3902 #if KMP_32_BIT_ARCH
3903     // initialization is still in progress, need to wait
3904     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3905 #else
3906     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3907 #endif
3908       KMP_YIELD(TRUE);
3909     KMP_MB();
3910   } else {
3911     KMP_MB();
3912   }
3913   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3914   pr_buf->th_doacross_flags =
3915       sh_buf->doacross_flags; // save private copy in order to not
3916   // touch shared buffer on each iteration
3917   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3918 }
3919 
3920 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3921   kmp_int32 shft, num_dims, i;
3922   kmp_uint32 flag;
3923   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3924   kmp_info_t *th = __kmp_threads[gtid];
3925   kmp_team_t *team = th->th.th_team;
3926   kmp_disp_t *pr_buf;
3927   kmp_int64 lo, up, st;
3928 
3929   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3930   if (team->t.t_serialized) {
3931     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3932     return; // no dependencies if team is serialized
3933   }
3934 
3935   // calculate sequential iteration number and check out-of-bounds condition
3936   pr_buf = th->th.th_dispatch;
3937   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3938   num_dims = pr_buf->th_doacross_info[0];
3939   lo = pr_buf->th_doacross_info[2];
3940   up = pr_buf->th_doacross_info[3];
3941   st = pr_buf->th_doacross_info[4];
3942   if (st == 1) { // most common case
3943     if (vec[0] < lo || vec[0] > up) {
3944       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3945                     "bounds [%lld,%lld]\n",
3946                     gtid, vec[0], lo, up));
3947       return;
3948     }
3949     iter_number = vec[0] - lo;
3950   } else if (st > 0) {
3951     if (vec[0] < lo || vec[0] > up) {
3952       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3953                     "bounds [%lld,%lld]\n",
3954                     gtid, vec[0], lo, up));
3955       return;
3956     }
3957     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3958   } else { // negative increment
3959     if (vec[0] > lo || vec[0] < up) {
3960       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3961                     "bounds [%lld,%lld]\n",
3962                     gtid, vec[0], lo, up));
3963       return;
3964     }
3965     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3966   }
3967   for (i = 1; i < num_dims; ++i) {
3968     kmp_int64 iter, ln;
3969     kmp_int32 j = i * 4;
3970     ln = pr_buf->th_doacross_info[j + 1];
3971     lo = pr_buf->th_doacross_info[j + 2];
3972     up = pr_buf->th_doacross_info[j + 3];
3973     st = pr_buf->th_doacross_info[j + 4];
3974     if (st == 1) {
3975       if (vec[i] < lo || vec[i] > up) {
3976         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3977                       "bounds [%lld,%lld]\n",
3978                       gtid, vec[i], lo, up));
3979         return;
3980       }
3981       iter = vec[i] - lo;
3982     } else if (st > 0) {
3983       if (vec[i] < lo || vec[i] > up) {
3984         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3985                       "bounds [%lld,%lld]\n",
3986                       gtid, vec[i], lo, up));
3987         return;
3988       }
3989       iter = (kmp_uint64)(vec[i] - lo) / st;
3990     } else { // st < 0
3991       if (vec[i] > lo || vec[i] < up) {
3992         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3993                       "bounds [%lld,%lld]\n",
3994                       gtid, vec[i], lo, up));
3995         return;
3996       }
3997       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3998     }
3999     iter_number = iter + ln * iter_number;
4000   }
4001   shft = iter_number % 32; // use 32-bit granularity
4002   iter_number >>= 5; // divided by 32
4003   flag = 1 << shft;
4004   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4005     KMP_YIELD(TRUE);
4006   }
4007   KMP_MB();
4008   KA_TRACE(20,
4009            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4010             gtid, (iter_number << 5) + shft));
4011 }
4012 
4013 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4014   kmp_int32 shft, num_dims, i;
4015   kmp_uint32 flag;
4016   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4017   kmp_info_t *th = __kmp_threads[gtid];
4018   kmp_team_t *team = th->th.th_team;
4019   kmp_disp_t *pr_buf;
4020   kmp_int64 lo, st;
4021 
4022   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4023   if (team->t.t_serialized) {
4024     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4025     return; // no dependencies if team is serialized
4026   }
4027 
4028   // calculate sequential iteration number (same as in "wait" but no
4029   // out-of-bounds checks)
4030   pr_buf = th->th.th_dispatch;
4031   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4032   num_dims = pr_buf->th_doacross_info[0];
4033   lo = pr_buf->th_doacross_info[2];
4034   st = pr_buf->th_doacross_info[4];
4035   if (st == 1) { // most common case
4036     iter_number = vec[0] - lo;
4037   } else if (st > 0) {
4038     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4039   } else { // negative increment
4040     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4041   }
4042   for (i = 1; i < num_dims; ++i) {
4043     kmp_int64 iter, ln;
4044     kmp_int32 j = i * 4;
4045     ln = pr_buf->th_doacross_info[j + 1];
4046     lo = pr_buf->th_doacross_info[j + 2];
4047     st = pr_buf->th_doacross_info[j + 4];
4048     if (st == 1) {
4049       iter = vec[i] - lo;
4050     } else if (st > 0) {
4051       iter = (kmp_uint64)(vec[i] - lo) / st;
4052     } else { // st < 0
4053       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4054     }
4055     iter_number = iter + ln * iter_number;
4056   }
4057   shft = iter_number % 32; // use 32-bit granularity
4058   iter_number >>= 5; // divided by 32
4059   flag = 1 << shft;
4060   KMP_MB();
4061   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4062     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4063   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4064                 (iter_number << 5) + shft));
4065 }
4066 
4067 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4068   kmp_int32 num_done;
4069   kmp_info_t *th = __kmp_threads[gtid];
4070   kmp_team_t *team = th->th.th_team;
4071   kmp_disp_t *pr_buf = th->th.th_dispatch;
4072 
4073   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4074   if (team->t.t_serialized) {
4075     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4076     return; // nothing to do
4077   }
4078   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4079   if (num_done == th->th.th_team_nproc) {
4080     // we are the last thread, need to free shared resources
4081     int idx = pr_buf->th_doacross_buf_idx - 1;
4082     dispatch_shared_info_t *sh_buf =
4083         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4084     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4085                      (kmp_int64)&sh_buf->doacross_num_done);
4086     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4087     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4088     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4089     sh_buf->doacross_flags = NULL;
4090     sh_buf->doacross_num_done = 0;
4091     sh_buf->doacross_buf_idx +=
4092         __kmp_dispatch_num_buffers; // free buffer for future re-use
4093   }
4094   // free private resources (need to keep buffer index forever)
4095   pr_buf->th_doacross_flags = NULL;
4096   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4097   pr_buf->th_doacross_info = NULL;
4098   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4099 }
4100 #endif
4101 
4102 #if OMP_50_ENABLED
4103 int __kmpc_get_target_offload(void) {
4104   if (!__kmp_init_serial) {
4105     __kmp_serial_initialize();
4106   }
4107   return __kmp_target_offload;
4108 }
4109 #endif // OMP_50_ENABLED
4110 
4111 // end of file //
4112