1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 #if OMP_40_ENABLED
248 
249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
250                            kmp_int32 proc_bind) {
251   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252                 proc_bind));
253 
254   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
255 }
256 
257 #endif /* OMP_40_ENABLED */
258 
259 /*!
260 @ingroup PARALLEL
261 @param loc  source location information
262 @param argc  total number of arguments in the ellipsis
263 @param microtask  pointer to callback routine consisting of outlined parallel
264 construct
265 @param ...  pointers to shared variables that aren't global
266 
267 Do the actual fork and call the microtask in the relevant number of threads.
268 */
269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
270   int gtid = __kmp_entry_gtid();
271 
272 #if (KMP_STATS_ENABLED)
273   // If we were in a serial region, then stop the serial timer, record
274   // the event, and start parallel region timer
275   stats_state_e previous_state = KMP_GET_THREAD_STATE();
276   if (previous_state == stats_state_e::SERIAL_REGION) {
277     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
278   } else {
279     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
280   }
281   int inParallel = __kmpc_in_parallel(loc);
282   if (inParallel) {
283     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
284   } else {
285     KMP_COUNT_BLOCK(OMP_PARALLEL);
286   }
287 #endif
288 
289   // maybe to save thr_state is enough here
290   {
291     va_list ap;
292     va_start(ap, microtask);
293 
294 #if OMPT_SUPPORT
295     ompt_frame_t *ompt_frame;
296     if (ompt_enabled.enabled) {
297       kmp_info_t *master_th = __kmp_threads[gtid];
298       kmp_team_t *parent_team = master_th->th.th_team;
299       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
300       if (lwt)
301         ompt_frame = &(lwt->ompt_task_info.frame);
302       else {
303         int tid = __kmp_tid_from_gtid(gtid);
304         ompt_frame = &(
305             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
306       }
307       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
308       OMPT_STORE_RETURN_ADDRESS(gtid);
309     }
310 #endif
311 
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_FORKING();
314 #endif
315     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
316                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
317                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
320                     &ap
321 #else
322                     ap
323 #endif
324                     );
325 #if INCLUDE_SSC_MARKS
326     SSC_MARK_JOINING();
327 #endif
328     __kmp_join_call(loc, gtid
329 #if OMPT_SUPPORT
330                     ,
331                     fork_context_intel
332 #endif
333                     );
334 
335     va_end(ap);
336   }
337 
338 #if KMP_STATS_ENABLED
339   if (previous_state == stats_state_e::SERIAL_REGION) {
340     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
341   } else {
342     KMP_POP_PARTITIONED_TIMER();
343   }
344 #endif // KMP_STATS_ENABLED
345 }
346 
347 #if OMP_40_ENABLED
348 /*!
349 @ingroup PARALLEL
350 @param loc source location information
351 @param global_tid global thread number
352 @param num_teams number of teams requested for the teams construct
353 @param num_threads number of threads per team requested for the teams construct
354 
355 Set the number of teams to be used by the teams construct.
356 This call is only required if the teams construct has a `num_teams` clause
357 or a `thread_limit` clause (or both).
358 */
359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
360                            kmp_int32 num_teams, kmp_int32 num_threads) {
361   KA_TRACE(20,
362            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
363             global_tid, num_teams, num_threads));
364 
365   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
366 }
367 
368 /*!
369 @ingroup PARALLEL
370 @param loc  source location information
371 @param argc  total number of arguments in the ellipsis
372 @param microtask  pointer to callback routine consisting of outlined teams
373 construct
374 @param ...  pointers to shared variables that aren't global
375 
376 Do the actual fork and call the microtask in the relevant number of threads.
377 */
378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
379                        ...) {
380   int gtid = __kmp_entry_gtid();
381   kmp_info_t *this_thr = __kmp_threads[gtid];
382   va_list ap;
383   va_start(ap, microtask);
384 
385   KMP_COUNT_BLOCK(OMP_TEAMS);
386 
387   // remember teams entry point and nesting level
388   this_thr->th.th_teams_microtask = microtask;
389   this_thr->th.th_teams_level =
390       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
391 
392 #if OMPT_SUPPORT
393   kmp_team_t *parent_team = this_thr->th.th_team;
394   int tid = __kmp_tid_from_gtid(gtid);
395   if (ompt_enabled.enabled) {
396     parent_team->t.t_implicit_task_taskdata[tid]
397         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
398   }
399   OMPT_STORE_RETURN_ADDRESS(gtid);
400 #endif
401 
402   // check if __kmpc_push_num_teams called, set default number of teams
403   // otherwise
404   if (this_thr->th.th_teams_size.nteams == 0) {
405     __kmp_push_num_teams(loc, gtid, 0, 0);
406   }
407   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
410 
411   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
412                   VOLATILE_CAST(microtask_t)
413                       __kmp_teams_master, // "wrapped" task
414                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
416                   &ap
417 #else
418                   ap
419 #endif
420                   );
421   __kmp_join_call(loc, gtid
422 #if OMPT_SUPPORT
423                   ,
424                   fork_context_intel
425 #endif
426                   );
427 
428   this_thr->th.th_teams_microtask = NULL;
429   this_thr->th.th_teams_level = 0;
430   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
431   va_end(ap);
432 }
433 #endif /* OMP_40_ENABLED */
434 
435 // I don't think this function should ever have been exported.
436 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
437 // openmp code ever called it, but it's been exported from the RTL for so
438 // long that I'm afraid to remove the definition.
439 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
440 
441 /*!
442 @ingroup PARALLEL
443 @param loc  source location information
444 @param global_tid  global thread number
445 
446 Enter a serialized parallel construct. This interface is used to handle a
447 conditional parallel region, like this,
448 @code
449 #pragma omp parallel if (condition)
450 @endcode
451 when the condition is false.
452 */
453 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
454 // The implementation is now in kmp_runtime.cpp so that it can share static
455 // functions with kmp_fork_call since the tasks to be done are similar in
456 // each case.
457 #if OMPT_SUPPORT
458   OMPT_STORE_RETURN_ADDRESS(global_tid);
459 #endif
460   __kmp_serialized_parallel(loc, global_tid);
461 }
462 
463 /*!
464 @ingroup PARALLEL
465 @param loc  source location information
466 @param global_tid  global thread number
467 
468 Leave a serialized parallel construct.
469 */
470 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
471   kmp_internal_control_t *top;
472   kmp_info_t *this_thr;
473   kmp_team_t *serial_team;
474 
475   KC_TRACE(10,
476            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
477 
478   /* skip all this code for autopar serialized loops since it results in
479      unacceptable overhead */
480   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
481     return;
482 
483   // Not autopar code
484   if (!TCR_4(__kmp_init_parallel))
485     __kmp_parallel_initialize();
486 
487 #if OMP_50_ENABLED
488   __kmp_resume_if_soft_paused();
489 #endif
490 
491   this_thr = __kmp_threads[global_tid];
492   serial_team = this_thr->th.th_serial_team;
493 
494 #if OMP_45_ENABLED
495   kmp_task_team_t *task_team = this_thr->th.th_task_team;
496 
497   // we need to wait for the proxy tasks before finishing the thread
498   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
499     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
500 #endif
501 
502   KMP_MB();
503   KMP_DEBUG_ASSERT(serial_team);
504   KMP_ASSERT(serial_team->t.t_serialized);
505   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
506   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
507   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
508   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
509 
510 #if OMPT_SUPPORT
511   if (ompt_enabled.enabled &&
512       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
513     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
514     if (ompt_enabled.ompt_callback_implicit_task) {
515       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
516           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
517           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
518     }
519 
520     // reset clear the task id only after unlinking the task
521     ompt_data_t *parent_task_data;
522     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
523 
524     if (ompt_enabled.ompt_callback_parallel_end) {
525       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
526           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
527           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
528     }
529     __ompt_lw_taskteam_unlink(this_thr);
530     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
531   }
532 #endif
533 
534   /* If necessary, pop the internal control stack values and replace the team
535    * values */
536   top = serial_team->t.t_control_stack_top;
537   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
538     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
539     serial_team->t.t_control_stack_top = top->next;
540     __kmp_free(top);
541   }
542 
543   // if( serial_team -> t.t_serialized > 1 )
544   serial_team->t.t_level--;
545 
546   /* pop dispatch buffers stack */
547   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
548   {
549     dispatch_private_info_t *disp_buffer =
550         serial_team->t.t_dispatch->th_disp_buffer;
551     serial_team->t.t_dispatch->th_disp_buffer =
552         serial_team->t.t_dispatch->th_disp_buffer->next;
553     __kmp_free(disp_buffer);
554   }
555 #if OMP_50_ENABLED
556   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
557 #endif
558 
559   --serial_team->t.t_serialized;
560   if (serial_team->t.t_serialized == 0) {
561 
562 /* return to the parallel section */
563 
564 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
565     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
566       __kmp_clear_x87_fpu_status_word();
567       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
568       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
569     }
570 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
571 
572     this_thr->th.th_team = serial_team->t.t_parent;
573     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
574 
575     /* restore values cached in the thread */
576     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
577     this_thr->th.th_team_master =
578         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
579     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
580 
581     /* TODO the below shouldn't need to be adjusted for serialized teams */
582     this_thr->th.th_dispatch =
583         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
584 
585     __kmp_pop_current_task_from_thread(this_thr);
586 
587     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
588     this_thr->th.th_current_task->td_flags.executing = 1;
589 
590     if (__kmp_tasking_mode != tskm_immediate_exec) {
591       // Copy the task team from the new child / old parent team to the thread.
592       this_thr->th.th_task_team =
593           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
594       KA_TRACE(20,
595                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
596                 "team %p\n",
597                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
598     }
599   } else {
600     if (__kmp_tasking_mode != tskm_immediate_exec) {
601       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
602                     "depth of serial team %p to %d\n",
603                     global_tid, serial_team, serial_team->t.t_serialized));
604     }
605   }
606 
607   if (__kmp_env_consistency_check)
608     __kmp_pop_parallel(global_tid, NULL);
609 #if OMPT_SUPPORT
610   if (ompt_enabled.enabled)
611     this_thr->th.ompt_thread_info.state =
612         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
613                                            : ompt_state_work_parallel);
614 #endif
615 }
616 
617 /*!
618 @ingroup SYNCHRONIZATION
619 @param loc  source location information.
620 
621 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
622 depending on the memory ordering convention obeyed by the compiler
623 even that may not be necessary).
624 */
625 void __kmpc_flush(ident_t *loc) {
626   KC_TRACE(10, ("__kmpc_flush: called\n"));
627 
628   /* need explicit __mf() here since use volatile instead in library */
629   KMP_MB(); /* Flush all pending memory write invalidates.  */
630 
631 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
632 #if KMP_MIC
633 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
634 // We shouldn't need it, though, since the ABI rules require that
635 // * If the compiler generates NGO stores it also generates the fence
636 // * If users hand-code NGO stores they should insert the fence
637 // therefore no incomplete unordered stores should be visible.
638 #else
639   // C74404
640   // This is to address non-temporal store instructions (sfence needed).
641   // The clflush instruction is addressed either (mfence needed).
642   // Probably the non-temporal load monvtdqa instruction should also be
643   // addressed.
644   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
645   if (!__kmp_cpuinfo.initialized) {
646     __kmp_query_cpuid(&__kmp_cpuinfo);
647   }
648   if (!__kmp_cpuinfo.sse2) {
649     // CPU cannot execute SSE2 instructions.
650   } else {
651 #if KMP_COMPILER_ICC
652     _mm_mfence();
653 #elif KMP_COMPILER_MSVC
654     MemoryBarrier();
655 #else
656     __sync_synchronize();
657 #endif // KMP_COMPILER_ICC
658   }
659 #endif // KMP_MIC
660 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
661 // Nothing to see here move along
662 #elif KMP_ARCH_PPC64
663 // Nothing needed here (we have a real MB above).
664 #if KMP_OS_CNK
665   // The flushing thread needs to yield here; this prevents a
666   // busy-waiting thread from saturating the pipeline. flush is
667   // often used in loops like this:
668   // while (!flag) {
669   //   #pragma omp flush(flag)
670   // }
671   // and adding the yield here is good for at least a 10x speedup
672   // when running >2 threads per core (on the NAS LU benchmark).
673   __kmp_yield(TRUE);
674 #endif
675 #else
676 #error Unknown or unsupported architecture
677 #endif
678 
679 #if OMPT_SUPPORT && OMPT_OPTIONAL
680   if (ompt_enabled.ompt_callback_flush) {
681     ompt_callbacks.ompt_callback(ompt_callback_flush)(
682         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
683   }
684 #endif
685 }
686 
687 /* -------------------------------------------------------------------------- */
688 /*!
689 @ingroup SYNCHRONIZATION
690 @param loc source location information
691 @param global_tid thread id.
692 
693 Execute a barrier.
694 */
695 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
696   KMP_COUNT_BLOCK(OMP_BARRIER);
697   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
698 
699   if (!TCR_4(__kmp_init_parallel))
700     __kmp_parallel_initialize();
701 
702 #if OMP_50_ENABLED
703   __kmp_resume_if_soft_paused();
704 #endif
705 
706   if (__kmp_env_consistency_check) {
707     if (loc == 0) {
708       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
709     }
710 
711     __kmp_check_barrier(global_tid, ct_barrier, loc);
712   }
713 
714 #if OMPT_SUPPORT
715   ompt_frame_t *ompt_frame;
716   if (ompt_enabled.enabled) {
717     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
718     if (ompt_frame->enter_frame.ptr == NULL)
719       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
720     OMPT_STORE_RETURN_ADDRESS(global_tid);
721   }
722 #endif
723   __kmp_threads[global_tid]->th.th_ident = loc;
724   // TODO: explicit barrier_wait_id:
725   //   this function is called when 'barrier' directive is present or
726   //   implicit barrier at the end of a worksharing construct.
727   // 1) better to add a per-thread barrier counter to a thread data structure
728   // 2) set to 0 when a new team is created
729   // 4) no sync is required
730 
731   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
732 #if OMPT_SUPPORT && OMPT_OPTIONAL
733   if (ompt_enabled.enabled) {
734     ompt_frame->enter_frame = ompt_data_none;
735   }
736 #endif
737 }
738 
739 /* The BARRIER for a MASTER section is always explicit   */
740 /*!
741 @ingroup WORK_SHARING
742 @param loc  source location information.
743 @param global_tid  global thread number .
744 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
745 */
746 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
747   int status = 0;
748 
749   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
750 
751   if (!TCR_4(__kmp_init_parallel))
752     __kmp_parallel_initialize();
753 
754 #if OMP_50_ENABLED
755   __kmp_resume_if_soft_paused();
756 #endif
757 
758   if (KMP_MASTER_GTID(global_tid)) {
759     KMP_COUNT_BLOCK(OMP_MASTER);
760     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
761     status = 1;
762   }
763 
764 #if OMPT_SUPPORT && OMPT_OPTIONAL
765   if (status) {
766     if (ompt_enabled.ompt_callback_master) {
767       kmp_info_t *this_thr = __kmp_threads[global_tid];
768       kmp_team_t *team = this_thr->th.th_team;
769 
770       int tid = __kmp_tid_from_gtid(global_tid);
771       ompt_callbacks.ompt_callback(ompt_callback_master)(
772           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
773           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
774           OMPT_GET_RETURN_ADDRESS(0));
775     }
776   }
777 #endif
778 
779   if (__kmp_env_consistency_check) {
780 #if KMP_USE_DYNAMIC_LOCK
781     if (status)
782       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
783     else
784       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
785 #else
786     if (status)
787       __kmp_push_sync(global_tid, ct_master, loc, NULL);
788     else
789       __kmp_check_sync(global_tid, ct_master, loc, NULL);
790 #endif
791   }
792 
793   return status;
794 }
795 
796 /*!
797 @ingroup WORK_SHARING
798 @param loc  source location information.
799 @param global_tid  global thread number .
800 
801 Mark the end of a <tt>master</tt> region. This should only be called by the
802 thread that executes the <tt>master</tt> region.
803 */
804 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
805   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
806 
807   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
808   KMP_POP_PARTITIONED_TIMER();
809 
810 #if OMPT_SUPPORT && OMPT_OPTIONAL
811   kmp_info_t *this_thr = __kmp_threads[global_tid];
812   kmp_team_t *team = this_thr->th.th_team;
813   if (ompt_enabled.ompt_callback_master) {
814     int tid = __kmp_tid_from_gtid(global_tid);
815     ompt_callbacks.ompt_callback(ompt_callback_master)(
816         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
817         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
818         OMPT_GET_RETURN_ADDRESS(0));
819   }
820 #endif
821 
822   if (__kmp_env_consistency_check) {
823     if (global_tid < 0)
824       KMP_WARNING(ThreadIdentInvalid);
825 
826     if (KMP_MASTER_GTID(global_tid))
827       __kmp_pop_sync(global_tid, ct_master, loc);
828   }
829 }
830 
831 /*!
832 @ingroup WORK_SHARING
833 @param loc  source location information.
834 @param gtid  global thread number.
835 
836 Start execution of an <tt>ordered</tt> construct.
837 */
838 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
839   int cid = 0;
840   kmp_info_t *th;
841   KMP_DEBUG_ASSERT(__kmp_init_serial);
842 
843   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
844 
845   if (!TCR_4(__kmp_init_parallel))
846     __kmp_parallel_initialize();
847 
848 #if OMP_50_ENABLED
849   __kmp_resume_if_soft_paused();
850 #endif
851 
852 #if USE_ITT_BUILD
853   __kmp_itt_ordered_prep(gtid);
854 // TODO: ordered_wait_id
855 #endif /* USE_ITT_BUILD */
856 
857   th = __kmp_threads[gtid];
858 
859 #if OMPT_SUPPORT && OMPT_OPTIONAL
860   kmp_team_t *team;
861   ompt_wait_id_t lck;
862   void *codeptr_ra;
863   if (ompt_enabled.enabled) {
864     OMPT_STORE_RETURN_ADDRESS(gtid);
865     team = __kmp_team_from_gtid(gtid);
866     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
867     /* OMPT state update */
868     th->th.ompt_thread_info.wait_id = lck;
869     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
870 
871     /* OMPT event callback */
872     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
873     if (ompt_enabled.ompt_callback_mutex_acquire) {
874       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
875           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
876           (ompt_wait_id_t)lck, codeptr_ra);
877     }
878   }
879 #endif
880 
881   if (th->th.th_dispatch->th_deo_fcn != 0)
882     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
883   else
884     __kmp_parallel_deo(&gtid, &cid, loc);
885 
886 #if OMPT_SUPPORT && OMPT_OPTIONAL
887   if (ompt_enabled.enabled) {
888     /* OMPT state update */
889     th->th.ompt_thread_info.state = ompt_state_work_parallel;
890     th->th.ompt_thread_info.wait_id = 0;
891 
892     /* OMPT event callback */
893     if (ompt_enabled.ompt_callback_mutex_acquired) {
894       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
895           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
896     }
897   }
898 #endif
899 
900 #if USE_ITT_BUILD
901   __kmp_itt_ordered_start(gtid);
902 #endif /* USE_ITT_BUILD */
903 }
904 
905 /*!
906 @ingroup WORK_SHARING
907 @param loc  source location information.
908 @param gtid  global thread number.
909 
910 End execution of an <tt>ordered</tt> construct.
911 */
912 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
913   int cid = 0;
914   kmp_info_t *th;
915 
916   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
917 
918 #if USE_ITT_BUILD
919   __kmp_itt_ordered_end(gtid);
920 // TODO: ordered_wait_id
921 #endif /* USE_ITT_BUILD */
922 
923   th = __kmp_threads[gtid];
924 
925   if (th->th.th_dispatch->th_dxo_fcn != 0)
926     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
927   else
928     __kmp_parallel_dxo(&gtid, &cid, loc);
929 
930 #if OMPT_SUPPORT && OMPT_OPTIONAL
931   OMPT_STORE_RETURN_ADDRESS(gtid);
932   if (ompt_enabled.ompt_callback_mutex_released) {
933     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
934         ompt_mutex_ordered,
935         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
936         OMPT_LOAD_RETURN_ADDRESS(gtid));
937   }
938 #endif
939 }
940 
941 #if KMP_USE_DYNAMIC_LOCK
942 
943 static __forceinline void
944 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
945                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
946   // Pointer to the allocated indirect lock is written to crit, while indexing
947   // is ignored.
948   void *idx;
949   kmp_indirect_lock_t **lck;
950   lck = (kmp_indirect_lock_t **)crit;
951   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
952   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
953   KMP_SET_I_LOCK_LOCATION(ilk, loc);
954   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
955   KA_TRACE(20,
956            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
957 #if USE_ITT_BUILD
958   __kmp_itt_critical_creating(ilk->lock, loc);
959 #endif
960   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
961   if (status == 0) {
962 #if USE_ITT_BUILD
963     __kmp_itt_critical_destroyed(ilk->lock);
964 #endif
965     // We don't really need to destroy the unclaimed lock here since it will be
966     // cleaned up at program exit.
967     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
968   }
969   KMP_DEBUG_ASSERT(*lck != NULL);
970 }
971 
972 // Fast-path acquire tas lock
973 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
974   {                                                                            \
975     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
976     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
977     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
978     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
979         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
980       kmp_uint32 spins;                                                        \
981       KMP_FSYNC_PREPARE(l);                                                    \
982       KMP_INIT_YIELD(spins);                                                   \
983       if (TCR_4(__kmp_nth) >                                                   \
984           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
985         KMP_YIELD(TRUE);                                                       \
986       } else {                                                                 \
987         KMP_YIELD_SPIN(spins);                                                 \
988       }                                                                        \
989       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
990       while (                                                                  \
991           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
992           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
993         __kmp_spin_backoff(&backoff);                                          \
994         if (TCR_4(__kmp_nth) >                                                 \
995             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
996           KMP_YIELD(TRUE);                                                     \
997         } else {                                                               \
998           KMP_YIELD_SPIN(spins);                                               \
999         }                                                                      \
1000       }                                                                        \
1001     }                                                                          \
1002     KMP_FSYNC_ACQUIRED(l);                                                     \
1003   }
1004 
1005 // Fast-path test tas lock
1006 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1007   {                                                                            \
1008     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1009     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1010     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1011     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1012          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1013   }
1014 
1015 // Fast-path release tas lock
1016 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1017   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1018 
1019 #if KMP_USE_FUTEX
1020 
1021 #include <sys/syscall.h>
1022 #include <unistd.h>
1023 #ifndef FUTEX_WAIT
1024 #define FUTEX_WAIT 0
1025 #endif
1026 #ifndef FUTEX_WAKE
1027 #define FUTEX_WAKE 1
1028 #endif
1029 
1030 // Fast-path acquire futex lock
1031 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1032   {                                                                            \
1033     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1034     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1035     KMP_MB();                                                                  \
1036     KMP_FSYNC_PREPARE(ftx);                                                    \
1037     kmp_int32 poll_val;                                                        \
1038     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1039                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1040                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1041       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1042       if (!cond) {                                                             \
1043         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1044                                          poll_val |                            \
1045                                              KMP_LOCK_BUSY(1, futex))) {       \
1046           continue;                                                            \
1047         }                                                                      \
1048         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1049       }                                                                        \
1050       kmp_int32 rc;                                                            \
1051       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1052                         NULL, NULL, 0)) != 0) {                                \
1053         continue;                                                              \
1054       }                                                                        \
1055       gtid_code |= 1;                                                          \
1056     }                                                                          \
1057     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1058   }
1059 
1060 // Fast-path test futex lock
1061 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1062   {                                                                            \
1063     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1064     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1065                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1066       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1067       rc = TRUE;                                                               \
1068     } else {                                                                   \
1069       rc = FALSE;                                                              \
1070     }                                                                          \
1071   }
1072 
1073 // Fast-path release futex lock
1074 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1075   {                                                                            \
1076     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1077     KMP_MB();                                                                  \
1078     KMP_FSYNC_RELEASING(ftx);                                                  \
1079     kmp_int32 poll_val =                                                       \
1080         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1081     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1082       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1083               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1084     }                                                                          \
1085     KMP_MB();                                                                  \
1086     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1087               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1088   }
1089 
1090 #endif // KMP_USE_FUTEX
1091 
1092 #else // KMP_USE_DYNAMIC_LOCK
1093 
1094 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1095                                                       ident_t const *loc,
1096                                                       kmp_int32 gtid) {
1097   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1098 
1099   // Because of the double-check, the following load doesn't need to be volatile
1100   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1101 
1102   if (lck == NULL) {
1103     void *idx;
1104 
1105     // Allocate & initialize the lock.
1106     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1107     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1108     __kmp_init_user_lock_with_checks(lck);
1109     __kmp_set_user_lock_location(lck, loc);
1110 #if USE_ITT_BUILD
1111     __kmp_itt_critical_creating(lck);
1112 // __kmp_itt_critical_creating() should be called *before* the first usage
1113 // of underlying lock. It is the only place where we can guarantee it. There
1114 // are chances the lock will destroyed with no usage, but it is not a
1115 // problem, because this is not real event seen by user but rather setting
1116 // name for object (lock). See more details in kmp_itt.h.
1117 #endif /* USE_ITT_BUILD */
1118 
1119     // Use a cmpxchg instruction to slam the start of the critical section with
1120     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1121     // and use the lock that the other thread allocated.
1122     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1123 
1124     if (status == 0) {
1125 // Deallocate the lock and reload the value.
1126 #if USE_ITT_BUILD
1127       __kmp_itt_critical_destroyed(lck);
1128 // Let ITT know the lock is destroyed and the same memory location may be reused
1129 // for another purpose.
1130 #endif /* USE_ITT_BUILD */
1131       __kmp_destroy_user_lock_with_checks(lck);
1132       __kmp_user_lock_free(&idx, gtid, lck);
1133       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1134       KMP_DEBUG_ASSERT(lck != NULL);
1135     }
1136   }
1137   return lck;
1138 }
1139 
1140 #endif // KMP_USE_DYNAMIC_LOCK
1141 
1142 /*!
1143 @ingroup WORK_SHARING
1144 @param loc  source location information.
1145 @param global_tid  global thread number .
1146 @param crit identity of the critical section. This could be a pointer to a lock
1147 associated with the critical section, or some other suitably unique value.
1148 
1149 Enter code protected by a `critical` construct.
1150 This function blocks until the executing thread can enter the critical section.
1151 */
1152 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1153                      kmp_critical_name *crit) {
1154 #if KMP_USE_DYNAMIC_LOCK
1155 #if OMPT_SUPPORT && OMPT_OPTIONAL
1156   OMPT_STORE_RETURN_ADDRESS(global_tid);
1157 #endif // OMPT_SUPPORT
1158   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1159 #else
1160   KMP_COUNT_BLOCK(OMP_CRITICAL);
1161 #if OMPT_SUPPORT && OMPT_OPTIONAL
1162   ompt_state_t prev_state = ompt_state_undefined;
1163   ompt_thread_info_t ti;
1164 #endif
1165   kmp_user_lock_p lck;
1166 
1167   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1168 
1169   // TODO: add THR_OVHD_STATE
1170 
1171   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1172   KMP_CHECK_USER_LOCK_INIT();
1173 
1174   if ((__kmp_user_lock_kind == lk_tas) &&
1175       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1176     lck = (kmp_user_lock_p)crit;
1177   }
1178 #if KMP_USE_FUTEX
1179   else if ((__kmp_user_lock_kind == lk_futex) &&
1180            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1181     lck = (kmp_user_lock_p)crit;
1182   }
1183 #endif
1184   else { // ticket, queuing or drdpa
1185     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1186   }
1187 
1188   if (__kmp_env_consistency_check)
1189     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1190 
1191 // since the critical directive binds to all threads, not just the current
1192 // team we have to check this even if we are in a serialized team.
1193 // also, even if we are the uber thread, we still have to conduct the lock,
1194 // as we have to contend with sibling threads.
1195 
1196 #if USE_ITT_BUILD
1197   __kmp_itt_critical_acquiring(lck);
1198 #endif /* USE_ITT_BUILD */
1199 #if OMPT_SUPPORT && OMPT_OPTIONAL
1200   OMPT_STORE_RETURN_ADDRESS(gtid);
1201   void *codeptr_ra = NULL;
1202   if (ompt_enabled.enabled) {
1203     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1204     /* OMPT state update */
1205     prev_state = ti.state;
1206     ti.wait_id = (ompt_wait_id_t)lck;
1207     ti.state = ompt_state_wait_critical;
1208 
1209     /* OMPT event callback */
1210     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1211     if (ompt_enabled.ompt_callback_mutex_acquire) {
1212       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1213           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1214           (ompt_wait_id_t)crit, codeptr_ra);
1215     }
1216   }
1217 #endif
1218   // Value of 'crit' should be good for using as a critical_id of the critical
1219   // section directive.
1220   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1221 
1222 #if USE_ITT_BUILD
1223   __kmp_itt_critical_acquired(lck);
1224 #endif /* USE_ITT_BUILD */
1225 #if OMPT_SUPPORT && OMPT_OPTIONAL
1226   if (ompt_enabled.enabled) {
1227     /* OMPT state update */
1228     ti.state = prev_state;
1229     ti.wait_id = 0;
1230 
1231     /* OMPT event callback */
1232     if (ompt_enabled.ompt_callback_mutex_acquired) {
1233       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1234           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1235     }
1236   }
1237 #endif
1238   KMP_POP_PARTITIONED_TIMER();
1239 
1240   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1241   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1242 #endif // KMP_USE_DYNAMIC_LOCK
1243 }
1244 
1245 #if KMP_USE_DYNAMIC_LOCK
1246 
1247 // Converts the given hint to an internal lock implementation
1248 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1249 #if KMP_USE_TSX
1250 #define KMP_TSX_LOCK(seq) lockseq_##seq
1251 #else
1252 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1253 #endif
1254 
1255 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1256 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1257 #else
1258 #define KMP_CPUINFO_RTM 0
1259 #endif
1260 
1261   // Hints that do not require further logic
1262   if (hint & kmp_lock_hint_hle)
1263     return KMP_TSX_LOCK(hle);
1264   if (hint & kmp_lock_hint_rtm)
1265     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1266   if (hint & kmp_lock_hint_adaptive)
1267     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1268 
1269   // Rule out conflicting hints first by returning the default lock
1270   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1271     return __kmp_user_lock_seq;
1272   if ((hint & omp_lock_hint_speculative) &&
1273       (hint & omp_lock_hint_nonspeculative))
1274     return __kmp_user_lock_seq;
1275 
1276   // Do not even consider speculation when it appears to be contended
1277   if (hint & omp_lock_hint_contended)
1278     return lockseq_queuing;
1279 
1280   // Uncontended lock without speculation
1281   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1282     return lockseq_tas;
1283 
1284   // HLE lock for speculation
1285   if (hint & omp_lock_hint_speculative)
1286     return KMP_TSX_LOCK(hle);
1287 
1288   return __kmp_user_lock_seq;
1289 }
1290 
1291 #if OMPT_SUPPORT && OMPT_OPTIONAL
1292 #if KMP_USE_DYNAMIC_LOCK
1293 static kmp_mutex_impl_t
1294 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1295   if (user_lock) {
1296     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1297     case 0:
1298       break;
1299 #if KMP_USE_FUTEX
1300     case locktag_futex:
1301       return kmp_mutex_impl_queuing;
1302 #endif
1303     case locktag_tas:
1304       return kmp_mutex_impl_spin;
1305 #if KMP_USE_TSX
1306     case locktag_hle:
1307       return kmp_mutex_impl_speculative;
1308 #endif
1309     default:
1310       return kmp_mutex_impl_none;
1311     }
1312     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1313   }
1314   KMP_ASSERT(ilock);
1315   switch (ilock->type) {
1316 #if KMP_USE_TSX
1317   case locktag_adaptive:
1318   case locktag_rtm:
1319     return kmp_mutex_impl_speculative;
1320 #endif
1321   case locktag_nested_tas:
1322     return kmp_mutex_impl_spin;
1323 #if KMP_USE_FUTEX
1324   case locktag_nested_futex:
1325 #endif
1326   case locktag_ticket:
1327   case locktag_queuing:
1328   case locktag_drdpa:
1329   case locktag_nested_ticket:
1330   case locktag_nested_queuing:
1331   case locktag_nested_drdpa:
1332     return kmp_mutex_impl_queuing;
1333   default:
1334     return kmp_mutex_impl_none;
1335   }
1336 }
1337 #else
1338 // For locks without dynamic binding
1339 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1340   switch (__kmp_user_lock_kind) {
1341   case lk_tas:
1342     return kmp_mutex_impl_spin;
1343 #if KMP_USE_FUTEX
1344   case lk_futex:
1345 #endif
1346   case lk_ticket:
1347   case lk_queuing:
1348   case lk_drdpa:
1349     return kmp_mutex_impl_queuing;
1350 #if KMP_USE_TSX
1351   case lk_hle:
1352   case lk_rtm:
1353   case lk_adaptive:
1354     return kmp_mutex_impl_speculative;
1355 #endif
1356   default:
1357     return kmp_mutex_impl_none;
1358   }
1359 }
1360 #endif // KMP_USE_DYNAMIC_LOCK
1361 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1362 
1363 /*!
1364 @ingroup WORK_SHARING
1365 @param loc  source location information.
1366 @param global_tid  global thread number.
1367 @param crit identity of the critical section. This could be a pointer to a lock
1368 associated with the critical section, or some other suitably unique value.
1369 @param hint the lock hint.
1370 
1371 Enter code protected by a `critical` construct with a hint. The hint value is
1372 used to suggest a lock implementation. This function blocks until the executing
1373 thread can enter the critical section unless the hint suggests use of
1374 speculative execution and the hardware supports it.
1375 */
1376 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1377                                kmp_critical_name *crit, uint32_t hint) {
1378   KMP_COUNT_BLOCK(OMP_CRITICAL);
1379   kmp_user_lock_p lck;
1380 #if OMPT_SUPPORT && OMPT_OPTIONAL
1381   ompt_state_t prev_state = ompt_state_undefined;
1382   ompt_thread_info_t ti;
1383   // This is the case, if called from __kmpc_critical:
1384   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1385   if (!codeptr)
1386     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1387 #endif
1388 
1389   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1390 
1391   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1392   // Check if it is initialized.
1393   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1394   if (*lk == 0) {
1395     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1396     if (KMP_IS_D_LOCK(lckseq)) {
1397       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1398                                   KMP_GET_D_TAG(lckseq));
1399     } else {
1400       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1401     }
1402   }
1403   // Branch for accessing the actual lock object and set operation. This
1404   // branching is inevitable since this lock initialization does not follow the
1405   // normal dispatch path (lock table is not used).
1406   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1407     lck = (kmp_user_lock_p)lk;
1408     if (__kmp_env_consistency_check) {
1409       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1410                       __kmp_map_hint_to_lock(hint));
1411     }
1412 #if USE_ITT_BUILD
1413     __kmp_itt_critical_acquiring(lck);
1414 #endif
1415 #if OMPT_SUPPORT && OMPT_OPTIONAL
1416     if (ompt_enabled.enabled) {
1417       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1418       /* OMPT state update */
1419       prev_state = ti.state;
1420       ti.wait_id = (ompt_wait_id_t)lck;
1421       ti.state = ompt_state_wait_critical;
1422 
1423       /* OMPT event callback */
1424       if (ompt_enabled.ompt_callback_mutex_acquire) {
1425         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1426             ompt_mutex_critical, (unsigned int)hint,
1427             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1428       }
1429     }
1430 #endif
1431 #if KMP_USE_INLINED_TAS
1432     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1433       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1434     } else
1435 #elif KMP_USE_INLINED_FUTEX
1436     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1437       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1438     } else
1439 #endif
1440     {
1441       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1442     }
1443   } else {
1444     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1445     lck = ilk->lock;
1446     if (__kmp_env_consistency_check) {
1447       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1448                       __kmp_map_hint_to_lock(hint));
1449     }
1450 #if USE_ITT_BUILD
1451     __kmp_itt_critical_acquiring(lck);
1452 #endif
1453 #if OMPT_SUPPORT && OMPT_OPTIONAL
1454     if (ompt_enabled.enabled) {
1455       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1456       /* OMPT state update */
1457       prev_state = ti.state;
1458       ti.wait_id = (ompt_wait_id_t)lck;
1459       ti.state = ompt_state_wait_critical;
1460 
1461       /* OMPT event callback */
1462       if (ompt_enabled.ompt_callback_mutex_acquire) {
1463         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1464             ompt_mutex_critical, (unsigned int)hint,
1465             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1466       }
1467     }
1468 #endif
1469     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1470   }
1471   KMP_POP_PARTITIONED_TIMER();
1472 
1473 #if USE_ITT_BUILD
1474   __kmp_itt_critical_acquired(lck);
1475 #endif /* USE_ITT_BUILD */
1476 #if OMPT_SUPPORT && OMPT_OPTIONAL
1477   if (ompt_enabled.enabled) {
1478     /* OMPT state update */
1479     ti.state = prev_state;
1480     ti.wait_id = 0;
1481 
1482     /* OMPT event callback */
1483     if (ompt_enabled.ompt_callback_mutex_acquired) {
1484       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1485           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1486     }
1487   }
1488 #endif
1489 
1490   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1491   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1492 } // __kmpc_critical_with_hint
1493 
1494 #endif // KMP_USE_DYNAMIC_LOCK
1495 
1496 /*!
1497 @ingroup WORK_SHARING
1498 @param loc  source location information.
1499 @param global_tid  global thread number .
1500 @param crit identity of the critical section. This could be a pointer to a lock
1501 associated with the critical section, or some other suitably unique value.
1502 
1503 Leave a critical section, releasing any lock that was held during its execution.
1504 */
1505 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1506                          kmp_critical_name *crit) {
1507   kmp_user_lock_p lck;
1508 
1509   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1510 
1511 #if KMP_USE_DYNAMIC_LOCK
1512   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1513     lck = (kmp_user_lock_p)crit;
1514     KMP_ASSERT(lck != NULL);
1515     if (__kmp_env_consistency_check) {
1516       __kmp_pop_sync(global_tid, ct_critical, loc);
1517     }
1518 #if USE_ITT_BUILD
1519     __kmp_itt_critical_releasing(lck);
1520 #endif
1521 #if KMP_USE_INLINED_TAS
1522     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1523       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1524     } else
1525 #elif KMP_USE_INLINED_FUTEX
1526     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1527       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1528     } else
1529 #endif
1530     {
1531       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1532     }
1533   } else {
1534     kmp_indirect_lock_t *ilk =
1535         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1536     KMP_ASSERT(ilk != NULL);
1537     lck = ilk->lock;
1538     if (__kmp_env_consistency_check) {
1539       __kmp_pop_sync(global_tid, ct_critical, loc);
1540     }
1541 #if USE_ITT_BUILD
1542     __kmp_itt_critical_releasing(lck);
1543 #endif
1544     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1545   }
1546 
1547 #else // KMP_USE_DYNAMIC_LOCK
1548 
1549   if ((__kmp_user_lock_kind == lk_tas) &&
1550       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1551     lck = (kmp_user_lock_p)crit;
1552   }
1553 #if KMP_USE_FUTEX
1554   else if ((__kmp_user_lock_kind == lk_futex) &&
1555            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1556     lck = (kmp_user_lock_p)crit;
1557   }
1558 #endif
1559   else { // ticket, queuing or drdpa
1560     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1561   }
1562 
1563   KMP_ASSERT(lck != NULL);
1564 
1565   if (__kmp_env_consistency_check)
1566     __kmp_pop_sync(global_tid, ct_critical, loc);
1567 
1568 #if USE_ITT_BUILD
1569   __kmp_itt_critical_releasing(lck);
1570 #endif /* USE_ITT_BUILD */
1571   // Value of 'crit' should be good for using as a critical_id of the critical
1572   // section directive.
1573   __kmp_release_user_lock_with_checks(lck, global_tid);
1574 
1575 #endif // KMP_USE_DYNAMIC_LOCK
1576 
1577 #if OMPT_SUPPORT && OMPT_OPTIONAL
1578   /* OMPT release event triggers after lock is released; place here to trigger
1579    * for all #if branches */
1580   OMPT_STORE_RETURN_ADDRESS(global_tid);
1581   if (ompt_enabled.ompt_callback_mutex_released) {
1582     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1583         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1584   }
1585 #endif
1586 
1587   KMP_POP_PARTITIONED_TIMER();
1588   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1589 }
1590 
1591 /*!
1592 @ingroup SYNCHRONIZATION
1593 @param loc source location information
1594 @param global_tid thread id.
1595 @return one if the thread should execute the master block, zero otherwise
1596 
1597 Start execution of a combined barrier and master. The barrier is executed inside
1598 this function.
1599 */
1600 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1601   int status;
1602 
1603   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1604 
1605   if (!TCR_4(__kmp_init_parallel))
1606     __kmp_parallel_initialize();
1607 
1608 #if OMP_50_ENABLED
1609   __kmp_resume_if_soft_paused();
1610 #endif
1611 
1612   if (__kmp_env_consistency_check)
1613     __kmp_check_barrier(global_tid, ct_barrier, loc);
1614 
1615 #if OMPT_SUPPORT
1616   ompt_frame_t *ompt_frame;
1617   if (ompt_enabled.enabled) {
1618     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1619     if (ompt_frame->enter_frame.ptr == NULL)
1620       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1621     OMPT_STORE_RETURN_ADDRESS(global_tid);
1622   }
1623 #endif
1624 #if USE_ITT_NOTIFY
1625   __kmp_threads[global_tid]->th.th_ident = loc;
1626 #endif
1627   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1628 #if OMPT_SUPPORT && OMPT_OPTIONAL
1629   if (ompt_enabled.enabled) {
1630     ompt_frame->enter_frame = ompt_data_none;
1631   }
1632 #endif
1633 
1634   return (status != 0) ? 0 : 1;
1635 }
1636 
1637 /*!
1638 @ingroup SYNCHRONIZATION
1639 @param loc source location information
1640 @param global_tid thread id.
1641 
1642 Complete the execution of a combined barrier and master. This function should
1643 only be called at the completion of the <tt>master</tt> code. Other threads will
1644 still be waiting at the barrier and this call releases them.
1645 */
1646 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1647   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1648 
1649   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1650 }
1651 
1652 /*!
1653 @ingroup SYNCHRONIZATION
1654 @param loc source location information
1655 @param global_tid thread id.
1656 @return one if the thread should execute the master block, zero otherwise
1657 
1658 Start execution of a combined barrier and master(nowait) construct.
1659 The barrier is executed inside this function.
1660 There is no equivalent "end" function, since the
1661 */
1662 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1663   kmp_int32 ret;
1664 
1665   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1666 
1667   if (!TCR_4(__kmp_init_parallel))
1668     __kmp_parallel_initialize();
1669 
1670 #if OMP_50_ENABLED
1671   __kmp_resume_if_soft_paused();
1672 #endif
1673 
1674   if (__kmp_env_consistency_check) {
1675     if (loc == 0) {
1676       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1677     }
1678     __kmp_check_barrier(global_tid, ct_barrier, loc);
1679   }
1680 
1681 #if OMPT_SUPPORT
1682   ompt_frame_t *ompt_frame;
1683   if (ompt_enabled.enabled) {
1684     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1685     if (ompt_frame->enter_frame.ptr == NULL)
1686       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1687     OMPT_STORE_RETURN_ADDRESS(global_tid);
1688   }
1689 #endif
1690 #if USE_ITT_NOTIFY
1691   __kmp_threads[global_tid]->th.th_ident = loc;
1692 #endif
1693   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1694 #if OMPT_SUPPORT && OMPT_OPTIONAL
1695   if (ompt_enabled.enabled) {
1696     ompt_frame->enter_frame = ompt_data_none;
1697   }
1698 #endif
1699 
1700   ret = __kmpc_master(loc, global_tid);
1701 
1702   if (__kmp_env_consistency_check) {
1703     /*  there's no __kmpc_end_master called; so the (stats) */
1704     /*  actions of __kmpc_end_master are done here          */
1705 
1706     if (global_tid < 0) {
1707       KMP_WARNING(ThreadIdentInvalid);
1708     }
1709     if (ret) {
1710       /* only one thread should do the pop since only */
1711       /* one did the push (see __kmpc_master())       */
1712 
1713       __kmp_pop_sync(global_tid, ct_master, loc);
1714     }
1715   }
1716 
1717   return (ret);
1718 }
1719 
1720 /* The BARRIER for a SINGLE process section is always explicit   */
1721 /*!
1722 @ingroup WORK_SHARING
1723 @param loc  source location information
1724 @param global_tid  global thread number
1725 @return One if this thread should execute the single construct, zero otherwise.
1726 
1727 Test whether to execute a <tt>single</tt> construct.
1728 There are no implicit barriers in the two "single" calls, rather the compiler
1729 should introduce an explicit barrier if it is required.
1730 */
1731 
1732 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1733   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1734 
1735   if (rc) {
1736     // We are going to execute the single statement, so we should count it.
1737     KMP_COUNT_BLOCK(OMP_SINGLE);
1738     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1739   }
1740 
1741 #if OMPT_SUPPORT && OMPT_OPTIONAL
1742   kmp_info_t *this_thr = __kmp_threads[global_tid];
1743   kmp_team_t *team = this_thr->th.th_team;
1744   int tid = __kmp_tid_from_gtid(global_tid);
1745 
1746   if (ompt_enabled.enabled) {
1747     if (rc) {
1748       if (ompt_enabled.ompt_callback_work) {
1749         ompt_callbacks.ompt_callback(ompt_callback_work)(
1750             ompt_work_single_executor, ompt_scope_begin,
1751             &(team->t.ompt_team_info.parallel_data),
1752             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1753             1, OMPT_GET_RETURN_ADDRESS(0));
1754       }
1755     } else {
1756       if (ompt_enabled.ompt_callback_work) {
1757         ompt_callbacks.ompt_callback(ompt_callback_work)(
1758             ompt_work_single_other, ompt_scope_begin,
1759             &(team->t.ompt_team_info.parallel_data),
1760             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1761             1, OMPT_GET_RETURN_ADDRESS(0));
1762         ompt_callbacks.ompt_callback(ompt_callback_work)(
1763             ompt_work_single_other, ompt_scope_end,
1764             &(team->t.ompt_team_info.parallel_data),
1765             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1766             1, OMPT_GET_RETURN_ADDRESS(0));
1767       }
1768     }
1769   }
1770 #endif
1771 
1772   return rc;
1773 }
1774 
1775 /*!
1776 @ingroup WORK_SHARING
1777 @param loc  source location information
1778 @param global_tid  global thread number
1779 
1780 Mark the end of a <tt>single</tt> construct.  This function should
1781 only be called by the thread that executed the block of code protected
1782 by the `single` construct.
1783 */
1784 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1785   __kmp_exit_single(global_tid);
1786   KMP_POP_PARTITIONED_TIMER();
1787 
1788 #if OMPT_SUPPORT && OMPT_OPTIONAL
1789   kmp_info_t *this_thr = __kmp_threads[global_tid];
1790   kmp_team_t *team = this_thr->th.th_team;
1791   int tid = __kmp_tid_from_gtid(global_tid);
1792 
1793   if (ompt_enabled.ompt_callback_work) {
1794     ompt_callbacks.ompt_callback(ompt_callback_work)(
1795         ompt_work_single_executor, ompt_scope_end,
1796         &(team->t.ompt_team_info.parallel_data),
1797         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1798         OMPT_GET_RETURN_ADDRESS(0));
1799   }
1800 #endif
1801 }
1802 
1803 /*!
1804 @ingroup WORK_SHARING
1805 @param loc Source location
1806 @param global_tid Global thread id
1807 
1808 Mark the end of a statically scheduled loop.
1809 */
1810 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1811   KMP_POP_PARTITIONED_TIMER();
1812   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1813 
1814 #if OMPT_SUPPORT && OMPT_OPTIONAL
1815   if (ompt_enabled.ompt_callback_work) {
1816     ompt_work_t ompt_work_type = ompt_work_loop;
1817     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1818     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1819     // Determine workshare type
1820     if (loc != NULL) {
1821       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1822         ompt_work_type = ompt_work_loop;
1823       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1824         ompt_work_type = ompt_work_sections;
1825       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1826         ompt_work_type = ompt_work_distribute;
1827       } else {
1828         // use default set above.
1829         // a warning about this case is provided in __kmpc_for_static_init
1830       }
1831       KMP_DEBUG_ASSERT(ompt_work_type);
1832     }
1833     ompt_callbacks.ompt_callback(ompt_callback_work)(
1834         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1835         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1836   }
1837 #endif
1838   if (__kmp_env_consistency_check)
1839     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1840 }
1841 
1842 // User routines which take C-style arguments (call by value)
1843 // different from the Fortran equivalent routines
1844 
1845 void ompc_set_num_threads(int arg) {
1846   // !!!!! TODO: check the per-task binding
1847   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1848 }
1849 
1850 void ompc_set_dynamic(int flag) {
1851   kmp_info_t *thread;
1852 
1853   /* For the thread-private implementation of the internal controls */
1854   thread = __kmp_entry_thread();
1855 
1856   __kmp_save_internal_controls(thread);
1857 
1858   set__dynamic(thread, flag ? TRUE : FALSE);
1859 }
1860 
1861 void ompc_set_nested(int flag) {
1862   kmp_info_t *thread;
1863 
1864   /* For the thread-private internal controls implementation */
1865   thread = __kmp_entry_thread();
1866 
1867   __kmp_save_internal_controls(thread);
1868 
1869   set__nested(thread, flag ? TRUE : FALSE);
1870 }
1871 
1872 void ompc_set_max_active_levels(int max_active_levels) {
1873   /* TO DO */
1874   /* we want per-task implementation of this internal control */
1875 
1876   /* For the per-thread internal controls implementation */
1877   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1878 }
1879 
1880 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1881   // !!!!! TODO: check the per-task binding
1882   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1883 }
1884 
1885 int ompc_get_ancestor_thread_num(int level) {
1886   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1887 }
1888 
1889 int ompc_get_team_size(int level) {
1890   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1891 }
1892 
1893 #if OMP_50_ENABLED
1894 /* OpenMP 5.0 Affinity Format API */
1895 
1896 void ompc_set_affinity_format(char const *format) {
1897   if (!__kmp_init_serial) {
1898     __kmp_serial_initialize();
1899   }
1900   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1901                          format, KMP_STRLEN(format) + 1);
1902 }
1903 
1904 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1905   size_t format_size;
1906   if (!__kmp_init_serial) {
1907     __kmp_serial_initialize();
1908   }
1909   format_size = KMP_STRLEN(__kmp_affinity_format);
1910   if (buffer && size) {
1911     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1912                            format_size + 1);
1913   }
1914   return format_size;
1915 }
1916 
1917 void ompc_display_affinity(char const *format) {
1918   int gtid;
1919   if (!TCR_4(__kmp_init_middle)) {
1920     __kmp_middle_initialize();
1921   }
1922   gtid = __kmp_get_gtid();
1923   __kmp_aux_display_affinity(gtid, format);
1924 }
1925 
1926 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1927                              char const *format) {
1928   int gtid;
1929   size_t num_required;
1930   kmp_str_buf_t capture_buf;
1931   if (!TCR_4(__kmp_init_middle)) {
1932     __kmp_middle_initialize();
1933   }
1934   gtid = __kmp_get_gtid();
1935   __kmp_str_buf_init(&capture_buf);
1936   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1937   if (buffer && buf_size) {
1938     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1939                            capture_buf.used + 1);
1940   }
1941   __kmp_str_buf_free(&capture_buf);
1942   return num_required;
1943 }
1944 #endif /* OMP_50_ENABLED */
1945 
1946 void kmpc_set_stacksize(int arg) {
1947   // __kmp_aux_set_stacksize initializes the library if needed
1948   __kmp_aux_set_stacksize(arg);
1949 }
1950 
1951 void kmpc_set_stacksize_s(size_t arg) {
1952   // __kmp_aux_set_stacksize initializes the library if needed
1953   __kmp_aux_set_stacksize(arg);
1954 }
1955 
1956 void kmpc_set_blocktime(int arg) {
1957   int gtid, tid;
1958   kmp_info_t *thread;
1959 
1960   gtid = __kmp_entry_gtid();
1961   tid = __kmp_tid_from_gtid(gtid);
1962   thread = __kmp_thread_from_gtid(gtid);
1963 
1964   __kmp_aux_set_blocktime(arg, thread, tid);
1965 }
1966 
1967 void kmpc_set_library(int arg) {
1968   // __kmp_user_set_library initializes the library if needed
1969   __kmp_user_set_library((enum library_type)arg);
1970 }
1971 
1972 void kmpc_set_defaults(char const *str) {
1973   // __kmp_aux_set_defaults initializes the library if needed
1974   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1975 }
1976 
1977 void kmpc_set_disp_num_buffers(int arg) {
1978   // ignore after initialization because some teams have already
1979   // allocated dispatch buffers
1980   if (__kmp_init_serial == 0 && arg > 0)
1981     __kmp_dispatch_num_buffers = arg;
1982 }
1983 
1984 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1985 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1986   return -1;
1987 #else
1988   if (!TCR_4(__kmp_init_middle)) {
1989     __kmp_middle_initialize();
1990   }
1991   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1992 #endif
1993 }
1994 
1995 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1996 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1997   return -1;
1998 #else
1999   if (!TCR_4(__kmp_init_middle)) {
2000     __kmp_middle_initialize();
2001   }
2002   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2003 #endif
2004 }
2005 
2006 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2007 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2008   return -1;
2009 #else
2010   if (!TCR_4(__kmp_init_middle)) {
2011     __kmp_middle_initialize();
2012   }
2013   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2014 #endif
2015 }
2016 
2017 /* -------------------------------------------------------------------------- */
2018 /*!
2019 @ingroup THREADPRIVATE
2020 @param loc       source location information
2021 @param gtid      global thread number
2022 @param cpy_size  size of the cpy_data buffer
2023 @param cpy_data  pointer to data to be copied
2024 @param cpy_func  helper function to call for copying data
2025 @param didit     flag variable: 1=single thread; 0=not single thread
2026 
2027 __kmpc_copyprivate implements the interface for the private data broadcast
2028 needed for the copyprivate clause associated with a single region in an
2029 OpenMP<sup>*</sup> program (both C and Fortran).
2030 All threads participating in the parallel region call this routine.
2031 One of the threads (called the single thread) should have the <tt>didit</tt>
2032 variable set to 1 and all other threads should have that variable set to 0.
2033 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2034 
2035 The OpenMP specification forbids the use of nowait on the single region when a
2036 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2037 barrier internally to avoid race conditions, so the code generation for the
2038 single region should avoid generating a barrier after the call to @ref
2039 __kmpc_copyprivate.
2040 
2041 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2042 The <tt>loc</tt> parameter is a pointer to source location information.
2043 
2044 Internal implementation: The single thread will first copy its descriptor
2045 address (cpy_data) to a team-private location, then the other threads will each
2046 call the function pointed to by the parameter cpy_func, which carries out the
2047 copy by copying the data using the cpy_data buffer.
2048 
2049 The cpy_func routine used for the copy and the contents of the data area defined
2050 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2051 to be done. For instance, the cpy_data buffer can hold the actual data to be
2052 copied or it may hold a list of pointers to the data. The cpy_func routine must
2053 interpret the cpy_data buffer appropriately.
2054 
2055 The interface to cpy_func is as follows:
2056 @code
2057 void cpy_func( void *destination, void *source )
2058 @endcode
2059 where void *destination is the cpy_data pointer for the thread being copied to
2060 and void *source is the cpy_data pointer for the thread being copied from.
2061 */
2062 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2063                         void *cpy_data, void (*cpy_func)(void *, void *),
2064                         kmp_int32 didit) {
2065   void **data_ptr;
2066 
2067   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2068 
2069   KMP_MB();
2070 
2071   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2072 
2073   if (__kmp_env_consistency_check) {
2074     if (loc == 0) {
2075       KMP_WARNING(ConstructIdentInvalid);
2076     }
2077   }
2078 
2079   // ToDo: Optimize the following two barriers into some kind of split barrier
2080 
2081   if (didit)
2082     *data_ptr = cpy_data;
2083 
2084 #if OMPT_SUPPORT
2085   ompt_frame_t *ompt_frame;
2086   if (ompt_enabled.enabled) {
2087     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2088     if (ompt_frame->enter_frame.ptr == NULL)
2089       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2090     OMPT_STORE_RETURN_ADDRESS(gtid);
2091   }
2092 #endif
2093 /* This barrier is not a barrier region boundary */
2094 #if USE_ITT_NOTIFY
2095   __kmp_threads[gtid]->th.th_ident = loc;
2096 #endif
2097   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2098 
2099   if (!didit)
2100     (*cpy_func)(cpy_data, *data_ptr);
2101 
2102 // Consider next barrier a user-visible barrier for barrier region boundaries
2103 // Nesting checks are already handled by the single construct checks
2104 
2105 #if OMPT_SUPPORT
2106   if (ompt_enabled.enabled) {
2107     OMPT_STORE_RETURN_ADDRESS(gtid);
2108   }
2109 #endif
2110 #if USE_ITT_NOTIFY
2111   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2112 // tasks can overwrite the location)
2113 #endif
2114   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2115 #if OMPT_SUPPORT && OMPT_OPTIONAL
2116   if (ompt_enabled.enabled) {
2117     ompt_frame->enter_frame = ompt_data_none;
2118   }
2119 #endif
2120 }
2121 
2122 /* -------------------------------------------------------------------------- */
2123 
2124 #define INIT_LOCK __kmp_init_user_lock_with_checks
2125 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2126 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2127 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2128 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2129 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2130   __kmp_acquire_nested_user_lock_with_checks_timed
2131 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2132 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2133 #define TEST_LOCK __kmp_test_user_lock_with_checks
2134 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2135 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2136 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2137 
2138 // TODO: Make check abort messages use location info & pass it into
2139 // with_checks routines
2140 
2141 #if KMP_USE_DYNAMIC_LOCK
2142 
2143 // internal lock initializer
2144 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2145                                                     kmp_dyna_lockseq_t seq) {
2146   if (KMP_IS_D_LOCK(seq)) {
2147     KMP_INIT_D_LOCK(lock, seq);
2148 #if USE_ITT_BUILD
2149     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2150 #endif
2151   } else {
2152     KMP_INIT_I_LOCK(lock, seq);
2153 #if USE_ITT_BUILD
2154     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2155     __kmp_itt_lock_creating(ilk->lock, loc);
2156 #endif
2157   }
2158 }
2159 
2160 // internal nest lock initializer
2161 static __forceinline void
2162 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2163                                kmp_dyna_lockseq_t seq) {
2164 #if KMP_USE_TSX
2165   // Don't have nested lock implementation for speculative locks
2166   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2167     seq = __kmp_user_lock_seq;
2168 #endif
2169   switch (seq) {
2170   case lockseq_tas:
2171     seq = lockseq_nested_tas;
2172     break;
2173 #if KMP_USE_FUTEX
2174   case lockseq_futex:
2175     seq = lockseq_nested_futex;
2176     break;
2177 #endif
2178   case lockseq_ticket:
2179     seq = lockseq_nested_ticket;
2180     break;
2181   case lockseq_queuing:
2182     seq = lockseq_nested_queuing;
2183     break;
2184   case lockseq_drdpa:
2185     seq = lockseq_nested_drdpa;
2186     break;
2187   default:
2188     seq = lockseq_nested_queuing;
2189   }
2190   KMP_INIT_I_LOCK(lock, seq);
2191 #if USE_ITT_BUILD
2192   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2193   __kmp_itt_lock_creating(ilk->lock, loc);
2194 #endif
2195 }
2196 
2197 /* initialize the lock with a hint */
2198 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2199                                 uintptr_t hint) {
2200   KMP_DEBUG_ASSERT(__kmp_init_serial);
2201   if (__kmp_env_consistency_check && user_lock == NULL) {
2202     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2203   }
2204 
2205   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2206 
2207 #if OMPT_SUPPORT && OMPT_OPTIONAL
2208   // This is the case, if called from omp_init_lock_with_hint:
2209   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2210   if (!codeptr)
2211     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2212   if (ompt_enabled.ompt_callback_lock_init) {
2213     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2214         ompt_mutex_lock, (omp_lock_hint_t)hint,
2215         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2216         codeptr);
2217   }
2218 #endif
2219 }
2220 
2221 /* initialize the lock with a hint */
2222 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2223                                      void **user_lock, uintptr_t hint) {
2224   KMP_DEBUG_ASSERT(__kmp_init_serial);
2225   if (__kmp_env_consistency_check && user_lock == NULL) {
2226     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2227   }
2228 
2229   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2230 
2231 #if OMPT_SUPPORT && OMPT_OPTIONAL
2232   // This is the case, if called from omp_init_lock_with_hint:
2233   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2234   if (!codeptr)
2235     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2236   if (ompt_enabled.ompt_callback_lock_init) {
2237     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2238         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2239         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2240         codeptr);
2241   }
2242 #endif
2243 }
2244 
2245 #endif // KMP_USE_DYNAMIC_LOCK
2246 
2247 /* initialize the lock */
2248 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2249 #if KMP_USE_DYNAMIC_LOCK
2250 
2251   KMP_DEBUG_ASSERT(__kmp_init_serial);
2252   if (__kmp_env_consistency_check && user_lock == NULL) {
2253     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2254   }
2255   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2256 
2257 #if OMPT_SUPPORT && OMPT_OPTIONAL
2258   // This is the case, if called from omp_init_lock_with_hint:
2259   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2260   if (!codeptr)
2261     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2262   if (ompt_enabled.ompt_callback_lock_init) {
2263     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2264         ompt_mutex_lock, omp_lock_hint_none,
2265         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2266         codeptr);
2267   }
2268 #endif
2269 
2270 #else // KMP_USE_DYNAMIC_LOCK
2271 
2272   static char const *const func = "omp_init_lock";
2273   kmp_user_lock_p lck;
2274   KMP_DEBUG_ASSERT(__kmp_init_serial);
2275 
2276   if (__kmp_env_consistency_check) {
2277     if (user_lock == NULL) {
2278       KMP_FATAL(LockIsUninitialized, func);
2279     }
2280   }
2281 
2282   KMP_CHECK_USER_LOCK_INIT();
2283 
2284   if ((__kmp_user_lock_kind == lk_tas) &&
2285       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2286     lck = (kmp_user_lock_p)user_lock;
2287   }
2288 #if KMP_USE_FUTEX
2289   else if ((__kmp_user_lock_kind == lk_futex) &&
2290            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2291     lck = (kmp_user_lock_p)user_lock;
2292   }
2293 #endif
2294   else {
2295     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2296   }
2297   INIT_LOCK(lck);
2298   __kmp_set_user_lock_location(lck, loc);
2299 
2300 #if OMPT_SUPPORT && OMPT_OPTIONAL
2301   // This is the case, if called from omp_init_lock_with_hint:
2302   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2303   if (!codeptr)
2304     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2305   if (ompt_enabled.ompt_callback_lock_init) {
2306     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2307         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2308         (ompt_wait_id_t)user_lock, codeptr);
2309   }
2310 #endif
2311 
2312 #if USE_ITT_BUILD
2313   __kmp_itt_lock_creating(lck);
2314 #endif /* USE_ITT_BUILD */
2315 
2316 #endif // KMP_USE_DYNAMIC_LOCK
2317 } // __kmpc_init_lock
2318 
2319 /* initialize the lock */
2320 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2321 #if KMP_USE_DYNAMIC_LOCK
2322 
2323   KMP_DEBUG_ASSERT(__kmp_init_serial);
2324   if (__kmp_env_consistency_check && user_lock == NULL) {
2325     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2326   }
2327   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2328 
2329 #if OMPT_SUPPORT && OMPT_OPTIONAL
2330   // This is the case, if called from omp_init_lock_with_hint:
2331   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2332   if (!codeptr)
2333     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2334   if (ompt_enabled.ompt_callback_lock_init) {
2335     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2336         ompt_mutex_nest_lock, omp_lock_hint_none,
2337         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2338         codeptr);
2339   }
2340 #endif
2341 
2342 #else // KMP_USE_DYNAMIC_LOCK
2343 
2344   static char const *const func = "omp_init_nest_lock";
2345   kmp_user_lock_p lck;
2346   KMP_DEBUG_ASSERT(__kmp_init_serial);
2347 
2348   if (__kmp_env_consistency_check) {
2349     if (user_lock == NULL) {
2350       KMP_FATAL(LockIsUninitialized, func);
2351     }
2352   }
2353 
2354   KMP_CHECK_USER_LOCK_INIT();
2355 
2356   if ((__kmp_user_lock_kind == lk_tas) &&
2357       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2358        OMP_NEST_LOCK_T_SIZE)) {
2359     lck = (kmp_user_lock_p)user_lock;
2360   }
2361 #if KMP_USE_FUTEX
2362   else if ((__kmp_user_lock_kind == lk_futex) &&
2363            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2364             OMP_NEST_LOCK_T_SIZE)) {
2365     lck = (kmp_user_lock_p)user_lock;
2366   }
2367 #endif
2368   else {
2369     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2370   }
2371 
2372   INIT_NESTED_LOCK(lck);
2373   __kmp_set_user_lock_location(lck, loc);
2374 
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376   // This is the case, if called from omp_init_lock_with_hint:
2377   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2378   if (!codeptr)
2379     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2380   if (ompt_enabled.ompt_callback_lock_init) {
2381     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2382         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2383         (ompt_wait_id_t)user_lock, codeptr);
2384   }
2385 #endif
2386 
2387 #if USE_ITT_BUILD
2388   __kmp_itt_lock_creating(lck);
2389 #endif /* USE_ITT_BUILD */
2390 
2391 #endif // KMP_USE_DYNAMIC_LOCK
2392 } // __kmpc_init_nest_lock
2393 
2394 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2395 #if KMP_USE_DYNAMIC_LOCK
2396 
2397 #if USE_ITT_BUILD
2398   kmp_user_lock_p lck;
2399   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2400     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2401   } else {
2402     lck = (kmp_user_lock_p)user_lock;
2403   }
2404   __kmp_itt_lock_destroyed(lck);
2405 #endif
2406 #if OMPT_SUPPORT && OMPT_OPTIONAL
2407   // This is the case, if called from omp_init_lock_with_hint:
2408   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2409   if (!codeptr)
2410     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2411   if (ompt_enabled.ompt_callback_lock_destroy) {
2412     kmp_user_lock_p lck;
2413     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2414       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2415     } else {
2416       lck = (kmp_user_lock_p)user_lock;
2417     }
2418     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2419         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2420   }
2421 #endif
2422   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2423 #else
2424   kmp_user_lock_p lck;
2425 
2426   if ((__kmp_user_lock_kind == lk_tas) &&
2427       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2428     lck = (kmp_user_lock_p)user_lock;
2429   }
2430 #if KMP_USE_FUTEX
2431   else if ((__kmp_user_lock_kind == lk_futex) &&
2432            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2433     lck = (kmp_user_lock_p)user_lock;
2434   }
2435 #endif
2436   else {
2437     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2438   }
2439 
2440 #if OMPT_SUPPORT && OMPT_OPTIONAL
2441   // This is the case, if called from omp_init_lock_with_hint:
2442   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2443   if (!codeptr)
2444     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2445   if (ompt_enabled.ompt_callback_lock_destroy) {
2446     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2447         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2448   }
2449 #endif
2450 
2451 #if USE_ITT_BUILD
2452   __kmp_itt_lock_destroyed(lck);
2453 #endif /* USE_ITT_BUILD */
2454   DESTROY_LOCK(lck);
2455 
2456   if ((__kmp_user_lock_kind == lk_tas) &&
2457       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2458     ;
2459   }
2460 #if KMP_USE_FUTEX
2461   else if ((__kmp_user_lock_kind == lk_futex) &&
2462            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2463     ;
2464   }
2465 #endif
2466   else {
2467     __kmp_user_lock_free(user_lock, gtid, lck);
2468   }
2469 #endif // KMP_USE_DYNAMIC_LOCK
2470 } // __kmpc_destroy_lock
2471 
2472 /* destroy the lock */
2473 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2474 #if KMP_USE_DYNAMIC_LOCK
2475 
2476 #if USE_ITT_BUILD
2477   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2478   __kmp_itt_lock_destroyed(ilk->lock);
2479 #endif
2480 #if OMPT_SUPPORT && OMPT_OPTIONAL
2481   // This is the case, if called from omp_init_lock_with_hint:
2482   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2483   if (!codeptr)
2484     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2485   if (ompt_enabled.ompt_callback_lock_destroy) {
2486     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2487         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2488   }
2489 #endif
2490   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2491 
2492 #else // KMP_USE_DYNAMIC_LOCK
2493 
2494   kmp_user_lock_p lck;
2495 
2496   if ((__kmp_user_lock_kind == lk_tas) &&
2497       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2498        OMP_NEST_LOCK_T_SIZE)) {
2499     lck = (kmp_user_lock_p)user_lock;
2500   }
2501 #if KMP_USE_FUTEX
2502   else if ((__kmp_user_lock_kind == lk_futex) &&
2503            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2504             OMP_NEST_LOCK_T_SIZE)) {
2505     lck = (kmp_user_lock_p)user_lock;
2506   }
2507 #endif
2508   else {
2509     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2510   }
2511 
2512 #if OMPT_SUPPORT && OMPT_OPTIONAL
2513   // This is the case, if called from omp_init_lock_with_hint:
2514   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2515   if (!codeptr)
2516     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2517   if (ompt_enabled.ompt_callback_lock_destroy) {
2518     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2519         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2520   }
2521 #endif
2522 
2523 #if USE_ITT_BUILD
2524   __kmp_itt_lock_destroyed(lck);
2525 #endif /* USE_ITT_BUILD */
2526 
2527   DESTROY_NESTED_LOCK(lck);
2528 
2529   if ((__kmp_user_lock_kind == lk_tas) &&
2530       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2531        OMP_NEST_LOCK_T_SIZE)) {
2532     ;
2533   }
2534 #if KMP_USE_FUTEX
2535   else if ((__kmp_user_lock_kind == lk_futex) &&
2536            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2537             OMP_NEST_LOCK_T_SIZE)) {
2538     ;
2539   }
2540 #endif
2541   else {
2542     __kmp_user_lock_free(user_lock, gtid, lck);
2543   }
2544 #endif // KMP_USE_DYNAMIC_LOCK
2545 } // __kmpc_destroy_nest_lock
2546 
2547 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2548   KMP_COUNT_BLOCK(OMP_set_lock);
2549 #if KMP_USE_DYNAMIC_LOCK
2550   int tag = KMP_EXTRACT_D_TAG(user_lock);
2551 #if USE_ITT_BUILD
2552   __kmp_itt_lock_acquiring(
2553       (kmp_user_lock_p)
2554           user_lock); // itt function will get to the right lock object.
2555 #endif
2556 #if OMPT_SUPPORT && OMPT_OPTIONAL
2557   // This is the case, if called from omp_init_lock_with_hint:
2558   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2559   if (!codeptr)
2560     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2561   if (ompt_enabled.ompt_callback_mutex_acquire) {
2562     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2563         ompt_mutex_lock, omp_lock_hint_none,
2564         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2565         codeptr);
2566   }
2567 #endif
2568 #if KMP_USE_INLINED_TAS
2569   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2570     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2571   } else
2572 #elif KMP_USE_INLINED_FUTEX
2573   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2574     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2575   } else
2576 #endif
2577   {
2578     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2579   }
2580 #if USE_ITT_BUILD
2581   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2582 #endif
2583 #if OMPT_SUPPORT && OMPT_OPTIONAL
2584   if (ompt_enabled.ompt_callback_mutex_acquired) {
2585     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2586         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2587   }
2588 #endif
2589 
2590 #else // KMP_USE_DYNAMIC_LOCK
2591 
2592   kmp_user_lock_p lck;
2593 
2594   if ((__kmp_user_lock_kind == lk_tas) &&
2595       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2596     lck = (kmp_user_lock_p)user_lock;
2597   }
2598 #if KMP_USE_FUTEX
2599   else if ((__kmp_user_lock_kind == lk_futex) &&
2600            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2601     lck = (kmp_user_lock_p)user_lock;
2602   }
2603 #endif
2604   else {
2605     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2606   }
2607 
2608 #if USE_ITT_BUILD
2609   __kmp_itt_lock_acquiring(lck);
2610 #endif /* USE_ITT_BUILD */
2611 #if OMPT_SUPPORT && OMPT_OPTIONAL
2612   // This is the case, if called from omp_init_lock_with_hint:
2613   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2614   if (!codeptr)
2615     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2616   if (ompt_enabled.ompt_callback_mutex_acquire) {
2617     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2618         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2619         (ompt_wait_id_t)lck, codeptr);
2620   }
2621 #endif
2622 
2623   ACQUIRE_LOCK(lck, gtid);
2624 
2625 #if USE_ITT_BUILD
2626   __kmp_itt_lock_acquired(lck);
2627 #endif /* USE_ITT_BUILD */
2628 
2629 #if OMPT_SUPPORT && OMPT_OPTIONAL
2630   if (ompt_enabled.ompt_callback_mutex_acquired) {
2631     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2632         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2633   }
2634 #endif
2635 
2636 #endif // KMP_USE_DYNAMIC_LOCK
2637 }
2638 
2639 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2640 #if KMP_USE_DYNAMIC_LOCK
2641 
2642 #if USE_ITT_BUILD
2643   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2644 #endif
2645 #if OMPT_SUPPORT && OMPT_OPTIONAL
2646   // This is the case, if called from omp_init_lock_with_hint:
2647   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2648   if (!codeptr)
2649     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2650   if (ompt_enabled.enabled) {
2651     if (ompt_enabled.ompt_callback_mutex_acquire) {
2652       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2653           ompt_mutex_nest_lock, omp_lock_hint_none,
2654           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2655           codeptr);
2656     }
2657   }
2658 #endif
2659   int acquire_status =
2660       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2661   (void) acquire_status;
2662 #if USE_ITT_BUILD
2663   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2664 #endif
2665 
2666 #if OMPT_SUPPORT && OMPT_OPTIONAL
2667   if (ompt_enabled.enabled) {
2668     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2669       if (ompt_enabled.ompt_callback_mutex_acquired) {
2670         // lock_first
2671         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2672             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2673       }
2674     } else {
2675       if (ompt_enabled.ompt_callback_nest_lock) {
2676         // lock_next
2677         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2678             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2679       }
2680     }
2681   }
2682 #endif
2683 
2684 #else // KMP_USE_DYNAMIC_LOCK
2685   int acquire_status;
2686   kmp_user_lock_p lck;
2687 
2688   if ((__kmp_user_lock_kind == lk_tas) &&
2689       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2690        OMP_NEST_LOCK_T_SIZE)) {
2691     lck = (kmp_user_lock_p)user_lock;
2692   }
2693 #if KMP_USE_FUTEX
2694   else if ((__kmp_user_lock_kind == lk_futex) &&
2695            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2696             OMP_NEST_LOCK_T_SIZE)) {
2697     lck = (kmp_user_lock_p)user_lock;
2698   }
2699 #endif
2700   else {
2701     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2702   }
2703 
2704 #if USE_ITT_BUILD
2705   __kmp_itt_lock_acquiring(lck);
2706 #endif /* USE_ITT_BUILD */
2707 #if OMPT_SUPPORT && OMPT_OPTIONAL
2708   // This is the case, if called from omp_init_lock_with_hint:
2709   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2710   if (!codeptr)
2711     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2712   if (ompt_enabled.enabled) {
2713     if (ompt_enabled.ompt_callback_mutex_acquire) {
2714       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2715           ompt_mutex_nest_lock, omp_lock_hint_none,
2716           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2717     }
2718   }
2719 #endif
2720 
2721   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2722 
2723 #if USE_ITT_BUILD
2724   __kmp_itt_lock_acquired(lck);
2725 #endif /* USE_ITT_BUILD */
2726 
2727 #if OMPT_SUPPORT && OMPT_OPTIONAL
2728   if (ompt_enabled.enabled) {
2729     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2730       if (ompt_enabled.ompt_callback_mutex_acquired) {
2731         // lock_first
2732         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2733             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2734       }
2735     } else {
2736       if (ompt_enabled.ompt_callback_nest_lock) {
2737         // lock_next
2738         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2739             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2740       }
2741     }
2742   }
2743 #endif
2744 
2745 #endif // KMP_USE_DYNAMIC_LOCK
2746 }
2747 
2748 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2749 #if KMP_USE_DYNAMIC_LOCK
2750 
2751   int tag = KMP_EXTRACT_D_TAG(user_lock);
2752 #if USE_ITT_BUILD
2753   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2754 #endif
2755 #if KMP_USE_INLINED_TAS
2756   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2757     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2758   } else
2759 #elif KMP_USE_INLINED_FUTEX
2760   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2761     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2762   } else
2763 #endif
2764   {
2765     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2766   }
2767 
2768 #if OMPT_SUPPORT && OMPT_OPTIONAL
2769   // This is the case, if called from omp_init_lock_with_hint:
2770   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2771   if (!codeptr)
2772     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2773   if (ompt_enabled.ompt_callback_mutex_released) {
2774     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2775         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2776   }
2777 #endif
2778 
2779 #else // KMP_USE_DYNAMIC_LOCK
2780 
2781   kmp_user_lock_p lck;
2782 
2783   /* Can't use serial interval since not block structured */
2784   /* release the lock */
2785 
2786   if ((__kmp_user_lock_kind == lk_tas) &&
2787       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2788 #if KMP_OS_LINUX &&                                                            \
2789     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2790 // "fast" path implemented to fix customer performance issue
2791 #if USE_ITT_BUILD
2792     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2793 #endif /* USE_ITT_BUILD */
2794     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2795     KMP_MB();
2796 
2797 #if OMPT_SUPPORT && OMPT_OPTIONAL
2798     // This is the case, if called from omp_init_lock_with_hint:
2799     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2800     if (!codeptr)
2801       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2802     if (ompt_enabled.ompt_callback_mutex_released) {
2803       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2804           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2805     }
2806 #endif
2807 
2808     return;
2809 #else
2810     lck = (kmp_user_lock_p)user_lock;
2811 #endif
2812   }
2813 #if KMP_USE_FUTEX
2814   else if ((__kmp_user_lock_kind == lk_futex) &&
2815            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2816     lck = (kmp_user_lock_p)user_lock;
2817   }
2818 #endif
2819   else {
2820     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2821   }
2822 
2823 #if USE_ITT_BUILD
2824   __kmp_itt_lock_releasing(lck);
2825 #endif /* USE_ITT_BUILD */
2826 
2827   RELEASE_LOCK(lck, gtid);
2828 
2829 #if OMPT_SUPPORT && OMPT_OPTIONAL
2830   // This is the case, if called from omp_init_lock_with_hint:
2831   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2832   if (!codeptr)
2833     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2834   if (ompt_enabled.ompt_callback_mutex_released) {
2835     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2836         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2837   }
2838 #endif
2839 
2840 #endif // KMP_USE_DYNAMIC_LOCK
2841 }
2842 
2843 /* release the lock */
2844 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2845 #if KMP_USE_DYNAMIC_LOCK
2846 
2847 #if USE_ITT_BUILD
2848   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2849 #endif
2850   int release_status =
2851       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2852   (void) release_status;
2853 
2854 #if OMPT_SUPPORT && OMPT_OPTIONAL
2855   // This is the case, if called from omp_init_lock_with_hint:
2856   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2857   if (!codeptr)
2858     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2859   if (ompt_enabled.enabled) {
2860     if (release_status == KMP_LOCK_RELEASED) {
2861       if (ompt_enabled.ompt_callback_mutex_released) {
2862         // release_lock_last
2863         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2864             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2865       }
2866     } else if (ompt_enabled.ompt_callback_nest_lock) {
2867       // release_lock_prev
2868       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2869           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2870     }
2871   }
2872 #endif
2873 
2874 #else // KMP_USE_DYNAMIC_LOCK
2875 
2876   kmp_user_lock_p lck;
2877 
2878   /* Can't use serial interval since not block structured */
2879 
2880   if ((__kmp_user_lock_kind == lk_tas) &&
2881       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2882        OMP_NEST_LOCK_T_SIZE)) {
2883 #if KMP_OS_LINUX &&                                                            \
2884     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2885     // "fast" path implemented to fix customer performance issue
2886     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2887 #if USE_ITT_BUILD
2888     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2889 #endif /* USE_ITT_BUILD */
2890 
2891 #if OMPT_SUPPORT && OMPT_OPTIONAL
2892     int release_status = KMP_LOCK_STILL_HELD;
2893 #endif
2894 
2895     if (--(tl->lk.depth_locked) == 0) {
2896       TCW_4(tl->lk.poll, 0);
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898       release_status = KMP_LOCK_RELEASED;
2899 #endif
2900     }
2901     KMP_MB();
2902 
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904     // This is the case, if called from omp_init_lock_with_hint:
2905     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2906     if (!codeptr)
2907       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2908     if (ompt_enabled.enabled) {
2909       if (release_status == KMP_LOCK_RELEASED) {
2910         if (ompt_enabled.ompt_callback_mutex_released) {
2911           // release_lock_last
2912           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2913               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2914         }
2915       } else if (ompt_enabled.ompt_callback_nest_lock) {
2916         // release_lock_previous
2917         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2918             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2919       }
2920     }
2921 #endif
2922 
2923     return;
2924 #else
2925     lck = (kmp_user_lock_p)user_lock;
2926 #endif
2927   }
2928 #if KMP_USE_FUTEX
2929   else if ((__kmp_user_lock_kind == lk_futex) &&
2930            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2931             OMP_NEST_LOCK_T_SIZE)) {
2932     lck = (kmp_user_lock_p)user_lock;
2933   }
2934 #endif
2935   else {
2936     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2937   }
2938 
2939 #if USE_ITT_BUILD
2940   __kmp_itt_lock_releasing(lck);
2941 #endif /* USE_ITT_BUILD */
2942 
2943   int release_status;
2944   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2945 #if OMPT_SUPPORT && OMPT_OPTIONAL
2946   // This is the case, if called from omp_init_lock_with_hint:
2947   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2948   if (!codeptr)
2949     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2950   if (ompt_enabled.enabled) {
2951     if (release_status == KMP_LOCK_RELEASED) {
2952       if (ompt_enabled.ompt_callback_mutex_released) {
2953         // release_lock_last
2954         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2955             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2956       }
2957     } else if (ompt_enabled.ompt_callback_nest_lock) {
2958       // release_lock_previous
2959       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2960           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2961     }
2962   }
2963 #endif
2964 
2965 #endif // KMP_USE_DYNAMIC_LOCK
2966 }
2967 
2968 /* try to acquire the lock */
2969 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2970   KMP_COUNT_BLOCK(OMP_test_lock);
2971 
2972 #if KMP_USE_DYNAMIC_LOCK
2973   int rc;
2974   int tag = KMP_EXTRACT_D_TAG(user_lock);
2975 #if USE_ITT_BUILD
2976   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2977 #endif
2978 #if OMPT_SUPPORT && OMPT_OPTIONAL
2979   // This is the case, if called from omp_init_lock_with_hint:
2980   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2981   if (!codeptr)
2982     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2983   if (ompt_enabled.ompt_callback_mutex_acquire) {
2984     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2985         ompt_mutex_lock, omp_lock_hint_none,
2986         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2987         codeptr);
2988   }
2989 #endif
2990 #if KMP_USE_INLINED_TAS
2991   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2992     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2993   } else
2994 #elif KMP_USE_INLINED_FUTEX
2995   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2996     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2997   } else
2998 #endif
2999   {
3000     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3001   }
3002   if (rc) {
3003 #if USE_ITT_BUILD
3004     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3005 #endif
3006 #if OMPT_SUPPORT && OMPT_OPTIONAL
3007     if (ompt_enabled.ompt_callback_mutex_acquired) {
3008       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3009           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
3010     }
3011 #endif
3012     return FTN_TRUE;
3013   } else {
3014 #if USE_ITT_BUILD
3015     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3016 #endif
3017     return FTN_FALSE;
3018   }
3019 
3020 #else // KMP_USE_DYNAMIC_LOCK
3021 
3022   kmp_user_lock_p lck;
3023   int rc;
3024 
3025   if ((__kmp_user_lock_kind == lk_tas) &&
3026       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3027     lck = (kmp_user_lock_p)user_lock;
3028   }
3029 #if KMP_USE_FUTEX
3030   else if ((__kmp_user_lock_kind == lk_futex) &&
3031            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3032     lck = (kmp_user_lock_p)user_lock;
3033   }
3034 #endif
3035   else {
3036     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3037   }
3038 
3039 #if USE_ITT_BUILD
3040   __kmp_itt_lock_acquiring(lck);
3041 #endif /* USE_ITT_BUILD */
3042 #if OMPT_SUPPORT && OMPT_OPTIONAL
3043   // This is the case, if called from omp_init_lock_with_hint:
3044   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3045   if (!codeptr)
3046     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3047   if (ompt_enabled.ompt_callback_mutex_acquire) {
3048     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3049         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3050         (ompt_wait_id_t)lck, codeptr);
3051   }
3052 #endif
3053 
3054   rc = TEST_LOCK(lck, gtid);
3055 #if USE_ITT_BUILD
3056   if (rc) {
3057     __kmp_itt_lock_acquired(lck);
3058   } else {
3059     __kmp_itt_lock_cancelled(lck);
3060   }
3061 #endif /* USE_ITT_BUILD */
3062 #if OMPT_SUPPORT && OMPT_OPTIONAL
3063   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3064     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3065         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
3066   }
3067 #endif
3068 
3069   return (rc ? FTN_TRUE : FTN_FALSE);
3070 
3071 /* Can't use serial interval since not block structured */
3072 
3073 #endif // KMP_USE_DYNAMIC_LOCK
3074 }
3075 
3076 /* try to acquire the lock */
3077 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3078 #if KMP_USE_DYNAMIC_LOCK
3079   int rc;
3080 #if USE_ITT_BUILD
3081   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3082 #endif
3083 #if OMPT_SUPPORT && OMPT_OPTIONAL
3084   // This is the case, if called from omp_init_lock_with_hint:
3085   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3086   if (!codeptr)
3087     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3088   if (ompt_enabled.ompt_callback_mutex_acquire) {
3089     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3090         ompt_mutex_nest_lock, omp_lock_hint_none,
3091         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3092         codeptr);
3093   }
3094 #endif
3095   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3096 #if USE_ITT_BUILD
3097   if (rc) {
3098     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3099   } else {
3100     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3101   }
3102 #endif
3103 #if OMPT_SUPPORT && OMPT_OPTIONAL
3104   if (ompt_enabled.enabled && rc) {
3105     if (rc == 1) {
3106       if (ompt_enabled.ompt_callback_mutex_acquired) {
3107         // lock_first
3108         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3109             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3110       }
3111     } else {
3112       if (ompt_enabled.ompt_callback_nest_lock) {
3113         // lock_next
3114         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3115             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3116       }
3117     }
3118   }
3119 #endif
3120   return rc;
3121 
3122 #else // KMP_USE_DYNAMIC_LOCK
3123 
3124   kmp_user_lock_p lck;
3125   int rc;
3126 
3127   if ((__kmp_user_lock_kind == lk_tas) &&
3128       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3129        OMP_NEST_LOCK_T_SIZE)) {
3130     lck = (kmp_user_lock_p)user_lock;
3131   }
3132 #if KMP_USE_FUTEX
3133   else if ((__kmp_user_lock_kind == lk_futex) &&
3134            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3135             OMP_NEST_LOCK_T_SIZE)) {
3136     lck = (kmp_user_lock_p)user_lock;
3137   }
3138 #endif
3139   else {
3140     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3141   }
3142 
3143 #if USE_ITT_BUILD
3144   __kmp_itt_lock_acquiring(lck);
3145 #endif /* USE_ITT_BUILD */
3146 
3147 #if OMPT_SUPPORT && OMPT_OPTIONAL
3148   // This is the case, if called from omp_init_lock_with_hint:
3149   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3150   if (!codeptr)
3151     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3152   if (ompt_enabled.enabled) &&
3153         ompt_enabled.ompt_callback_mutex_acquire) {
3154       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3155           ompt_mutex_nest_lock, omp_lock_hint_none,
3156           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3157     }
3158 #endif
3159 
3160   rc = TEST_NESTED_LOCK(lck, gtid);
3161 #if USE_ITT_BUILD
3162   if (rc) {
3163     __kmp_itt_lock_acquired(lck);
3164   } else {
3165     __kmp_itt_lock_cancelled(lck);
3166   }
3167 #endif /* USE_ITT_BUILD */
3168 #if OMPT_SUPPORT && OMPT_OPTIONAL
3169   if (ompt_enabled.enabled && rc) {
3170     if (rc == 1) {
3171       if (ompt_enabled.ompt_callback_mutex_acquired) {
3172         // lock_first
3173         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3174             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3175       }
3176     } else {
3177       if (ompt_enabled.ompt_callback_nest_lock) {
3178         // lock_next
3179         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3180             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3181       }
3182     }
3183   }
3184 #endif
3185   return rc;
3186 
3187 /* Can't use serial interval since not block structured */
3188 
3189 #endif // KMP_USE_DYNAMIC_LOCK
3190 }
3191 
3192 // Interface to fast scalable reduce methods routines
3193 
3194 // keep the selected method in a thread local structure for cross-function
3195 // usage: will be used in __kmpc_end_reduce* functions;
3196 // another solution: to re-determine the method one more time in
3197 // __kmpc_end_reduce* functions (new prototype required then)
3198 // AT: which solution is better?
3199 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3200   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3201 
3202 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3203   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3204 
3205 // description of the packed_reduction_method variable: look at the macros in
3206 // kmp.h
3207 
3208 // used in a critical section reduce block
3209 static __forceinline void
3210 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3211                                           kmp_critical_name *crit) {
3212 
3213   // this lock was visible to a customer and to the threading profile tool as a
3214   // serial overhead span (although it's used for an internal purpose only)
3215   //            why was it visible in previous implementation?
3216   //            should we keep it visible in new reduce block?
3217   kmp_user_lock_p lck;
3218 
3219 #if KMP_USE_DYNAMIC_LOCK
3220 
3221   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3222   // Check if it is initialized.
3223   if (*lk == 0) {
3224     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3225       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3226                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3227     } else {
3228       __kmp_init_indirect_csptr(crit, loc, global_tid,
3229                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3230     }
3231   }
3232   // Branch for accessing the actual lock object and set operation. This
3233   // branching is inevitable since this lock initialization does not follow the
3234   // normal dispatch path (lock table is not used).
3235   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3236     lck = (kmp_user_lock_p)lk;
3237     KMP_DEBUG_ASSERT(lck != NULL);
3238     if (__kmp_env_consistency_check) {
3239       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3240     }
3241     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3242   } else {
3243     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3244     lck = ilk->lock;
3245     KMP_DEBUG_ASSERT(lck != NULL);
3246     if (__kmp_env_consistency_check) {
3247       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3248     }
3249     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3250   }
3251 
3252 #else // KMP_USE_DYNAMIC_LOCK
3253 
3254   // We know that the fast reduction code is only emitted by Intel compilers
3255   // with 32 byte critical sections. If there isn't enough space, then we
3256   // have to use a pointer.
3257   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3258     lck = (kmp_user_lock_p)crit;
3259   } else {
3260     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3261   }
3262   KMP_DEBUG_ASSERT(lck != NULL);
3263 
3264   if (__kmp_env_consistency_check)
3265     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3266 
3267   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3268 
3269 #endif // KMP_USE_DYNAMIC_LOCK
3270 }
3271 
3272 // used in a critical section reduce block
3273 static __forceinline void
3274 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3275                                         kmp_critical_name *crit) {
3276 
3277   kmp_user_lock_p lck;
3278 
3279 #if KMP_USE_DYNAMIC_LOCK
3280 
3281   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3282     lck = (kmp_user_lock_p)crit;
3283     if (__kmp_env_consistency_check)
3284       __kmp_pop_sync(global_tid, ct_critical, loc);
3285     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3286   } else {
3287     kmp_indirect_lock_t *ilk =
3288         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3289     if (__kmp_env_consistency_check)
3290       __kmp_pop_sync(global_tid, ct_critical, loc);
3291     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3292   }
3293 
3294 #else // KMP_USE_DYNAMIC_LOCK
3295 
3296   // We know that the fast reduction code is only emitted by Intel compilers
3297   // with 32 byte critical sections. If there isn't enough space, then we have
3298   // to use a pointer.
3299   if (__kmp_base_user_lock_size > 32) {
3300     lck = *((kmp_user_lock_p *)crit);
3301     KMP_ASSERT(lck != NULL);
3302   } else {
3303     lck = (kmp_user_lock_p)crit;
3304   }
3305 
3306   if (__kmp_env_consistency_check)
3307     __kmp_pop_sync(global_tid, ct_critical, loc);
3308 
3309   __kmp_release_user_lock_with_checks(lck, global_tid);
3310 
3311 #endif // KMP_USE_DYNAMIC_LOCK
3312 } // __kmp_end_critical_section_reduce_block
3313 
3314 #if OMP_40_ENABLED
3315 static __forceinline int
3316 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3317                                      int *task_state) {
3318   kmp_team_t *team;
3319 
3320   // Check if we are inside the teams construct?
3321   if (th->th.th_teams_microtask) {
3322     *team_p = team = th->th.th_team;
3323     if (team->t.t_level == th->th.th_teams_level) {
3324       // This is reduction at teams construct.
3325       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3326       // Let's swap teams temporarily for the reduction.
3327       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3328       th->th.th_team = team->t.t_parent;
3329       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3330       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3331       *task_state = th->th.th_task_state;
3332       th->th.th_task_state = 0;
3333 
3334       return 1;
3335     }
3336   }
3337   return 0;
3338 }
3339 
3340 static __forceinline void
3341 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3342   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3343   th->th.th_info.ds.ds_tid = 0;
3344   th->th.th_team = team;
3345   th->th.th_team_nproc = team->t.t_nproc;
3346   th->th.th_task_team = team->t.t_task_team[task_state];
3347   th->th.th_task_state = task_state;
3348 }
3349 #endif
3350 
3351 /* 2.a.i. Reduce Block without a terminating barrier */
3352 /*!
3353 @ingroup SYNCHRONIZATION
3354 @param loc source location information
3355 @param global_tid global thread number
3356 @param num_vars number of items (variables) to be reduced
3357 @param reduce_size size of data in bytes to be reduced
3358 @param reduce_data pointer to data to be reduced
3359 @param reduce_func callback function providing reduction operation on two
3360 operands and returning result of reduction in lhs_data
3361 @param lck pointer to the unique lock data structure
3362 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3363 threads if atomic reduction needed
3364 
3365 The nowait version is used for a reduce clause with the nowait argument.
3366 */
3367 kmp_int32
3368 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3369                      size_t reduce_size, void *reduce_data,
3370                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3371                      kmp_critical_name *lck) {
3372 
3373   KMP_COUNT_BLOCK(REDUCE_nowait);
3374   int retval = 0;
3375   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3376 #if OMP_40_ENABLED
3377   kmp_info_t *th;
3378   kmp_team_t *team;
3379   int teams_swapped = 0, task_state;
3380 #endif
3381   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3382 
3383   // why do we need this initialization here at all?
3384   // Reduction clause can not be used as a stand-alone directive.
3385 
3386   // do not call __kmp_serial_initialize(), it will be called by
3387   // __kmp_parallel_initialize() if needed
3388   // possible detection of false-positive race by the threadchecker ???
3389   if (!TCR_4(__kmp_init_parallel))
3390     __kmp_parallel_initialize();
3391 
3392 #if OMP_50_ENABLED
3393   __kmp_resume_if_soft_paused();
3394 #endif
3395 
3396 // check correctness of reduce block nesting
3397 #if KMP_USE_DYNAMIC_LOCK
3398   if (__kmp_env_consistency_check)
3399     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3400 #else
3401   if (__kmp_env_consistency_check)
3402     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3403 #endif
3404 
3405 #if OMP_40_ENABLED
3406   th = __kmp_thread_from_gtid(global_tid);
3407   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3408 #endif // OMP_40_ENABLED
3409 
3410   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3411   // the value should be kept in a variable
3412   // the variable should be either a construct-specific or thread-specific
3413   // property, not a team specific property
3414   //     (a thread can reach the next reduce block on the next construct, reduce
3415   //     method may differ on the next construct)
3416   // an ident_t "loc" parameter could be used as a construct-specific property
3417   // (what if loc == 0?)
3418   //     (if both construct-specific and team-specific variables were shared,
3419   //     then unness extra syncs should be needed)
3420   // a thread-specific variable is better regarding two issues above (next
3421   // construct and extra syncs)
3422   // a thread-specific "th_local.reduction_method" variable is used currently
3423   // each thread executes 'determine' and 'set' lines (no need to execute by one
3424   // thread, to avoid unness extra syncs)
3425 
3426   packed_reduction_method = __kmp_determine_reduction_method(
3427       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3428   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3429 
3430   if (packed_reduction_method == critical_reduce_block) {
3431 
3432     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3433     retval = 1;
3434 
3435   } else if (packed_reduction_method == empty_reduce_block) {
3436 
3437     // usage: if team size == 1, no synchronization is required ( Intel
3438     // platforms only )
3439     retval = 1;
3440 
3441   } else if (packed_reduction_method == atomic_reduce_block) {
3442 
3443     retval = 2;
3444 
3445     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3446     // won't be called by the code gen)
3447     //     (it's not quite good, because the checking block has been closed by
3448     //     this 'pop',
3449     //      but atomic operation has not been executed yet, will be executed
3450     //      slightly later, literally on next instruction)
3451     if (__kmp_env_consistency_check)
3452       __kmp_pop_sync(global_tid, ct_reduce, loc);
3453 
3454   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3455                                    tree_reduce_block)) {
3456 
3457 // AT: performance issue: a real barrier here
3458 // AT:     (if master goes slow, other threads are blocked here waiting for the
3459 // master to come and release them)
3460 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3461 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3462 // be confusing to a customer)
3463 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3464 // might go faster and be more in line with sense of NOWAIT
3465 // AT: TO DO: do epcc test and compare times
3466 
3467 // this barrier should be invisible to a customer and to the threading profile
3468 // tool (it's neither a terminating barrier nor customer's code, it's
3469 // used for an internal purpose)
3470 #if OMPT_SUPPORT
3471     // JP: can this barrier potentially leed to task scheduling?
3472     // JP: as long as there is a barrier in the implementation, OMPT should and
3473     // will provide the barrier events
3474     //         so we set-up the necessary frame/return addresses.
3475     ompt_frame_t *ompt_frame;
3476     if (ompt_enabled.enabled) {
3477       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3478       if (ompt_frame->enter_frame.ptr == NULL)
3479         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3480       OMPT_STORE_RETURN_ADDRESS(global_tid);
3481     }
3482 #endif
3483 #if USE_ITT_NOTIFY
3484     __kmp_threads[global_tid]->th.th_ident = loc;
3485 #endif
3486     retval =
3487         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3488                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3489     retval = (retval != 0) ? (0) : (1);
3490 #if OMPT_SUPPORT && OMPT_OPTIONAL
3491     if (ompt_enabled.enabled) {
3492       ompt_frame->enter_frame = ompt_data_none;
3493     }
3494 #endif
3495 
3496     // all other workers except master should do this pop here
3497     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3498     if (__kmp_env_consistency_check) {
3499       if (retval == 0) {
3500         __kmp_pop_sync(global_tid, ct_reduce, loc);
3501       }
3502     }
3503 
3504   } else {
3505 
3506     // should never reach this block
3507     KMP_ASSERT(0); // "unexpected method"
3508   }
3509 #if OMP_40_ENABLED
3510   if (teams_swapped) {
3511     __kmp_restore_swapped_teams(th, team, task_state);
3512   }
3513 #endif
3514   KA_TRACE(
3515       10,
3516       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3517        global_tid, packed_reduction_method, retval));
3518 
3519   return retval;
3520 }
3521 
3522 /*!
3523 @ingroup SYNCHRONIZATION
3524 @param loc source location information
3525 @param global_tid global thread id.
3526 @param lck pointer to the unique lock data structure
3527 
3528 Finish the execution of a reduce nowait.
3529 */
3530 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3531                               kmp_critical_name *lck) {
3532 
3533   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3534 
3535   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3536 
3537   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3538 
3539   if (packed_reduction_method == critical_reduce_block) {
3540 
3541     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3542 
3543   } else if (packed_reduction_method == empty_reduce_block) {
3544 
3545     // usage: if team size == 1, no synchronization is required ( on Intel
3546     // platforms only )
3547 
3548   } else if (packed_reduction_method == atomic_reduce_block) {
3549 
3550     // neither master nor other workers should get here
3551     //     (code gen does not generate this call in case 2: atomic reduce block)
3552     // actually it's better to remove this elseif at all;
3553     // after removal this value will checked by the 'else' and will assert
3554 
3555   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3556                                    tree_reduce_block)) {
3557 
3558     // only master gets here
3559 
3560   } else {
3561 
3562     // should never reach this block
3563     KMP_ASSERT(0); // "unexpected method"
3564   }
3565 
3566   if (__kmp_env_consistency_check)
3567     __kmp_pop_sync(global_tid, ct_reduce, loc);
3568 
3569   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3570                 global_tid, packed_reduction_method));
3571 
3572   return;
3573 }
3574 
3575 /* 2.a.ii. Reduce Block with a terminating barrier */
3576 
3577 /*!
3578 @ingroup SYNCHRONIZATION
3579 @param loc source location information
3580 @param global_tid global thread number
3581 @param num_vars number of items (variables) to be reduced
3582 @param reduce_size size of data in bytes to be reduced
3583 @param reduce_data pointer to data to be reduced
3584 @param reduce_func callback function providing reduction operation on two
3585 operands and returning result of reduction in lhs_data
3586 @param lck pointer to the unique lock data structure
3587 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3588 threads if atomic reduction needed
3589 
3590 A blocking reduce that includes an implicit barrier.
3591 */
3592 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3593                         size_t reduce_size, void *reduce_data,
3594                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3595                         kmp_critical_name *lck) {
3596   KMP_COUNT_BLOCK(REDUCE_wait);
3597   int retval = 0;
3598   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3599 #if OMP_40_ENABLED
3600   kmp_info_t *th;
3601   kmp_team_t *team;
3602   int teams_swapped = 0, task_state;
3603 #endif
3604 
3605   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3606 
3607   // why do we need this initialization here at all?
3608   // Reduction clause can not be a stand-alone directive.
3609 
3610   // do not call __kmp_serial_initialize(), it will be called by
3611   // __kmp_parallel_initialize() if needed
3612   // possible detection of false-positive race by the threadchecker ???
3613   if (!TCR_4(__kmp_init_parallel))
3614     __kmp_parallel_initialize();
3615 
3616 #if OMP_50_ENABLED
3617   __kmp_resume_if_soft_paused();
3618 #endif
3619 
3620 // check correctness of reduce block nesting
3621 #if KMP_USE_DYNAMIC_LOCK
3622   if (__kmp_env_consistency_check)
3623     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3624 #else
3625   if (__kmp_env_consistency_check)
3626     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3627 #endif
3628 
3629 #if OMP_40_ENABLED
3630   th = __kmp_thread_from_gtid(global_tid);
3631   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3632 #endif // OMP_40_ENABLED
3633 
3634   packed_reduction_method = __kmp_determine_reduction_method(
3635       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3636   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3637 
3638   if (packed_reduction_method == critical_reduce_block) {
3639 
3640     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3641     retval = 1;
3642 
3643   } else if (packed_reduction_method == empty_reduce_block) {
3644 
3645     // usage: if team size == 1, no synchronization is required ( Intel
3646     // platforms only )
3647     retval = 1;
3648 
3649   } else if (packed_reduction_method == atomic_reduce_block) {
3650 
3651     retval = 2;
3652 
3653   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3654                                    tree_reduce_block)) {
3655 
3656 // case tree_reduce_block:
3657 // this barrier should be visible to a customer and to the threading profile
3658 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3659 #if OMPT_SUPPORT
3660     ompt_frame_t *ompt_frame;
3661     if (ompt_enabled.enabled) {
3662       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3663       if (ompt_frame->enter_frame.ptr == NULL)
3664         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3665       OMPT_STORE_RETURN_ADDRESS(global_tid);
3666     }
3667 #endif
3668 #if USE_ITT_NOTIFY
3669     __kmp_threads[global_tid]->th.th_ident =
3670         loc; // needed for correct notification of frames
3671 #endif
3672     retval =
3673         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3674                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3675     retval = (retval != 0) ? (0) : (1);
3676 #if OMPT_SUPPORT && OMPT_OPTIONAL
3677     if (ompt_enabled.enabled) {
3678       ompt_frame->enter_frame = ompt_data_none;
3679     }
3680 #endif
3681 
3682     // all other workers except master should do this pop here
3683     // ( none of other workers except master will enter __kmpc_end_reduce() )
3684     if (__kmp_env_consistency_check) {
3685       if (retval == 0) { // 0: all other workers; 1: master
3686         __kmp_pop_sync(global_tid, ct_reduce, loc);
3687       }
3688     }
3689 
3690   } else {
3691 
3692     // should never reach this block
3693     KMP_ASSERT(0); // "unexpected method"
3694   }
3695 #if OMP_40_ENABLED
3696   if (teams_swapped) {
3697     __kmp_restore_swapped_teams(th, team, task_state);
3698   }
3699 #endif
3700 
3701   KA_TRACE(10,
3702            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3703             global_tid, packed_reduction_method, retval));
3704 
3705   return retval;
3706 }
3707 
3708 /*!
3709 @ingroup SYNCHRONIZATION
3710 @param loc source location information
3711 @param global_tid global thread id.
3712 @param lck pointer to the unique lock data structure
3713 
3714 Finish the execution of a blocking reduce.
3715 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3716 start function.
3717 */
3718 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3719                        kmp_critical_name *lck) {
3720 
3721   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3722 #if OMP_40_ENABLED
3723   kmp_info_t *th;
3724   kmp_team_t *team;
3725   int teams_swapped = 0, task_state;
3726 #endif
3727 
3728   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3729 
3730 #if OMP_40_ENABLED
3731   th = __kmp_thread_from_gtid(global_tid);
3732   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3733 #endif // OMP_40_ENABLED
3734 
3735   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3736 
3737   // this barrier should be visible to a customer and to the threading profile
3738   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3739 
3740   if (packed_reduction_method == critical_reduce_block) {
3741 
3742     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3743 
3744 // TODO: implicit barrier: should be exposed
3745 #if OMPT_SUPPORT
3746     ompt_frame_t *ompt_frame;
3747     if (ompt_enabled.enabled) {
3748       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3749       if (ompt_frame->enter_frame.ptr == NULL)
3750         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3751       OMPT_STORE_RETURN_ADDRESS(global_tid);
3752     }
3753 #endif
3754 #if USE_ITT_NOTIFY
3755     __kmp_threads[global_tid]->th.th_ident = loc;
3756 #endif
3757     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3758 #if OMPT_SUPPORT && OMPT_OPTIONAL
3759     if (ompt_enabled.enabled) {
3760       ompt_frame->enter_frame = ompt_data_none;
3761     }
3762 #endif
3763 
3764   } else if (packed_reduction_method == empty_reduce_block) {
3765 
3766 // usage: if team size==1, no synchronization is required (Intel platforms only)
3767 
3768 // TODO: implicit barrier: should be exposed
3769 #if OMPT_SUPPORT
3770     ompt_frame_t *ompt_frame;
3771     if (ompt_enabled.enabled) {
3772       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3773       if (ompt_frame->enter_frame.ptr == NULL)
3774         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3775       OMPT_STORE_RETURN_ADDRESS(global_tid);
3776     }
3777 #endif
3778 #if USE_ITT_NOTIFY
3779     __kmp_threads[global_tid]->th.th_ident = loc;
3780 #endif
3781     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3782 #if OMPT_SUPPORT && OMPT_OPTIONAL
3783     if (ompt_enabled.enabled) {
3784       ompt_frame->enter_frame = ompt_data_none;
3785     }
3786 #endif
3787 
3788   } else if (packed_reduction_method == atomic_reduce_block) {
3789 
3790 #if OMPT_SUPPORT
3791     ompt_frame_t *ompt_frame;
3792     if (ompt_enabled.enabled) {
3793       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3794       if (ompt_frame->enter_frame.ptr == NULL)
3795         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3796       OMPT_STORE_RETURN_ADDRESS(global_tid);
3797     }
3798 #endif
3799 // TODO: implicit barrier: should be exposed
3800 #if USE_ITT_NOTIFY
3801     __kmp_threads[global_tid]->th.th_ident = loc;
3802 #endif
3803     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3804 #if OMPT_SUPPORT && OMPT_OPTIONAL
3805     if (ompt_enabled.enabled) {
3806       ompt_frame->enter_frame = ompt_data_none;
3807     }
3808 #endif
3809 
3810   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3811                                    tree_reduce_block)) {
3812 
3813     // only master executes here (master releases all other workers)
3814     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3815                             global_tid);
3816 
3817   } else {
3818 
3819     // should never reach this block
3820     KMP_ASSERT(0); // "unexpected method"
3821   }
3822 #if OMP_40_ENABLED
3823   if (teams_swapped) {
3824     __kmp_restore_swapped_teams(th, team, task_state);
3825   }
3826 #endif
3827 
3828   if (__kmp_env_consistency_check)
3829     __kmp_pop_sync(global_tid, ct_reduce, loc);
3830 
3831   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3832                 global_tid, packed_reduction_method));
3833 
3834   return;
3835 }
3836 
3837 #undef __KMP_GET_REDUCTION_METHOD
3838 #undef __KMP_SET_REDUCTION_METHOD
3839 
3840 /* end of interface to fast scalable reduce routines */
3841 
3842 kmp_uint64 __kmpc_get_taskid() {
3843 
3844   kmp_int32 gtid;
3845   kmp_info_t *thread;
3846 
3847   gtid = __kmp_get_gtid();
3848   if (gtid < 0) {
3849     return 0;
3850   }
3851   thread = __kmp_thread_from_gtid(gtid);
3852   return thread->th.th_current_task->td_task_id;
3853 
3854 } // __kmpc_get_taskid
3855 
3856 kmp_uint64 __kmpc_get_parent_taskid() {
3857 
3858   kmp_int32 gtid;
3859   kmp_info_t *thread;
3860   kmp_taskdata_t *parent_task;
3861 
3862   gtid = __kmp_get_gtid();
3863   if (gtid < 0) {
3864     return 0;
3865   }
3866   thread = __kmp_thread_from_gtid(gtid);
3867   parent_task = thread->th.th_current_task->td_parent;
3868   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3869 
3870 } // __kmpc_get_parent_taskid
3871 
3872 #if OMP_45_ENABLED
3873 /*!
3874 @ingroup WORK_SHARING
3875 @param loc  source location information.
3876 @param gtid  global thread number.
3877 @param num_dims  number of associated doacross loops.
3878 @param dims  info on loops bounds.
3879 
3880 Initialize doacross loop information.
3881 Expect compiler send us inclusive bounds,
3882 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3883 */
3884 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3885                           const struct kmp_dim *dims) {
3886   int j, idx;
3887   kmp_int64 last, trace_count;
3888   kmp_info_t *th = __kmp_threads[gtid];
3889   kmp_team_t *team = th->th.th_team;
3890   kmp_uint32 *flags;
3891   kmp_disp_t *pr_buf = th->th.th_dispatch;
3892   dispatch_shared_info_t *sh_buf;
3893 
3894   KA_TRACE(
3895       20,
3896       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3897        gtid, num_dims, !team->t.t_serialized));
3898   KMP_DEBUG_ASSERT(dims != NULL);
3899   KMP_DEBUG_ASSERT(num_dims > 0);
3900 
3901   if (team->t.t_serialized) {
3902     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3903     return; // no dependencies if team is serialized
3904   }
3905   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3906   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3907   // the next loop
3908   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3909 
3910   // Save bounds info into allocated private buffer
3911   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3912   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3913       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3914   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3915   pr_buf->th_doacross_info[0] =
3916       (kmp_int64)num_dims; // first element is number of dimensions
3917   // Save also address of num_done in order to access it later without knowing
3918   // the buffer index
3919   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3920   pr_buf->th_doacross_info[2] = dims[0].lo;
3921   pr_buf->th_doacross_info[3] = dims[0].up;
3922   pr_buf->th_doacross_info[4] = dims[0].st;
3923   last = 5;
3924   for (j = 1; j < num_dims; ++j) {
3925     kmp_int64
3926         range_length; // To keep ranges of all dimensions but the first dims[0]
3927     if (dims[j].st == 1) { // most common case
3928       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3929       range_length = dims[j].up - dims[j].lo + 1;
3930     } else {
3931       if (dims[j].st > 0) {
3932         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3933         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3934       } else { // negative increment
3935         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3936         range_length =
3937             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3938       }
3939     }
3940     pr_buf->th_doacross_info[last++] = range_length;
3941     pr_buf->th_doacross_info[last++] = dims[j].lo;
3942     pr_buf->th_doacross_info[last++] = dims[j].up;
3943     pr_buf->th_doacross_info[last++] = dims[j].st;
3944   }
3945 
3946   // Compute total trip count.
3947   // Start with range of dims[0] which we don't need to keep in the buffer.
3948   if (dims[0].st == 1) { // most common case
3949     trace_count = dims[0].up - dims[0].lo + 1;
3950   } else if (dims[0].st > 0) {
3951     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3952     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3953   } else { // negative increment
3954     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3955     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3956   }
3957   for (j = 1; j < num_dims; ++j) {
3958     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3959   }
3960   KMP_DEBUG_ASSERT(trace_count > 0);
3961 
3962   // Check if shared buffer is not occupied by other loop (idx -
3963   // __kmp_dispatch_num_buffers)
3964   if (idx != sh_buf->doacross_buf_idx) {
3965     // Shared buffer is occupied, wait for it to be free
3966     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3967                        __kmp_eq_4, NULL);
3968   }
3969 #if KMP_32_BIT_ARCH
3970   // Check if we are the first thread. After the CAS the first thread gets 0,
3971   // others get 1 if initialization is in progress, allocated pointer otherwise.
3972   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3973   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3974       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3975 #else
3976   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3977       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3978 #endif
3979   if (flags == NULL) {
3980     // we are the first thread, allocate the array of flags
3981     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3982     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3983     KMP_MB();
3984     sh_buf->doacross_flags = flags;
3985   } else if (flags == (kmp_uint32 *)1) {
3986 #if KMP_32_BIT_ARCH
3987     // initialization is still in progress, need to wait
3988     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3989 #else
3990     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3991 #endif
3992       KMP_YIELD(TRUE);
3993     KMP_MB();
3994   } else {
3995     KMP_MB();
3996   }
3997   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3998   pr_buf->th_doacross_flags =
3999       sh_buf->doacross_flags; // save private copy in order to not
4000   // touch shared buffer on each iteration
4001   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4002 }
4003 
4004 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4005   kmp_int32 shft, num_dims, i;
4006   kmp_uint32 flag;
4007   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4008   kmp_info_t *th = __kmp_threads[gtid];
4009   kmp_team_t *team = th->th.th_team;
4010   kmp_disp_t *pr_buf;
4011   kmp_int64 lo, up, st;
4012 
4013   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4014   if (team->t.t_serialized) {
4015     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4016     return; // no dependencies if team is serialized
4017   }
4018 
4019   // calculate sequential iteration number and check out-of-bounds condition
4020   pr_buf = th->th.th_dispatch;
4021   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4022   num_dims = pr_buf->th_doacross_info[0];
4023   lo = pr_buf->th_doacross_info[2];
4024   up = pr_buf->th_doacross_info[3];
4025   st = pr_buf->th_doacross_info[4];
4026   if (st == 1) { // most common case
4027     if (vec[0] < lo || vec[0] > up) {
4028       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4029                     "bounds [%lld,%lld]\n",
4030                     gtid, vec[0], lo, up));
4031       return;
4032     }
4033     iter_number = vec[0] - lo;
4034   } else if (st > 0) {
4035     if (vec[0] < lo || vec[0] > up) {
4036       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4037                     "bounds [%lld,%lld]\n",
4038                     gtid, vec[0], lo, up));
4039       return;
4040     }
4041     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4042   } else { // negative increment
4043     if (vec[0] > lo || vec[0] < up) {
4044       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4045                     "bounds [%lld,%lld]\n",
4046                     gtid, vec[0], lo, up));
4047       return;
4048     }
4049     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4050   }
4051   for (i = 1; i < num_dims; ++i) {
4052     kmp_int64 iter, ln;
4053     kmp_int32 j = i * 4;
4054     ln = pr_buf->th_doacross_info[j + 1];
4055     lo = pr_buf->th_doacross_info[j + 2];
4056     up = pr_buf->th_doacross_info[j + 3];
4057     st = pr_buf->th_doacross_info[j + 4];
4058     if (st == 1) {
4059       if (vec[i] < lo || vec[i] > up) {
4060         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4061                       "bounds [%lld,%lld]\n",
4062                       gtid, vec[i], lo, up));
4063         return;
4064       }
4065       iter = vec[i] - lo;
4066     } else if (st > 0) {
4067       if (vec[i] < lo || vec[i] > up) {
4068         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4069                       "bounds [%lld,%lld]\n",
4070                       gtid, vec[i], lo, up));
4071         return;
4072       }
4073       iter = (kmp_uint64)(vec[i] - lo) / st;
4074     } else { // st < 0
4075       if (vec[i] > lo || vec[i] < up) {
4076         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4077                       "bounds [%lld,%lld]\n",
4078                       gtid, vec[i], lo, up));
4079         return;
4080       }
4081       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4082     }
4083     iter_number = iter + ln * iter_number;
4084   }
4085   shft = iter_number % 32; // use 32-bit granularity
4086   iter_number >>= 5; // divided by 32
4087   flag = 1 << shft;
4088   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4089     KMP_YIELD(TRUE);
4090   }
4091   KMP_MB();
4092   KA_TRACE(20,
4093            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4094             gtid, (iter_number << 5) + shft));
4095 }
4096 
4097 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4098   kmp_int32 shft, num_dims, i;
4099   kmp_uint32 flag;
4100   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4101   kmp_info_t *th = __kmp_threads[gtid];
4102   kmp_team_t *team = th->th.th_team;
4103   kmp_disp_t *pr_buf;
4104   kmp_int64 lo, st;
4105 
4106   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4107   if (team->t.t_serialized) {
4108     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4109     return; // no dependencies if team is serialized
4110   }
4111 
4112   // calculate sequential iteration number (same as in "wait" but no
4113   // out-of-bounds checks)
4114   pr_buf = th->th.th_dispatch;
4115   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4116   num_dims = pr_buf->th_doacross_info[0];
4117   lo = pr_buf->th_doacross_info[2];
4118   st = pr_buf->th_doacross_info[4];
4119   if (st == 1) { // most common case
4120     iter_number = vec[0] - lo;
4121   } else if (st > 0) {
4122     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4123   } else { // negative increment
4124     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4125   }
4126   for (i = 1; i < num_dims; ++i) {
4127     kmp_int64 iter, ln;
4128     kmp_int32 j = i * 4;
4129     ln = pr_buf->th_doacross_info[j + 1];
4130     lo = pr_buf->th_doacross_info[j + 2];
4131     st = pr_buf->th_doacross_info[j + 4];
4132     if (st == 1) {
4133       iter = vec[i] - lo;
4134     } else if (st > 0) {
4135       iter = (kmp_uint64)(vec[i] - lo) / st;
4136     } else { // st < 0
4137       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4138     }
4139     iter_number = iter + ln * iter_number;
4140   }
4141   shft = iter_number % 32; // use 32-bit granularity
4142   iter_number >>= 5; // divided by 32
4143   flag = 1 << shft;
4144   KMP_MB();
4145   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4146     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4147   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4148                 (iter_number << 5) + shft));
4149 }
4150 
4151 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4152   kmp_int32 num_done;
4153   kmp_info_t *th = __kmp_threads[gtid];
4154   kmp_team_t *team = th->th.th_team;
4155   kmp_disp_t *pr_buf = th->th.th_dispatch;
4156 
4157   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4158   if (team->t.t_serialized) {
4159     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4160     return; // nothing to do
4161   }
4162   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4163   if (num_done == th->th.th_team_nproc) {
4164     // we are the last thread, need to free shared resources
4165     int idx = pr_buf->th_doacross_buf_idx - 1;
4166     dispatch_shared_info_t *sh_buf =
4167         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4168     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4169                      (kmp_int64)&sh_buf->doacross_num_done);
4170     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4171     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4172     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4173     sh_buf->doacross_flags = NULL;
4174     sh_buf->doacross_num_done = 0;
4175     sh_buf->doacross_buf_idx +=
4176         __kmp_dispatch_num_buffers; // free buffer for future re-use
4177   }
4178   // free private resources (need to keep buffer index forever)
4179   pr_buf->th_doacross_flags = NULL;
4180   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4181   pr_buf->th_doacross_info = NULL;
4182   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4183 }
4184 #endif
4185 
4186 #if OMP_50_ENABLED
4187 int __kmpc_get_target_offload(void) {
4188   if (!__kmp_init_serial) {
4189     __kmp_serial_initialize();
4190   }
4191   return __kmp_target_offload;
4192 }
4193 
4194 int __kmpc_pause_resource(kmp_pause_status_t level) {
4195   if (!__kmp_init_serial) {
4196     return 1; // Can't pause if runtime is not initialized
4197   }
4198   return __kmp_pause_resource(level);
4199 }
4200 #endif // OMP_50_ENABLED
4201 
4202 // end of file //
4203