1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 #if OMP_40_ENABLED
248 
249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
250                            kmp_int32 proc_bind) {
251   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252                 proc_bind));
253 
254   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
255 }
256 
257 #endif /* OMP_40_ENABLED */
258 
259 /*!
260 @ingroup PARALLEL
261 @param loc  source location information
262 @param argc  total number of arguments in the ellipsis
263 @param microtask  pointer to callback routine consisting of outlined parallel
264 construct
265 @param ...  pointers to shared variables that aren't global
266 
267 Do the actual fork and call the microtask in the relevant number of threads.
268 */
269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
270   int gtid = __kmp_entry_gtid();
271 
272 #if (KMP_STATS_ENABLED)
273   // If we were in a serial region, then stop the serial timer, record
274   // the event, and start parallel region timer
275   stats_state_e previous_state = KMP_GET_THREAD_STATE();
276   if (previous_state == stats_state_e::SERIAL_REGION) {
277     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
278   } else {
279     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
280   }
281   int inParallel = __kmpc_in_parallel(loc);
282   if (inParallel) {
283     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
284   } else {
285     KMP_COUNT_BLOCK(OMP_PARALLEL);
286   }
287 #endif
288 
289   // maybe to save thr_state is enough here
290   {
291     va_list ap;
292     va_start(ap, microtask);
293 
294 #if OMPT_SUPPORT
295     ompt_frame_t *ompt_frame;
296     if (ompt_enabled.enabled) {
297       kmp_info_t *master_th = __kmp_threads[gtid];
298       kmp_team_t *parent_team = master_th->th.th_team;
299       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
300       if (lwt)
301         ompt_frame = &(lwt->ompt_task_info.frame);
302       else {
303         int tid = __kmp_tid_from_gtid(gtid);
304         ompt_frame = &(
305             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
306       }
307       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
308       OMPT_STORE_RETURN_ADDRESS(gtid);
309     }
310 #endif
311 
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_FORKING();
314 #endif
315     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
316                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
317                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
320                     &ap
321 #else
322                     ap
323 #endif
324                     );
325 #if INCLUDE_SSC_MARKS
326     SSC_MARK_JOINING();
327 #endif
328     __kmp_join_call(loc, gtid
329 #if OMPT_SUPPORT
330                     ,
331                     fork_context_intel
332 #endif
333                     );
334 
335     va_end(ap);
336   }
337 
338 #if KMP_STATS_ENABLED
339   if (previous_state == stats_state_e::SERIAL_REGION) {
340     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
341   } else {
342     KMP_POP_PARTITIONED_TIMER();
343   }
344 #endif // KMP_STATS_ENABLED
345 }
346 
347 #if OMP_40_ENABLED
348 /*!
349 @ingroup PARALLEL
350 @param loc source location information
351 @param global_tid global thread number
352 @param num_teams number of teams requested for the teams construct
353 @param num_threads number of threads per team requested for the teams construct
354 
355 Set the number of teams to be used by the teams construct.
356 This call is only required if the teams construct has a `num_teams` clause
357 or a `thread_limit` clause (or both).
358 */
359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
360                            kmp_int32 num_teams, kmp_int32 num_threads) {
361   KA_TRACE(20,
362            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
363             global_tid, num_teams, num_threads));
364 
365   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
366 }
367 
368 /*!
369 @ingroup PARALLEL
370 @param loc  source location information
371 @param argc  total number of arguments in the ellipsis
372 @param microtask  pointer to callback routine consisting of outlined teams
373 construct
374 @param ...  pointers to shared variables that aren't global
375 
376 Do the actual fork and call the microtask in the relevant number of threads.
377 */
378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
379                        ...) {
380   int gtid = __kmp_entry_gtid();
381   kmp_info_t *this_thr = __kmp_threads[gtid];
382   va_list ap;
383   va_start(ap, microtask);
384 
385 #if KMP_STATS_ENABLED
386   KMP_COUNT_BLOCK(OMP_TEAMS);
387   stats_state_e previous_state = KMP_GET_THREAD_STATE();
388   if (previous_state == stats_state_e::SERIAL_REGION) {
389     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
390   } else {
391     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
392   }
393 #endif
394 
395   // remember teams entry point and nesting level
396   this_thr->th.th_teams_microtask = microtask;
397   this_thr->th.th_teams_level =
398       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
399 
400 #if OMPT_SUPPORT
401   kmp_team_t *parent_team = this_thr->th.th_team;
402   int tid = __kmp_tid_from_gtid(gtid);
403   if (ompt_enabled.enabled) {
404     parent_team->t.t_implicit_task_taskdata[tid]
405         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
406   }
407   OMPT_STORE_RETURN_ADDRESS(gtid);
408 #endif
409 
410   // check if __kmpc_push_num_teams called, set default number of teams
411   // otherwise
412   if (this_thr->th.th_teams_size.nteams == 0) {
413     __kmp_push_num_teams(loc, gtid, 0, 0);
414   }
415   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
416   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
417   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
418 
419   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
420                   VOLATILE_CAST(microtask_t)
421                       __kmp_teams_master, // "wrapped" task
422                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
423 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
424                   &ap
425 #else
426                   ap
427 #endif
428                   );
429   __kmp_join_call(loc, gtid
430 #if OMPT_SUPPORT
431                   ,
432                   fork_context_intel
433 #endif
434                   );
435 
436   // Pop current CG root off list
437   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
438   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
439   this_thr->th.th_cg_roots = tmp->up;
440   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
441                  " to node %p. cg_nthreads was %d\n",
442                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
443   __kmp_free(tmp);
444   // Restore current task's thread_limit from CG root
445   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446   this_thr->th.th_current_task->td_icvs.thread_limit =
447       this_thr->th.th_cg_roots->cg_thread_limit;
448 
449   this_thr->th.th_teams_microtask = NULL;
450   this_thr->th.th_teams_level = 0;
451   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
452   va_end(ap);
453 #if KMP_STATS_ENABLED
454   if (previous_state == stats_state_e::SERIAL_REGION) {
455     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
456   } else {
457     KMP_POP_PARTITIONED_TIMER();
458   }
459 #endif // KMP_STATS_ENABLED
460 }
461 #endif /* OMP_40_ENABLED */
462 
463 // I don't think this function should ever have been exported.
464 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
465 // openmp code ever called it, but it's been exported from the RTL for so
466 // long that I'm afraid to remove the definition.
467 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
468 
469 /*!
470 @ingroup PARALLEL
471 @param loc  source location information
472 @param global_tid  global thread number
473 
474 Enter a serialized parallel construct. This interface is used to handle a
475 conditional parallel region, like this,
476 @code
477 #pragma omp parallel if (condition)
478 @endcode
479 when the condition is false.
480 */
481 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482 // The implementation is now in kmp_runtime.cpp so that it can share static
483 // functions with kmp_fork_call since the tasks to be done are similar in
484 // each case.
485 #if OMPT_SUPPORT
486   OMPT_STORE_RETURN_ADDRESS(global_tid);
487 #endif
488   __kmp_serialized_parallel(loc, global_tid);
489 }
490 
491 /*!
492 @ingroup PARALLEL
493 @param loc  source location information
494 @param global_tid  global thread number
495 
496 Leave a serialized parallel construct.
497 */
498 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
499   kmp_internal_control_t *top;
500   kmp_info_t *this_thr;
501   kmp_team_t *serial_team;
502 
503   KC_TRACE(10,
504            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
505 
506   /* skip all this code for autopar serialized loops since it results in
507      unacceptable overhead */
508   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
509     return;
510 
511   // Not autopar code
512   if (!TCR_4(__kmp_init_parallel))
513     __kmp_parallel_initialize();
514 
515 #if OMP_50_ENABLED
516   __kmp_resume_if_soft_paused();
517 #endif
518 
519   this_thr = __kmp_threads[global_tid];
520   serial_team = this_thr->th.th_serial_team;
521 
522 #if OMP_45_ENABLED
523   kmp_task_team_t *task_team = this_thr->th.th_task_team;
524 
525   // we need to wait for the proxy tasks before finishing the thread
526   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
527     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
528 #endif
529 
530   KMP_MB();
531   KMP_DEBUG_ASSERT(serial_team);
532   KMP_ASSERT(serial_team->t.t_serialized);
533   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
534   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
535   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
536   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
537 
538 #if OMPT_SUPPORT
539   if (ompt_enabled.enabled &&
540       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
541     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
542     if (ompt_enabled.ompt_callback_implicit_task) {
543       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
544           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
545           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
546     }
547 
548     // reset clear the task id only after unlinking the task
549     ompt_data_t *parent_task_data;
550     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
551 
552     if (ompt_enabled.ompt_callback_parallel_end) {
553       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
554           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
555           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
556     }
557     __ompt_lw_taskteam_unlink(this_thr);
558     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
559   }
560 #endif
561 
562   /* If necessary, pop the internal control stack values and replace the team
563    * values */
564   top = serial_team->t.t_control_stack_top;
565   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
566     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
567     serial_team->t.t_control_stack_top = top->next;
568     __kmp_free(top);
569   }
570 
571   // if( serial_team -> t.t_serialized > 1 )
572   serial_team->t.t_level--;
573 
574   /* pop dispatch buffers stack */
575   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
576   {
577     dispatch_private_info_t *disp_buffer =
578         serial_team->t.t_dispatch->th_disp_buffer;
579     serial_team->t.t_dispatch->th_disp_buffer =
580         serial_team->t.t_dispatch->th_disp_buffer->next;
581     __kmp_free(disp_buffer);
582   }
583 #if OMP_50_ENABLED
584   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
585 #endif
586 
587   --serial_team->t.t_serialized;
588   if (serial_team->t.t_serialized == 0) {
589 
590 /* return to the parallel section */
591 
592 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
593     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
594       __kmp_clear_x87_fpu_status_word();
595       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
596       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
597     }
598 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
599 
600     this_thr->th.th_team = serial_team->t.t_parent;
601     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
602 
603     /* restore values cached in the thread */
604     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
605     this_thr->th.th_team_master =
606         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
607     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
608 
609     /* TODO the below shouldn't need to be adjusted for serialized teams */
610     this_thr->th.th_dispatch =
611         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
612 
613     __kmp_pop_current_task_from_thread(this_thr);
614 
615     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
616     this_thr->th.th_current_task->td_flags.executing = 1;
617 
618     if (__kmp_tasking_mode != tskm_immediate_exec) {
619       // Copy the task team from the new child / old parent team to the thread.
620       this_thr->th.th_task_team =
621           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
622       KA_TRACE(20,
623                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
624                 "team %p\n",
625                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
626     }
627   } else {
628     if (__kmp_tasking_mode != tskm_immediate_exec) {
629       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
630                     "depth of serial team %p to %d\n",
631                     global_tid, serial_team, serial_team->t.t_serialized));
632     }
633   }
634 
635   if (__kmp_env_consistency_check)
636     __kmp_pop_parallel(global_tid, NULL);
637 #if OMPT_SUPPORT
638   if (ompt_enabled.enabled)
639     this_thr->th.ompt_thread_info.state =
640         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
641                                            : ompt_state_work_parallel);
642 #endif
643 }
644 
645 /*!
646 @ingroup SYNCHRONIZATION
647 @param loc  source location information.
648 
649 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
650 depending on the memory ordering convention obeyed by the compiler
651 even that may not be necessary).
652 */
653 void __kmpc_flush(ident_t *loc) {
654   KC_TRACE(10, ("__kmpc_flush: called\n"));
655 
656   /* need explicit __mf() here since use volatile instead in library */
657   KMP_MB(); /* Flush all pending memory write invalidates.  */
658 
659 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
660 #if KMP_MIC
661 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
662 // We shouldn't need it, though, since the ABI rules require that
663 // * If the compiler generates NGO stores it also generates the fence
664 // * If users hand-code NGO stores they should insert the fence
665 // therefore no incomplete unordered stores should be visible.
666 #else
667   // C74404
668   // This is to address non-temporal store instructions (sfence needed).
669   // The clflush instruction is addressed either (mfence needed).
670   // Probably the non-temporal load monvtdqa instruction should also be
671   // addressed.
672   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
673   if (!__kmp_cpuinfo.initialized) {
674     __kmp_query_cpuid(&__kmp_cpuinfo);
675   }
676   if (!__kmp_cpuinfo.sse2) {
677     // CPU cannot execute SSE2 instructions.
678   } else {
679 #if KMP_COMPILER_ICC
680     _mm_mfence();
681 #elif KMP_COMPILER_MSVC
682     MemoryBarrier();
683 #else
684     __sync_synchronize();
685 #endif // KMP_COMPILER_ICC
686   }
687 #endif // KMP_MIC
688 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
689 // Nothing to see here move along
690 #elif KMP_ARCH_PPC64
691 // Nothing needed here (we have a real MB above).
692 #if KMP_OS_CNK
693   // The flushing thread needs to yield here; this prevents a
694   // busy-waiting thread from saturating the pipeline. flush is
695   // often used in loops like this:
696   // while (!flag) {
697   //   #pragma omp flush(flag)
698   // }
699   // and adding the yield here is good for at least a 10x speedup
700   // when running >2 threads per core (on the NAS LU benchmark).
701   __kmp_yield();
702 #endif
703 #else
704 #error Unknown or unsupported architecture
705 #endif
706 
707 #if OMPT_SUPPORT && OMPT_OPTIONAL
708   if (ompt_enabled.ompt_callback_flush) {
709     ompt_callbacks.ompt_callback(ompt_callback_flush)(
710         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
711   }
712 #endif
713 }
714 
715 /* -------------------------------------------------------------------------- */
716 /*!
717 @ingroup SYNCHRONIZATION
718 @param loc source location information
719 @param global_tid thread id.
720 
721 Execute a barrier.
722 */
723 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
724   KMP_COUNT_BLOCK(OMP_BARRIER);
725   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
726 
727   if (!TCR_4(__kmp_init_parallel))
728     __kmp_parallel_initialize();
729 
730 #if OMP_50_ENABLED
731   __kmp_resume_if_soft_paused();
732 #endif
733 
734   if (__kmp_env_consistency_check) {
735     if (loc == 0) {
736       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
737     }
738 
739     __kmp_check_barrier(global_tid, ct_barrier, loc);
740   }
741 
742 #if OMPT_SUPPORT
743   ompt_frame_t *ompt_frame;
744   if (ompt_enabled.enabled) {
745     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
746     if (ompt_frame->enter_frame.ptr == NULL)
747       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
748     OMPT_STORE_RETURN_ADDRESS(global_tid);
749   }
750 #endif
751   __kmp_threads[global_tid]->th.th_ident = loc;
752   // TODO: explicit barrier_wait_id:
753   //   this function is called when 'barrier' directive is present or
754   //   implicit barrier at the end of a worksharing construct.
755   // 1) better to add a per-thread barrier counter to a thread data structure
756   // 2) set to 0 when a new team is created
757   // 4) no sync is required
758 
759   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
760 #if OMPT_SUPPORT && OMPT_OPTIONAL
761   if (ompt_enabled.enabled) {
762     ompt_frame->enter_frame = ompt_data_none;
763   }
764 #endif
765 }
766 
767 /* The BARRIER for a MASTER section is always explicit   */
768 /*!
769 @ingroup WORK_SHARING
770 @param loc  source location information.
771 @param global_tid  global thread number .
772 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
773 */
774 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
775   int status = 0;
776 
777   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
778 
779   if (!TCR_4(__kmp_init_parallel))
780     __kmp_parallel_initialize();
781 
782 #if OMP_50_ENABLED
783   __kmp_resume_if_soft_paused();
784 #endif
785 
786   if (KMP_MASTER_GTID(global_tid)) {
787     KMP_COUNT_BLOCK(OMP_MASTER);
788     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
789     status = 1;
790   }
791 
792 #if OMPT_SUPPORT && OMPT_OPTIONAL
793   if (status) {
794     if (ompt_enabled.ompt_callback_master) {
795       kmp_info_t *this_thr = __kmp_threads[global_tid];
796       kmp_team_t *team = this_thr->th.th_team;
797 
798       int tid = __kmp_tid_from_gtid(global_tid);
799       ompt_callbacks.ompt_callback(ompt_callback_master)(
800           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
801           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
802           OMPT_GET_RETURN_ADDRESS(0));
803     }
804   }
805 #endif
806 
807   if (__kmp_env_consistency_check) {
808 #if KMP_USE_DYNAMIC_LOCK
809     if (status)
810       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
811     else
812       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
813 #else
814     if (status)
815       __kmp_push_sync(global_tid, ct_master, loc, NULL);
816     else
817       __kmp_check_sync(global_tid, ct_master, loc, NULL);
818 #endif
819   }
820 
821   return status;
822 }
823 
824 /*!
825 @ingroup WORK_SHARING
826 @param loc  source location information.
827 @param global_tid  global thread number .
828 
829 Mark the end of a <tt>master</tt> region. This should only be called by the
830 thread that executes the <tt>master</tt> region.
831 */
832 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
833   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
834 
835   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
836   KMP_POP_PARTITIONED_TIMER();
837 
838 #if OMPT_SUPPORT && OMPT_OPTIONAL
839   kmp_info_t *this_thr = __kmp_threads[global_tid];
840   kmp_team_t *team = this_thr->th.th_team;
841   if (ompt_enabled.ompt_callback_master) {
842     int tid = __kmp_tid_from_gtid(global_tid);
843     ompt_callbacks.ompt_callback(ompt_callback_master)(
844         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
845         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
846         OMPT_GET_RETURN_ADDRESS(0));
847   }
848 #endif
849 
850   if (__kmp_env_consistency_check) {
851     if (global_tid < 0)
852       KMP_WARNING(ThreadIdentInvalid);
853 
854     if (KMP_MASTER_GTID(global_tid))
855       __kmp_pop_sync(global_tid, ct_master, loc);
856   }
857 }
858 
859 /*!
860 @ingroup WORK_SHARING
861 @param loc  source location information.
862 @param gtid  global thread number.
863 
864 Start execution of an <tt>ordered</tt> construct.
865 */
866 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
867   int cid = 0;
868   kmp_info_t *th;
869   KMP_DEBUG_ASSERT(__kmp_init_serial);
870 
871   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
872 
873   if (!TCR_4(__kmp_init_parallel))
874     __kmp_parallel_initialize();
875 
876 #if OMP_50_ENABLED
877   __kmp_resume_if_soft_paused();
878 #endif
879 
880 #if USE_ITT_BUILD
881   __kmp_itt_ordered_prep(gtid);
882 // TODO: ordered_wait_id
883 #endif /* USE_ITT_BUILD */
884 
885   th = __kmp_threads[gtid];
886 
887 #if OMPT_SUPPORT && OMPT_OPTIONAL
888   kmp_team_t *team;
889   ompt_wait_id_t lck;
890   void *codeptr_ra;
891   if (ompt_enabled.enabled) {
892     OMPT_STORE_RETURN_ADDRESS(gtid);
893     team = __kmp_team_from_gtid(gtid);
894     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
895     /* OMPT state update */
896     th->th.ompt_thread_info.wait_id = lck;
897     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
898 
899     /* OMPT event callback */
900     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
901     if (ompt_enabled.ompt_callback_mutex_acquire) {
902       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
903           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
904           (ompt_wait_id_t)lck, codeptr_ra);
905     }
906   }
907 #endif
908 
909   if (th->th.th_dispatch->th_deo_fcn != 0)
910     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
911   else
912     __kmp_parallel_deo(&gtid, &cid, loc);
913 
914 #if OMPT_SUPPORT && OMPT_OPTIONAL
915   if (ompt_enabled.enabled) {
916     /* OMPT state update */
917     th->th.ompt_thread_info.state = ompt_state_work_parallel;
918     th->th.ompt_thread_info.wait_id = 0;
919 
920     /* OMPT event callback */
921     if (ompt_enabled.ompt_callback_mutex_acquired) {
922       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
923           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
924     }
925   }
926 #endif
927 
928 #if USE_ITT_BUILD
929   __kmp_itt_ordered_start(gtid);
930 #endif /* USE_ITT_BUILD */
931 }
932 
933 /*!
934 @ingroup WORK_SHARING
935 @param loc  source location information.
936 @param gtid  global thread number.
937 
938 End execution of an <tt>ordered</tt> construct.
939 */
940 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
941   int cid = 0;
942   kmp_info_t *th;
943 
944   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
945 
946 #if USE_ITT_BUILD
947   __kmp_itt_ordered_end(gtid);
948 // TODO: ordered_wait_id
949 #endif /* USE_ITT_BUILD */
950 
951   th = __kmp_threads[gtid];
952 
953   if (th->th.th_dispatch->th_dxo_fcn != 0)
954     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
955   else
956     __kmp_parallel_dxo(&gtid, &cid, loc);
957 
958 #if OMPT_SUPPORT && OMPT_OPTIONAL
959   OMPT_STORE_RETURN_ADDRESS(gtid);
960   if (ompt_enabled.ompt_callback_mutex_released) {
961     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
962         ompt_mutex_ordered,
963         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
964         OMPT_LOAD_RETURN_ADDRESS(gtid));
965   }
966 #endif
967 }
968 
969 #if KMP_USE_DYNAMIC_LOCK
970 
971 static __forceinline void
972 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
973                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
974   // Pointer to the allocated indirect lock is written to crit, while indexing
975   // is ignored.
976   void *idx;
977   kmp_indirect_lock_t **lck;
978   lck = (kmp_indirect_lock_t **)crit;
979   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
980   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
981   KMP_SET_I_LOCK_LOCATION(ilk, loc);
982   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
983   KA_TRACE(20,
984            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
985 #if USE_ITT_BUILD
986   __kmp_itt_critical_creating(ilk->lock, loc);
987 #endif
988   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
989   if (status == 0) {
990 #if USE_ITT_BUILD
991     __kmp_itt_critical_destroyed(ilk->lock);
992 #endif
993     // We don't really need to destroy the unclaimed lock here since it will be
994     // cleaned up at program exit.
995     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
996   }
997   KMP_DEBUG_ASSERT(*lck != NULL);
998 }
999 
1000 // Fast-path acquire tas lock
1001 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1002   {                                                                            \
1003     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1004     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1005     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1006     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1007         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1008       kmp_uint32 spins;                                                        \
1009       KMP_FSYNC_PREPARE(l);                                                    \
1010       KMP_INIT_YIELD(spins);                                                   \
1011       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1012       do {                                                                     \
1013         if (TCR_4(__kmp_nth) >                                                 \
1014             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1015           KMP_YIELD(TRUE);                                                     \
1016         } else {                                                               \
1017           KMP_YIELD_SPIN(spins);                                               \
1018         }                                                                      \
1019         __kmp_spin_backoff(&backoff);                                          \
1020       } while (                                                                \
1021           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1022           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1023     }                                                                          \
1024     KMP_FSYNC_ACQUIRED(l);                                                     \
1025   }
1026 
1027 // Fast-path test tas lock
1028 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1029   {                                                                            \
1030     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1031     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1032     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1033     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1034          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1035   }
1036 
1037 // Fast-path release tas lock
1038 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1039   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1040 
1041 #if KMP_USE_FUTEX
1042 
1043 #include <sys/syscall.h>
1044 #include <unistd.h>
1045 #ifndef FUTEX_WAIT
1046 #define FUTEX_WAIT 0
1047 #endif
1048 #ifndef FUTEX_WAKE
1049 #define FUTEX_WAKE 1
1050 #endif
1051 
1052 // Fast-path acquire futex lock
1053 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1054   {                                                                            \
1055     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1056     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1057     KMP_MB();                                                                  \
1058     KMP_FSYNC_PREPARE(ftx);                                                    \
1059     kmp_int32 poll_val;                                                        \
1060     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1061                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1062                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1063       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1064       if (!cond) {                                                             \
1065         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1066                                          poll_val |                            \
1067                                              KMP_LOCK_BUSY(1, futex))) {       \
1068           continue;                                                            \
1069         }                                                                      \
1070         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1071       }                                                                        \
1072       kmp_int32 rc;                                                            \
1073       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1074                         NULL, NULL, 0)) != 0) {                                \
1075         continue;                                                              \
1076       }                                                                        \
1077       gtid_code |= 1;                                                          \
1078     }                                                                          \
1079     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1080   }
1081 
1082 // Fast-path test futex lock
1083 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1084   {                                                                            \
1085     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1086     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1087                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1088       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1089       rc = TRUE;                                                               \
1090     } else {                                                                   \
1091       rc = FALSE;                                                              \
1092     }                                                                          \
1093   }
1094 
1095 // Fast-path release futex lock
1096 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1097   {                                                                            \
1098     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1099     KMP_MB();                                                                  \
1100     KMP_FSYNC_RELEASING(ftx);                                                  \
1101     kmp_int32 poll_val =                                                       \
1102         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1103     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1104       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1105               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1106     }                                                                          \
1107     KMP_MB();                                                                  \
1108     KMP_YIELD_OVERSUB();                                                       \
1109   }
1110 
1111 #endif // KMP_USE_FUTEX
1112 
1113 #else // KMP_USE_DYNAMIC_LOCK
1114 
1115 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1116                                                       ident_t const *loc,
1117                                                       kmp_int32 gtid) {
1118   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1119 
1120   // Because of the double-check, the following load doesn't need to be volatile
1121   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1122 
1123   if (lck == NULL) {
1124     void *idx;
1125 
1126     // Allocate & initialize the lock.
1127     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1128     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1129     __kmp_init_user_lock_with_checks(lck);
1130     __kmp_set_user_lock_location(lck, loc);
1131 #if USE_ITT_BUILD
1132     __kmp_itt_critical_creating(lck);
1133 // __kmp_itt_critical_creating() should be called *before* the first usage
1134 // of underlying lock. It is the only place where we can guarantee it. There
1135 // are chances the lock will destroyed with no usage, but it is not a
1136 // problem, because this is not real event seen by user but rather setting
1137 // name for object (lock). See more details in kmp_itt.h.
1138 #endif /* USE_ITT_BUILD */
1139 
1140     // Use a cmpxchg instruction to slam the start of the critical section with
1141     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1142     // and use the lock that the other thread allocated.
1143     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1144 
1145     if (status == 0) {
1146 // Deallocate the lock and reload the value.
1147 #if USE_ITT_BUILD
1148       __kmp_itt_critical_destroyed(lck);
1149 // Let ITT know the lock is destroyed and the same memory location may be reused
1150 // for another purpose.
1151 #endif /* USE_ITT_BUILD */
1152       __kmp_destroy_user_lock_with_checks(lck);
1153       __kmp_user_lock_free(&idx, gtid, lck);
1154       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1155       KMP_DEBUG_ASSERT(lck != NULL);
1156     }
1157   }
1158   return lck;
1159 }
1160 
1161 #endif // KMP_USE_DYNAMIC_LOCK
1162 
1163 /*!
1164 @ingroup WORK_SHARING
1165 @param loc  source location information.
1166 @param global_tid  global thread number .
1167 @param crit identity of the critical section. This could be a pointer to a lock
1168 associated with the critical section, or some other suitably unique value.
1169 
1170 Enter code protected by a `critical` construct.
1171 This function blocks until the executing thread can enter the critical section.
1172 */
1173 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1174                      kmp_critical_name *crit) {
1175 #if KMP_USE_DYNAMIC_LOCK
1176 #if OMPT_SUPPORT && OMPT_OPTIONAL
1177   OMPT_STORE_RETURN_ADDRESS(global_tid);
1178 #endif // OMPT_SUPPORT
1179   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1180 #else
1181   KMP_COUNT_BLOCK(OMP_CRITICAL);
1182 #if OMPT_SUPPORT && OMPT_OPTIONAL
1183   ompt_state_t prev_state = ompt_state_undefined;
1184   ompt_thread_info_t ti;
1185 #endif
1186   kmp_user_lock_p lck;
1187 
1188   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1189 
1190   // TODO: add THR_OVHD_STATE
1191 
1192   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1193   KMP_CHECK_USER_LOCK_INIT();
1194 
1195   if ((__kmp_user_lock_kind == lk_tas) &&
1196       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1197     lck = (kmp_user_lock_p)crit;
1198   }
1199 #if KMP_USE_FUTEX
1200   else if ((__kmp_user_lock_kind == lk_futex) &&
1201            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1202     lck = (kmp_user_lock_p)crit;
1203   }
1204 #endif
1205   else { // ticket, queuing or drdpa
1206     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1207   }
1208 
1209   if (__kmp_env_consistency_check)
1210     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1211 
1212 // since the critical directive binds to all threads, not just the current
1213 // team we have to check this even if we are in a serialized team.
1214 // also, even if we are the uber thread, we still have to conduct the lock,
1215 // as we have to contend with sibling threads.
1216 
1217 #if USE_ITT_BUILD
1218   __kmp_itt_critical_acquiring(lck);
1219 #endif /* USE_ITT_BUILD */
1220 #if OMPT_SUPPORT && OMPT_OPTIONAL
1221   OMPT_STORE_RETURN_ADDRESS(gtid);
1222   void *codeptr_ra = NULL;
1223   if (ompt_enabled.enabled) {
1224     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1225     /* OMPT state update */
1226     prev_state = ti.state;
1227     ti.wait_id = (ompt_wait_id_t)lck;
1228     ti.state = ompt_state_wait_critical;
1229 
1230     /* OMPT event callback */
1231     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1232     if (ompt_enabled.ompt_callback_mutex_acquire) {
1233       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1234           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1235           (ompt_wait_id_t)crit, codeptr_ra);
1236     }
1237   }
1238 #endif
1239   // Value of 'crit' should be good for using as a critical_id of the critical
1240   // section directive.
1241   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1242 
1243 #if USE_ITT_BUILD
1244   __kmp_itt_critical_acquired(lck);
1245 #endif /* USE_ITT_BUILD */
1246 #if OMPT_SUPPORT && OMPT_OPTIONAL
1247   if (ompt_enabled.enabled) {
1248     /* OMPT state update */
1249     ti.state = prev_state;
1250     ti.wait_id = 0;
1251 
1252     /* OMPT event callback */
1253     if (ompt_enabled.ompt_callback_mutex_acquired) {
1254       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1255           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1256     }
1257   }
1258 #endif
1259   KMP_POP_PARTITIONED_TIMER();
1260 
1261   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1262   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1263 #endif // KMP_USE_DYNAMIC_LOCK
1264 }
1265 
1266 #if KMP_USE_DYNAMIC_LOCK
1267 
1268 // Converts the given hint to an internal lock implementation
1269 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1270 #if KMP_USE_TSX
1271 #define KMP_TSX_LOCK(seq) lockseq_##seq
1272 #else
1273 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1274 #endif
1275 
1276 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1277 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1278 #else
1279 #define KMP_CPUINFO_RTM 0
1280 #endif
1281 
1282   // Hints that do not require further logic
1283   if (hint & kmp_lock_hint_hle)
1284     return KMP_TSX_LOCK(hle);
1285   if (hint & kmp_lock_hint_rtm)
1286     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1287   if (hint & kmp_lock_hint_adaptive)
1288     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1289 
1290   // Rule out conflicting hints first by returning the default lock
1291   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1292     return __kmp_user_lock_seq;
1293   if ((hint & omp_lock_hint_speculative) &&
1294       (hint & omp_lock_hint_nonspeculative))
1295     return __kmp_user_lock_seq;
1296 
1297   // Do not even consider speculation when it appears to be contended
1298   if (hint & omp_lock_hint_contended)
1299     return lockseq_queuing;
1300 
1301   // Uncontended lock without speculation
1302   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1303     return lockseq_tas;
1304 
1305   // HLE lock for speculation
1306   if (hint & omp_lock_hint_speculative)
1307     return KMP_TSX_LOCK(hle);
1308 
1309   return __kmp_user_lock_seq;
1310 }
1311 
1312 #if OMPT_SUPPORT && OMPT_OPTIONAL
1313 #if KMP_USE_DYNAMIC_LOCK
1314 static kmp_mutex_impl_t
1315 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1316   if (user_lock) {
1317     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1318     case 0:
1319       break;
1320 #if KMP_USE_FUTEX
1321     case locktag_futex:
1322       return kmp_mutex_impl_queuing;
1323 #endif
1324     case locktag_tas:
1325       return kmp_mutex_impl_spin;
1326 #if KMP_USE_TSX
1327     case locktag_hle:
1328       return kmp_mutex_impl_speculative;
1329 #endif
1330     default:
1331       return kmp_mutex_impl_none;
1332     }
1333     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1334   }
1335   KMP_ASSERT(ilock);
1336   switch (ilock->type) {
1337 #if KMP_USE_TSX
1338   case locktag_adaptive:
1339   case locktag_rtm:
1340     return kmp_mutex_impl_speculative;
1341 #endif
1342   case locktag_nested_tas:
1343     return kmp_mutex_impl_spin;
1344 #if KMP_USE_FUTEX
1345   case locktag_nested_futex:
1346 #endif
1347   case locktag_ticket:
1348   case locktag_queuing:
1349   case locktag_drdpa:
1350   case locktag_nested_ticket:
1351   case locktag_nested_queuing:
1352   case locktag_nested_drdpa:
1353     return kmp_mutex_impl_queuing;
1354   default:
1355     return kmp_mutex_impl_none;
1356   }
1357 }
1358 #else
1359 // For locks without dynamic binding
1360 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1361   switch (__kmp_user_lock_kind) {
1362   case lk_tas:
1363     return kmp_mutex_impl_spin;
1364 #if KMP_USE_FUTEX
1365   case lk_futex:
1366 #endif
1367   case lk_ticket:
1368   case lk_queuing:
1369   case lk_drdpa:
1370     return kmp_mutex_impl_queuing;
1371 #if KMP_USE_TSX
1372   case lk_hle:
1373   case lk_rtm:
1374   case lk_adaptive:
1375     return kmp_mutex_impl_speculative;
1376 #endif
1377   default:
1378     return kmp_mutex_impl_none;
1379   }
1380 }
1381 #endif // KMP_USE_DYNAMIC_LOCK
1382 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1383 
1384 /*!
1385 @ingroup WORK_SHARING
1386 @param loc  source location information.
1387 @param global_tid  global thread number.
1388 @param crit identity of the critical section. This could be a pointer to a lock
1389 associated with the critical section, or some other suitably unique value.
1390 @param hint the lock hint.
1391 
1392 Enter code protected by a `critical` construct with a hint. The hint value is
1393 used to suggest a lock implementation. This function blocks until the executing
1394 thread can enter the critical section unless the hint suggests use of
1395 speculative execution and the hardware supports it.
1396 */
1397 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1398                                kmp_critical_name *crit, uint32_t hint) {
1399   KMP_COUNT_BLOCK(OMP_CRITICAL);
1400   kmp_user_lock_p lck;
1401 #if OMPT_SUPPORT && OMPT_OPTIONAL
1402   ompt_state_t prev_state = ompt_state_undefined;
1403   ompt_thread_info_t ti;
1404   // This is the case, if called from __kmpc_critical:
1405   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1406   if (!codeptr)
1407     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1408 #endif
1409 
1410   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1411 
1412   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1413   // Check if it is initialized.
1414   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1415   if (*lk == 0) {
1416     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1417     if (KMP_IS_D_LOCK(lckseq)) {
1418       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1419                                   KMP_GET_D_TAG(lckseq));
1420     } else {
1421       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1422     }
1423   }
1424   // Branch for accessing the actual lock object and set operation. This
1425   // branching is inevitable since this lock initialization does not follow the
1426   // normal dispatch path (lock table is not used).
1427   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1428     lck = (kmp_user_lock_p)lk;
1429     if (__kmp_env_consistency_check) {
1430       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1431                       __kmp_map_hint_to_lock(hint));
1432     }
1433 #if USE_ITT_BUILD
1434     __kmp_itt_critical_acquiring(lck);
1435 #endif
1436 #if OMPT_SUPPORT && OMPT_OPTIONAL
1437     if (ompt_enabled.enabled) {
1438       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1439       /* OMPT state update */
1440       prev_state = ti.state;
1441       ti.wait_id = (ompt_wait_id_t)lck;
1442       ti.state = ompt_state_wait_critical;
1443 
1444       /* OMPT event callback */
1445       if (ompt_enabled.ompt_callback_mutex_acquire) {
1446         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1447             ompt_mutex_critical, (unsigned int)hint,
1448             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1449       }
1450     }
1451 #endif
1452 #if KMP_USE_INLINED_TAS
1453     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1454       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1455     } else
1456 #elif KMP_USE_INLINED_FUTEX
1457     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1458       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1459     } else
1460 #endif
1461     {
1462       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1463     }
1464   } else {
1465     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1466     lck = ilk->lock;
1467     if (__kmp_env_consistency_check) {
1468       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1469                       __kmp_map_hint_to_lock(hint));
1470     }
1471 #if USE_ITT_BUILD
1472     __kmp_itt_critical_acquiring(lck);
1473 #endif
1474 #if OMPT_SUPPORT && OMPT_OPTIONAL
1475     if (ompt_enabled.enabled) {
1476       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1477       /* OMPT state update */
1478       prev_state = ti.state;
1479       ti.wait_id = (ompt_wait_id_t)lck;
1480       ti.state = ompt_state_wait_critical;
1481 
1482       /* OMPT event callback */
1483       if (ompt_enabled.ompt_callback_mutex_acquire) {
1484         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1485             ompt_mutex_critical, (unsigned int)hint,
1486             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1487       }
1488     }
1489 #endif
1490     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1491   }
1492   KMP_POP_PARTITIONED_TIMER();
1493 
1494 #if USE_ITT_BUILD
1495   __kmp_itt_critical_acquired(lck);
1496 #endif /* USE_ITT_BUILD */
1497 #if OMPT_SUPPORT && OMPT_OPTIONAL
1498   if (ompt_enabled.enabled) {
1499     /* OMPT state update */
1500     ti.state = prev_state;
1501     ti.wait_id = 0;
1502 
1503     /* OMPT event callback */
1504     if (ompt_enabled.ompt_callback_mutex_acquired) {
1505       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1506           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1507     }
1508   }
1509 #endif
1510 
1511   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1512   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1513 } // __kmpc_critical_with_hint
1514 
1515 #endif // KMP_USE_DYNAMIC_LOCK
1516 
1517 /*!
1518 @ingroup WORK_SHARING
1519 @param loc  source location information.
1520 @param global_tid  global thread number .
1521 @param crit identity of the critical section. This could be a pointer to a lock
1522 associated with the critical section, or some other suitably unique value.
1523 
1524 Leave a critical section, releasing any lock that was held during its execution.
1525 */
1526 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1527                          kmp_critical_name *crit) {
1528   kmp_user_lock_p lck;
1529 
1530   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1531 
1532 #if KMP_USE_DYNAMIC_LOCK
1533   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1534     lck = (kmp_user_lock_p)crit;
1535     KMP_ASSERT(lck != NULL);
1536     if (__kmp_env_consistency_check) {
1537       __kmp_pop_sync(global_tid, ct_critical, loc);
1538     }
1539 #if USE_ITT_BUILD
1540     __kmp_itt_critical_releasing(lck);
1541 #endif
1542 #if KMP_USE_INLINED_TAS
1543     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1544       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1545     } else
1546 #elif KMP_USE_INLINED_FUTEX
1547     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1548       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1549     } else
1550 #endif
1551     {
1552       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1553     }
1554   } else {
1555     kmp_indirect_lock_t *ilk =
1556         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1557     KMP_ASSERT(ilk != NULL);
1558     lck = ilk->lock;
1559     if (__kmp_env_consistency_check) {
1560       __kmp_pop_sync(global_tid, ct_critical, loc);
1561     }
1562 #if USE_ITT_BUILD
1563     __kmp_itt_critical_releasing(lck);
1564 #endif
1565     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1566   }
1567 
1568 #else // KMP_USE_DYNAMIC_LOCK
1569 
1570   if ((__kmp_user_lock_kind == lk_tas) &&
1571       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1572     lck = (kmp_user_lock_p)crit;
1573   }
1574 #if KMP_USE_FUTEX
1575   else if ((__kmp_user_lock_kind == lk_futex) &&
1576            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1577     lck = (kmp_user_lock_p)crit;
1578   }
1579 #endif
1580   else { // ticket, queuing or drdpa
1581     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1582   }
1583 
1584   KMP_ASSERT(lck != NULL);
1585 
1586   if (__kmp_env_consistency_check)
1587     __kmp_pop_sync(global_tid, ct_critical, loc);
1588 
1589 #if USE_ITT_BUILD
1590   __kmp_itt_critical_releasing(lck);
1591 #endif /* USE_ITT_BUILD */
1592   // Value of 'crit' should be good for using as a critical_id of the critical
1593   // section directive.
1594   __kmp_release_user_lock_with_checks(lck, global_tid);
1595 
1596 #endif // KMP_USE_DYNAMIC_LOCK
1597 
1598 #if OMPT_SUPPORT && OMPT_OPTIONAL
1599   /* OMPT release event triggers after lock is released; place here to trigger
1600    * for all #if branches */
1601   OMPT_STORE_RETURN_ADDRESS(global_tid);
1602   if (ompt_enabled.ompt_callback_mutex_released) {
1603     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1604         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1605   }
1606 #endif
1607 
1608   KMP_POP_PARTITIONED_TIMER();
1609   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1610 }
1611 
1612 /*!
1613 @ingroup SYNCHRONIZATION
1614 @param loc source location information
1615 @param global_tid thread id.
1616 @return one if the thread should execute the master block, zero otherwise
1617 
1618 Start execution of a combined barrier and master. The barrier is executed inside
1619 this function.
1620 */
1621 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1622   int status;
1623 
1624   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1625 
1626   if (!TCR_4(__kmp_init_parallel))
1627     __kmp_parallel_initialize();
1628 
1629 #if OMP_50_ENABLED
1630   __kmp_resume_if_soft_paused();
1631 #endif
1632 
1633   if (__kmp_env_consistency_check)
1634     __kmp_check_barrier(global_tid, ct_barrier, loc);
1635 
1636 #if OMPT_SUPPORT
1637   ompt_frame_t *ompt_frame;
1638   if (ompt_enabled.enabled) {
1639     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1640     if (ompt_frame->enter_frame.ptr == NULL)
1641       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1642     OMPT_STORE_RETURN_ADDRESS(global_tid);
1643   }
1644 #endif
1645 #if USE_ITT_NOTIFY
1646   __kmp_threads[global_tid]->th.th_ident = loc;
1647 #endif
1648   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1649 #if OMPT_SUPPORT && OMPT_OPTIONAL
1650   if (ompt_enabled.enabled) {
1651     ompt_frame->enter_frame = ompt_data_none;
1652   }
1653 #endif
1654 
1655   return (status != 0) ? 0 : 1;
1656 }
1657 
1658 /*!
1659 @ingroup SYNCHRONIZATION
1660 @param loc source location information
1661 @param global_tid thread id.
1662 
1663 Complete the execution of a combined barrier and master. This function should
1664 only be called at the completion of the <tt>master</tt> code. Other threads will
1665 still be waiting at the barrier and this call releases them.
1666 */
1667 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1668   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1669 
1670   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1671 }
1672 
1673 /*!
1674 @ingroup SYNCHRONIZATION
1675 @param loc source location information
1676 @param global_tid thread id.
1677 @return one if the thread should execute the master block, zero otherwise
1678 
1679 Start execution of a combined barrier and master(nowait) construct.
1680 The barrier is executed inside this function.
1681 There is no equivalent "end" function, since the
1682 */
1683 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1684   kmp_int32 ret;
1685 
1686   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1687 
1688   if (!TCR_4(__kmp_init_parallel))
1689     __kmp_parallel_initialize();
1690 
1691 #if OMP_50_ENABLED
1692   __kmp_resume_if_soft_paused();
1693 #endif
1694 
1695   if (__kmp_env_consistency_check) {
1696     if (loc == 0) {
1697       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1698     }
1699     __kmp_check_barrier(global_tid, ct_barrier, loc);
1700   }
1701 
1702 #if OMPT_SUPPORT
1703   ompt_frame_t *ompt_frame;
1704   if (ompt_enabled.enabled) {
1705     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1706     if (ompt_frame->enter_frame.ptr == NULL)
1707       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1708     OMPT_STORE_RETURN_ADDRESS(global_tid);
1709   }
1710 #endif
1711 #if USE_ITT_NOTIFY
1712   __kmp_threads[global_tid]->th.th_ident = loc;
1713 #endif
1714   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1715 #if OMPT_SUPPORT && OMPT_OPTIONAL
1716   if (ompt_enabled.enabled) {
1717     ompt_frame->enter_frame = ompt_data_none;
1718   }
1719 #endif
1720 
1721   ret = __kmpc_master(loc, global_tid);
1722 
1723   if (__kmp_env_consistency_check) {
1724     /*  there's no __kmpc_end_master called; so the (stats) */
1725     /*  actions of __kmpc_end_master are done here          */
1726 
1727     if (global_tid < 0) {
1728       KMP_WARNING(ThreadIdentInvalid);
1729     }
1730     if (ret) {
1731       /* only one thread should do the pop since only */
1732       /* one did the push (see __kmpc_master())       */
1733 
1734       __kmp_pop_sync(global_tid, ct_master, loc);
1735     }
1736   }
1737 
1738   return (ret);
1739 }
1740 
1741 /* The BARRIER for a SINGLE process section is always explicit   */
1742 /*!
1743 @ingroup WORK_SHARING
1744 @param loc  source location information
1745 @param global_tid  global thread number
1746 @return One if this thread should execute the single construct, zero otherwise.
1747 
1748 Test whether to execute a <tt>single</tt> construct.
1749 There are no implicit barriers in the two "single" calls, rather the compiler
1750 should introduce an explicit barrier if it is required.
1751 */
1752 
1753 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1754   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1755 
1756   if (rc) {
1757     // We are going to execute the single statement, so we should count it.
1758     KMP_COUNT_BLOCK(OMP_SINGLE);
1759     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1760   }
1761 
1762 #if OMPT_SUPPORT && OMPT_OPTIONAL
1763   kmp_info_t *this_thr = __kmp_threads[global_tid];
1764   kmp_team_t *team = this_thr->th.th_team;
1765   int tid = __kmp_tid_from_gtid(global_tid);
1766 
1767   if (ompt_enabled.enabled) {
1768     if (rc) {
1769       if (ompt_enabled.ompt_callback_work) {
1770         ompt_callbacks.ompt_callback(ompt_callback_work)(
1771             ompt_work_single_executor, ompt_scope_begin,
1772             &(team->t.ompt_team_info.parallel_data),
1773             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1774             1, OMPT_GET_RETURN_ADDRESS(0));
1775       }
1776     } else {
1777       if (ompt_enabled.ompt_callback_work) {
1778         ompt_callbacks.ompt_callback(ompt_callback_work)(
1779             ompt_work_single_other, ompt_scope_begin,
1780             &(team->t.ompt_team_info.parallel_data),
1781             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1782             1, OMPT_GET_RETURN_ADDRESS(0));
1783         ompt_callbacks.ompt_callback(ompt_callback_work)(
1784             ompt_work_single_other, ompt_scope_end,
1785             &(team->t.ompt_team_info.parallel_data),
1786             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1787             1, OMPT_GET_RETURN_ADDRESS(0));
1788       }
1789     }
1790   }
1791 #endif
1792 
1793   return rc;
1794 }
1795 
1796 /*!
1797 @ingroup WORK_SHARING
1798 @param loc  source location information
1799 @param global_tid  global thread number
1800 
1801 Mark the end of a <tt>single</tt> construct.  This function should
1802 only be called by the thread that executed the block of code protected
1803 by the `single` construct.
1804 */
1805 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1806   __kmp_exit_single(global_tid);
1807   KMP_POP_PARTITIONED_TIMER();
1808 
1809 #if OMPT_SUPPORT && OMPT_OPTIONAL
1810   kmp_info_t *this_thr = __kmp_threads[global_tid];
1811   kmp_team_t *team = this_thr->th.th_team;
1812   int tid = __kmp_tid_from_gtid(global_tid);
1813 
1814   if (ompt_enabled.ompt_callback_work) {
1815     ompt_callbacks.ompt_callback(ompt_callback_work)(
1816         ompt_work_single_executor, ompt_scope_end,
1817         &(team->t.ompt_team_info.parallel_data),
1818         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1819         OMPT_GET_RETURN_ADDRESS(0));
1820   }
1821 #endif
1822 }
1823 
1824 /*!
1825 @ingroup WORK_SHARING
1826 @param loc Source location
1827 @param global_tid Global thread id
1828 
1829 Mark the end of a statically scheduled loop.
1830 */
1831 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1832   KMP_POP_PARTITIONED_TIMER();
1833   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1834 
1835 #if OMPT_SUPPORT && OMPT_OPTIONAL
1836   if (ompt_enabled.ompt_callback_work) {
1837     ompt_work_t ompt_work_type = ompt_work_loop;
1838     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1839     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1840     // Determine workshare type
1841     if (loc != NULL) {
1842       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1843         ompt_work_type = ompt_work_loop;
1844       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1845         ompt_work_type = ompt_work_sections;
1846       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1847         ompt_work_type = ompt_work_distribute;
1848       } else {
1849         // use default set above.
1850         // a warning about this case is provided in __kmpc_for_static_init
1851       }
1852       KMP_DEBUG_ASSERT(ompt_work_type);
1853     }
1854     ompt_callbacks.ompt_callback(ompt_callback_work)(
1855         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1856         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1857   }
1858 #endif
1859   if (__kmp_env_consistency_check)
1860     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1861 }
1862 
1863 // User routines which take C-style arguments (call by value)
1864 // different from the Fortran equivalent routines
1865 
1866 void ompc_set_num_threads(int arg) {
1867   // !!!!! TODO: check the per-task binding
1868   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1869 }
1870 
1871 void ompc_set_dynamic(int flag) {
1872   kmp_info_t *thread;
1873 
1874   /* For the thread-private implementation of the internal controls */
1875   thread = __kmp_entry_thread();
1876 
1877   __kmp_save_internal_controls(thread);
1878 
1879   set__dynamic(thread, flag ? TRUE : FALSE);
1880 }
1881 
1882 void ompc_set_nested(int flag) {
1883   kmp_info_t *thread;
1884 
1885   /* For the thread-private internal controls implementation */
1886   thread = __kmp_entry_thread();
1887 
1888   __kmp_save_internal_controls(thread);
1889 
1890   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1891 }
1892 
1893 void ompc_set_max_active_levels(int max_active_levels) {
1894   /* TO DO */
1895   /* we want per-task implementation of this internal control */
1896 
1897   /* For the per-thread internal controls implementation */
1898   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1899 }
1900 
1901 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1902   // !!!!! TODO: check the per-task binding
1903   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1904 }
1905 
1906 int ompc_get_ancestor_thread_num(int level) {
1907   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1908 }
1909 
1910 int ompc_get_team_size(int level) {
1911   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1912 }
1913 
1914 #if OMP_50_ENABLED
1915 /* OpenMP 5.0 Affinity Format API */
1916 
1917 void ompc_set_affinity_format(char const *format) {
1918   if (!__kmp_init_serial) {
1919     __kmp_serial_initialize();
1920   }
1921   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1922                          format, KMP_STRLEN(format) + 1);
1923 }
1924 
1925 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1926   size_t format_size;
1927   if (!__kmp_init_serial) {
1928     __kmp_serial_initialize();
1929   }
1930   format_size = KMP_STRLEN(__kmp_affinity_format);
1931   if (buffer && size) {
1932     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1933                            format_size + 1);
1934   }
1935   return format_size;
1936 }
1937 
1938 void ompc_display_affinity(char const *format) {
1939   int gtid;
1940   if (!TCR_4(__kmp_init_middle)) {
1941     __kmp_middle_initialize();
1942   }
1943   gtid = __kmp_get_gtid();
1944   __kmp_aux_display_affinity(gtid, format);
1945 }
1946 
1947 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1948                              char const *format) {
1949   int gtid;
1950   size_t num_required;
1951   kmp_str_buf_t capture_buf;
1952   if (!TCR_4(__kmp_init_middle)) {
1953     __kmp_middle_initialize();
1954   }
1955   gtid = __kmp_get_gtid();
1956   __kmp_str_buf_init(&capture_buf);
1957   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1958   if (buffer && buf_size) {
1959     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1960                            capture_buf.used + 1);
1961   }
1962   __kmp_str_buf_free(&capture_buf);
1963   return num_required;
1964 }
1965 #endif /* OMP_50_ENABLED */
1966 
1967 void kmpc_set_stacksize(int arg) {
1968   // __kmp_aux_set_stacksize initializes the library if needed
1969   __kmp_aux_set_stacksize(arg);
1970 }
1971 
1972 void kmpc_set_stacksize_s(size_t arg) {
1973   // __kmp_aux_set_stacksize initializes the library if needed
1974   __kmp_aux_set_stacksize(arg);
1975 }
1976 
1977 void kmpc_set_blocktime(int arg) {
1978   int gtid, tid;
1979   kmp_info_t *thread;
1980 
1981   gtid = __kmp_entry_gtid();
1982   tid = __kmp_tid_from_gtid(gtid);
1983   thread = __kmp_thread_from_gtid(gtid);
1984 
1985   __kmp_aux_set_blocktime(arg, thread, tid);
1986 }
1987 
1988 void kmpc_set_library(int arg) {
1989   // __kmp_user_set_library initializes the library if needed
1990   __kmp_user_set_library((enum library_type)arg);
1991 }
1992 
1993 void kmpc_set_defaults(char const *str) {
1994   // __kmp_aux_set_defaults initializes the library if needed
1995   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1996 }
1997 
1998 void kmpc_set_disp_num_buffers(int arg) {
1999   // ignore after initialization because some teams have already
2000   // allocated dispatch buffers
2001   if (__kmp_init_serial == 0 && arg > 0)
2002     __kmp_dispatch_num_buffers = arg;
2003 }
2004 
2005 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2006 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2007   return -1;
2008 #else
2009   if (!TCR_4(__kmp_init_middle)) {
2010     __kmp_middle_initialize();
2011   }
2012   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2013 #endif
2014 }
2015 
2016 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2017 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2018   return -1;
2019 #else
2020   if (!TCR_4(__kmp_init_middle)) {
2021     __kmp_middle_initialize();
2022   }
2023   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2024 #endif
2025 }
2026 
2027 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2028 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2029   return -1;
2030 #else
2031   if (!TCR_4(__kmp_init_middle)) {
2032     __kmp_middle_initialize();
2033   }
2034   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2035 #endif
2036 }
2037 
2038 /* -------------------------------------------------------------------------- */
2039 /*!
2040 @ingroup THREADPRIVATE
2041 @param loc       source location information
2042 @param gtid      global thread number
2043 @param cpy_size  size of the cpy_data buffer
2044 @param cpy_data  pointer to data to be copied
2045 @param cpy_func  helper function to call for copying data
2046 @param didit     flag variable: 1=single thread; 0=not single thread
2047 
2048 __kmpc_copyprivate implements the interface for the private data broadcast
2049 needed for the copyprivate clause associated with a single region in an
2050 OpenMP<sup>*</sup> program (both C and Fortran).
2051 All threads participating in the parallel region call this routine.
2052 One of the threads (called the single thread) should have the <tt>didit</tt>
2053 variable set to 1 and all other threads should have that variable set to 0.
2054 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2055 
2056 The OpenMP specification forbids the use of nowait on the single region when a
2057 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2058 barrier internally to avoid race conditions, so the code generation for the
2059 single region should avoid generating a barrier after the call to @ref
2060 __kmpc_copyprivate.
2061 
2062 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2063 The <tt>loc</tt> parameter is a pointer to source location information.
2064 
2065 Internal implementation: The single thread will first copy its descriptor
2066 address (cpy_data) to a team-private location, then the other threads will each
2067 call the function pointed to by the parameter cpy_func, which carries out the
2068 copy by copying the data using the cpy_data buffer.
2069 
2070 The cpy_func routine used for the copy and the contents of the data area defined
2071 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2072 to be done. For instance, the cpy_data buffer can hold the actual data to be
2073 copied or it may hold a list of pointers to the data. The cpy_func routine must
2074 interpret the cpy_data buffer appropriately.
2075 
2076 The interface to cpy_func is as follows:
2077 @code
2078 void cpy_func( void *destination, void *source )
2079 @endcode
2080 where void *destination is the cpy_data pointer for the thread being copied to
2081 and void *source is the cpy_data pointer for the thread being copied from.
2082 */
2083 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2084                         void *cpy_data, void (*cpy_func)(void *, void *),
2085                         kmp_int32 didit) {
2086   void **data_ptr;
2087 
2088   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2089 
2090   KMP_MB();
2091 
2092   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2093 
2094   if (__kmp_env_consistency_check) {
2095     if (loc == 0) {
2096       KMP_WARNING(ConstructIdentInvalid);
2097     }
2098   }
2099 
2100   // ToDo: Optimize the following two barriers into some kind of split barrier
2101 
2102   if (didit)
2103     *data_ptr = cpy_data;
2104 
2105 #if OMPT_SUPPORT
2106   ompt_frame_t *ompt_frame;
2107   if (ompt_enabled.enabled) {
2108     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2109     if (ompt_frame->enter_frame.ptr == NULL)
2110       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2111     OMPT_STORE_RETURN_ADDRESS(gtid);
2112   }
2113 #endif
2114 /* This barrier is not a barrier region boundary */
2115 #if USE_ITT_NOTIFY
2116   __kmp_threads[gtid]->th.th_ident = loc;
2117 #endif
2118   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2119 
2120   if (!didit)
2121     (*cpy_func)(cpy_data, *data_ptr);
2122 
2123 // Consider next barrier a user-visible barrier for barrier region boundaries
2124 // Nesting checks are already handled by the single construct checks
2125 
2126 #if OMPT_SUPPORT
2127   if (ompt_enabled.enabled) {
2128     OMPT_STORE_RETURN_ADDRESS(gtid);
2129   }
2130 #endif
2131 #if USE_ITT_NOTIFY
2132   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2133 // tasks can overwrite the location)
2134 #endif
2135   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2136 #if OMPT_SUPPORT && OMPT_OPTIONAL
2137   if (ompt_enabled.enabled) {
2138     ompt_frame->enter_frame = ompt_data_none;
2139   }
2140 #endif
2141 }
2142 
2143 /* -------------------------------------------------------------------------- */
2144 
2145 #define INIT_LOCK __kmp_init_user_lock_with_checks
2146 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2147 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2148 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2149 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2150 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2151   __kmp_acquire_nested_user_lock_with_checks_timed
2152 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2153 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2154 #define TEST_LOCK __kmp_test_user_lock_with_checks
2155 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2156 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2157 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2158 
2159 // TODO: Make check abort messages use location info & pass it into
2160 // with_checks routines
2161 
2162 #if KMP_USE_DYNAMIC_LOCK
2163 
2164 // internal lock initializer
2165 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2166                                                     kmp_dyna_lockseq_t seq) {
2167   if (KMP_IS_D_LOCK(seq)) {
2168     KMP_INIT_D_LOCK(lock, seq);
2169 #if USE_ITT_BUILD
2170     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2171 #endif
2172   } else {
2173     KMP_INIT_I_LOCK(lock, seq);
2174 #if USE_ITT_BUILD
2175     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2176     __kmp_itt_lock_creating(ilk->lock, loc);
2177 #endif
2178   }
2179 }
2180 
2181 // internal nest lock initializer
2182 static __forceinline void
2183 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2184                                kmp_dyna_lockseq_t seq) {
2185 #if KMP_USE_TSX
2186   // Don't have nested lock implementation for speculative locks
2187   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2188     seq = __kmp_user_lock_seq;
2189 #endif
2190   switch (seq) {
2191   case lockseq_tas:
2192     seq = lockseq_nested_tas;
2193     break;
2194 #if KMP_USE_FUTEX
2195   case lockseq_futex:
2196     seq = lockseq_nested_futex;
2197     break;
2198 #endif
2199   case lockseq_ticket:
2200     seq = lockseq_nested_ticket;
2201     break;
2202   case lockseq_queuing:
2203     seq = lockseq_nested_queuing;
2204     break;
2205   case lockseq_drdpa:
2206     seq = lockseq_nested_drdpa;
2207     break;
2208   default:
2209     seq = lockseq_nested_queuing;
2210   }
2211   KMP_INIT_I_LOCK(lock, seq);
2212 #if USE_ITT_BUILD
2213   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2214   __kmp_itt_lock_creating(ilk->lock, loc);
2215 #endif
2216 }
2217 
2218 /* initialize the lock with a hint */
2219 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2220                                 uintptr_t hint) {
2221   KMP_DEBUG_ASSERT(__kmp_init_serial);
2222   if (__kmp_env_consistency_check && user_lock == NULL) {
2223     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2224   }
2225 
2226   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2227 
2228 #if OMPT_SUPPORT && OMPT_OPTIONAL
2229   // This is the case, if called from omp_init_lock_with_hint:
2230   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2231   if (!codeptr)
2232     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2233   if (ompt_enabled.ompt_callback_lock_init) {
2234     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2235         ompt_mutex_lock, (omp_lock_hint_t)hint,
2236         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2237         codeptr);
2238   }
2239 #endif
2240 }
2241 
2242 /* initialize the lock with a hint */
2243 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2244                                      void **user_lock, uintptr_t hint) {
2245   KMP_DEBUG_ASSERT(__kmp_init_serial);
2246   if (__kmp_env_consistency_check && user_lock == NULL) {
2247     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2248   }
2249 
2250   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2251 
2252 #if OMPT_SUPPORT && OMPT_OPTIONAL
2253   // This is the case, if called from omp_init_lock_with_hint:
2254   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2255   if (!codeptr)
2256     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2257   if (ompt_enabled.ompt_callback_lock_init) {
2258     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2259         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2260         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2261         codeptr);
2262   }
2263 #endif
2264 }
2265 
2266 #endif // KMP_USE_DYNAMIC_LOCK
2267 
2268 /* initialize the lock */
2269 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2270 #if KMP_USE_DYNAMIC_LOCK
2271 
2272   KMP_DEBUG_ASSERT(__kmp_init_serial);
2273   if (__kmp_env_consistency_check && user_lock == NULL) {
2274     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2275   }
2276   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2277 
2278 #if OMPT_SUPPORT && OMPT_OPTIONAL
2279   // This is the case, if called from omp_init_lock_with_hint:
2280   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2281   if (!codeptr)
2282     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2283   if (ompt_enabled.ompt_callback_lock_init) {
2284     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2285         ompt_mutex_lock, omp_lock_hint_none,
2286         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2287         codeptr);
2288   }
2289 #endif
2290 
2291 #else // KMP_USE_DYNAMIC_LOCK
2292 
2293   static char const *const func = "omp_init_lock";
2294   kmp_user_lock_p lck;
2295   KMP_DEBUG_ASSERT(__kmp_init_serial);
2296 
2297   if (__kmp_env_consistency_check) {
2298     if (user_lock == NULL) {
2299       KMP_FATAL(LockIsUninitialized, func);
2300     }
2301   }
2302 
2303   KMP_CHECK_USER_LOCK_INIT();
2304 
2305   if ((__kmp_user_lock_kind == lk_tas) &&
2306       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2307     lck = (kmp_user_lock_p)user_lock;
2308   }
2309 #if KMP_USE_FUTEX
2310   else if ((__kmp_user_lock_kind == lk_futex) &&
2311            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2312     lck = (kmp_user_lock_p)user_lock;
2313   }
2314 #endif
2315   else {
2316     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2317   }
2318   INIT_LOCK(lck);
2319   __kmp_set_user_lock_location(lck, loc);
2320 
2321 #if OMPT_SUPPORT && OMPT_OPTIONAL
2322   // This is the case, if called from omp_init_lock_with_hint:
2323   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2324   if (!codeptr)
2325     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2326   if (ompt_enabled.ompt_callback_lock_init) {
2327     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2328         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2329         (ompt_wait_id_t)user_lock, codeptr);
2330   }
2331 #endif
2332 
2333 #if USE_ITT_BUILD
2334   __kmp_itt_lock_creating(lck);
2335 #endif /* USE_ITT_BUILD */
2336 
2337 #endif // KMP_USE_DYNAMIC_LOCK
2338 } // __kmpc_init_lock
2339 
2340 /* initialize the lock */
2341 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2342 #if KMP_USE_DYNAMIC_LOCK
2343 
2344   KMP_DEBUG_ASSERT(__kmp_init_serial);
2345   if (__kmp_env_consistency_check && user_lock == NULL) {
2346     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2347   }
2348   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2349 
2350 #if OMPT_SUPPORT && OMPT_OPTIONAL
2351   // This is the case, if called from omp_init_lock_with_hint:
2352   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2353   if (!codeptr)
2354     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2355   if (ompt_enabled.ompt_callback_lock_init) {
2356     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2357         ompt_mutex_nest_lock, omp_lock_hint_none,
2358         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2359         codeptr);
2360   }
2361 #endif
2362 
2363 #else // KMP_USE_DYNAMIC_LOCK
2364 
2365   static char const *const func = "omp_init_nest_lock";
2366   kmp_user_lock_p lck;
2367   KMP_DEBUG_ASSERT(__kmp_init_serial);
2368 
2369   if (__kmp_env_consistency_check) {
2370     if (user_lock == NULL) {
2371       KMP_FATAL(LockIsUninitialized, func);
2372     }
2373   }
2374 
2375   KMP_CHECK_USER_LOCK_INIT();
2376 
2377   if ((__kmp_user_lock_kind == lk_tas) &&
2378       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2379        OMP_NEST_LOCK_T_SIZE)) {
2380     lck = (kmp_user_lock_p)user_lock;
2381   }
2382 #if KMP_USE_FUTEX
2383   else if ((__kmp_user_lock_kind == lk_futex) &&
2384            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2385             OMP_NEST_LOCK_T_SIZE)) {
2386     lck = (kmp_user_lock_p)user_lock;
2387   }
2388 #endif
2389   else {
2390     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2391   }
2392 
2393   INIT_NESTED_LOCK(lck);
2394   __kmp_set_user_lock_location(lck, loc);
2395 
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397   // This is the case, if called from omp_init_lock_with_hint:
2398   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2399   if (!codeptr)
2400     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2401   if (ompt_enabled.ompt_callback_lock_init) {
2402     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2403         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2404         (ompt_wait_id_t)user_lock, codeptr);
2405   }
2406 #endif
2407 
2408 #if USE_ITT_BUILD
2409   __kmp_itt_lock_creating(lck);
2410 #endif /* USE_ITT_BUILD */
2411 
2412 #endif // KMP_USE_DYNAMIC_LOCK
2413 } // __kmpc_init_nest_lock
2414 
2415 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2416 #if KMP_USE_DYNAMIC_LOCK
2417 
2418 #if USE_ITT_BUILD
2419   kmp_user_lock_p lck;
2420   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2421     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2422   } else {
2423     lck = (kmp_user_lock_p)user_lock;
2424   }
2425   __kmp_itt_lock_destroyed(lck);
2426 #endif
2427 #if OMPT_SUPPORT && OMPT_OPTIONAL
2428   // This is the case, if called from omp_init_lock_with_hint:
2429   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2430   if (!codeptr)
2431     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2432   if (ompt_enabled.ompt_callback_lock_destroy) {
2433     kmp_user_lock_p lck;
2434     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2435       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2436     } else {
2437       lck = (kmp_user_lock_p)user_lock;
2438     }
2439     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2440         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2441   }
2442 #endif
2443   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2444 #else
2445   kmp_user_lock_p lck;
2446 
2447   if ((__kmp_user_lock_kind == lk_tas) &&
2448       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2449     lck = (kmp_user_lock_p)user_lock;
2450   }
2451 #if KMP_USE_FUTEX
2452   else if ((__kmp_user_lock_kind == lk_futex) &&
2453            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2454     lck = (kmp_user_lock_p)user_lock;
2455   }
2456 #endif
2457   else {
2458     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2459   }
2460 
2461 #if OMPT_SUPPORT && OMPT_OPTIONAL
2462   // This is the case, if called from omp_init_lock_with_hint:
2463   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2464   if (!codeptr)
2465     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2466   if (ompt_enabled.ompt_callback_lock_destroy) {
2467     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2468         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2469   }
2470 #endif
2471 
2472 #if USE_ITT_BUILD
2473   __kmp_itt_lock_destroyed(lck);
2474 #endif /* USE_ITT_BUILD */
2475   DESTROY_LOCK(lck);
2476 
2477   if ((__kmp_user_lock_kind == lk_tas) &&
2478       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2479     ;
2480   }
2481 #if KMP_USE_FUTEX
2482   else if ((__kmp_user_lock_kind == lk_futex) &&
2483            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2484     ;
2485   }
2486 #endif
2487   else {
2488     __kmp_user_lock_free(user_lock, gtid, lck);
2489   }
2490 #endif // KMP_USE_DYNAMIC_LOCK
2491 } // __kmpc_destroy_lock
2492 
2493 /* destroy the lock */
2494 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2495 #if KMP_USE_DYNAMIC_LOCK
2496 
2497 #if USE_ITT_BUILD
2498   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2499   __kmp_itt_lock_destroyed(ilk->lock);
2500 #endif
2501 #if OMPT_SUPPORT && OMPT_OPTIONAL
2502   // This is the case, if called from omp_init_lock_with_hint:
2503   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2504   if (!codeptr)
2505     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2506   if (ompt_enabled.ompt_callback_lock_destroy) {
2507     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2508         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2509   }
2510 #endif
2511   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2512 
2513 #else // KMP_USE_DYNAMIC_LOCK
2514 
2515   kmp_user_lock_p lck;
2516 
2517   if ((__kmp_user_lock_kind == lk_tas) &&
2518       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2519        OMP_NEST_LOCK_T_SIZE)) {
2520     lck = (kmp_user_lock_p)user_lock;
2521   }
2522 #if KMP_USE_FUTEX
2523   else if ((__kmp_user_lock_kind == lk_futex) &&
2524            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2525             OMP_NEST_LOCK_T_SIZE)) {
2526     lck = (kmp_user_lock_p)user_lock;
2527   }
2528 #endif
2529   else {
2530     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2531   }
2532 
2533 #if OMPT_SUPPORT && OMPT_OPTIONAL
2534   // This is the case, if called from omp_init_lock_with_hint:
2535   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2536   if (!codeptr)
2537     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2538   if (ompt_enabled.ompt_callback_lock_destroy) {
2539     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2540         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2541   }
2542 #endif
2543 
2544 #if USE_ITT_BUILD
2545   __kmp_itt_lock_destroyed(lck);
2546 #endif /* USE_ITT_BUILD */
2547 
2548   DESTROY_NESTED_LOCK(lck);
2549 
2550   if ((__kmp_user_lock_kind == lk_tas) &&
2551       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2552        OMP_NEST_LOCK_T_SIZE)) {
2553     ;
2554   }
2555 #if KMP_USE_FUTEX
2556   else if ((__kmp_user_lock_kind == lk_futex) &&
2557            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2558             OMP_NEST_LOCK_T_SIZE)) {
2559     ;
2560   }
2561 #endif
2562   else {
2563     __kmp_user_lock_free(user_lock, gtid, lck);
2564   }
2565 #endif // KMP_USE_DYNAMIC_LOCK
2566 } // __kmpc_destroy_nest_lock
2567 
2568 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2569   KMP_COUNT_BLOCK(OMP_set_lock);
2570 #if KMP_USE_DYNAMIC_LOCK
2571   int tag = KMP_EXTRACT_D_TAG(user_lock);
2572 #if USE_ITT_BUILD
2573   __kmp_itt_lock_acquiring(
2574       (kmp_user_lock_p)
2575           user_lock); // itt function will get to the right lock object.
2576 #endif
2577 #if OMPT_SUPPORT && OMPT_OPTIONAL
2578   // This is the case, if called from omp_init_lock_with_hint:
2579   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2580   if (!codeptr)
2581     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2582   if (ompt_enabled.ompt_callback_mutex_acquire) {
2583     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2584         ompt_mutex_lock, omp_lock_hint_none,
2585         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2586         codeptr);
2587   }
2588 #endif
2589 #if KMP_USE_INLINED_TAS
2590   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2591     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2592   } else
2593 #elif KMP_USE_INLINED_FUTEX
2594   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2595     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2596   } else
2597 #endif
2598   {
2599     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2600   }
2601 #if USE_ITT_BUILD
2602   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2603 #endif
2604 #if OMPT_SUPPORT && OMPT_OPTIONAL
2605   if (ompt_enabled.ompt_callback_mutex_acquired) {
2606     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2607         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2608   }
2609 #endif
2610 
2611 #else // KMP_USE_DYNAMIC_LOCK
2612 
2613   kmp_user_lock_p lck;
2614 
2615   if ((__kmp_user_lock_kind == lk_tas) &&
2616       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2617     lck = (kmp_user_lock_p)user_lock;
2618   }
2619 #if KMP_USE_FUTEX
2620   else if ((__kmp_user_lock_kind == lk_futex) &&
2621            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2622     lck = (kmp_user_lock_p)user_lock;
2623   }
2624 #endif
2625   else {
2626     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2627   }
2628 
2629 #if USE_ITT_BUILD
2630   __kmp_itt_lock_acquiring(lck);
2631 #endif /* USE_ITT_BUILD */
2632 #if OMPT_SUPPORT && OMPT_OPTIONAL
2633   // This is the case, if called from omp_init_lock_with_hint:
2634   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2635   if (!codeptr)
2636     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2637   if (ompt_enabled.ompt_callback_mutex_acquire) {
2638     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2639         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2640         (ompt_wait_id_t)lck, codeptr);
2641   }
2642 #endif
2643 
2644   ACQUIRE_LOCK(lck, gtid);
2645 
2646 #if USE_ITT_BUILD
2647   __kmp_itt_lock_acquired(lck);
2648 #endif /* USE_ITT_BUILD */
2649 
2650 #if OMPT_SUPPORT && OMPT_OPTIONAL
2651   if (ompt_enabled.ompt_callback_mutex_acquired) {
2652     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2653         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2654   }
2655 #endif
2656 
2657 #endif // KMP_USE_DYNAMIC_LOCK
2658 }
2659 
2660 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2661 #if KMP_USE_DYNAMIC_LOCK
2662 
2663 #if USE_ITT_BUILD
2664   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2665 #endif
2666 #if OMPT_SUPPORT && OMPT_OPTIONAL
2667   // This is the case, if called from omp_init_lock_with_hint:
2668   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2669   if (!codeptr)
2670     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2671   if (ompt_enabled.enabled) {
2672     if (ompt_enabled.ompt_callback_mutex_acquire) {
2673       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2674           ompt_mutex_nest_lock, omp_lock_hint_none,
2675           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2676           codeptr);
2677     }
2678   }
2679 #endif
2680   int acquire_status =
2681       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2682   (void) acquire_status;
2683 #if USE_ITT_BUILD
2684   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2685 #endif
2686 
2687 #if OMPT_SUPPORT && OMPT_OPTIONAL
2688   if (ompt_enabled.enabled) {
2689     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2690       if (ompt_enabled.ompt_callback_mutex_acquired) {
2691         // lock_first
2692         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2693             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2694       }
2695     } else {
2696       if (ompt_enabled.ompt_callback_nest_lock) {
2697         // lock_next
2698         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2699             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2700       }
2701     }
2702   }
2703 #endif
2704 
2705 #else // KMP_USE_DYNAMIC_LOCK
2706   int acquire_status;
2707   kmp_user_lock_p lck;
2708 
2709   if ((__kmp_user_lock_kind == lk_tas) &&
2710       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2711        OMP_NEST_LOCK_T_SIZE)) {
2712     lck = (kmp_user_lock_p)user_lock;
2713   }
2714 #if KMP_USE_FUTEX
2715   else if ((__kmp_user_lock_kind == lk_futex) &&
2716            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2717             OMP_NEST_LOCK_T_SIZE)) {
2718     lck = (kmp_user_lock_p)user_lock;
2719   }
2720 #endif
2721   else {
2722     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2723   }
2724 
2725 #if USE_ITT_BUILD
2726   __kmp_itt_lock_acquiring(lck);
2727 #endif /* USE_ITT_BUILD */
2728 #if OMPT_SUPPORT && OMPT_OPTIONAL
2729   // This is the case, if called from omp_init_lock_with_hint:
2730   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2731   if (!codeptr)
2732     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2733   if (ompt_enabled.enabled) {
2734     if (ompt_enabled.ompt_callback_mutex_acquire) {
2735       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2736           ompt_mutex_nest_lock, omp_lock_hint_none,
2737           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2738     }
2739   }
2740 #endif
2741 
2742   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2743 
2744 #if USE_ITT_BUILD
2745   __kmp_itt_lock_acquired(lck);
2746 #endif /* USE_ITT_BUILD */
2747 
2748 #if OMPT_SUPPORT && OMPT_OPTIONAL
2749   if (ompt_enabled.enabled) {
2750     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2751       if (ompt_enabled.ompt_callback_mutex_acquired) {
2752         // lock_first
2753         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2754             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2755       }
2756     } else {
2757       if (ompt_enabled.ompt_callback_nest_lock) {
2758         // lock_next
2759         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2760             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2761       }
2762     }
2763   }
2764 #endif
2765 
2766 #endif // KMP_USE_DYNAMIC_LOCK
2767 }
2768 
2769 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2770 #if KMP_USE_DYNAMIC_LOCK
2771 
2772   int tag = KMP_EXTRACT_D_TAG(user_lock);
2773 #if USE_ITT_BUILD
2774   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2775 #endif
2776 #if KMP_USE_INLINED_TAS
2777   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2778     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2779   } else
2780 #elif KMP_USE_INLINED_FUTEX
2781   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2782     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2783   } else
2784 #endif
2785   {
2786     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2787   }
2788 
2789 #if OMPT_SUPPORT && OMPT_OPTIONAL
2790   // This is the case, if called from omp_init_lock_with_hint:
2791   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2792   if (!codeptr)
2793     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2794   if (ompt_enabled.ompt_callback_mutex_released) {
2795     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2796         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2797   }
2798 #endif
2799 
2800 #else // KMP_USE_DYNAMIC_LOCK
2801 
2802   kmp_user_lock_p lck;
2803 
2804   /* Can't use serial interval since not block structured */
2805   /* release the lock */
2806 
2807   if ((__kmp_user_lock_kind == lk_tas) &&
2808       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2809 #if KMP_OS_LINUX &&                                                            \
2810     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2811 // "fast" path implemented to fix customer performance issue
2812 #if USE_ITT_BUILD
2813     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2814 #endif /* USE_ITT_BUILD */
2815     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2816     KMP_MB();
2817 
2818 #if OMPT_SUPPORT && OMPT_OPTIONAL
2819     // This is the case, if called from omp_init_lock_with_hint:
2820     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2821     if (!codeptr)
2822       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2823     if (ompt_enabled.ompt_callback_mutex_released) {
2824       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2825           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2826     }
2827 #endif
2828 
2829     return;
2830 #else
2831     lck = (kmp_user_lock_p)user_lock;
2832 #endif
2833   }
2834 #if KMP_USE_FUTEX
2835   else if ((__kmp_user_lock_kind == lk_futex) &&
2836            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2837     lck = (kmp_user_lock_p)user_lock;
2838   }
2839 #endif
2840   else {
2841     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2842   }
2843 
2844 #if USE_ITT_BUILD
2845   __kmp_itt_lock_releasing(lck);
2846 #endif /* USE_ITT_BUILD */
2847 
2848   RELEASE_LOCK(lck, gtid);
2849 
2850 #if OMPT_SUPPORT && OMPT_OPTIONAL
2851   // This is the case, if called from omp_init_lock_with_hint:
2852   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2853   if (!codeptr)
2854     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2855   if (ompt_enabled.ompt_callback_mutex_released) {
2856     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2857         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2858   }
2859 #endif
2860 
2861 #endif // KMP_USE_DYNAMIC_LOCK
2862 }
2863 
2864 /* release the lock */
2865 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2866 #if KMP_USE_DYNAMIC_LOCK
2867 
2868 #if USE_ITT_BUILD
2869   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2870 #endif
2871   int release_status =
2872       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2873   (void) release_status;
2874 
2875 #if OMPT_SUPPORT && OMPT_OPTIONAL
2876   // This is the case, if called from omp_init_lock_with_hint:
2877   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2878   if (!codeptr)
2879     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2880   if (ompt_enabled.enabled) {
2881     if (release_status == KMP_LOCK_RELEASED) {
2882       if (ompt_enabled.ompt_callback_mutex_released) {
2883         // release_lock_last
2884         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2885             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2886       }
2887     } else if (ompt_enabled.ompt_callback_nest_lock) {
2888       // release_lock_prev
2889       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2890           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2891     }
2892   }
2893 #endif
2894 
2895 #else // KMP_USE_DYNAMIC_LOCK
2896 
2897   kmp_user_lock_p lck;
2898 
2899   /* Can't use serial interval since not block structured */
2900 
2901   if ((__kmp_user_lock_kind == lk_tas) &&
2902       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2903        OMP_NEST_LOCK_T_SIZE)) {
2904 #if KMP_OS_LINUX &&                                                            \
2905     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2906     // "fast" path implemented to fix customer performance issue
2907     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2908 #if USE_ITT_BUILD
2909     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2910 #endif /* USE_ITT_BUILD */
2911 
2912 #if OMPT_SUPPORT && OMPT_OPTIONAL
2913     int release_status = KMP_LOCK_STILL_HELD;
2914 #endif
2915 
2916     if (--(tl->lk.depth_locked) == 0) {
2917       TCW_4(tl->lk.poll, 0);
2918 #if OMPT_SUPPORT && OMPT_OPTIONAL
2919       release_status = KMP_LOCK_RELEASED;
2920 #endif
2921     }
2922     KMP_MB();
2923 
2924 #if OMPT_SUPPORT && OMPT_OPTIONAL
2925     // This is the case, if called from omp_init_lock_with_hint:
2926     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2927     if (!codeptr)
2928       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2929     if (ompt_enabled.enabled) {
2930       if (release_status == KMP_LOCK_RELEASED) {
2931         if (ompt_enabled.ompt_callback_mutex_released) {
2932           // release_lock_last
2933           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2934               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2935         }
2936       } else if (ompt_enabled.ompt_callback_nest_lock) {
2937         // release_lock_previous
2938         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2939             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2940       }
2941     }
2942 #endif
2943 
2944     return;
2945 #else
2946     lck = (kmp_user_lock_p)user_lock;
2947 #endif
2948   }
2949 #if KMP_USE_FUTEX
2950   else if ((__kmp_user_lock_kind == lk_futex) &&
2951            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2952             OMP_NEST_LOCK_T_SIZE)) {
2953     lck = (kmp_user_lock_p)user_lock;
2954   }
2955 #endif
2956   else {
2957     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2958   }
2959 
2960 #if USE_ITT_BUILD
2961   __kmp_itt_lock_releasing(lck);
2962 #endif /* USE_ITT_BUILD */
2963 
2964   int release_status;
2965   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2966 #if OMPT_SUPPORT && OMPT_OPTIONAL
2967   // This is the case, if called from omp_init_lock_with_hint:
2968   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2969   if (!codeptr)
2970     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2971   if (ompt_enabled.enabled) {
2972     if (release_status == KMP_LOCK_RELEASED) {
2973       if (ompt_enabled.ompt_callback_mutex_released) {
2974         // release_lock_last
2975         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2976             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2977       }
2978     } else if (ompt_enabled.ompt_callback_nest_lock) {
2979       // release_lock_previous
2980       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2981           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2982     }
2983   }
2984 #endif
2985 
2986 #endif // KMP_USE_DYNAMIC_LOCK
2987 }
2988 
2989 /* try to acquire the lock */
2990 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2991   KMP_COUNT_BLOCK(OMP_test_lock);
2992 
2993 #if KMP_USE_DYNAMIC_LOCK
2994   int rc;
2995   int tag = KMP_EXTRACT_D_TAG(user_lock);
2996 #if USE_ITT_BUILD
2997   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2998 #endif
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000   // This is the case, if called from omp_init_lock_with_hint:
3001   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3002   if (!codeptr)
3003     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3004   if (ompt_enabled.ompt_callback_mutex_acquire) {
3005     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3006         ompt_mutex_lock, omp_lock_hint_none,
3007         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3008         codeptr);
3009   }
3010 #endif
3011 #if KMP_USE_INLINED_TAS
3012   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3013     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3014   } else
3015 #elif KMP_USE_INLINED_FUTEX
3016   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3017     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3018   } else
3019 #endif
3020   {
3021     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3022   }
3023   if (rc) {
3024 #if USE_ITT_BUILD
3025     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3026 #endif
3027 #if OMPT_SUPPORT && OMPT_OPTIONAL
3028     if (ompt_enabled.ompt_callback_mutex_acquired) {
3029       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3030           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
3031     }
3032 #endif
3033     return FTN_TRUE;
3034   } else {
3035 #if USE_ITT_BUILD
3036     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3037 #endif
3038     return FTN_FALSE;
3039   }
3040 
3041 #else // KMP_USE_DYNAMIC_LOCK
3042 
3043   kmp_user_lock_p lck;
3044   int rc;
3045 
3046   if ((__kmp_user_lock_kind == lk_tas) &&
3047       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3048     lck = (kmp_user_lock_p)user_lock;
3049   }
3050 #if KMP_USE_FUTEX
3051   else if ((__kmp_user_lock_kind == lk_futex) &&
3052            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3053     lck = (kmp_user_lock_p)user_lock;
3054   }
3055 #endif
3056   else {
3057     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3058   }
3059 
3060 #if USE_ITT_BUILD
3061   __kmp_itt_lock_acquiring(lck);
3062 #endif /* USE_ITT_BUILD */
3063 #if OMPT_SUPPORT && OMPT_OPTIONAL
3064   // This is the case, if called from omp_init_lock_with_hint:
3065   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3066   if (!codeptr)
3067     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3068   if (ompt_enabled.ompt_callback_mutex_acquire) {
3069     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3070         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3071         (ompt_wait_id_t)lck, codeptr);
3072   }
3073 #endif
3074 
3075   rc = TEST_LOCK(lck, gtid);
3076 #if USE_ITT_BUILD
3077   if (rc) {
3078     __kmp_itt_lock_acquired(lck);
3079   } else {
3080     __kmp_itt_lock_cancelled(lck);
3081   }
3082 #endif /* USE_ITT_BUILD */
3083 #if OMPT_SUPPORT && OMPT_OPTIONAL
3084   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3085     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3086         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
3087   }
3088 #endif
3089 
3090   return (rc ? FTN_TRUE : FTN_FALSE);
3091 
3092 /* Can't use serial interval since not block structured */
3093 
3094 #endif // KMP_USE_DYNAMIC_LOCK
3095 }
3096 
3097 /* try to acquire the lock */
3098 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3099 #if KMP_USE_DYNAMIC_LOCK
3100   int rc;
3101 #if USE_ITT_BUILD
3102   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3103 #endif
3104 #if OMPT_SUPPORT && OMPT_OPTIONAL
3105   // This is the case, if called from omp_init_lock_with_hint:
3106   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3107   if (!codeptr)
3108     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3109   if (ompt_enabled.ompt_callback_mutex_acquire) {
3110     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3111         ompt_mutex_nest_lock, omp_lock_hint_none,
3112         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3113         codeptr);
3114   }
3115 #endif
3116   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3117 #if USE_ITT_BUILD
3118   if (rc) {
3119     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3120   } else {
3121     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3122   }
3123 #endif
3124 #if OMPT_SUPPORT && OMPT_OPTIONAL
3125   if (ompt_enabled.enabled && rc) {
3126     if (rc == 1) {
3127       if (ompt_enabled.ompt_callback_mutex_acquired) {
3128         // lock_first
3129         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3130             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3131       }
3132     } else {
3133       if (ompt_enabled.ompt_callback_nest_lock) {
3134         // lock_next
3135         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3136             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3137       }
3138     }
3139   }
3140 #endif
3141   return rc;
3142 
3143 #else // KMP_USE_DYNAMIC_LOCK
3144 
3145   kmp_user_lock_p lck;
3146   int rc;
3147 
3148   if ((__kmp_user_lock_kind == lk_tas) &&
3149       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3150        OMP_NEST_LOCK_T_SIZE)) {
3151     lck = (kmp_user_lock_p)user_lock;
3152   }
3153 #if KMP_USE_FUTEX
3154   else if ((__kmp_user_lock_kind == lk_futex) &&
3155            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3156             OMP_NEST_LOCK_T_SIZE)) {
3157     lck = (kmp_user_lock_p)user_lock;
3158   }
3159 #endif
3160   else {
3161     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3162   }
3163 
3164 #if USE_ITT_BUILD
3165   __kmp_itt_lock_acquiring(lck);
3166 #endif /* USE_ITT_BUILD */
3167 
3168 #if OMPT_SUPPORT && OMPT_OPTIONAL
3169   // This is the case, if called from omp_init_lock_with_hint:
3170   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3171   if (!codeptr)
3172     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3173   if (ompt_enabled.enabled) &&
3174         ompt_enabled.ompt_callback_mutex_acquire) {
3175       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3176           ompt_mutex_nest_lock, omp_lock_hint_none,
3177           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3178     }
3179 #endif
3180 
3181   rc = TEST_NESTED_LOCK(lck, gtid);
3182 #if USE_ITT_BUILD
3183   if (rc) {
3184     __kmp_itt_lock_acquired(lck);
3185   } else {
3186     __kmp_itt_lock_cancelled(lck);
3187   }
3188 #endif /* USE_ITT_BUILD */
3189 #if OMPT_SUPPORT && OMPT_OPTIONAL
3190   if (ompt_enabled.enabled && rc) {
3191     if (rc == 1) {
3192       if (ompt_enabled.ompt_callback_mutex_acquired) {
3193         // lock_first
3194         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3195             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3196       }
3197     } else {
3198       if (ompt_enabled.ompt_callback_nest_lock) {
3199         // lock_next
3200         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3201             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3202       }
3203     }
3204   }
3205 #endif
3206   return rc;
3207 
3208 /* Can't use serial interval since not block structured */
3209 
3210 #endif // KMP_USE_DYNAMIC_LOCK
3211 }
3212 
3213 // Interface to fast scalable reduce methods routines
3214 
3215 // keep the selected method in a thread local structure for cross-function
3216 // usage: will be used in __kmpc_end_reduce* functions;
3217 // another solution: to re-determine the method one more time in
3218 // __kmpc_end_reduce* functions (new prototype required then)
3219 // AT: which solution is better?
3220 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3221   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3222 
3223 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3224   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3225 
3226 // description of the packed_reduction_method variable: look at the macros in
3227 // kmp.h
3228 
3229 // used in a critical section reduce block
3230 static __forceinline void
3231 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3232                                           kmp_critical_name *crit) {
3233 
3234   // this lock was visible to a customer and to the threading profile tool as a
3235   // serial overhead span (although it's used for an internal purpose only)
3236   //            why was it visible in previous implementation?
3237   //            should we keep it visible in new reduce block?
3238   kmp_user_lock_p lck;
3239 
3240 #if KMP_USE_DYNAMIC_LOCK
3241 
3242   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3243   // Check if it is initialized.
3244   if (*lk == 0) {
3245     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3246       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3247                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3248     } else {
3249       __kmp_init_indirect_csptr(crit, loc, global_tid,
3250                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3251     }
3252   }
3253   // Branch for accessing the actual lock object and set operation. This
3254   // branching is inevitable since this lock initialization does not follow the
3255   // normal dispatch path (lock table is not used).
3256   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3257     lck = (kmp_user_lock_p)lk;
3258     KMP_DEBUG_ASSERT(lck != NULL);
3259     if (__kmp_env_consistency_check) {
3260       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3261     }
3262     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3263   } else {
3264     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3265     lck = ilk->lock;
3266     KMP_DEBUG_ASSERT(lck != NULL);
3267     if (__kmp_env_consistency_check) {
3268       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3269     }
3270     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3271   }
3272 
3273 #else // KMP_USE_DYNAMIC_LOCK
3274 
3275   // We know that the fast reduction code is only emitted by Intel compilers
3276   // with 32 byte critical sections. If there isn't enough space, then we
3277   // have to use a pointer.
3278   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3279     lck = (kmp_user_lock_p)crit;
3280   } else {
3281     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3282   }
3283   KMP_DEBUG_ASSERT(lck != NULL);
3284 
3285   if (__kmp_env_consistency_check)
3286     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3287 
3288   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3289 
3290 #endif // KMP_USE_DYNAMIC_LOCK
3291 }
3292 
3293 // used in a critical section reduce block
3294 static __forceinline void
3295 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3296                                         kmp_critical_name *crit) {
3297 
3298   kmp_user_lock_p lck;
3299 
3300 #if KMP_USE_DYNAMIC_LOCK
3301 
3302   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3303     lck = (kmp_user_lock_p)crit;
3304     if (__kmp_env_consistency_check)
3305       __kmp_pop_sync(global_tid, ct_critical, loc);
3306     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3307   } else {
3308     kmp_indirect_lock_t *ilk =
3309         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3310     if (__kmp_env_consistency_check)
3311       __kmp_pop_sync(global_tid, ct_critical, loc);
3312     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3313   }
3314 
3315 #else // KMP_USE_DYNAMIC_LOCK
3316 
3317   // We know that the fast reduction code is only emitted by Intel compilers
3318   // with 32 byte critical sections. If there isn't enough space, then we have
3319   // to use a pointer.
3320   if (__kmp_base_user_lock_size > 32) {
3321     lck = *((kmp_user_lock_p *)crit);
3322     KMP_ASSERT(lck != NULL);
3323   } else {
3324     lck = (kmp_user_lock_p)crit;
3325   }
3326 
3327   if (__kmp_env_consistency_check)
3328     __kmp_pop_sync(global_tid, ct_critical, loc);
3329 
3330   __kmp_release_user_lock_with_checks(lck, global_tid);
3331 
3332 #endif // KMP_USE_DYNAMIC_LOCK
3333 } // __kmp_end_critical_section_reduce_block
3334 
3335 #if OMP_40_ENABLED
3336 static __forceinline int
3337 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3338                                      int *task_state) {
3339   kmp_team_t *team;
3340 
3341   // Check if we are inside the teams construct?
3342   if (th->th.th_teams_microtask) {
3343     *team_p = team = th->th.th_team;
3344     if (team->t.t_level == th->th.th_teams_level) {
3345       // This is reduction at teams construct.
3346       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3347       // Let's swap teams temporarily for the reduction.
3348       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3349       th->th.th_team = team->t.t_parent;
3350       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3351       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3352       *task_state = th->th.th_task_state;
3353       th->th.th_task_state = 0;
3354 
3355       return 1;
3356     }
3357   }
3358   return 0;
3359 }
3360 
3361 static __forceinline void
3362 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3363   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3364   th->th.th_info.ds.ds_tid = 0;
3365   th->th.th_team = team;
3366   th->th.th_team_nproc = team->t.t_nproc;
3367   th->th.th_task_team = team->t.t_task_team[task_state];
3368   th->th.th_task_state = task_state;
3369 }
3370 #endif
3371 
3372 /* 2.a.i. Reduce Block without a terminating barrier */
3373 /*!
3374 @ingroup SYNCHRONIZATION
3375 @param loc source location information
3376 @param global_tid global thread number
3377 @param num_vars number of items (variables) to be reduced
3378 @param reduce_size size of data in bytes to be reduced
3379 @param reduce_data pointer to data to be reduced
3380 @param reduce_func callback function providing reduction operation on two
3381 operands and returning result of reduction in lhs_data
3382 @param lck pointer to the unique lock data structure
3383 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3384 threads if atomic reduction needed
3385 
3386 The nowait version is used for a reduce clause with the nowait argument.
3387 */
3388 kmp_int32
3389 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3390                      size_t reduce_size, void *reduce_data,
3391                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3392                      kmp_critical_name *lck) {
3393 
3394   KMP_COUNT_BLOCK(REDUCE_nowait);
3395   int retval = 0;
3396   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3397 #if OMP_40_ENABLED
3398   kmp_info_t *th;
3399   kmp_team_t *team;
3400   int teams_swapped = 0, task_state;
3401 #endif
3402   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3403 
3404   // why do we need this initialization here at all?
3405   // Reduction clause can not be used as a stand-alone directive.
3406 
3407   // do not call __kmp_serial_initialize(), it will be called by
3408   // __kmp_parallel_initialize() if needed
3409   // possible detection of false-positive race by the threadchecker ???
3410   if (!TCR_4(__kmp_init_parallel))
3411     __kmp_parallel_initialize();
3412 
3413 #if OMP_50_ENABLED
3414   __kmp_resume_if_soft_paused();
3415 #endif
3416 
3417 // check correctness of reduce block nesting
3418 #if KMP_USE_DYNAMIC_LOCK
3419   if (__kmp_env_consistency_check)
3420     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3421 #else
3422   if (__kmp_env_consistency_check)
3423     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3424 #endif
3425 
3426 #if OMP_40_ENABLED
3427   th = __kmp_thread_from_gtid(global_tid);
3428   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3429 #endif // OMP_40_ENABLED
3430 
3431   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3432   // the value should be kept in a variable
3433   // the variable should be either a construct-specific or thread-specific
3434   // property, not a team specific property
3435   //     (a thread can reach the next reduce block on the next construct, reduce
3436   //     method may differ on the next construct)
3437   // an ident_t "loc" parameter could be used as a construct-specific property
3438   // (what if loc == 0?)
3439   //     (if both construct-specific and team-specific variables were shared,
3440   //     then unness extra syncs should be needed)
3441   // a thread-specific variable is better regarding two issues above (next
3442   // construct and extra syncs)
3443   // a thread-specific "th_local.reduction_method" variable is used currently
3444   // each thread executes 'determine' and 'set' lines (no need to execute by one
3445   // thread, to avoid unness extra syncs)
3446 
3447   packed_reduction_method = __kmp_determine_reduction_method(
3448       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3449   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3450 
3451   if (packed_reduction_method == critical_reduce_block) {
3452 
3453     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3454     retval = 1;
3455 
3456   } else if (packed_reduction_method == empty_reduce_block) {
3457 
3458     // usage: if team size == 1, no synchronization is required ( Intel
3459     // platforms only )
3460     retval = 1;
3461 
3462   } else if (packed_reduction_method == atomic_reduce_block) {
3463 
3464     retval = 2;
3465 
3466     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3467     // won't be called by the code gen)
3468     //     (it's not quite good, because the checking block has been closed by
3469     //     this 'pop',
3470     //      but atomic operation has not been executed yet, will be executed
3471     //      slightly later, literally on next instruction)
3472     if (__kmp_env_consistency_check)
3473       __kmp_pop_sync(global_tid, ct_reduce, loc);
3474 
3475   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3476                                    tree_reduce_block)) {
3477 
3478 // AT: performance issue: a real barrier here
3479 // AT:     (if master goes slow, other threads are blocked here waiting for the
3480 // master to come and release them)
3481 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3482 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3483 // be confusing to a customer)
3484 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3485 // might go faster and be more in line with sense of NOWAIT
3486 // AT: TO DO: do epcc test and compare times
3487 
3488 // this barrier should be invisible to a customer and to the threading profile
3489 // tool (it's neither a terminating barrier nor customer's code, it's
3490 // used for an internal purpose)
3491 #if OMPT_SUPPORT
3492     // JP: can this barrier potentially leed to task scheduling?
3493     // JP: as long as there is a barrier in the implementation, OMPT should and
3494     // will provide the barrier events
3495     //         so we set-up the necessary frame/return addresses.
3496     ompt_frame_t *ompt_frame;
3497     if (ompt_enabled.enabled) {
3498       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3499       if (ompt_frame->enter_frame.ptr == NULL)
3500         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3501       OMPT_STORE_RETURN_ADDRESS(global_tid);
3502     }
3503 #endif
3504 #if USE_ITT_NOTIFY
3505     __kmp_threads[global_tid]->th.th_ident = loc;
3506 #endif
3507     retval =
3508         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3509                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3510     retval = (retval != 0) ? (0) : (1);
3511 #if OMPT_SUPPORT && OMPT_OPTIONAL
3512     if (ompt_enabled.enabled) {
3513       ompt_frame->enter_frame = ompt_data_none;
3514     }
3515 #endif
3516 
3517     // all other workers except master should do this pop here
3518     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3519     if (__kmp_env_consistency_check) {
3520       if (retval == 0) {
3521         __kmp_pop_sync(global_tid, ct_reduce, loc);
3522       }
3523     }
3524 
3525   } else {
3526 
3527     // should never reach this block
3528     KMP_ASSERT(0); // "unexpected method"
3529   }
3530 #if OMP_40_ENABLED
3531   if (teams_swapped) {
3532     __kmp_restore_swapped_teams(th, team, task_state);
3533   }
3534 #endif
3535   KA_TRACE(
3536       10,
3537       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3538        global_tid, packed_reduction_method, retval));
3539 
3540   return retval;
3541 }
3542 
3543 /*!
3544 @ingroup SYNCHRONIZATION
3545 @param loc source location information
3546 @param global_tid global thread id.
3547 @param lck pointer to the unique lock data structure
3548 
3549 Finish the execution of a reduce nowait.
3550 */
3551 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3552                               kmp_critical_name *lck) {
3553 
3554   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3555 
3556   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3557 
3558   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3559 
3560   if (packed_reduction_method == critical_reduce_block) {
3561 
3562     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3563 
3564   } else if (packed_reduction_method == empty_reduce_block) {
3565 
3566     // usage: if team size == 1, no synchronization is required ( on Intel
3567     // platforms only )
3568 
3569   } else if (packed_reduction_method == atomic_reduce_block) {
3570 
3571     // neither master nor other workers should get here
3572     //     (code gen does not generate this call in case 2: atomic reduce block)
3573     // actually it's better to remove this elseif at all;
3574     // after removal this value will checked by the 'else' and will assert
3575 
3576   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3577                                    tree_reduce_block)) {
3578 
3579     // only master gets here
3580 
3581   } else {
3582 
3583     // should never reach this block
3584     KMP_ASSERT(0); // "unexpected method"
3585   }
3586 
3587   if (__kmp_env_consistency_check)
3588     __kmp_pop_sync(global_tid, ct_reduce, loc);
3589 
3590   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3591                 global_tid, packed_reduction_method));
3592 
3593   return;
3594 }
3595 
3596 /* 2.a.ii. Reduce Block with a terminating barrier */
3597 
3598 /*!
3599 @ingroup SYNCHRONIZATION
3600 @param loc source location information
3601 @param global_tid global thread number
3602 @param num_vars number of items (variables) to be reduced
3603 @param reduce_size size of data in bytes to be reduced
3604 @param reduce_data pointer to data to be reduced
3605 @param reduce_func callback function providing reduction operation on two
3606 operands and returning result of reduction in lhs_data
3607 @param lck pointer to the unique lock data structure
3608 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3609 threads if atomic reduction needed
3610 
3611 A blocking reduce that includes an implicit barrier.
3612 */
3613 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3614                         size_t reduce_size, void *reduce_data,
3615                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3616                         kmp_critical_name *lck) {
3617   KMP_COUNT_BLOCK(REDUCE_wait);
3618   int retval = 0;
3619   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3620 #if OMP_40_ENABLED
3621   kmp_info_t *th;
3622   kmp_team_t *team;
3623   int teams_swapped = 0, task_state;
3624 #endif
3625 
3626   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3627 
3628   // why do we need this initialization here at all?
3629   // Reduction clause can not be a stand-alone directive.
3630 
3631   // do not call __kmp_serial_initialize(), it will be called by
3632   // __kmp_parallel_initialize() if needed
3633   // possible detection of false-positive race by the threadchecker ???
3634   if (!TCR_4(__kmp_init_parallel))
3635     __kmp_parallel_initialize();
3636 
3637 #if OMP_50_ENABLED
3638   __kmp_resume_if_soft_paused();
3639 #endif
3640 
3641 // check correctness of reduce block nesting
3642 #if KMP_USE_DYNAMIC_LOCK
3643   if (__kmp_env_consistency_check)
3644     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3645 #else
3646   if (__kmp_env_consistency_check)
3647     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3648 #endif
3649 
3650 #if OMP_40_ENABLED
3651   th = __kmp_thread_from_gtid(global_tid);
3652   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3653 #endif // OMP_40_ENABLED
3654 
3655   packed_reduction_method = __kmp_determine_reduction_method(
3656       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3657   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3658 
3659   if (packed_reduction_method == critical_reduce_block) {
3660 
3661     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3662     retval = 1;
3663 
3664   } else if (packed_reduction_method == empty_reduce_block) {
3665 
3666     // usage: if team size == 1, no synchronization is required ( Intel
3667     // platforms only )
3668     retval = 1;
3669 
3670   } else if (packed_reduction_method == atomic_reduce_block) {
3671 
3672     retval = 2;
3673 
3674   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3675                                    tree_reduce_block)) {
3676 
3677 // case tree_reduce_block:
3678 // this barrier should be visible to a customer and to the threading profile
3679 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3680 #if OMPT_SUPPORT
3681     ompt_frame_t *ompt_frame;
3682     if (ompt_enabled.enabled) {
3683       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3684       if (ompt_frame->enter_frame.ptr == NULL)
3685         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3686       OMPT_STORE_RETURN_ADDRESS(global_tid);
3687     }
3688 #endif
3689 #if USE_ITT_NOTIFY
3690     __kmp_threads[global_tid]->th.th_ident =
3691         loc; // needed for correct notification of frames
3692 #endif
3693     retval =
3694         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3695                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3696     retval = (retval != 0) ? (0) : (1);
3697 #if OMPT_SUPPORT && OMPT_OPTIONAL
3698     if (ompt_enabled.enabled) {
3699       ompt_frame->enter_frame = ompt_data_none;
3700     }
3701 #endif
3702 
3703     // all other workers except master should do this pop here
3704     // ( none of other workers except master will enter __kmpc_end_reduce() )
3705     if (__kmp_env_consistency_check) {
3706       if (retval == 0) { // 0: all other workers; 1: master
3707         __kmp_pop_sync(global_tid, ct_reduce, loc);
3708       }
3709     }
3710 
3711   } else {
3712 
3713     // should never reach this block
3714     KMP_ASSERT(0); // "unexpected method"
3715   }
3716 #if OMP_40_ENABLED
3717   if (teams_swapped) {
3718     __kmp_restore_swapped_teams(th, team, task_state);
3719   }
3720 #endif
3721 
3722   KA_TRACE(10,
3723            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3724             global_tid, packed_reduction_method, retval));
3725 
3726   return retval;
3727 }
3728 
3729 /*!
3730 @ingroup SYNCHRONIZATION
3731 @param loc source location information
3732 @param global_tid global thread id.
3733 @param lck pointer to the unique lock data structure
3734 
3735 Finish the execution of a blocking reduce.
3736 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3737 start function.
3738 */
3739 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3740                        kmp_critical_name *lck) {
3741 
3742   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3743 #if OMP_40_ENABLED
3744   kmp_info_t *th;
3745   kmp_team_t *team;
3746   int teams_swapped = 0, task_state;
3747 #endif
3748 
3749   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3750 
3751 #if OMP_40_ENABLED
3752   th = __kmp_thread_from_gtid(global_tid);
3753   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3754 #endif // OMP_40_ENABLED
3755 
3756   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3757 
3758   // this barrier should be visible to a customer and to the threading profile
3759   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3760 
3761   if (packed_reduction_method == critical_reduce_block) {
3762 
3763     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3764 
3765 // TODO: implicit barrier: should be exposed
3766 #if OMPT_SUPPORT
3767     ompt_frame_t *ompt_frame;
3768     if (ompt_enabled.enabled) {
3769       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3770       if (ompt_frame->enter_frame.ptr == NULL)
3771         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3772       OMPT_STORE_RETURN_ADDRESS(global_tid);
3773     }
3774 #endif
3775 #if USE_ITT_NOTIFY
3776     __kmp_threads[global_tid]->th.th_ident = loc;
3777 #endif
3778     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3779 #if OMPT_SUPPORT && OMPT_OPTIONAL
3780     if (ompt_enabled.enabled) {
3781       ompt_frame->enter_frame = ompt_data_none;
3782     }
3783 #endif
3784 
3785   } else if (packed_reduction_method == empty_reduce_block) {
3786 
3787 // usage: if team size==1, no synchronization is required (Intel platforms only)
3788 
3789 // TODO: implicit barrier: should be exposed
3790 #if OMPT_SUPPORT
3791     ompt_frame_t *ompt_frame;
3792     if (ompt_enabled.enabled) {
3793       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3794       if (ompt_frame->enter_frame.ptr == NULL)
3795         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3796       OMPT_STORE_RETURN_ADDRESS(global_tid);
3797     }
3798 #endif
3799 #if USE_ITT_NOTIFY
3800     __kmp_threads[global_tid]->th.th_ident = loc;
3801 #endif
3802     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3803 #if OMPT_SUPPORT && OMPT_OPTIONAL
3804     if (ompt_enabled.enabled) {
3805       ompt_frame->enter_frame = ompt_data_none;
3806     }
3807 #endif
3808 
3809   } else if (packed_reduction_method == atomic_reduce_block) {
3810 
3811 #if OMPT_SUPPORT
3812     ompt_frame_t *ompt_frame;
3813     if (ompt_enabled.enabled) {
3814       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3815       if (ompt_frame->enter_frame.ptr == NULL)
3816         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3817       OMPT_STORE_RETURN_ADDRESS(global_tid);
3818     }
3819 #endif
3820 // TODO: implicit barrier: should be exposed
3821 #if USE_ITT_NOTIFY
3822     __kmp_threads[global_tid]->th.th_ident = loc;
3823 #endif
3824     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3825 #if OMPT_SUPPORT && OMPT_OPTIONAL
3826     if (ompt_enabled.enabled) {
3827       ompt_frame->enter_frame = ompt_data_none;
3828     }
3829 #endif
3830 
3831   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3832                                    tree_reduce_block)) {
3833 
3834     // only master executes here (master releases all other workers)
3835     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3836                             global_tid);
3837 
3838   } else {
3839 
3840     // should never reach this block
3841     KMP_ASSERT(0); // "unexpected method"
3842   }
3843 #if OMP_40_ENABLED
3844   if (teams_swapped) {
3845     __kmp_restore_swapped_teams(th, team, task_state);
3846   }
3847 #endif
3848 
3849   if (__kmp_env_consistency_check)
3850     __kmp_pop_sync(global_tid, ct_reduce, loc);
3851 
3852   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3853                 global_tid, packed_reduction_method));
3854 
3855   return;
3856 }
3857 
3858 #undef __KMP_GET_REDUCTION_METHOD
3859 #undef __KMP_SET_REDUCTION_METHOD
3860 
3861 /* end of interface to fast scalable reduce routines */
3862 
3863 kmp_uint64 __kmpc_get_taskid() {
3864 
3865   kmp_int32 gtid;
3866   kmp_info_t *thread;
3867 
3868   gtid = __kmp_get_gtid();
3869   if (gtid < 0) {
3870     return 0;
3871   }
3872   thread = __kmp_thread_from_gtid(gtid);
3873   return thread->th.th_current_task->td_task_id;
3874 
3875 } // __kmpc_get_taskid
3876 
3877 kmp_uint64 __kmpc_get_parent_taskid() {
3878 
3879   kmp_int32 gtid;
3880   kmp_info_t *thread;
3881   kmp_taskdata_t *parent_task;
3882 
3883   gtid = __kmp_get_gtid();
3884   if (gtid < 0) {
3885     return 0;
3886   }
3887   thread = __kmp_thread_from_gtid(gtid);
3888   parent_task = thread->th.th_current_task->td_parent;
3889   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3890 
3891 } // __kmpc_get_parent_taskid
3892 
3893 #if OMP_45_ENABLED
3894 /*!
3895 @ingroup WORK_SHARING
3896 @param loc  source location information.
3897 @param gtid  global thread number.
3898 @param num_dims  number of associated doacross loops.
3899 @param dims  info on loops bounds.
3900 
3901 Initialize doacross loop information.
3902 Expect compiler send us inclusive bounds,
3903 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3904 */
3905 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3906                           const struct kmp_dim *dims) {
3907   int j, idx;
3908   kmp_int64 last, trace_count;
3909   kmp_info_t *th = __kmp_threads[gtid];
3910   kmp_team_t *team = th->th.th_team;
3911   kmp_uint32 *flags;
3912   kmp_disp_t *pr_buf = th->th.th_dispatch;
3913   dispatch_shared_info_t *sh_buf;
3914 
3915   KA_TRACE(
3916       20,
3917       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3918        gtid, num_dims, !team->t.t_serialized));
3919   KMP_DEBUG_ASSERT(dims != NULL);
3920   KMP_DEBUG_ASSERT(num_dims > 0);
3921 
3922   if (team->t.t_serialized) {
3923     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3924     return; // no dependencies if team is serialized
3925   }
3926   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3927   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3928   // the next loop
3929   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3930 
3931   // Save bounds info into allocated private buffer
3932   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3933   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3934       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3935   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3936   pr_buf->th_doacross_info[0] =
3937       (kmp_int64)num_dims; // first element is number of dimensions
3938   // Save also address of num_done in order to access it later without knowing
3939   // the buffer index
3940   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3941   pr_buf->th_doacross_info[2] = dims[0].lo;
3942   pr_buf->th_doacross_info[3] = dims[0].up;
3943   pr_buf->th_doacross_info[4] = dims[0].st;
3944   last = 5;
3945   for (j = 1; j < num_dims; ++j) {
3946     kmp_int64
3947         range_length; // To keep ranges of all dimensions but the first dims[0]
3948     if (dims[j].st == 1) { // most common case
3949       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3950       range_length = dims[j].up - dims[j].lo + 1;
3951     } else {
3952       if (dims[j].st > 0) {
3953         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3954         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3955       } else { // negative increment
3956         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3957         range_length =
3958             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3959       }
3960     }
3961     pr_buf->th_doacross_info[last++] = range_length;
3962     pr_buf->th_doacross_info[last++] = dims[j].lo;
3963     pr_buf->th_doacross_info[last++] = dims[j].up;
3964     pr_buf->th_doacross_info[last++] = dims[j].st;
3965   }
3966 
3967   // Compute total trip count.
3968   // Start with range of dims[0] which we don't need to keep in the buffer.
3969   if (dims[0].st == 1) { // most common case
3970     trace_count = dims[0].up - dims[0].lo + 1;
3971   } else if (dims[0].st > 0) {
3972     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3973     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3974   } else { // negative increment
3975     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3976     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3977   }
3978   for (j = 1; j < num_dims; ++j) {
3979     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3980   }
3981   KMP_DEBUG_ASSERT(trace_count > 0);
3982 
3983   // Check if shared buffer is not occupied by other loop (idx -
3984   // __kmp_dispatch_num_buffers)
3985   if (idx != sh_buf->doacross_buf_idx) {
3986     // Shared buffer is occupied, wait for it to be free
3987     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3988                  __kmp_eq_4, NULL);
3989   }
3990 #if KMP_32_BIT_ARCH
3991   // Check if we are the first thread. After the CAS the first thread gets 0,
3992   // others get 1 if initialization is in progress, allocated pointer otherwise.
3993   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3994   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3995       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3996 #else
3997   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3998       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3999 #endif
4000   if (flags == NULL) {
4001     // we are the first thread, allocate the array of flags
4002     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
4003     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4004     KMP_MB();
4005     sh_buf->doacross_flags = flags;
4006   } else if (flags == (kmp_uint32 *)1) {
4007 #if KMP_32_BIT_ARCH
4008     // initialization is still in progress, need to wait
4009     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4010 #else
4011     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4012 #endif
4013       KMP_YIELD(TRUE);
4014     KMP_MB();
4015   } else {
4016     KMP_MB();
4017   }
4018   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4019   pr_buf->th_doacross_flags =
4020       sh_buf->doacross_flags; // save private copy in order to not
4021   // touch shared buffer on each iteration
4022   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4023 }
4024 
4025 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4026   kmp_int32 shft, num_dims, i;
4027   kmp_uint32 flag;
4028   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4029   kmp_info_t *th = __kmp_threads[gtid];
4030   kmp_team_t *team = th->th.th_team;
4031   kmp_disp_t *pr_buf;
4032   kmp_int64 lo, up, st;
4033 
4034   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4035   if (team->t.t_serialized) {
4036     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4037     return; // no dependencies if team is serialized
4038   }
4039 
4040   // calculate sequential iteration number and check out-of-bounds condition
4041   pr_buf = th->th.th_dispatch;
4042   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4043   num_dims = pr_buf->th_doacross_info[0];
4044   lo = pr_buf->th_doacross_info[2];
4045   up = pr_buf->th_doacross_info[3];
4046   st = pr_buf->th_doacross_info[4];
4047   if (st == 1) { // most common case
4048     if (vec[0] < lo || vec[0] > up) {
4049       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4050                     "bounds [%lld,%lld]\n",
4051                     gtid, vec[0], lo, up));
4052       return;
4053     }
4054     iter_number = vec[0] - lo;
4055   } else if (st > 0) {
4056     if (vec[0] < lo || vec[0] > up) {
4057       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4058                     "bounds [%lld,%lld]\n",
4059                     gtid, vec[0], lo, up));
4060       return;
4061     }
4062     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4063   } else { // negative increment
4064     if (vec[0] > lo || vec[0] < up) {
4065       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4066                     "bounds [%lld,%lld]\n",
4067                     gtid, vec[0], lo, up));
4068       return;
4069     }
4070     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4071   }
4072   for (i = 1; i < num_dims; ++i) {
4073     kmp_int64 iter, ln;
4074     kmp_int32 j = i * 4;
4075     ln = pr_buf->th_doacross_info[j + 1];
4076     lo = pr_buf->th_doacross_info[j + 2];
4077     up = pr_buf->th_doacross_info[j + 3];
4078     st = pr_buf->th_doacross_info[j + 4];
4079     if (st == 1) {
4080       if (vec[i] < lo || vec[i] > up) {
4081         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4082                       "bounds [%lld,%lld]\n",
4083                       gtid, vec[i], lo, up));
4084         return;
4085       }
4086       iter = vec[i] - lo;
4087     } else if (st > 0) {
4088       if (vec[i] < lo || vec[i] > up) {
4089         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4090                       "bounds [%lld,%lld]\n",
4091                       gtid, vec[i], lo, up));
4092         return;
4093       }
4094       iter = (kmp_uint64)(vec[i] - lo) / st;
4095     } else { // st < 0
4096       if (vec[i] > lo || vec[i] < up) {
4097         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4098                       "bounds [%lld,%lld]\n",
4099                       gtid, vec[i], lo, up));
4100         return;
4101       }
4102       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4103     }
4104     iter_number = iter + ln * iter_number;
4105   }
4106   shft = iter_number % 32; // use 32-bit granularity
4107   iter_number >>= 5; // divided by 32
4108   flag = 1 << shft;
4109   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4110     KMP_YIELD(TRUE);
4111   }
4112   KMP_MB();
4113   KA_TRACE(20,
4114            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4115             gtid, (iter_number << 5) + shft));
4116 }
4117 
4118 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4119   kmp_int32 shft, num_dims, i;
4120   kmp_uint32 flag;
4121   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4122   kmp_info_t *th = __kmp_threads[gtid];
4123   kmp_team_t *team = th->th.th_team;
4124   kmp_disp_t *pr_buf;
4125   kmp_int64 lo, st;
4126 
4127   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4128   if (team->t.t_serialized) {
4129     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4130     return; // no dependencies if team is serialized
4131   }
4132 
4133   // calculate sequential iteration number (same as in "wait" but no
4134   // out-of-bounds checks)
4135   pr_buf = th->th.th_dispatch;
4136   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4137   num_dims = pr_buf->th_doacross_info[0];
4138   lo = pr_buf->th_doacross_info[2];
4139   st = pr_buf->th_doacross_info[4];
4140   if (st == 1) { // most common case
4141     iter_number = vec[0] - lo;
4142   } else if (st > 0) {
4143     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4144   } else { // negative increment
4145     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4146   }
4147   for (i = 1; i < num_dims; ++i) {
4148     kmp_int64 iter, ln;
4149     kmp_int32 j = i * 4;
4150     ln = pr_buf->th_doacross_info[j + 1];
4151     lo = pr_buf->th_doacross_info[j + 2];
4152     st = pr_buf->th_doacross_info[j + 4];
4153     if (st == 1) {
4154       iter = vec[i] - lo;
4155     } else if (st > 0) {
4156       iter = (kmp_uint64)(vec[i] - lo) / st;
4157     } else { // st < 0
4158       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4159     }
4160     iter_number = iter + ln * iter_number;
4161   }
4162   shft = iter_number % 32; // use 32-bit granularity
4163   iter_number >>= 5; // divided by 32
4164   flag = 1 << shft;
4165   KMP_MB();
4166   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4167     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4168   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4169                 (iter_number << 5) + shft));
4170 }
4171 
4172 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4173   kmp_int32 num_done;
4174   kmp_info_t *th = __kmp_threads[gtid];
4175   kmp_team_t *team = th->th.th_team;
4176   kmp_disp_t *pr_buf = th->th.th_dispatch;
4177 
4178   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4179   if (team->t.t_serialized) {
4180     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4181     return; // nothing to do
4182   }
4183   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4184   if (num_done == th->th.th_team_nproc) {
4185     // we are the last thread, need to free shared resources
4186     int idx = pr_buf->th_doacross_buf_idx - 1;
4187     dispatch_shared_info_t *sh_buf =
4188         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4189     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4190                      (kmp_int64)&sh_buf->doacross_num_done);
4191     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4192     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4193     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4194     sh_buf->doacross_flags = NULL;
4195     sh_buf->doacross_num_done = 0;
4196     sh_buf->doacross_buf_idx +=
4197         __kmp_dispatch_num_buffers; // free buffer for future re-use
4198   }
4199   // free private resources (need to keep buffer index forever)
4200   pr_buf->th_doacross_flags = NULL;
4201   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4202   pr_buf->th_doacross_info = NULL;
4203   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4204 }
4205 #endif
4206 
4207 #if OMP_50_ENABLED
4208 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
4209 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4210   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4211 }
4212 
4213 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4214   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4215 }
4216 
4217 int __kmpc_get_target_offload(void) {
4218   if (!__kmp_init_serial) {
4219     __kmp_serial_initialize();
4220   }
4221   return __kmp_target_offload;
4222 }
4223 
4224 int __kmpc_pause_resource(kmp_pause_status_t level) {
4225   if (!__kmp_init_serial) {
4226     return 1; // Can't pause if runtime is not initialized
4227   }
4228   return __kmp_pause_resource(level);
4229 }
4230 #endif // OMP_50_ENABLED
4231 
4232 // end of file //
4233