1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 #if KMP_OS_WINDOWS && OMPT_SUPPORT
74   // Normal exit process on Windows does not allow worker threads of the final
75   // parallel region to finish reporting their events, so shutting down the
76   // library here fixes the issue at least for the cases where __kmpc_end() is
77   // placed properly.
78   if (ompt_enabled.enabled)
79     __kmp_internal_end_library(__kmp_gtid_get_specific());
80 #endif
81 }
82 
83 /*!
84 @ingroup THREAD_STATES
85 @param loc Source location information.
86 @return The global thread index of the active thread.
87 
88 This function can be called in any context.
89 
90 If the runtime has ony been entered at the outermost level from a
91 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
92 that which would be returned by omp_get_thread_num() in the outermost
93 active parallel construct. (Or zero if there is no active parallel
94 construct, since the master thread is necessarily thread zero).
95 
96 If multiple non-OpenMP threads all enter an OpenMP construct then this
97 will be a unique thread identifier among all the threads created by
98 the OpenMP runtime (but the value cannote be defined in terms of
99 OpenMP thread ids returned by omp_get_thread_num()).
100 */
101 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
102   kmp_int32 gtid = __kmp_entry_gtid();
103 
104   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
105 
106   return gtid;
107 }
108 
109 /*!
110 @ingroup THREAD_STATES
111 @param loc Source location information.
112 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
113 
114 This function can be called in any context.
115 It returns the total number of threads under the control of the OpenMP runtime.
116 That is not a number that can be determined by any OpenMP standard calls, since
117 the library may be called from more than one non-OpenMP thread, and this
118 reflects the total over all such calls. Similarly the runtime maintains
119 underlying threads even when they are not active (since the cost of creating
120 and destroying OS threads is high), this call counts all such threads even if
121 they are not waiting for work.
122 */
123 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
124   KC_TRACE(10,
125            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
126 
127   return TCR_4(__kmp_all_nth);
128 }
129 
130 /*!
131 @ingroup THREAD_STATES
132 @param loc Source location information.
133 @return The thread number of the calling thread in the innermost active parallel
134 construct.
135 */
136 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
137   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
138   return __kmp_tid_from_gtid(__kmp_entry_gtid());
139 }
140 
141 /*!
142 @ingroup THREAD_STATES
143 @param loc Source location information.
144 @return The number of threads in the innermost active parallel construct.
145 */
146 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
147   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
148 
149   return __kmp_entry_thread()->th.th_team->t.t_nproc;
150 }
151 
152 /*!
153  * @ingroup DEPRECATED
154  * @param loc location description
155  *
156  * This function need not be called. It always returns TRUE.
157  */
158 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
159 #ifndef KMP_DEBUG
160 
161   return TRUE;
162 
163 #else
164 
165   const char *semi2;
166   const char *semi3;
167   int line_no;
168 
169   if (__kmp_par_range == 0) {
170     return TRUE;
171   }
172   semi2 = loc->psource;
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   semi2 = strchr(semi2, ';');
177   if (semi2 == NULL) {
178     return TRUE;
179   }
180   semi2 = strchr(semi2 + 1, ';');
181   if (semi2 == NULL) {
182     return TRUE;
183   }
184   if (__kmp_par_range_filename[0]) {
185     const char *name = semi2 - 1;
186     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
187       name--;
188     }
189     if ((*name == '/') || (*name == ';')) {
190       name++;
191     }
192     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
193       return __kmp_par_range < 0;
194     }
195   }
196   semi3 = strchr(semi2 + 1, ';');
197   if (__kmp_par_range_routine[0]) {
198     if ((semi3 != NULL) && (semi3 > semi2) &&
199         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
200       return __kmp_par_range < 0;
201     }
202   }
203   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
204     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
205       return __kmp_par_range > 0;
206     }
207     return __kmp_par_range < 0;
208   }
209   return TRUE;
210 
211 #endif /* KMP_DEBUG */
212 }
213 
214 /*!
215 @ingroup THREAD_STATES
216 @param loc Source location information.
217 @return 1 if this thread is executing inside an active parallel region, zero if
218 not.
219 */
220 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
221   return __kmp_entry_thread()->th.th_root->r.r_active;
222 }
223 
224 /*!
225 @ingroup PARALLEL
226 @param loc source location information
227 @param global_tid global thread number
228 @param num_threads number of threads requested for this parallel construct
229 
230 Set the number of threads to be used by the next fork spawned by this thread.
231 This call is only required if the parallel construct has a `num_threads` clause.
232 */
233 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
234                              kmp_int32 num_threads) {
235   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
236                 global_tid, num_threads));
237 
238   __kmp_push_num_threads(loc, global_tid, num_threads);
239 }
240 
241 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
242   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
243 
244   /* the num_threads are automatically popped */
245 }
246 
247 #if OMP_40_ENABLED
248 
249 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
250                            kmp_int32 proc_bind) {
251   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
252                 proc_bind));
253 
254   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
255 }
256 
257 #endif /* OMP_40_ENABLED */
258 
259 /*!
260 @ingroup PARALLEL
261 @param loc  source location information
262 @param argc  total number of arguments in the ellipsis
263 @param microtask  pointer to callback routine consisting of outlined parallel
264 construct
265 @param ...  pointers to shared variables that aren't global
266 
267 Do the actual fork and call the microtask in the relevant number of threads.
268 */
269 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
270   int gtid = __kmp_entry_gtid();
271 
272 #if (KMP_STATS_ENABLED)
273   // If we were in a serial region, then stop the serial timer, record
274   // the event, and start parallel region timer
275   stats_state_e previous_state = KMP_GET_THREAD_STATE();
276   if (previous_state == stats_state_e::SERIAL_REGION) {
277     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
278   } else {
279     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
280   }
281   int inParallel = __kmpc_in_parallel(loc);
282   if (inParallel) {
283     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
284   } else {
285     KMP_COUNT_BLOCK(OMP_PARALLEL);
286   }
287 #endif
288 
289   // maybe to save thr_state is enough here
290   {
291     va_list ap;
292     va_start(ap, microtask);
293 
294 #if OMPT_SUPPORT
295     ompt_frame_t *ompt_frame;
296     if (ompt_enabled.enabled) {
297       kmp_info_t *master_th = __kmp_threads[gtid];
298       kmp_team_t *parent_team = master_th->th.th_team;
299       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
300       if (lwt)
301         ompt_frame = &(lwt->ompt_task_info.frame);
302       else {
303         int tid = __kmp_tid_from_gtid(gtid);
304         ompt_frame = &(
305             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
306       }
307       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
308       OMPT_STORE_RETURN_ADDRESS(gtid);
309     }
310 #endif
311 
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_FORKING();
314 #endif
315     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
316                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
317                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
318 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
319 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
320                     &ap
321 #else
322                     ap
323 #endif
324                     );
325 #if INCLUDE_SSC_MARKS
326     SSC_MARK_JOINING();
327 #endif
328     __kmp_join_call(loc, gtid
329 #if OMPT_SUPPORT
330                     ,
331                     fork_context_intel
332 #endif
333                     );
334 
335     va_end(ap);
336   }
337 
338 #if KMP_STATS_ENABLED
339   if (previous_state == stats_state_e::SERIAL_REGION) {
340     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
341   } else {
342     KMP_POP_PARTITIONED_TIMER();
343   }
344 #endif // KMP_STATS_ENABLED
345 }
346 
347 #if OMP_40_ENABLED
348 /*!
349 @ingroup PARALLEL
350 @param loc source location information
351 @param global_tid global thread number
352 @param num_teams number of teams requested for the teams construct
353 @param num_threads number of threads per team requested for the teams construct
354 
355 Set the number of teams to be used by the teams construct.
356 This call is only required if the teams construct has a `num_teams` clause
357 or a `thread_limit` clause (or both).
358 */
359 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
360                            kmp_int32 num_teams, kmp_int32 num_threads) {
361   KA_TRACE(20,
362            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
363             global_tid, num_teams, num_threads));
364 
365   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
366 }
367 
368 /*!
369 @ingroup PARALLEL
370 @param loc  source location information
371 @param argc  total number of arguments in the ellipsis
372 @param microtask  pointer to callback routine consisting of outlined teams
373 construct
374 @param ...  pointers to shared variables that aren't global
375 
376 Do the actual fork and call the microtask in the relevant number of threads.
377 */
378 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
379                        ...) {
380   int gtid = __kmp_entry_gtid();
381   kmp_info_t *this_thr = __kmp_threads[gtid];
382   va_list ap;
383   va_start(ap, microtask);
384 
385   KMP_COUNT_BLOCK(OMP_TEAMS);
386 
387   // remember teams entry point and nesting level
388   this_thr->th.th_teams_microtask = microtask;
389   this_thr->th.th_teams_level =
390       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
391 
392 #if OMPT_SUPPORT
393   kmp_team_t *parent_team = this_thr->th.th_team;
394   int tid = __kmp_tid_from_gtid(gtid);
395   if (ompt_enabled.enabled) {
396     parent_team->t.t_implicit_task_taskdata[tid]
397         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
398   }
399   OMPT_STORE_RETURN_ADDRESS(gtid);
400 #endif
401 
402   // check if __kmpc_push_num_teams called, set default number of teams
403   // otherwise
404   if (this_thr->th.th_teams_size.nteams == 0) {
405     __kmp_push_num_teams(loc, gtid, 0, 0);
406   }
407   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
410 
411   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
412                   VOLATILE_CAST(microtask_t)
413                       __kmp_teams_master, // "wrapped" task
414                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
415 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
416                   &ap
417 #else
418                   ap
419 #endif
420                   );
421   __kmp_join_call(loc, gtid
422 #if OMPT_SUPPORT
423                   ,
424                   fork_context_intel
425 #endif
426                   );
427 
428   // Pop current CG root off list
429   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
431   this_thr->th.th_cg_roots = tmp->up;
432   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
433                  " to node %p. cg_nthreads was %d\n",
434                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
435   __kmp_free(tmp);
436   // Restore current task's thread_limit from CG root
437   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
438   this_thr->th.th_current_task->td_icvs.thread_limit =
439       this_thr->th.th_cg_roots->cg_thread_limit;
440 
441   this_thr->th.th_teams_microtask = NULL;
442   this_thr->th.th_teams_level = 0;
443   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
444   va_end(ap);
445 }
446 #endif /* OMP_40_ENABLED */
447 
448 // I don't think this function should ever have been exported.
449 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
450 // openmp code ever called it, but it's been exported from the RTL for so
451 // long that I'm afraid to remove the definition.
452 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
453 
454 /*!
455 @ingroup PARALLEL
456 @param loc  source location information
457 @param global_tid  global thread number
458 
459 Enter a serialized parallel construct. This interface is used to handle a
460 conditional parallel region, like this,
461 @code
462 #pragma omp parallel if (condition)
463 @endcode
464 when the condition is false.
465 */
466 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
467 // The implementation is now in kmp_runtime.cpp so that it can share static
468 // functions with kmp_fork_call since the tasks to be done are similar in
469 // each case.
470 #if OMPT_SUPPORT
471   OMPT_STORE_RETURN_ADDRESS(global_tid);
472 #endif
473   __kmp_serialized_parallel(loc, global_tid);
474 }
475 
476 /*!
477 @ingroup PARALLEL
478 @param loc  source location information
479 @param global_tid  global thread number
480 
481 Leave a serialized parallel construct.
482 */
483 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
484   kmp_internal_control_t *top;
485   kmp_info_t *this_thr;
486   kmp_team_t *serial_team;
487 
488   KC_TRACE(10,
489            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
490 
491   /* skip all this code for autopar serialized loops since it results in
492      unacceptable overhead */
493   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
494     return;
495 
496   // Not autopar code
497   if (!TCR_4(__kmp_init_parallel))
498     __kmp_parallel_initialize();
499 
500 #if OMP_50_ENABLED
501   __kmp_resume_if_soft_paused();
502 #endif
503 
504   this_thr = __kmp_threads[global_tid];
505   serial_team = this_thr->th.th_serial_team;
506 
507 #if OMP_45_ENABLED
508   kmp_task_team_t *task_team = this_thr->th.th_task_team;
509 
510   // we need to wait for the proxy tasks before finishing the thread
511   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
512     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
513 #endif
514 
515   KMP_MB();
516   KMP_DEBUG_ASSERT(serial_team);
517   KMP_ASSERT(serial_team->t.t_serialized);
518   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
519   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
520   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
521   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
522 
523 #if OMPT_SUPPORT
524   if (ompt_enabled.enabled &&
525       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
526     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
527     if (ompt_enabled.ompt_callback_implicit_task) {
528       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
529           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
530           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
531     }
532 
533     // reset clear the task id only after unlinking the task
534     ompt_data_t *parent_task_data;
535     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
536 
537     if (ompt_enabled.ompt_callback_parallel_end) {
538       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
539           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
540           ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
541     }
542     __ompt_lw_taskteam_unlink(this_thr);
543     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
544   }
545 #endif
546 
547   /* If necessary, pop the internal control stack values and replace the team
548    * values */
549   top = serial_team->t.t_control_stack_top;
550   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
551     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
552     serial_team->t.t_control_stack_top = top->next;
553     __kmp_free(top);
554   }
555 
556   // if( serial_team -> t.t_serialized > 1 )
557   serial_team->t.t_level--;
558 
559   /* pop dispatch buffers stack */
560   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
561   {
562     dispatch_private_info_t *disp_buffer =
563         serial_team->t.t_dispatch->th_disp_buffer;
564     serial_team->t.t_dispatch->th_disp_buffer =
565         serial_team->t.t_dispatch->th_disp_buffer->next;
566     __kmp_free(disp_buffer);
567   }
568 #if OMP_50_ENABLED
569   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
570 #endif
571 
572   --serial_team->t.t_serialized;
573   if (serial_team->t.t_serialized == 0) {
574 
575 /* return to the parallel section */
576 
577 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
578     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
579       __kmp_clear_x87_fpu_status_word();
580       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
581       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
582     }
583 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
584 
585     this_thr->th.th_team = serial_team->t.t_parent;
586     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
587 
588     /* restore values cached in the thread */
589     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
590     this_thr->th.th_team_master =
591         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
592     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
593 
594     /* TODO the below shouldn't need to be adjusted for serialized teams */
595     this_thr->th.th_dispatch =
596         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
597 
598     __kmp_pop_current_task_from_thread(this_thr);
599 
600     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
601     this_thr->th.th_current_task->td_flags.executing = 1;
602 
603     if (__kmp_tasking_mode != tskm_immediate_exec) {
604       // Copy the task team from the new child / old parent team to the thread.
605       this_thr->th.th_task_team =
606           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
607       KA_TRACE(20,
608                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
609                 "team %p\n",
610                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
611     }
612   } else {
613     if (__kmp_tasking_mode != tskm_immediate_exec) {
614       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
615                     "depth of serial team %p to %d\n",
616                     global_tid, serial_team, serial_team->t.t_serialized));
617     }
618   }
619 
620   if (__kmp_env_consistency_check)
621     __kmp_pop_parallel(global_tid, NULL);
622 #if OMPT_SUPPORT
623   if (ompt_enabled.enabled)
624     this_thr->th.ompt_thread_info.state =
625         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
626                                            : ompt_state_work_parallel);
627 #endif
628 }
629 
630 /*!
631 @ingroup SYNCHRONIZATION
632 @param loc  source location information.
633 
634 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
635 depending on the memory ordering convention obeyed by the compiler
636 even that may not be necessary).
637 */
638 void __kmpc_flush(ident_t *loc) {
639   KC_TRACE(10, ("__kmpc_flush: called\n"));
640 
641   /* need explicit __mf() here since use volatile instead in library */
642   KMP_MB(); /* Flush all pending memory write invalidates.  */
643 
644 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
645 #if KMP_MIC
646 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
647 // We shouldn't need it, though, since the ABI rules require that
648 // * If the compiler generates NGO stores it also generates the fence
649 // * If users hand-code NGO stores they should insert the fence
650 // therefore no incomplete unordered stores should be visible.
651 #else
652   // C74404
653   // This is to address non-temporal store instructions (sfence needed).
654   // The clflush instruction is addressed either (mfence needed).
655   // Probably the non-temporal load monvtdqa instruction should also be
656   // addressed.
657   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
658   if (!__kmp_cpuinfo.initialized) {
659     __kmp_query_cpuid(&__kmp_cpuinfo);
660   }
661   if (!__kmp_cpuinfo.sse2) {
662     // CPU cannot execute SSE2 instructions.
663   } else {
664 #if KMP_COMPILER_ICC
665     _mm_mfence();
666 #elif KMP_COMPILER_MSVC
667     MemoryBarrier();
668 #else
669     __sync_synchronize();
670 #endif // KMP_COMPILER_ICC
671   }
672 #endif // KMP_MIC
673 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
674 // Nothing to see here move along
675 #elif KMP_ARCH_PPC64
676 // Nothing needed here (we have a real MB above).
677 #if KMP_OS_CNK
678   // The flushing thread needs to yield here; this prevents a
679   // busy-waiting thread from saturating the pipeline. flush is
680   // often used in loops like this:
681   // while (!flag) {
682   //   #pragma omp flush(flag)
683   // }
684   // and adding the yield here is good for at least a 10x speedup
685   // when running >2 threads per core (on the NAS LU benchmark).
686   __kmp_yield();
687 #endif
688 #else
689 #error Unknown or unsupported architecture
690 #endif
691 
692 #if OMPT_SUPPORT && OMPT_OPTIONAL
693   if (ompt_enabled.ompt_callback_flush) {
694     ompt_callbacks.ompt_callback(ompt_callback_flush)(
695         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
696   }
697 #endif
698 }
699 
700 /* -------------------------------------------------------------------------- */
701 /*!
702 @ingroup SYNCHRONIZATION
703 @param loc source location information
704 @param global_tid thread id.
705 
706 Execute a barrier.
707 */
708 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
709   KMP_COUNT_BLOCK(OMP_BARRIER);
710   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
711 
712   if (!TCR_4(__kmp_init_parallel))
713     __kmp_parallel_initialize();
714 
715 #if OMP_50_ENABLED
716   __kmp_resume_if_soft_paused();
717 #endif
718 
719   if (__kmp_env_consistency_check) {
720     if (loc == 0) {
721       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
722     }
723 
724     __kmp_check_barrier(global_tid, ct_barrier, loc);
725   }
726 
727 #if OMPT_SUPPORT
728   ompt_frame_t *ompt_frame;
729   if (ompt_enabled.enabled) {
730     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
731     if (ompt_frame->enter_frame.ptr == NULL)
732       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
733     OMPT_STORE_RETURN_ADDRESS(global_tid);
734   }
735 #endif
736   __kmp_threads[global_tid]->th.th_ident = loc;
737   // TODO: explicit barrier_wait_id:
738   //   this function is called when 'barrier' directive is present or
739   //   implicit barrier at the end of a worksharing construct.
740   // 1) better to add a per-thread barrier counter to a thread data structure
741   // 2) set to 0 when a new team is created
742   // 4) no sync is required
743 
744   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
745 #if OMPT_SUPPORT && OMPT_OPTIONAL
746   if (ompt_enabled.enabled) {
747     ompt_frame->enter_frame = ompt_data_none;
748   }
749 #endif
750 }
751 
752 /* The BARRIER for a MASTER section is always explicit   */
753 /*!
754 @ingroup WORK_SHARING
755 @param loc  source location information.
756 @param global_tid  global thread number .
757 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
758 */
759 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
760   int status = 0;
761 
762   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
763 
764   if (!TCR_4(__kmp_init_parallel))
765     __kmp_parallel_initialize();
766 
767 #if OMP_50_ENABLED
768   __kmp_resume_if_soft_paused();
769 #endif
770 
771   if (KMP_MASTER_GTID(global_tid)) {
772     KMP_COUNT_BLOCK(OMP_MASTER);
773     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
774     status = 1;
775   }
776 
777 #if OMPT_SUPPORT && OMPT_OPTIONAL
778   if (status) {
779     if (ompt_enabled.ompt_callback_master) {
780       kmp_info_t *this_thr = __kmp_threads[global_tid];
781       kmp_team_t *team = this_thr->th.th_team;
782 
783       int tid = __kmp_tid_from_gtid(global_tid);
784       ompt_callbacks.ompt_callback(ompt_callback_master)(
785           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
786           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
787           OMPT_GET_RETURN_ADDRESS(0));
788     }
789   }
790 #endif
791 
792   if (__kmp_env_consistency_check) {
793 #if KMP_USE_DYNAMIC_LOCK
794     if (status)
795       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
796     else
797       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
798 #else
799     if (status)
800       __kmp_push_sync(global_tid, ct_master, loc, NULL);
801     else
802       __kmp_check_sync(global_tid, ct_master, loc, NULL);
803 #endif
804   }
805 
806   return status;
807 }
808 
809 /*!
810 @ingroup WORK_SHARING
811 @param loc  source location information.
812 @param global_tid  global thread number .
813 
814 Mark the end of a <tt>master</tt> region. This should only be called by the
815 thread that executes the <tt>master</tt> region.
816 */
817 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
818   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
819 
820   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
821   KMP_POP_PARTITIONED_TIMER();
822 
823 #if OMPT_SUPPORT && OMPT_OPTIONAL
824   kmp_info_t *this_thr = __kmp_threads[global_tid];
825   kmp_team_t *team = this_thr->th.th_team;
826   if (ompt_enabled.ompt_callback_master) {
827     int tid = __kmp_tid_from_gtid(global_tid);
828     ompt_callbacks.ompt_callback(ompt_callback_master)(
829         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
830         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
831         OMPT_GET_RETURN_ADDRESS(0));
832   }
833 #endif
834 
835   if (__kmp_env_consistency_check) {
836     if (global_tid < 0)
837       KMP_WARNING(ThreadIdentInvalid);
838 
839     if (KMP_MASTER_GTID(global_tid))
840       __kmp_pop_sync(global_tid, ct_master, loc);
841   }
842 }
843 
844 /*!
845 @ingroup WORK_SHARING
846 @param loc  source location information.
847 @param gtid  global thread number.
848 
849 Start execution of an <tt>ordered</tt> construct.
850 */
851 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
852   int cid = 0;
853   kmp_info_t *th;
854   KMP_DEBUG_ASSERT(__kmp_init_serial);
855 
856   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
857 
858   if (!TCR_4(__kmp_init_parallel))
859     __kmp_parallel_initialize();
860 
861 #if OMP_50_ENABLED
862   __kmp_resume_if_soft_paused();
863 #endif
864 
865 #if USE_ITT_BUILD
866   __kmp_itt_ordered_prep(gtid);
867 // TODO: ordered_wait_id
868 #endif /* USE_ITT_BUILD */
869 
870   th = __kmp_threads[gtid];
871 
872 #if OMPT_SUPPORT && OMPT_OPTIONAL
873   kmp_team_t *team;
874   ompt_wait_id_t lck;
875   void *codeptr_ra;
876   if (ompt_enabled.enabled) {
877     OMPT_STORE_RETURN_ADDRESS(gtid);
878     team = __kmp_team_from_gtid(gtid);
879     lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
880     /* OMPT state update */
881     th->th.ompt_thread_info.wait_id = lck;
882     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
883 
884     /* OMPT event callback */
885     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
886     if (ompt_enabled.ompt_callback_mutex_acquire) {
887       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
888           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
889           (ompt_wait_id_t)lck, codeptr_ra);
890     }
891   }
892 #endif
893 
894   if (th->th.th_dispatch->th_deo_fcn != 0)
895     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
896   else
897     __kmp_parallel_deo(&gtid, &cid, loc);
898 
899 #if OMPT_SUPPORT && OMPT_OPTIONAL
900   if (ompt_enabled.enabled) {
901     /* OMPT state update */
902     th->th.ompt_thread_info.state = ompt_state_work_parallel;
903     th->th.ompt_thread_info.wait_id = 0;
904 
905     /* OMPT event callback */
906     if (ompt_enabled.ompt_callback_mutex_acquired) {
907       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
908           ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
909     }
910   }
911 #endif
912 
913 #if USE_ITT_BUILD
914   __kmp_itt_ordered_start(gtid);
915 #endif /* USE_ITT_BUILD */
916 }
917 
918 /*!
919 @ingroup WORK_SHARING
920 @param loc  source location information.
921 @param gtid  global thread number.
922 
923 End execution of an <tt>ordered</tt> construct.
924 */
925 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
926   int cid = 0;
927   kmp_info_t *th;
928 
929   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
930 
931 #if USE_ITT_BUILD
932   __kmp_itt_ordered_end(gtid);
933 // TODO: ordered_wait_id
934 #endif /* USE_ITT_BUILD */
935 
936   th = __kmp_threads[gtid];
937 
938   if (th->th.th_dispatch->th_dxo_fcn != 0)
939     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
940   else
941     __kmp_parallel_dxo(&gtid, &cid, loc);
942 
943 #if OMPT_SUPPORT && OMPT_OPTIONAL
944   OMPT_STORE_RETURN_ADDRESS(gtid);
945   if (ompt_enabled.ompt_callback_mutex_released) {
946     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
947         ompt_mutex_ordered,
948         (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
949         OMPT_LOAD_RETURN_ADDRESS(gtid));
950   }
951 #endif
952 }
953 
954 #if KMP_USE_DYNAMIC_LOCK
955 
956 static __forceinline void
957 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
958                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
959   // Pointer to the allocated indirect lock is written to crit, while indexing
960   // is ignored.
961   void *idx;
962   kmp_indirect_lock_t **lck;
963   lck = (kmp_indirect_lock_t **)crit;
964   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
965   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
966   KMP_SET_I_LOCK_LOCATION(ilk, loc);
967   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
968   KA_TRACE(20,
969            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
970 #if USE_ITT_BUILD
971   __kmp_itt_critical_creating(ilk->lock, loc);
972 #endif
973   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
974   if (status == 0) {
975 #if USE_ITT_BUILD
976     __kmp_itt_critical_destroyed(ilk->lock);
977 #endif
978     // We don't really need to destroy the unclaimed lock here since it will be
979     // cleaned up at program exit.
980     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
981   }
982   KMP_DEBUG_ASSERT(*lck != NULL);
983 }
984 
985 // Fast-path acquire tas lock
986 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
987   {                                                                            \
988     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
989     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
990     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
991     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
992         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
993       kmp_uint32 spins;                                                        \
994       KMP_FSYNC_PREPARE(l);                                                    \
995       KMP_INIT_YIELD(spins);                                                   \
996       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
997       do {                                                                     \
998         if (TCR_4(__kmp_nth) >                                                 \
999             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1000           KMP_YIELD(TRUE);                                                     \
1001         } else {                                                               \
1002           KMP_YIELD_SPIN(spins);                                               \
1003         }                                                                      \
1004         __kmp_spin_backoff(&backoff);                                          \
1005       } while (                                                                \
1006           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1007           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1008     }                                                                          \
1009     KMP_FSYNC_ACQUIRED(l);                                                     \
1010   }
1011 
1012 // Fast-path test tas lock
1013 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1014   {                                                                            \
1015     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1016     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1017     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1018     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1019          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1020   }
1021 
1022 // Fast-path release tas lock
1023 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1024   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1025 
1026 #if KMP_USE_FUTEX
1027 
1028 #include <sys/syscall.h>
1029 #include <unistd.h>
1030 #ifndef FUTEX_WAIT
1031 #define FUTEX_WAIT 0
1032 #endif
1033 #ifndef FUTEX_WAKE
1034 #define FUTEX_WAKE 1
1035 #endif
1036 
1037 // Fast-path acquire futex lock
1038 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1039   {                                                                            \
1040     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1041     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1042     KMP_MB();                                                                  \
1043     KMP_FSYNC_PREPARE(ftx);                                                    \
1044     kmp_int32 poll_val;                                                        \
1045     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1046                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1047                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1048       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1049       if (!cond) {                                                             \
1050         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1051                                          poll_val |                            \
1052                                              KMP_LOCK_BUSY(1, futex))) {       \
1053           continue;                                                            \
1054         }                                                                      \
1055         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1056       }                                                                        \
1057       kmp_int32 rc;                                                            \
1058       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1059                         NULL, NULL, 0)) != 0) {                                \
1060         continue;                                                              \
1061       }                                                                        \
1062       gtid_code |= 1;                                                          \
1063     }                                                                          \
1064     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1065   }
1066 
1067 // Fast-path test futex lock
1068 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1069   {                                                                            \
1070     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1071     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1072                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1073       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1074       rc = TRUE;                                                               \
1075     } else {                                                                   \
1076       rc = FALSE;                                                              \
1077     }                                                                          \
1078   }
1079 
1080 // Fast-path release futex lock
1081 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1082   {                                                                            \
1083     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1084     KMP_MB();                                                                  \
1085     KMP_FSYNC_RELEASING(ftx);                                                  \
1086     kmp_int32 poll_val =                                                       \
1087         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1088     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1089       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1090               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1091     }                                                                          \
1092     KMP_MB();                                                                  \
1093     KMP_YIELD_OVERSUB();                                                       \
1094   }
1095 
1096 #endif // KMP_USE_FUTEX
1097 
1098 #else // KMP_USE_DYNAMIC_LOCK
1099 
1100 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1101                                                       ident_t const *loc,
1102                                                       kmp_int32 gtid) {
1103   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1104 
1105   // Because of the double-check, the following load doesn't need to be volatile
1106   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1107 
1108   if (lck == NULL) {
1109     void *idx;
1110 
1111     // Allocate & initialize the lock.
1112     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1113     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1114     __kmp_init_user_lock_with_checks(lck);
1115     __kmp_set_user_lock_location(lck, loc);
1116 #if USE_ITT_BUILD
1117     __kmp_itt_critical_creating(lck);
1118 // __kmp_itt_critical_creating() should be called *before* the first usage
1119 // of underlying lock. It is the only place where we can guarantee it. There
1120 // are chances the lock will destroyed with no usage, but it is not a
1121 // problem, because this is not real event seen by user but rather setting
1122 // name for object (lock). See more details in kmp_itt.h.
1123 #endif /* USE_ITT_BUILD */
1124 
1125     // Use a cmpxchg instruction to slam the start of the critical section with
1126     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1127     // and use the lock that the other thread allocated.
1128     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1129 
1130     if (status == 0) {
1131 // Deallocate the lock and reload the value.
1132 #if USE_ITT_BUILD
1133       __kmp_itt_critical_destroyed(lck);
1134 // Let ITT know the lock is destroyed and the same memory location may be reused
1135 // for another purpose.
1136 #endif /* USE_ITT_BUILD */
1137       __kmp_destroy_user_lock_with_checks(lck);
1138       __kmp_user_lock_free(&idx, gtid, lck);
1139       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1140       KMP_DEBUG_ASSERT(lck != NULL);
1141     }
1142   }
1143   return lck;
1144 }
1145 
1146 #endif // KMP_USE_DYNAMIC_LOCK
1147 
1148 /*!
1149 @ingroup WORK_SHARING
1150 @param loc  source location information.
1151 @param global_tid  global thread number .
1152 @param crit identity of the critical section. This could be a pointer to a lock
1153 associated with the critical section, or some other suitably unique value.
1154 
1155 Enter code protected by a `critical` construct.
1156 This function blocks until the executing thread can enter the critical section.
1157 */
1158 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1159                      kmp_critical_name *crit) {
1160 #if KMP_USE_DYNAMIC_LOCK
1161 #if OMPT_SUPPORT && OMPT_OPTIONAL
1162   OMPT_STORE_RETURN_ADDRESS(global_tid);
1163 #endif // OMPT_SUPPORT
1164   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1165 #else
1166   KMP_COUNT_BLOCK(OMP_CRITICAL);
1167 #if OMPT_SUPPORT && OMPT_OPTIONAL
1168   ompt_state_t prev_state = ompt_state_undefined;
1169   ompt_thread_info_t ti;
1170 #endif
1171   kmp_user_lock_p lck;
1172 
1173   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1174 
1175   // TODO: add THR_OVHD_STATE
1176 
1177   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1178   KMP_CHECK_USER_LOCK_INIT();
1179 
1180   if ((__kmp_user_lock_kind == lk_tas) &&
1181       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1182     lck = (kmp_user_lock_p)crit;
1183   }
1184 #if KMP_USE_FUTEX
1185   else if ((__kmp_user_lock_kind == lk_futex) &&
1186            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1187     lck = (kmp_user_lock_p)crit;
1188   }
1189 #endif
1190   else { // ticket, queuing or drdpa
1191     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1192   }
1193 
1194   if (__kmp_env_consistency_check)
1195     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1196 
1197 // since the critical directive binds to all threads, not just the current
1198 // team we have to check this even if we are in a serialized team.
1199 // also, even if we are the uber thread, we still have to conduct the lock,
1200 // as we have to contend with sibling threads.
1201 
1202 #if USE_ITT_BUILD
1203   __kmp_itt_critical_acquiring(lck);
1204 #endif /* USE_ITT_BUILD */
1205 #if OMPT_SUPPORT && OMPT_OPTIONAL
1206   OMPT_STORE_RETURN_ADDRESS(gtid);
1207   void *codeptr_ra = NULL;
1208   if (ompt_enabled.enabled) {
1209     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1210     /* OMPT state update */
1211     prev_state = ti.state;
1212     ti.wait_id = (ompt_wait_id_t)lck;
1213     ti.state = ompt_state_wait_critical;
1214 
1215     /* OMPT event callback */
1216     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1217     if (ompt_enabled.ompt_callback_mutex_acquire) {
1218       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1219           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1220           (ompt_wait_id_t)crit, codeptr_ra);
1221     }
1222   }
1223 #endif
1224   // Value of 'crit' should be good for using as a critical_id of the critical
1225   // section directive.
1226   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1227 
1228 #if USE_ITT_BUILD
1229   __kmp_itt_critical_acquired(lck);
1230 #endif /* USE_ITT_BUILD */
1231 #if OMPT_SUPPORT && OMPT_OPTIONAL
1232   if (ompt_enabled.enabled) {
1233     /* OMPT state update */
1234     ti.state = prev_state;
1235     ti.wait_id = 0;
1236 
1237     /* OMPT event callback */
1238     if (ompt_enabled.ompt_callback_mutex_acquired) {
1239       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1240           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
1241     }
1242   }
1243 #endif
1244   KMP_POP_PARTITIONED_TIMER();
1245 
1246   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1247   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1248 #endif // KMP_USE_DYNAMIC_LOCK
1249 }
1250 
1251 #if KMP_USE_DYNAMIC_LOCK
1252 
1253 // Converts the given hint to an internal lock implementation
1254 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1255 #if KMP_USE_TSX
1256 #define KMP_TSX_LOCK(seq) lockseq_##seq
1257 #else
1258 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1259 #endif
1260 
1261 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1262 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1263 #else
1264 #define KMP_CPUINFO_RTM 0
1265 #endif
1266 
1267   // Hints that do not require further logic
1268   if (hint & kmp_lock_hint_hle)
1269     return KMP_TSX_LOCK(hle);
1270   if (hint & kmp_lock_hint_rtm)
1271     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1272   if (hint & kmp_lock_hint_adaptive)
1273     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1274 
1275   // Rule out conflicting hints first by returning the default lock
1276   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1277     return __kmp_user_lock_seq;
1278   if ((hint & omp_lock_hint_speculative) &&
1279       (hint & omp_lock_hint_nonspeculative))
1280     return __kmp_user_lock_seq;
1281 
1282   // Do not even consider speculation when it appears to be contended
1283   if (hint & omp_lock_hint_contended)
1284     return lockseq_queuing;
1285 
1286   // Uncontended lock without speculation
1287   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1288     return lockseq_tas;
1289 
1290   // HLE lock for speculation
1291   if (hint & omp_lock_hint_speculative)
1292     return KMP_TSX_LOCK(hle);
1293 
1294   return __kmp_user_lock_seq;
1295 }
1296 
1297 #if OMPT_SUPPORT && OMPT_OPTIONAL
1298 #if KMP_USE_DYNAMIC_LOCK
1299 static kmp_mutex_impl_t
1300 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1301   if (user_lock) {
1302     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1303     case 0:
1304       break;
1305 #if KMP_USE_FUTEX
1306     case locktag_futex:
1307       return kmp_mutex_impl_queuing;
1308 #endif
1309     case locktag_tas:
1310       return kmp_mutex_impl_spin;
1311 #if KMP_USE_TSX
1312     case locktag_hle:
1313       return kmp_mutex_impl_speculative;
1314 #endif
1315     default:
1316       return kmp_mutex_impl_none;
1317     }
1318     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1319   }
1320   KMP_ASSERT(ilock);
1321   switch (ilock->type) {
1322 #if KMP_USE_TSX
1323   case locktag_adaptive:
1324   case locktag_rtm:
1325     return kmp_mutex_impl_speculative;
1326 #endif
1327   case locktag_nested_tas:
1328     return kmp_mutex_impl_spin;
1329 #if KMP_USE_FUTEX
1330   case locktag_nested_futex:
1331 #endif
1332   case locktag_ticket:
1333   case locktag_queuing:
1334   case locktag_drdpa:
1335   case locktag_nested_ticket:
1336   case locktag_nested_queuing:
1337   case locktag_nested_drdpa:
1338     return kmp_mutex_impl_queuing;
1339   default:
1340     return kmp_mutex_impl_none;
1341   }
1342 }
1343 #else
1344 // For locks without dynamic binding
1345 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1346   switch (__kmp_user_lock_kind) {
1347   case lk_tas:
1348     return kmp_mutex_impl_spin;
1349 #if KMP_USE_FUTEX
1350   case lk_futex:
1351 #endif
1352   case lk_ticket:
1353   case lk_queuing:
1354   case lk_drdpa:
1355     return kmp_mutex_impl_queuing;
1356 #if KMP_USE_TSX
1357   case lk_hle:
1358   case lk_rtm:
1359   case lk_adaptive:
1360     return kmp_mutex_impl_speculative;
1361 #endif
1362   default:
1363     return kmp_mutex_impl_none;
1364   }
1365 }
1366 #endif // KMP_USE_DYNAMIC_LOCK
1367 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1368 
1369 /*!
1370 @ingroup WORK_SHARING
1371 @param loc  source location information.
1372 @param global_tid  global thread number.
1373 @param crit identity of the critical section. This could be a pointer to a lock
1374 associated with the critical section, or some other suitably unique value.
1375 @param hint the lock hint.
1376 
1377 Enter code protected by a `critical` construct with a hint. The hint value is
1378 used to suggest a lock implementation. This function blocks until the executing
1379 thread can enter the critical section unless the hint suggests use of
1380 speculative execution and the hardware supports it.
1381 */
1382 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1383                                kmp_critical_name *crit, uint32_t hint) {
1384   KMP_COUNT_BLOCK(OMP_CRITICAL);
1385   kmp_user_lock_p lck;
1386 #if OMPT_SUPPORT && OMPT_OPTIONAL
1387   ompt_state_t prev_state = ompt_state_undefined;
1388   ompt_thread_info_t ti;
1389   // This is the case, if called from __kmpc_critical:
1390   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1391   if (!codeptr)
1392     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1393 #endif
1394 
1395   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1396 
1397   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1398   // Check if it is initialized.
1399   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1400   if (*lk == 0) {
1401     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1402     if (KMP_IS_D_LOCK(lckseq)) {
1403       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1404                                   KMP_GET_D_TAG(lckseq));
1405     } else {
1406       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1407     }
1408   }
1409   // Branch for accessing the actual lock object and set operation. This
1410   // branching is inevitable since this lock initialization does not follow the
1411   // normal dispatch path (lock table is not used).
1412   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1413     lck = (kmp_user_lock_p)lk;
1414     if (__kmp_env_consistency_check) {
1415       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1416                       __kmp_map_hint_to_lock(hint));
1417     }
1418 #if USE_ITT_BUILD
1419     __kmp_itt_critical_acquiring(lck);
1420 #endif
1421 #if OMPT_SUPPORT && OMPT_OPTIONAL
1422     if (ompt_enabled.enabled) {
1423       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1424       /* OMPT state update */
1425       prev_state = ti.state;
1426       ti.wait_id = (ompt_wait_id_t)lck;
1427       ti.state = ompt_state_wait_critical;
1428 
1429       /* OMPT event callback */
1430       if (ompt_enabled.ompt_callback_mutex_acquire) {
1431         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1432             ompt_mutex_critical, (unsigned int)hint,
1433             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
1434       }
1435     }
1436 #endif
1437 #if KMP_USE_INLINED_TAS
1438     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1439       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1440     } else
1441 #elif KMP_USE_INLINED_FUTEX
1442     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1443       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1444     } else
1445 #endif
1446     {
1447       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1448     }
1449   } else {
1450     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1451     lck = ilk->lock;
1452     if (__kmp_env_consistency_check) {
1453       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1454                       __kmp_map_hint_to_lock(hint));
1455     }
1456 #if USE_ITT_BUILD
1457     __kmp_itt_critical_acquiring(lck);
1458 #endif
1459 #if OMPT_SUPPORT && OMPT_OPTIONAL
1460     if (ompt_enabled.enabled) {
1461       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1462       /* OMPT state update */
1463       prev_state = ti.state;
1464       ti.wait_id = (ompt_wait_id_t)lck;
1465       ti.state = ompt_state_wait_critical;
1466 
1467       /* OMPT event callback */
1468       if (ompt_enabled.ompt_callback_mutex_acquire) {
1469         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1470             ompt_mutex_critical, (unsigned int)hint,
1471             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
1472       }
1473     }
1474 #endif
1475     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1476   }
1477   KMP_POP_PARTITIONED_TIMER();
1478 
1479 #if USE_ITT_BUILD
1480   __kmp_itt_critical_acquired(lck);
1481 #endif /* USE_ITT_BUILD */
1482 #if OMPT_SUPPORT && OMPT_OPTIONAL
1483   if (ompt_enabled.enabled) {
1484     /* OMPT state update */
1485     ti.state = prev_state;
1486     ti.wait_id = 0;
1487 
1488     /* OMPT event callback */
1489     if (ompt_enabled.ompt_callback_mutex_acquired) {
1490       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1491           ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
1492     }
1493   }
1494 #endif
1495 
1496   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1497   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1498 } // __kmpc_critical_with_hint
1499 
1500 #endif // KMP_USE_DYNAMIC_LOCK
1501 
1502 /*!
1503 @ingroup WORK_SHARING
1504 @param loc  source location information.
1505 @param global_tid  global thread number .
1506 @param crit identity of the critical section. This could be a pointer to a lock
1507 associated with the critical section, or some other suitably unique value.
1508 
1509 Leave a critical section, releasing any lock that was held during its execution.
1510 */
1511 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1512                          kmp_critical_name *crit) {
1513   kmp_user_lock_p lck;
1514 
1515   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1516 
1517 #if KMP_USE_DYNAMIC_LOCK
1518   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1519     lck = (kmp_user_lock_p)crit;
1520     KMP_ASSERT(lck != NULL);
1521     if (__kmp_env_consistency_check) {
1522       __kmp_pop_sync(global_tid, ct_critical, loc);
1523     }
1524 #if USE_ITT_BUILD
1525     __kmp_itt_critical_releasing(lck);
1526 #endif
1527 #if KMP_USE_INLINED_TAS
1528     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1529       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1530     } else
1531 #elif KMP_USE_INLINED_FUTEX
1532     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1533       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1534     } else
1535 #endif
1536     {
1537       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1538     }
1539   } else {
1540     kmp_indirect_lock_t *ilk =
1541         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1542     KMP_ASSERT(ilk != NULL);
1543     lck = ilk->lock;
1544     if (__kmp_env_consistency_check) {
1545       __kmp_pop_sync(global_tid, ct_critical, loc);
1546     }
1547 #if USE_ITT_BUILD
1548     __kmp_itt_critical_releasing(lck);
1549 #endif
1550     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1551   }
1552 
1553 #else // KMP_USE_DYNAMIC_LOCK
1554 
1555   if ((__kmp_user_lock_kind == lk_tas) &&
1556       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1557     lck = (kmp_user_lock_p)crit;
1558   }
1559 #if KMP_USE_FUTEX
1560   else if ((__kmp_user_lock_kind == lk_futex) &&
1561            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1562     lck = (kmp_user_lock_p)crit;
1563   }
1564 #endif
1565   else { // ticket, queuing or drdpa
1566     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1567   }
1568 
1569   KMP_ASSERT(lck != NULL);
1570 
1571   if (__kmp_env_consistency_check)
1572     __kmp_pop_sync(global_tid, ct_critical, loc);
1573 
1574 #if USE_ITT_BUILD
1575   __kmp_itt_critical_releasing(lck);
1576 #endif /* USE_ITT_BUILD */
1577   // Value of 'crit' should be good for using as a critical_id of the critical
1578   // section directive.
1579   __kmp_release_user_lock_with_checks(lck, global_tid);
1580 
1581 #endif // KMP_USE_DYNAMIC_LOCK
1582 
1583 #if OMPT_SUPPORT && OMPT_OPTIONAL
1584   /* OMPT release event triggers after lock is released; place here to trigger
1585    * for all #if branches */
1586   OMPT_STORE_RETURN_ADDRESS(global_tid);
1587   if (ompt_enabled.ompt_callback_mutex_released) {
1588     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1589         ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1590   }
1591 #endif
1592 
1593   KMP_POP_PARTITIONED_TIMER();
1594   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1595 }
1596 
1597 /*!
1598 @ingroup SYNCHRONIZATION
1599 @param loc source location information
1600 @param global_tid thread id.
1601 @return one if the thread should execute the master block, zero otherwise
1602 
1603 Start execution of a combined barrier and master. The barrier is executed inside
1604 this function.
1605 */
1606 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1607   int status;
1608 
1609   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1610 
1611   if (!TCR_4(__kmp_init_parallel))
1612     __kmp_parallel_initialize();
1613 
1614 #if OMP_50_ENABLED
1615   __kmp_resume_if_soft_paused();
1616 #endif
1617 
1618   if (__kmp_env_consistency_check)
1619     __kmp_check_barrier(global_tid, ct_barrier, loc);
1620 
1621 #if OMPT_SUPPORT
1622   ompt_frame_t *ompt_frame;
1623   if (ompt_enabled.enabled) {
1624     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1625     if (ompt_frame->enter_frame.ptr == NULL)
1626       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1627     OMPT_STORE_RETURN_ADDRESS(global_tid);
1628   }
1629 #endif
1630 #if USE_ITT_NOTIFY
1631   __kmp_threads[global_tid]->th.th_ident = loc;
1632 #endif
1633   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1634 #if OMPT_SUPPORT && OMPT_OPTIONAL
1635   if (ompt_enabled.enabled) {
1636     ompt_frame->enter_frame = ompt_data_none;
1637   }
1638 #endif
1639 
1640   return (status != 0) ? 0 : 1;
1641 }
1642 
1643 /*!
1644 @ingroup SYNCHRONIZATION
1645 @param loc source location information
1646 @param global_tid thread id.
1647 
1648 Complete the execution of a combined barrier and master. This function should
1649 only be called at the completion of the <tt>master</tt> code. Other threads will
1650 still be waiting at the barrier and this call releases them.
1651 */
1652 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1653   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1654 
1655   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1656 }
1657 
1658 /*!
1659 @ingroup SYNCHRONIZATION
1660 @param loc source location information
1661 @param global_tid thread id.
1662 @return one if the thread should execute the master block, zero otherwise
1663 
1664 Start execution of a combined barrier and master(nowait) construct.
1665 The barrier is executed inside this function.
1666 There is no equivalent "end" function, since the
1667 */
1668 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1669   kmp_int32 ret;
1670 
1671   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1672 
1673   if (!TCR_4(__kmp_init_parallel))
1674     __kmp_parallel_initialize();
1675 
1676 #if OMP_50_ENABLED
1677   __kmp_resume_if_soft_paused();
1678 #endif
1679 
1680   if (__kmp_env_consistency_check) {
1681     if (loc == 0) {
1682       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1683     }
1684     __kmp_check_barrier(global_tid, ct_barrier, loc);
1685   }
1686 
1687 #if OMPT_SUPPORT
1688   ompt_frame_t *ompt_frame;
1689   if (ompt_enabled.enabled) {
1690     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1691     if (ompt_frame->enter_frame.ptr == NULL)
1692       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1693     OMPT_STORE_RETURN_ADDRESS(global_tid);
1694   }
1695 #endif
1696 #if USE_ITT_NOTIFY
1697   __kmp_threads[global_tid]->th.th_ident = loc;
1698 #endif
1699   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1700 #if OMPT_SUPPORT && OMPT_OPTIONAL
1701   if (ompt_enabled.enabled) {
1702     ompt_frame->enter_frame = ompt_data_none;
1703   }
1704 #endif
1705 
1706   ret = __kmpc_master(loc, global_tid);
1707 
1708   if (__kmp_env_consistency_check) {
1709     /*  there's no __kmpc_end_master called; so the (stats) */
1710     /*  actions of __kmpc_end_master are done here          */
1711 
1712     if (global_tid < 0) {
1713       KMP_WARNING(ThreadIdentInvalid);
1714     }
1715     if (ret) {
1716       /* only one thread should do the pop since only */
1717       /* one did the push (see __kmpc_master())       */
1718 
1719       __kmp_pop_sync(global_tid, ct_master, loc);
1720     }
1721   }
1722 
1723   return (ret);
1724 }
1725 
1726 /* The BARRIER for a SINGLE process section is always explicit   */
1727 /*!
1728 @ingroup WORK_SHARING
1729 @param loc  source location information
1730 @param global_tid  global thread number
1731 @return One if this thread should execute the single construct, zero otherwise.
1732 
1733 Test whether to execute a <tt>single</tt> construct.
1734 There are no implicit barriers in the two "single" calls, rather the compiler
1735 should introduce an explicit barrier if it is required.
1736 */
1737 
1738 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1739   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1740 
1741   if (rc) {
1742     // We are going to execute the single statement, so we should count it.
1743     KMP_COUNT_BLOCK(OMP_SINGLE);
1744     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1745   }
1746 
1747 #if OMPT_SUPPORT && OMPT_OPTIONAL
1748   kmp_info_t *this_thr = __kmp_threads[global_tid];
1749   kmp_team_t *team = this_thr->th.th_team;
1750   int tid = __kmp_tid_from_gtid(global_tid);
1751 
1752   if (ompt_enabled.enabled) {
1753     if (rc) {
1754       if (ompt_enabled.ompt_callback_work) {
1755         ompt_callbacks.ompt_callback(ompt_callback_work)(
1756             ompt_work_single_executor, ompt_scope_begin,
1757             &(team->t.ompt_team_info.parallel_data),
1758             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1759             1, OMPT_GET_RETURN_ADDRESS(0));
1760       }
1761     } else {
1762       if (ompt_enabled.ompt_callback_work) {
1763         ompt_callbacks.ompt_callback(ompt_callback_work)(
1764             ompt_work_single_other, ompt_scope_begin,
1765             &(team->t.ompt_team_info.parallel_data),
1766             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1767             1, OMPT_GET_RETURN_ADDRESS(0));
1768         ompt_callbacks.ompt_callback(ompt_callback_work)(
1769             ompt_work_single_other, ompt_scope_end,
1770             &(team->t.ompt_team_info.parallel_data),
1771             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1772             1, OMPT_GET_RETURN_ADDRESS(0));
1773       }
1774     }
1775   }
1776 #endif
1777 
1778   return rc;
1779 }
1780 
1781 /*!
1782 @ingroup WORK_SHARING
1783 @param loc  source location information
1784 @param global_tid  global thread number
1785 
1786 Mark the end of a <tt>single</tt> construct.  This function should
1787 only be called by the thread that executed the block of code protected
1788 by the `single` construct.
1789 */
1790 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1791   __kmp_exit_single(global_tid);
1792   KMP_POP_PARTITIONED_TIMER();
1793 
1794 #if OMPT_SUPPORT && OMPT_OPTIONAL
1795   kmp_info_t *this_thr = __kmp_threads[global_tid];
1796   kmp_team_t *team = this_thr->th.th_team;
1797   int tid = __kmp_tid_from_gtid(global_tid);
1798 
1799   if (ompt_enabled.ompt_callback_work) {
1800     ompt_callbacks.ompt_callback(ompt_callback_work)(
1801         ompt_work_single_executor, ompt_scope_end,
1802         &(team->t.ompt_team_info.parallel_data),
1803         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1804         OMPT_GET_RETURN_ADDRESS(0));
1805   }
1806 #endif
1807 }
1808 
1809 /*!
1810 @ingroup WORK_SHARING
1811 @param loc Source location
1812 @param global_tid Global thread id
1813 
1814 Mark the end of a statically scheduled loop.
1815 */
1816 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1817   KMP_POP_PARTITIONED_TIMER();
1818   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1819 
1820 #if OMPT_SUPPORT && OMPT_OPTIONAL
1821   if (ompt_enabled.ompt_callback_work) {
1822     ompt_work_t ompt_work_type = ompt_work_loop;
1823     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1824     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1825     // Determine workshare type
1826     if (loc != NULL) {
1827       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1828         ompt_work_type = ompt_work_loop;
1829       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1830         ompt_work_type = ompt_work_sections;
1831       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1832         ompt_work_type = ompt_work_distribute;
1833       } else {
1834         // use default set above.
1835         // a warning about this case is provided in __kmpc_for_static_init
1836       }
1837       KMP_DEBUG_ASSERT(ompt_work_type);
1838     }
1839     ompt_callbacks.ompt_callback(ompt_callback_work)(
1840         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1841         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1842   }
1843 #endif
1844   if (__kmp_env_consistency_check)
1845     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1846 }
1847 
1848 // User routines which take C-style arguments (call by value)
1849 // different from the Fortran equivalent routines
1850 
1851 void ompc_set_num_threads(int arg) {
1852   // !!!!! TODO: check the per-task binding
1853   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1854 }
1855 
1856 void ompc_set_dynamic(int flag) {
1857   kmp_info_t *thread;
1858 
1859   /* For the thread-private implementation of the internal controls */
1860   thread = __kmp_entry_thread();
1861 
1862   __kmp_save_internal_controls(thread);
1863 
1864   set__dynamic(thread, flag ? TRUE : FALSE);
1865 }
1866 
1867 void ompc_set_nested(int flag) {
1868   kmp_info_t *thread;
1869 
1870   /* For the thread-private internal controls implementation */
1871   thread = __kmp_entry_thread();
1872 
1873   __kmp_save_internal_controls(thread);
1874 
1875   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1876 }
1877 
1878 void ompc_set_max_active_levels(int max_active_levels) {
1879   /* TO DO */
1880   /* we want per-task implementation of this internal control */
1881 
1882   /* For the per-thread internal controls implementation */
1883   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1884 }
1885 
1886 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1887   // !!!!! TODO: check the per-task binding
1888   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1889 }
1890 
1891 int ompc_get_ancestor_thread_num(int level) {
1892   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1893 }
1894 
1895 int ompc_get_team_size(int level) {
1896   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1897 }
1898 
1899 #if OMP_50_ENABLED
1900 /* OpenMP 5.0 Affinity Format API */
1901 
1902 void ompc_set_affinity_format(char const *format) {
1903   if (!__kmp_init_serial) {
1904     __kmp_serial_initialize();
1905   }
1906   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1907                          format, KMP_STRLEN(format) + 1);
1908 }
1909 
1910 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1911   size_t format_size;
1912   if (!__kmp_init_serial) {
1913     __kmp_serial_initialize();
1914   }
1915   format_size = KMP_STRLEN(__kmp_affinity_format);
1916   if (buffer && size) {
1917     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1918                            format_size + 1);
1919   }
1920   return format_size;
1921 }
1922 
1923 void ompc_display_affinity(char const *format) {
1924   int gtid;
1925   if (!TCR_4(__kmp_init_middle)) {
1926     __kmp_middle_initialize();
1927   }
1928   gtid = __kmp_get_gtid();
1929   __kmp_aux_display_affinity(gtid, format);
1930 }
1931 
1932 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1933                              char const *format) {
1934   int gtid;
1935   size_t num_required;
1936   kmp_str_buf_t capture_buf;
1937   if (!TCR_4(__kmp_init_middle)) {
1938     __kmp_middle_initialize();
1939   }
1940   gtid = __kmp_get_gtid();
1941   __kmp_str_buf_init(&capture_buf);
1942   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1943   if (buffer && buf_size) {
1944     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1945                            capture_buf.used + 1);
1946   }
1947   __kmp_str_buf_free(&capture_buf);
1948   return num_required;
1949 }
1950 #endif /* OMP_50_ENABLED */
1951 
1952 void kmpc_set_stacksize(int arg) {
1953   // __kmp_aux_set_stacksize initializes the library if needed
1954   __kmp_aux_set_stacksize(arg);
1955 }
1956 
1957 void kmpc_set_stacksize_s(size_t arg) {
1958   // __kmp_aux_set_stacksize initializes the library if needed
1959   __kmp_aux_set_stacksize(arg);
1960 }
1961 
1962 void kmpc_set_blocktime(int arg) {
1963   int gtid, tid;
1964   kmp_info_t *thread;
1965 
1966   gtid = __kmp_entry_gtid();
1967   tid = __kmp_tid_from_gtid(gtid);
1968   thread = __kmp_thread_from_gtid(gtid);
1969 
1970   __kmp_aux_set_blocktime(arg, thread, tid);
1971 }
1972 
1973 void kmpc_set_library(int arg) {
1974   // __kmp_user_set_library initializes the library if needed
1975   __kmp_user_set_library((enum library_type)arg);
1976 }
1977 
1978 void kmpc_set_defaults(char const *str) {
1979   // __kmp_aux_set_defaults initializes the library if needed
1980   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1981 }
1982 
1983 void kmpc_set_disp_num_buffers(int arg) {
1984   // ignore after initialization because some teams have already
1985   // allocated dispatch buffers
1986   if (__kmp_init_serial == 0 && arg > 0)
1987     __kmp_dispatch_num_buffers = arg;
1988 }
1989 
1990 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1991 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1992   return -1;
1993 #else
1994   if (!TCR_4(__kmp_init_middle)) {
1995     __kmp_middle_initialize();
1996   }
1997   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1998 #endif
1999 }
2000 
2001 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2002 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2003   return -1;
2004 #else
2005   if (!TCR_4(__kmp_init_middle)) {
2006     __kmp_middle_initialize();
2007   }
2008   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2009 #endif
2010 }
2011 
2012 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2013 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2014   return -1;
2015 #else
2016   if (!TCR_4(__kmp_init_middle)) {
2017     __kmp_middle_initialize();
2018   }
2019   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2020 #endif
2021 }
2022 
2023 /* -------------------------------------------------------------------------- */
2024 /*!
2025 @ingroup THREADPRIVATE
2026 @param loc       source location information
2027 @param gtid      global thread number
2028 @param cpy_size  size of the cpy_data buffer
2029 @param cpy_data  pointer to data to be copied
2030 @param cpy_func  helper function to call for copying data
2031 @param didit     flag variable: 1=single thread; 0=not single thread
2032 
2033 __kmpc_copyprivate implements the interface for the private data broadcast
2034 needed for the copyprivate clause associated with a single region in an
2035 OpenMP<sup>*</sup> program (both C and Fortran).
2036 All threads participating in the parallel region call this routine.
2037 One of the threads (called the single thread) should have the <tt>didit</tt>
2038 variable set to 1 and all other threads should have that variable set to 0.
2039 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2040 
2041 The OpenMP specification forbids the use of nowait on the single region when a
2042 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2043 barrier internally to avoid race conditions, so the code generation for the
2044 single region should avoid generating a barrier after the call to @ref
2045 __kmpc_copyprivate.
2046 
2047 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2048 The <tt>loc</tt> parameter is a pointer to source location information.
2049 
2050 Internal implementation: The single thread will first copy its descriptor
2051 address (cpy_data) to a team-private location, then the other threads will each
2052 call the function pointed to by the parameter cpy_func, which carries out the
2053 copy by copying the data using the cpy_data buffer.
2054 
2055 The cpy_func routine used for the copy and the contents of the data area defined
2056 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2057 to be done. For instance, the cpy_data buffer can hold the actual data to be
2058 copied or it may hold a list of pointers to the data. The cpy_func routine must
2059 interpret the cpy_data buffer appropriately.
2060 
2061 The interface to cpy_func is as follows:
2062 @code
2063 void cpy_func( void *destination, void *source )
2064 @endcode
2065 where void *destination is the cpy_data pointer for the thread being copied to
2066 and void *source is the cpy_data pointer for the thread being copied from.
2067 */
2068 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2069                         void *cpy_data, void (*cpy_func)(void *, void *),
2070                         kmp_int32 didit) {
2071   void **data_ptr;
2072 
2073   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2074 
2075   KMP_MB();
2076 
2077   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2078 
2079   if (__kmp_env_consistency_check) {
2080     if (loc == 0) {
2081       KMP_WARNING(ConstructIdentInvalid);
2082     }
2083   }
2084 
2085   // ToDo: Optimize the following two barriers into some kind of split barrier
2086 
2087   if (didit)
2088     *data_ptr = cpy_data;
2089 
2090 #if OMPT_SUPPORT
2091   ompt_frame_t *ompt_frame;
2092   if (ompt_enabled.enabled) {
2093     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2094     if (ompt_frame->enter_frame.ptr == NULL)
2095       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2096     OMPT_STORE_RETURN_ADDRESS(gtid);
2097   }
2098 #endif
2099 /* This barrier is not a barrier region boundary */
2100 #if USE_ITT_NOTIFY
2101   __kmp_threads[gtid]->th.th_ident = loc;
2102 #endif
2103   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2104 
2105   if (!didit)
2106     (*cpy_func)(cpy_data, *data_ptr);
2107 
2108 // Consider next barrier a user-visible barrier for barrier region boundaries
2109 // Nesting checks are already handled by the single construct checks
2110 
2111 #if OMPT_SUPPORT
2112   if (ompt_enabled.enabled) {
2113     OMPT_STORE_RETURN_ADDRESS(gtid);
2114   }
2115 #endif
2116 #if USE_ITT_NOTIFY
2117   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2118 // tasks can overwrite the location)
2119 #endif
2120   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2121 #if OMPT_SUPPORT && OMPT_OPTIONAL
2122   if (ompt_enabled.enabled) {
2123     ompt_frame->enter_frame = ompt_data_none;
2124   }
2125 #endif
2126 }
2127 
2128 /* -------------------------------------------------------------------------- */
2129 
2130 #define INIT_LOCK __kmp_init_user_lock_with_checks
2131 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2132 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2133 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2134 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2135 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2136   __kmp_acquire_nested_user_lock_with_checks_timed
2137 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2138 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2139 #define TEST_LOCK __kmp_test_user_lock_with_checks
2140 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2141 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2142 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2143 
2144 // TODO: Make check abort messages use location info & pass it into
2145 // with_checks routines
2146 
2147 #if KMP_USE_DYNAMIC_LOCK
2148 
2149 // internal lock initializer
2150 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2151                                                     kmp_dyna_lockseq_t seq) {
2152   if (KMP_IS_D_LOCK(seq)) {
2153     KMP_INIT_D_LOCK(lock, seq);
2154 #if USE_ITT_BUILD
2155     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2156 #endif
2157   } else {
2158     KMP_INIT_I_LOCK(lock, seq);
2159 #if USE_ITT_BUILD
2160     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2161     __kmp_itt_lock_creating(ilk->lock, loc);
2162 #endif
2163   }
2164 }
2165 
2166 // internal nest lock initializer
2167 static __forceinline void
2168 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2169                                kmp_dyna_lockseq_t seq) {
2170 #if KMP_USE_TSX
2171   // Don't have nested lock implementation for speculative locks
2172   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2173     seq = __kmp_user_lock_seq;
2174 #endif
2175   switch (seq) {
2176   case lockseq_tas:
2177     seq = lockseq_nested_tas;
2178     break;
2179 #if KMP_USE_FUTEX
2180   case lockseq_futex:
2181     seq = lockseq_nested_futex;
2182     break;
2183 #endif
2184   case lockseq_ticket:
2185     seq = lockseq_nested_ticket;
2186     break;
2187   case lockseq_queuing:
2188     seq = lockseq_nested_queuing;
2189     break;
2190   case lockseq_drdpa:
2191     seq = lockseq_nested_drdpa;
2192     break;
2193   default:
2194     seq = lockseq_nested_queuing;
2195   }
2196   KMP_INIT_I_LOCK(lock, seq);
2197 #if USE_ITT_BUILD
2198   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2199   __kmp_itt_lock_creating(ilk->lock, loc);
2200 #endif
2201 }
2202 
2203 /* initialize the lock with a hint */
2204 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2205                                 uintptr_t hint) {
2206   KMP_DEBUG_ASSERT(__kmp_init_serial);
2207   if (__kmp_env_consistency_check && user_lock == NULL) {
2208     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2209   }
2210 
2211   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2212 
2213 #if OMPT_SUPPORT && OMPT_OPTIONAL
2214   // This is the case, if called from omp_init_lock_with_hint:
2215   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2216   if (!codeptr)
2217     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2218   if (ompt_enabled.ompt_callback_lock_init) {
2219     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2220         ompt_mutex_lock, (omp_lock_hint_t)hint,
2221         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2222         codeptr);
2223   }
2224 #endif
2225 }
2226 
2227 /* initialize the lock with a hint */
2228 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2229                                      void **user_lock, uintptr_t hint) {
2230   KMP_DEBUG_ASSERT(__kmp_init_serial);
2231   if (__kmp_env_consistency_check && user_lock == NULL) {
2232     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2233   }
2234 
2235   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2236 
2237 #if OMPT_SUPPORT && OMPT_OPTIONAL
2238   // This is the case, if called from omp_init_lock_with_hint:
2239   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2240   if (!codeptr)
2241     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2242   if (ompt_enabled.ompt_callback_lock_init) {
2243     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2244         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2245         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2246         codeptr);
2247   }
2248 #endif
2249 }
2250 
2251 #endif // KMP_USE_DYNAMIC_LOCK
2252 
2253 /* initialize the lock */
2254 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2255 #if KMP_USE_DYNAMIC_LOCK
2256 
2257   KMP_DEBUG_ASSERT(__kmp_init_serial);
2258   if (__kmp_env_consistency_check && user_lock == NULL) {
2259     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2260   }
2261   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2262 
2263 #if OMPT_SUPPORT && OMPT_OPTIONAL
2264   // This is the case, if called from omp_init_lock_with_hint:
2265   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2266   if (!codeptr)
2267     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2268   if (ompt_enabled.ompt_callback_lock_init) {
2269     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2270         ompt_mutex_lock, omp_lock_hint_none,
2271         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2272         codeptr);
2273   }
2274 #endif
2275 
2276 #else // KMP_USE_DYNAMIC_LOCK
2277 
2278   static char const *const func = "omp_init_lock";
2279   kmp_user_lock_p lck;
2280   KMP_DEBUG_ASSERT(__kmp_init_serial);
2281 
2282   if (__kmp_env_consistency_check) {
2283     if (user_lock == NULL) {
2284       KMP_FATAL(LockIsUninitialized, func);
2285     }
2286   }
2287 
2288   KMP_CHECK_USER_LOCK_INIT();
2289 
2290   if ((__kmp_user_lock_kind == lk_tas) &&
2291       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2292     lck = (kmp_user_lock_p)user_lock;
2293   }
2294 #if KMP_USE_FUTEX
2295   else if ((__kmp_user_lock_kind == lk_futex) &&
2296            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2297     lck = (kmp_user_lock_p)user_lock;
2298   }
2299 #endif
2300   else {
2301     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2302   }
2303   INIT_LOCK(lck);
2304   __kmp_set_user_lock_location(lck, loc);
2305 
2306 #if OMPT_SUPPORT && OMPT_OPTIONAL
2307   // This is the case, if called from omp_init_lock_with_hint:
2308   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2309   if (!codeptr)
2310     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2311   if (ompt_enabled.ompt_callback_lock_init) {
2312     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2313         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2314         (ompt_wait_id_t)user_lock, codeptr);
2315   }
2316 #endif
2317 
2318 #if USE_ITT_BUILD
2319   __kmp_itt_lock_creating(lck);
2320 #endif /* USE_ITT_BUILD */
2321 
2322 #endif // KMP_USE_DYNAMIC_LOCK
2323 } // __kmpc_init_lock
2324 
2325 /* initialize the lock */
2326 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2327 #if KMP_USE_DYNAMIC_LOCK
2328 
2329   KMP_DEBUG_ASSERT(__kmp_init_serial);
2330   if (__kmp_env_consistency_check && user_lock == NULL) {
2331     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2332   }
2333   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2334 
2335 #if OMPT_SUPPORT && OMPT_OPTIONAL
2336   // This is the case, if called from omp_init_lock_with_hint:
2337   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2338   if (!codeptr)
2339     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2340   if (ompt_enabled.ompt_callback_lock_init) {
2341     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2342         ompt_mutex_nest_lock, omp_lock_hint_none,
2343         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2344         codeptr);
2345   }
2346 #endif
2347 
2348 #else // KMP_USE_DYNAMIC_LOCK
2349 
2350   static char const *const func = "omp_init_nest_lock";
2351   kmp_user_lock_p lck;
2352   KMP_DEBUG_ASSERT(__kmp_init_serial);
2353 
2354   if (__kmp_env_consistency_check) {
2355     if (user_lock == NULL) {
2356       KMP_FATAL(LockIsUninitialized, func);
2357     }
2358   }
2359 
2360   KMP_CHECK_USER_LOCK_INIT();
2361 
2362   if ((__kmp_user_lock_kind == lk_tas) &&
2363       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2364        OMP_NEST_LOCK_T_SIZE)) {
2365     lck = (kmp_user_lock_p)user_lock;
2366   }
2367 #if KMP_USE_FUTEX
2368   else if ((__kmp_user_lock_kind == lk_futex) &&
2369            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2370             OMP_NEST_LOCK_T_SIZE)) {
2371     lck = (kmp_user_lock_p)user_lock;
2372   }
2373 #endif
2374   else {
2375     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2376   }
2377 
2378   INIT_NESTED_LOCK(lck);
2379   __kmp_set_user_lock_location(lck, loc);
2380 
2381 #if OMPT_SUPPORT && OMPT_OPTIONAL
2382   // This is the case, if called from omp_init_lock_with_hint:
2383   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2384   if (!codeptr)
2385     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2386   if (ompt_enabled.ompt_callback_lock_init) {
2387     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2388         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2389         (ompt_wait_id_t)user_lock, codeptr);
2390   }
2391 #endif
2392 
2393 #if USE_ITT_BUILD
2394   __kmp_itt_lock_creating(lck);
2395 #endif /* USE_ITT_BUILD */
2396 
2397 #endif // KMP_USE_DYNAMIC_LOCK
2398 } // __kmpc_init_nest_lock
2399 
2400 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2401 #if KMP_USE_DYNAMIC_LOCK
2402 
2403 #if USE_ITT_BUILD
2404   kmp_user_lock_p lck;
2405   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2406     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2407   } else {
2408     lck = (kmp_user_lock_p)user_lock;
2409   }
2410   __kmp_itt_lock_destroyed(lck);
2411 #endif
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413   // This is the case, if called from omp_init_lock_with_hint:
2414   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2415   if (!codeptr)
2416     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2417   if (ompt_enabled.ompt_callback_lock_destroy) {
2418     kmp_user_lock_p lck;
2419     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2420       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2421     } else {
2422       lck = (kmp_user_lock_p)user_lock;
2423     }
2424     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2425         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2426   }
2427 #endif
2428   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2429 #else
2430   kmp_user_lock_p lck;
2431 
2432   if ((__kmp_user_lock_kind == lk_tas) &&
2433       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2434     lck = (kmp_user_lock_p)user_lock;
2435   }
2436 #if KMP_USE_FUTEX
2437   else if ((__kmp_user_lock_kind == lk_futex) &&
2438            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2439     lck = (kmp_user_lock_p)user_lock;
2440   }
2441 #endif
2442   else {
2443     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2444   }
2445 
2446 #if OMPT_SUPPORT && OMPT_OPTIONAL
2447   // This is the case, if called from omp_init_lock_with_hint:
2448   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2449   if (!codeptr)
2450     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2451   if (ompt_enabled.ompt_callback_lock_destroy) {
2452     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2453         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2454   }
2455 #endif
2456 
2457 #if USE_ITT_BUILD
2458   __kmp_itt_lock_destroyed(lck);
2459 #endif /* USE_ITT_BUILD */
2460   DESTROY_LOCK(lck);
2461 
2462   if ((__kmp_user_lock_kind == lk_tas) &&
2463       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2464     ;
2465   }
2466 #if KMP_USE_FUTEX
2467   else if ((__kmp_user_lock_kind == lk_futex) &&
2468            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2469     ;
2470   }
2471 #endif
2472   else {
2473     __kmp_user_lock_free(user_lock, gtid, lck);
2474   }
2475 #endif // KMP_USE_DYNAMIC_LOCK
2476 } // __kmpc_destroy_lock
2477 
2478 /* destroy the lock */
2479 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2480 #if KMP_USE_DYNAMIC_LOCK
2481 
2482 #if USE_ITT_BUILD
2483   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2484   __kmp_itt_lock_destroyed(ilk->lock);
2485 #endif
2486 #if OMPT_SUPPORT && OMPT_OPTIONAL
2487   // This is the case, if called from omp_init_lock_with_hint:
2488   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2489   if (!codeptr)
2490     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2491   if (ompt_enabled.ompt_callback_lock_destroy) {
2492     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2493         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2494   }
2495 #endif
2496   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2497 
2498 #else // KMP_USE_DYNAMIC_LOCK
2499 
2500   kmp_user_lock_p lck;
2501 
2502   if ((__kmp_user_lock_kind == lk_tas) &&
2503       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2504        OMP_NEST_LOCK_T_SIZE)) {
2505     lck = (kmp_user_lock_p)user_lock;
2506   }
2507 #if KMP_USE_FUTEX
2508   else if ((__kmp_user_lock_kind == lk_futex) &&
2509            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2510             OMP_NEST_LOCK_T_SIZE)) {
2511     lck = (kmp_user_lock_p)user_lock;
2512   }
2513 #endif
2514   else {
2515     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2516   }
2517 
2518 #if OMPT_SUPPORT && OMPT_OPTIONAL
2519   // This is the case, if called from omp_init_lock_with_hint:
2520   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2521   if (!codeptr)
2522     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2523   if (ompt_enabled.ompt_callback_lock_destroy) {
2524     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2525         ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2526   }
2527 #endif
2528 
2529 #if USE_ITT_BUILD
2530   __kmp_itt_lock_destroyed(lck);
2531 #endif /* USE_ITT_BUILD */
2532 
2533   DESTROY_NESTED_LOCK(lck);
2534 
2535   if ((__kmp_user_lock_kind == lk_tas) &&
2536       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2537        OMP_NEST_LOCK_T_SIZE)) {
2538     ;
2539   }
2540 #if KMP_USE_FUTEX
2541   else if ((__kmp_user_lock_kind == lk_futex) &&
2542            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2543             OMP_NEST_LOCK_T_SIZE)) {
2544     ;
2545   }
2546 #endif
2547   else {
2548     __kmp_user_lock_free(user_lock, gtid, lck);
2549   }
2550 #endif // KMP_USE_DYNAMIC_LOCK
2551 } // __kmpc_destroy_nest_lock
2552 
2553 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2554   KMP_COUNT_BLOCK(OMP_set_lock);
2555 #if KMP_USE_DYNAMIC_LOCK
2556   int tag = KMP_EXTRACT_D_TAG(user_lock);
2557 #if USE_ITT_BUILD
2558   __kmp_itt_lock_acquiring(
2559       (kmp_user_lock_p)
2560           user_lock); // itt function will get to the right lock object.
2561 #endif
2562 #if OMPT_SUPPORT && OMPT_OPTIONAL
2563   // This is the case, if called from omp_init_lock_with_hint:
2564   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2565   if (!codeptr)
2566     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2567   if (ompt_enabled.ompt_callback_mutex_acquire) {
2568     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2569         ompt_mutex_lock, omp_lock_hint_none,
2570         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2571         codeptr);
2572   }
2573 #endif
2574 #if KMP_USE_INLINED_TAS
2575   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2576     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2577   } else
2578 #elif KMP_USE_INLINED_FUTEX
2579   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2580     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2581   } else
2582 #endif
2583   {
2584     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2585   }
2586 #if USE_ITT_BUILD
2587   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2588 #endif
2589 #if OMPT_SUPPORT && OMPT_OPTIONAL
2590   if (ompt_enabled.ompt_callback_mutex_acquired) {
2591     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2592         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2593   }
2594 #endif
2595 
2596 #else // KMP_USE_DYNAMIC_LOCK
2597 
2598   kmp_user_lock_p lck;
2599 
2600   if ((__kmp_user_lock_kind == lk_tas) &&
2601       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2602     lck = (kmp_user_lock_p)user_lock;
2603   }
2604 #if KMP_USE_FUTEX
2605   else if ((__kmp_user_lock_kind == lk_futex) &&
2606            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2607     lck = (kmp_user_lock_p)user_lock;
2608   }
2609 #endif
2610   else {
2611     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2612   }
2613 
2614 #if USE_ITT_BUILD
2615   __kmp_itt_lock_acquiring(lck);
2616 #endif /* USE_ITT_BUILD */
2617 #if OMPT_SUPPORT && OMPT_OPTIONAL
2618   // This is the case, if called from omp_init_lock_with_hint:
2619   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2620   if (!codeptr)
2621     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2622   if (ompt_enabled.ompt_callback_mutex_acquire) {
2623     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2624         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2625         (ompt_wait_id_t)lck, codeptr);
2626   }
2627 #endif
2628 
2629   ACQUIRE_LOCK(lck, gtid);
2630 
2631 #if USE_ITT_BUILD
2632   __kmp_itt_lock_acquired(lck);
2633 #endif /* USE_ITT_BUILD */
2634 
2635 #if OMPT_SUPPORT && OMPT_OPTIONAL
2636   if (ompt_enabled.ompt_callback_mutex_acquired) {
2637     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2638         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2639   }
2640 #endif
2641 
2642 #endif // KMP_USE_DYNAMIC_LOCK
2643 }
2644 
2645 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2646 #if KMP_USE_DYNAMIC_LOCK
2647 
2648 #if USE_ITT_BUILD
2649   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2650 #endif
2651 #if OMPT_SUPPORT && OMPT_OPTIONAL
2652   // This is the case, if called from omp_init_lock_with_hint:
2653   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2654   if (!codeptr)
2655     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2656   if (ompt_enabled.enabled) {
2657     if (ompt_enabled.ompt_callback_mutex_acquire) {
2658       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2659           ompt_mutex_nest_lock, omp_lock_hint_none,
2660           __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2661           codeptr);
2662     }
2663   }
2664 #endif
2665   int acquire_status =
2666       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2667   (void) acquire_status;
2668 #if USE_ITT_BUILD
2669   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2670 #endif
2671 
2672 #if OMPT_SUPPORT && OMPT_OPTIONAL
2673   if (ompt_enabled.enabled) {
2674     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2675       if (ompt_enabled.ompt_callback_mutex_acquired) {
2676         // lock_first
2677         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2678             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2679       }
2680     } else {
2681       if (ompt_enabled.ompt_callback_nest_lock) {
2682         // lock_next
2683         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2684             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
2685       }
2686     }
2687   }
2688 #endif
2689 
2690 #else // KMP_USE_DYNAMIC_LOCK
2691   int acquire_status;
2692   kmp_user_lock_p lck;
2693 
2694   if ((__kmp_user_lock_kind == lk_tas) &&
2695       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2696        OMP_NEST_LOCK_T_SIZE)) {
2697     lck = (kmp_user_lock_p)user_lock;
2698   }
2699 #if KMP_USE_FUTEX
2700   else if ((__kmp_user_lock_kind == lk_futex) &&
2701            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2702             OMP_NEST_LOCK_T_SIZE)) {
2703     lck = (kmp_user_lock_p)user_lock;
2704   }
2705 #endif
2706   else {
2707     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2708   }
2709 
2710 #if USE_ITT_BUILD
2711   __kmp_itt_lock_acquiring(lck);
2712 #endif /* USE_ITT_BUILD */
2713 #if OMPT_SUPPORT && OMPT_OPTIONAL
2714   // This is the case, if called from omp_init_lock_with_hint:
2715   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2716   if (!codeptr)
2717     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2718   if (ompt_enabled.enabled) {
2719     if (ompt_enabled.ompt_callback_mutex_acquire) {
2720       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2721           ompt_mutex_nest_lock, omp_lock_hint_none,
2722           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
2723     }
2724   }
2725 #endif
2726 
2727   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2728 
2729 #if USE_ITT_BUILD
2730   __kmp_itt_lock_acquired(lck);
2731 #endif /* USE_ITT_BUILD */
2732 
2733 #if OMPT_SUPPORT && OMPT_OPTIONAL
2734   if (ompt_enabled.enabled) {
2735     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2736       if (ompt_enabled.ompt_callback_mutex_acquired) {
2737         // lock_first
2738         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2739             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2740       }
2741     } else {
2742       if (ompt_enabled.ompt_callback_nest_lock) {
2743         // lock_next
2744         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2745             ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
2746       }
2747     }
2748   }
2749 #endif
2750 
2751 #endif // KMP_USE_DYNAMIC_LOCK
2752 }
2753 
2754 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2755 #if KMP_USE_DYNAMIC_LOCK
2756 
2757   int tag = KMP_EXTRACT_D_TAG(user_lock);
2758 #if USE_ITT_BUILD
2759   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2760 #endif
2761 #if KMP_USE_INLINED_TAS
2762   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2763     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2764   } else
2765 #elif KMP_USE_INLINED_FUTEX
2766   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2767     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2768   } else
2769 #endif
2770   {
2771     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2772   }
2773 
2774 #if OMPT_SUPPORT && OMPT_OPTIONAL
2775   // This is the case, if called from omp_init_lock_with_hint:
2776   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2777   if (!codeptr)
2778     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2779   if (ompt_enabled.ompt_callback_mutex_released) {
2780     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2781         ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
2782   }
2783 #endif
2784 
2785 #else // KMP_USE_DYNAMIC_LOCK
2786 
2787   kmp_user_lock_p lck;
2788 
2789   /* Can't use serial interval since not block structured */
2790   /* release the lock */
2791 
2792   if ((__kmp_user_lock_kind == lk_tas) &&
2793       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2794 #if KMP_OS_LINUX &&                                                            \
2795     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2796 // "fast" path implemented to fix customer performance issue
2797 #if USE_ITT_BUILD
2798     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2799 #endif /* USE_ITT_BUILD */
2800     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2801     KMP_MB();
2802 
2803 #if OMPT_SUPPORT && OMPT_OPTIONAL
2804     // This is the case, if called from omp_init_lock_with_hint:
2805     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2806     if (!codeptr)
2807       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2808     if (ompt_enabled.ompt_callback_mutex_released) {
2809       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2810           ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2811     }
2812 #endif
2813 
2814     return;
2815 #else
2816     lck = (kmp_user_lock_p)user_lock;
2817 #endif
2818   }
2819 #if KMP_USE_FUTEX
2820   else if ((__kmp_user_lock_kind == lk_futex) &&
2821            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2822     lck = (kmp_user_lock_p)user_lock;
2823   }
2824 #endif
2825   else {
2826     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2827   }
2828 
2829 #if USE_ITT_BUILD
2830   __kmp_itt_lock_releasing(lck);
2831 #endif /* USE_ITT_BUILD */
2832 
2833   RELEASE_LOCK(lck, gtid);
2834 
2835 #if OMPT_SUPPORT && OMPT_OPTIONAL
2836   // This is the case, if called from omp_init_lock_with_hint:
2837   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2838   if (!codeptr)
2839     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2840   if (ompt_enabled.ompt_callback_mutex_released) {
2841     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2842         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
2843   }
2844 #endif
2845 
2846 #endif // KMP_USE_DYNAMIC_LOCK
2847 }
2848 
2849 /* release the lock */
2850 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2851 #if KMP_USE_DYNAMIC_LOCK
2852 
2853 #if USE_ITT_BUILD
2854   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2855 #endif
2856   int release_status =
2857       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2858   (void) release_status;
2859 
2860 #if OMPT_SUPPORT && OMPT_OPTIONAL
2861   // This is the case, if called from omp_init_lock_with_hint:
2862   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2863   if (!codeptr)
2864     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2865   if (ompt_enabled.enabled) {
2866     if (release_status == KMP_LOCK_RELEASED) {
2867       if (ompt_enabled.ompt_callback_mutex_released) {
2868         // release_lock_last
2869         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2870             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
2871       }
2872     } else if (ompt_enabled.ompt_callback_nest_lock) {
2873       // release_lock_prev
2874       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2875           ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
2876     }
2877   }
2878 #endif
2879 
2880 #else // KMP_USE_DYNAMIC_LOCK
2881 
2882   kmp_user_lock_p lck;
2883 
2884   /* Can't use serial interval since not block structured */
2885 
2886   if ((__kmp_user_lock_kind == lk_tas) &&
2887       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2888        OMP_NEST_LOCK_T_SIZE)) {
2889 #if KMP_OS_LINUX &&                                                            \
2890     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2891     // "fast" path implemented to fix customer performance issue
2892     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2893 #if USE_ITT_BUILD
2894     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2895 #endif /* USE_ITT_BUILD */
2896 
2897 #if OMPT_SUPPORT && OMPT_OPTIONAL
2898     int release_status = KMP_LOCK_STILL_HELD;
2899 #endif
2900 
2901     if (--(tl->lk.depth_locked) == 0) {
2902       TCW_4(tl->lk.poll, 0);
2903 #if OMPT_SUPPORT && OMPT_OPTIONAL
2904       release_status = KMP_LOCK_RELEASED;
2905 #endif
2906     }
2907     KMP_MB();
2908 
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910     // This is the case, if called from omp_init_lock_with_hint:
2911     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2912     if (!codeptr)
2913       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2914     if (ompt_enabled.enabled) {
2915       if (release_status == KMP_LOCK_RELEASED) {
2916         if (ompt_enabled.ompt_callback_mutex_released) {
2917           // release_lock_last
2918           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2919               ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2920         }
2921       } else if (ompt_enabled.ompt_callback_nest_lock) {
2922         // release_lock_previous
2923         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2924             ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2925       }
2926     }
2927 #endif
2928 
2929     return;
2930 #else
2931     lck = (kmp_user_lock_p)user_lock;
2932 #endif
2933   }
2934 #if KMP_USE_FUTEX
2935   else if ((__kmp_user_lock_kind == lk_futex) &&
2936            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2937             OMP_NEST_LOCK_T_SIZE)) {
2938     lck = (kmp_user_lock_p)user_lock;
2939   }
2940 #endif
2941   else {
2942     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2943   }
2944 
2945 #if USE_ITT_BUILD
2946   __kmp_itt_lock_releasing(lck);
2947 #endif /* USE_ITT_BUILD */
2948 
2949   int release_status;
2950   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2951 #if OMPT_SUPPORT && OMPT_OPTIONAL
2952   // This is the case, if called from omp_init_lock_with_hint:
2953   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2954   if (!codeptr)
2955     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2956   if (ompt_enabled.enabled) {
2957     if (release_status == KMP_LOCK_RELEASED) {
2958       if (ompt_enabled.ompt_callback_mutex_released) {
2959         // release_lock_last
2960         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2961             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
2962       }
2963     } else if (ompt_enabled.ompt_callback_nest_lock) {
2964       // release_lock_previous
2965       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2966           ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
2967     }
2968   }
2969 #endif
2970 
2971 #endif // KMP_USE_DYNAMIC_LOCK
2972 }
2973 
2974 /* try to acquire the lock */
2975 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2976   KMP_COUNT_BLOCK(OMP_test_lock);
2977 
2978 #if KMP_USE_DYNAMIC_LOCK
2979   int rc;
2980   int tag = KMP_EXTRACT_D_TAG(user_lock);
2981 #if USE_ITT_BUILD
2982   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2983 #endif
2984 #if OMPT_SUPPORT && OMPT_OPTIONAL
2985   // This is the case, if called from omp_init_lock_with_hint:
2986   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2987   if (!codeptr)
2988     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2989   if (ompt_enabled.ompt_callback_mutex_acquire) {
2990     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2991         ompt_mutex_lock, omp_lock_hint_none,
2992         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
2993         codeptr);
2994   }
2995 #endif
2996 #if KMP_USE_INLINED_TAS
2997   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2998     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2999   } else
3000 #elif KMP_USE_INLINED_FUTEX
3001   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3002     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3003   } else
3004 #endif
3005   {
3006     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3007   }
3008   if (rc) {
3009 #if USE_ITT_BUILD
3010     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3011 #endif
3012 #if OMPT_SUPPORT && OMPT_OPTIONAL
3013     if (ompt_enabled.ompt_callback_mutex_acquired) {
3014       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3015           ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
3016     }
3017 #endif
3018     return FTN_TRUE;
3019   } else {
3020 #if USE_ITT_BUILD
3021     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3022 #endif
3023     return FTN_FALSE;
3024   }
3025 
3026 #else // KMP_USE_DYNAMIC_LOCK
3027 
3028   kmp_user_lock_p lck;
3029   int rc;
3030 
3031   if ((__kmp_user_lock_kind == lk_tas) &&
3032       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3033     lck = (kmp_user_lock_p)user_lock;
3034   }
3035 #if KMP_USE_FUTEX
3036   else if ((__kmp_user_lock_kind == lk_futex) &&
3037            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3038     lck = (kmp_user_lock_p)user_lock;
3039   }
3040 #endif
3041   else {
3042     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3043   }
3044 
3045 #if USE_ITT_BUILD
3046   __kmp_itt_lock_acquiring(lck);
3047 #endif /* USE_ITT_BUILD */
3048 #if OMPT_SUPPORT && OMPT_OPTIONAL
3049   // This is the case, if called from omp_init_lock_with_hint:
3050   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3051   if (!codeptr)
3052     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3053   if (ompt_enabled.ompt_callback_mutex_acquire) {
3054     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3055         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3056         (ompt_wait_id_t)lck, codeptr);
3057   }
3058 #endif
3059 
3060   rc = TEST_LOCK(lck, gtid);
3061 #if USE_ITT_BUILD
3062   if (rc) {
3063     __kmp_itt_lock_acquired(lck);
3064   } else {
3065     __kmp_itt_lock_cancelled(lck);
3066   }
3067 #endif /* USE_ITT_BUILD */
3068 #if OMPT_SUPPORT && OMPT_OPTIONAL
3069   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3070     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3071         ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
3072   }
3073 #endif
3074 
3075   return (rc ? FTN_TRUE : FTN_FALSE);
3076 
3077 /* Can't use serial interval since not block structured */
3078 
3079 #endif // KMP_USE_DYNAMIC_LOCK
3080 }
3081 
3082 /* try to acquire the lock */
3083 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3084 #if KMP_USE_DYNAMIC_LOCK
3085   int rc;
3086 #if USE_ITT_BUILD
3087   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3088 #endif
3089 #if OMPT_SUPPORT && OMPT_OPTIONAL
3090   // This is the case, if called from omp_init_lock_with_hint:
3091   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3092   if (!codeptr)
3093     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3094   if (ompt_enabled.ompt_callback_mutex_acquire) {
3095     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3096         ompt_mutex_nest_lock, omp_lock_hint_none,
3097         __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
3098         codeptr);
3099   }
3100 #endif
3101   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3102 #if USE_ITT_BUILD
3103   if (rc) {
3104     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3105   } else {
3106     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3107   }
3108 #endif
3109 #if OMPT_SUPPORT && OMPT_OPTIONAL
3110   if (ompt_enabled.enabled && rc) {
3111     if (rc == 1) {
3112       if (ompt_enabled.ompt_callback_mutex_acquired) {
3113         // lock_first
3114         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3115             ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
3116       }
3117     } else {
3118       if (ompt_enabled.ompt_callback_nest_lock) {
3119         // lock_next
3120         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3121             ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
3122       }
3123     }
3124   }
3125 #endif
3126   return rc;
3127 
3128 #else // KMP_USE_DYNAMIC_LOCK
3129 
3130   kmp_user_lock_p lck;
3131   int rc;
3132 
3133   if ((__kmp_user_lock_kind == lk_tas) &&
3134       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3135        OMP_NEST_LOCK_T_SIZE)) {
3136     lck = (kmp_user_lock_p)user_lock;
3137   }
3138 #if KMP_USE_FUTEX
3139   else if ((__kmp_user_lock_kind == lk_futex) &&
3140            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3141             OMP_NEST_LOCK_T_SIZE)) {
3142     lck = (kmp_user_lock_p)user_lock;
3143   }
3144 #endif
3145   else {
3146     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3147   }
3148 
3149 #if USE_ITT_BUILD
3150   __kmp_itt_lock_acquiring(lck);
3151 #endif /* USE_ITT_BUILD */
3152 
3153 #if OMPT_SUPPORT && OMPT_OPTIONAL
3154   // This is the case, if called from omp_init_lock_with_hint:
3155   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3156   if (!codeptr)
3157     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3158   if (ompt_enabled.enabled) &&
3159         ompt_enabled.ompt_callback_mutex_acquire) {
3160       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3161           ompt_mutex_nest_lock, omp_lock_hint_none,
3162           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
3163     }
3164 #endif
3165 
3166   rc = TEST_NESTED_LOCK(lck, gtid);
3167 #if USE_ITT_BUILD
3168   if (rc) {
3169     __kmp_itt_lock_acquired(lck);
3170   } else {
3171     __kmp_itt_lock_cancelled(lck);
3172   }
3173 #endif /* USE_ITT_BUILD */
3174 #if OMPT_SUPPORT && OMPT_OPTIONAL
3175   if (ompt_enabled.enabled && rc) {
3176     if (rc == 1) {
3177       if (ompt_enabled.ompt_callback_mutex_acquired) {
3178         // lock_first
3179         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3180             ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
3181       }
3182     } else {
3183       if (ompt_enabled.ompt_callback_nest_lock) {
3184         // lock_next
3185         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3186             ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
3187       }
3188     }
3189   }
3190 #endif
3191   return rc;
3192 
3193 /* Can't use serial interval since not block structured */
3194 
3195 #endif // KMP_USE_DYNAMIC_LOCK
3196 }
3197 
3198 // Interface to fast scalable reduce methods routines
3199 
3200 // keep the selected method in a thread local structure for cross-function
3201 // usage: will be used in __kmpc_end_reduce* functions;
3202 // another solution: to re-determine the method one more time in
3203 // __kmpc_end_reduce* functions (new prototype required then)
3204 // AT: which solution is better?
3205 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3206   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3207 
3208 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3209   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3210 
3211 // description of the packed_reduction_method variable: look at the macros in
3212 // kmp.h
3213 
3214 // used in a critical section reduce block
3215 static __forceinline void
3216 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3217                                           kmp_critical_name *crit) {
3218 
3219   // this lock was visible to a customer and to the threading profile tool as a
3220   // serial overhead span (although it's used for an internal purpose only)
3221   //            why was it visible in previous implementation?
3222   //            should we keep it visible in new reduce block?
3223   kmp_user_lock_p lck;
3224 
3225 #if KMP_USE_DYNAMIC_LOCK
3226 
3227   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3228   // Check if it is initialized.
3229   if (*lk == 0) {
3230     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3231       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3232                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3233     } else {
3234       __kmp_init_indirect_csptr(crit, loc, global_tid,
3235                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3236     }
3237   }
3238   // Branch for accessing the actual lock object and set operation. This
3239   // branching is inevitable since this lock initialization does not follow the
3240   // normal dispatch path (lock table is not used).
3241   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3242     lck = (kmp_user_lock_p)lk;
3243     KMP_DEBUG_ASSERT(lck != NULL);
3244     if (__kmp_env_consistency_check) {
3245       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3246     }
3247     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3248   } else {
3249     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3250     lck = ilk->lock;
3251     KMP_DEBUG_ASSERT(lck != NULL);
3252     if (__kmp_env_consistency_check) {
3253       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3254     }
3255     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3256   }
3257 
3258 #else // KMP_USE_DYNAMIC_LOCK
3259 
3260   // We know that the fast reduction code is only emitted by Intel compilers
3261   // with 32 byte critical sections. If there isn't enough space, then we
3262   // have to use a pointer.
3263   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3264     lck = (kmp_user_lock_p)crit;
3265   } else {
3266     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3267   }
3268   KMP_DEBUG_ASSERT(lck != NULL);
3269 
3270   if (__kmp_env_consistency_check)
3271     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3272 
3273   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3274 
3275 #endif // KMP_USE_DYNAMIC_LOCK
3276 }
3277 
3278 // used in a critical section reduce block
3279 static __forceinline void
3280 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3281                                         kmp_critical_name *crit) {
3282 
3283   kmp_user_lock_p lck;
3284 
3285 #if KMP_USE_DYNAMIC_LOCK
3286 
3287   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3288     lck = (kmp_user_lock_p)crit;
3289     if (__kmp_env_consistency_check)
3290       __kmp_pop_sync(global_tid, ct_critical, loc);
3291     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3292   } else {
3293     kmp_indirect_lock_t *ilk =
3294         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3295     if (__kmp_env_consistency_check)
3296       __kmp_pop_sync(global_tid, ct_critical, loc);
3297     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3298   }
3299 
3300 #else // KMP_USE_DYNAMIC_LOCK
3301 
3302   // We know that the fast reduction code is only emitted by Intel compilers
3303   // with 32 byte critical sections. If there isn't enough space, then we have
3304   // to use a pointer.
3305   if (__kmp_base_user_lock_size > 32) {
3306     lck = *((kmp_user_lock_p *)crit);
3307     KMP_ASSERT(lck != NULL);
3308   } else {
3309     lck = (kmp_user_lock_p)crit;
3310   }
3311 
3312   if (__kmp_env_consistency_check)
3313     __kmp_pop_sync(global_tid, ct_critical, loc);
3314 
3315   __kmp_release_user_lock_with_checks(lck, global_tid);
3316 
3317 #endif // KMP_USE_DYNAMIC_LOCK
3318 } // __kmp_end_critical_section_reduce_block
3319 
3320 #if OMP_40_ENABLED
3321 static __forceinline int
3322 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3323                                      int *task_state) {
3324   kmp_team_t *team;
3325 
3326   // Check if we are inside the teams construct?
3327   if (th->th.th_teams_microtask) {
3328     *team_p = team = th->th.th_team;
3329     if (team->t.t_level == th->th.th_teams_level) {
3330       // This is reduction at teams construct.
3331       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3332       // Let's swap teams temporarily for the reduction.
3333       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3334       th->th.th_team = team->t.t_parent;
3335       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3336       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3337       *task_state = th->th.th_task_state;
3338       th->th.th_task_state = 0;
3339 
3340       return 1;
3341     }
3342   }
3343   return 0;
3344 }
3345 
3346 static __forceinline void
3347 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3348   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3349   th->th.th_info.ds.ds_tid = 0;
3350   th->th.th_team = team;
3351   th->th.th_team_nproc = team->t.t_nproc;
3352   th->th.th_task_team = team->t.t_task_team[task_state];
3353   th->th.th_task_state = task_state;
3354 }
3355 #endif
3356 
3357 /* 2.a.i. Reduce Block without a terminating barrier */
3358 /*!
3359 @ingroup SYNCHRONIZATION
3360 @param loc source location information
3361 @param global_tid global thread number
3362 @param num_vars number of items (variables) to be reduced
3363 @param reduce_size size of data in bytes to be reduced
3364 @param reduce_data pointer to data to be reduced
3365 @param reduce_func callback function providing reduction operation on two
3366 operands and returning result of reduction in lhs_data
3367 @param lck pointer to the unique lock data structure
3368 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3369 threads if atomic reduction needed
3370 
3371 The nowait version is used for a reduce clause with the nowait argument.
3372 */
3373 kmp_int32
3374 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3375                      size_t reduce_size, void *reduce_data,
3376                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3377                      kmp_critical_name *lck) {
3378 
3379   KMP_COUNT_BLOCK(REDUCE_nowait);
3380   int retval = 0;
3381   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3382 #if OMP_40_ENABLED
3383   kmp_info_t *th;
3384   kmp_team_t *team;
3385   int teams_swapped = 0, task_state;
3386 #endif
3387   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3388 
3389   // why do we need this initialization here at all?
3390   // Reduction clause can not be used as a stand-alone directive.
3391 
3392   // do not call __kmp_serial_initialize(), it will be called by
3393   // __kmp_parallel_initialize() if needed
3394   // possible detection of false-positive race by the threadchecker ???
3395   if (!TCR_4(__kmp_init_parallel))
3396     __kmp_parallel_initialize();
3397 
3398 #if OMP_50_ENABLED
3399   __kmp_resume_if_soft_paused();
3400 #endif
3401 
3402 // check correctness of reduce block nesting
3403 #if KMP_USE_DYNAMIC_LOCK
3404   if (__kmp_env_consistency_check)
3405     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3406 #else
3407   if (__kmp_env_consistency_check)
3408     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3409 #endif
3410 
3411 #if OMP_40_ENABLED
3412   th = __kmp_thread_from_gtid(global_tid);
3413   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3414 #endif // OMP_40_ENABLED
3415 
3416   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3417   // the value should be kept in a variable
3418   // the variable should be either a construct-specific or thread-specific
3419   // property, not a team specific property
3420   //     (a thread can reach the next reduce block on the next construct, reduce
3421   //     method may differ on the next construct)
3422   // an ident_t "loc" parameter could be used as a construct-specific property
3423   // (what if loc == 0?)
3424   //     (if both construct-specific and team-specific variables were shared,
3425   //     then unness extra syncs should be needed)
3426   // a thread-specific variable is better regarding two issues above (next
3427   // construct and extra syncs)
3428   // a thread-specific "th_local.reduction_method" variable is used currently
3429   // each thread executes 'determine' and 'set' lines (no need to execute by one
3430   // thread, to avoid unness extra syncs)
3431 
3432   packed_reduction_method = __kmp_determine_reduction_method(
3433       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3434   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3435 
3436   if (packed_reduction_method == critical_reduce_block) {
3437 
3438     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3439     retval = 1;
3440 
3441   } else if (packed_reduction_method == empty_reduce_block) {
3442 
3443     // usage: if team size == 1, no synchronization is required ( Intel
3444     // platforms only )
3445     retval = 1;
3446 
3447   } else if (packed_reduction_method == atomic_reduce_block) {
3448 
3449     retval = 2;
3450 
3451     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3452     // won't be called by the code gen)
3453     //     (it's not quite good, because the checking block has been closed by
3454     //     this 'pop',
3455     //      but atomic operation has not been executed yet, will be executed
3456     //      slightly later, literally on next instruction)
3457     if (__kmp_env_consistency_check)
3458       __kmp_pop_sync(global_tid, ct_reduce, loc);
3459 
3460   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3461                                    tree_reduce_block)) {
3462 
3463 // AT: performance issue: a real barrier here
3464 // AT:     (if master goes slow, other threads are blocked here waiting for the
3465 // master to come and release them)
3466 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3467 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3468 // be confusing to a customer)
3469 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3470 // might go faster and be more in line with sense of NOWAIT
3471 // AT: TO DO: do epcc test and compare times
3472 
3473 // this barrier should be invisible to a customer and to the threading profile
3474 // tool (it's neither a terminating barrier nor customer's code, it's
3475 // used for an internal purpose)
3476 #if OMPT_SUPPORT
3477     // JP: can this barrier potentially leed to task scheduling?
3478     // JP: as long as there is a barrier in the implementation, OMPT should and
3479     // will provide the barrier events
3480     //         so we set-up the necessary frame/return addresses.
3481     ompt_frame_t *ompt_frame;
3482     if (ompt_enabled.enabled) {
3483       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3484       if (ompt_frame->enter_frame.ptr == NULL)
3485         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3486       OMPT_STORE_RETURN_ADDRESS(global_tid);
3487     }
3488 #endif
3489 #if USE_ITT_NOTIFY
3490     __kmp_threads[global_tid]->th.th_ident = loc;
3491 #endif
3492     retval =
3493         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3494                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3495     retval = (retval != 0) ? (0) : (1);
3496 #if OMPT_SUPPORT && OMPT_OPTIONAL
3497     if (ompt_enabled.enabled) {
3498       ompt_frame->enter_frame = ompt_data_none;
3499     }
3500 #endif
3501 
3502     // all other workers except master should do this pop here
3503     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3504     if (__kmp_env_consistency_check) {
3505       if (retval == 0) {
3506         __kmp_pop_sync(global_tid, ct_reduce, loc);
3507       }
3508     }
3509 
3510   } else {
3511 
3512     // should never reach this block
3513     KMP_ASSERT(0); // "unexpected method"
3514   }
3515 #if OMP_40_ENABLED
3516   if (teams_swapped) {
3517     __kmp_restore_swapped_teams(th, team, task_state);
3518   }
3519 #endif
3520   KA_TRACE(
3521       10,
3522       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3523        global_tid, packed_reduction_method, retval));
3524 
3525   return retval;
3526 }
3527 
3528 /*!
3529 @ingroup SYNCHRONIZATION
3530 @param loc source location information
3531 @param global_tid global thread id.
3532 @param lck pointer to the unique lock data structure
3533 
3534 Finish the execution of a reduce nowait.
3535 */
3536 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3537                               kmp_critical_name *lck) {
3538 
3539   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3540 
3541   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3542 
3543   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3544 
3545   if (packed_reduction_method == critical_reduce_block) {
3546 
3547     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3548 
3549   } else if (packed_reduction_method == empty_reduce_block) {
3550 
3551     // usage: if team size == 1, no synchronization is required ( on Intel
3552     // platforms only )
3553 
3554   } else if (packed_reduction_method == atomic_reduce_block) {
3555 
3556     // neither master nor other workers should get here
3557     //     (code gen does not generate this call in case 2: atomic reduce block)
3558     // actually it's better to remove this elseif at all;
3559     // after removal this value will checked by the 'else' and will assert
3560 
3561   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3562                                    tree_reduce_block)) {
3563 
3564     // only master gets here
3565 
3566   } else {
3567 
3568     // should never reach this block
3569     KMP_ASSERT(0); // "unexpected method"
3570   }
3571 
3572   if (__kmp_env_consistency_check)
3573     __kmp_pop_sync(global_tid, ct_reduce, loc);
3574 
3575   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3576                 global_tid, packed_reduction_method));
3577 
3578   return;
3579 }
3580 
3581 /* 2.a.ii. Reduce Block with a terminating barrier */
3582 
3583 /*!
3584 @ingroup SYNCHRONIZATION
3585 @param loc source location information
3586 @param global_tid global thread number
3587 @param num_vars number of items (variables) to be reduced
3588 @param reduce_size size of data in bytes to be reduced
3589 @param reduce_data pointer to data to be reduced
3590 @param reduce_func callback function providing reduction operation on two
3591 operands and returning result of reduction in lhs_data
3592 @param lck pointer to the unique lock data structure
3593 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3594 threads if atomic reduction needed
3595 
3596 A blocking reduce that includes an implicit barrier.
3597 */
3598 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3599                         size_t reduce_size, void *reduce_data,
3600                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3601                         kmp_critical_name *lck) {
3602   KMP_COUNT_BLOCK(REDUCE_wait);
3603   int retval = 0;
3604   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3605 #if OMP_40_ENABLED
3606   kmp_info_t *th;
3607   kmp_team_t *team;
3608   int teams_swapped = 0, task_state;
3609 #endif
3610 
3611   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3612 
3613   // why do we need this initialization here at all?
3614   // Reduction clause can not be a stand-alone directive.
3615 
3616   // do not call __kmp_serial_initialize(), it will be called by
3617   // __kmp_parallel_initialize() if needed
3618   // possible detection of false-positive race by the threadchecker ???
3619   if (!TCR_4(__kmp_init_parallel))
3620     __kmp_parallel_initialize();
3621 
3622 #if OMP_50_ENABLED
3623   __kmp_resume_if_soft_paused();
3624 #endif
3625 
3626 // check correctness of reduce block nesting
3627 #if KMP_USE_DYNAMIC_LOCK
3628   if (__kmp_env_consistency_check)
3629     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3630 #else
3631   if (__kmp_env_consistency_check)
3632     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3633 #endif
3634 
3635 #if OMP_40_ENABLED
3636   th = __kmp_thread_from_gtid(global_tid);
3637   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3638 #endif // OMP_40_ENABLED
3639 
3640   packed_reduction_method = __kmp_determine_reduction_method(
3641       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3642   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3643 
3644   if (packed_reduction_method == critical_reduce_block) {
3645 
3646     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3647     retval = 1;
3648 
3649   } else if (packed_reduction_method == empty_reduce_block) {
3650 
3651     // usage: if team size == 1, no synchronization is required ( Intel
3652     // platforms only )
3653     retval = 1;
3654 
3655   } else if (packed_reduction_method == atomic_reduce_block) {
3656 
3657     retval = 2;
3658 
3659   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3660                                    tree_reduce_block)) {
3661 
3662 // case tree_reduce_block:
3663 // this barrier should be visible to a customer and to the threading profile
3664 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3665 #if OMPT_SUPPORT
3666     ompt_frame_t *ompt_frame;
3667     if (ompt_enabled.enabled) {
3668       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3669       if (ompt_frame->enter_frame.ptr == NULL)
3670         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3671       OMPT_STORE_RETURN_ADDRESS(global_tid);
3672     }
3673 #endif
3674 #if USE_ITT_NOTIFY
3675     __kmp_threads[global_tid]->th.th_ident =
3676         loc; // needed for correct notification of frames
3677 #endif
3678     retval =
3679         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3680                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3681     retval = (retval != 0) ? (0) : (1);
3682 #if OMPT_SUPPORT && OMPT_OPTIONAL
3683     if (ompt_enabled.enabled) {
3684       ompt_frame->enter_frame = ompt_data_none;
3685     }
3686 #endif
3687 
3688     // all other workers except master should do this pop here
3689     // ( none of other workers except master will enter __kmpc_end_reduce() )
3690     if (__kmp_env_consistency_check) {
3691       if (retval == 0) { // 0: all other workers; 1: master
3692         __kmp_pop_sync(global_tid, ct_reduce, loc);
3693       }
3694     }
3695 
3696   } else {
3697 
3698     // should never reach this block
3699     KMP_ASSERT(0); // "unexpected method"
3700   }
3701 #if OMP_40_ENABLED
3702   if (teams_swapped) {
3703     __kmp_restore_swapped_teams(th, team, task_state);
3704   }
3705 #endif
3706 
3707   KA_TRACE(10,
3708            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3709             global_tid, packed_reduction_method, retval));
3710 
3711   return retval;
3712 }
3713 
3714 /*!
3715 @ingroup SYNCHRONIZATION
3716 @param loc source location information
3717 @param global_tid global thread id.
3718 @param lck pointer to the unique lock data structure
3719 
3720 Finish the execution of a blocking reduce.
3721 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3722 start function.
3723 */
3724 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3725                        kmp_critical_name *lck) {
3726 
3727   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3728 #if OMP_40_ENABLED
3729   kmp_info_t *th;
3730   kmp_team_t *team;
3731   int teams_swapped = 0, task_state;
3732 #endif
3733 
3734   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3735 
3736 #if OMP_40_ENABLED
3737   th = __kmp_thread_from_gtid(global_tid);
3738   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3739 #endif // OMP_40_ENABLED
3740 
3741   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3742 
3743   // this barrier should be visible to a customer and to the threading profile
3744   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3745 
3746   if (packed_reduction_method == critical_reduce_block) {
3747 
3748     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3749 
3750 // TODO: implicit barrier: should be exposed
3751 #if OMPT_SUPPORT
3752     ompt_frame_t *ompt_frame;
3753     if (ompt_enabled.enabled) {
3754       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3755       if (ompt_frame->enter_frame.ptr == NULL)
3756         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3757       OMPT_STORE_RETURN_ADDRESS(global_tid);
3758     }
3759 #endif
3760 #if USE_ITT_NOTIFY
3761     __kmp_threads[global_tid]->th.th_ident = loc;
3762 #endif
3763     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3764 #if OMPT_SUPPORT && OMPT_OPTIONAL
3765     if (ompt_enabled.enabled) {
3766       ompt_frame->enter_frame = ompt_data_none;
3767     }
3768 #endif
3769 
3770   } else if (packed_reduction_method == empty_reduce_block) {
3771 
3772 // usage: if team size==1, no synchronization is required (Intel platforms only)
3773 
3774 // TODO: implicit barrier: should be exposed
3775 #if OMPT_SUPPORT
3776     ompt_frame_t *ompt_frame;
3777     if (ompt_enabled.enabled) {
3778       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3779       if (ompt_frame->enter_frame.ptr == NULL)
3780         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3781       OMPT_STORE_RETURN_ADDRESS(global_tid);
3782     }
3783 #endif
3784 #if USE_ITT_NOTIFY
3785     __kmp_threads[global_tid]->th.th_ident = loc;
3786 #endif
3787     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3788 #if OMPT_SUPPORT && OMPT_OPTIONAL
3789     if (ompt_enabled.enabled) {
3790       ompt_frame->enter_frame = ompt_data_none;
3791     }
3792 #endif
3793 
3794   } else if (packed_reduction_method == atomic_reduce_block) {
3795 
3796 #if OMPT_SUPPORT
3797     ompt_frame_t *ompt_frame;
3798     if (ompt_enabled.enabled) {
3799       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3800       if (ompt_frame->enter_frame.ptr == NULL)
3801         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3802       OMPT_STORE_RETURN_ADDRESS(global_tid);
3803     }
3804 #endif
3805 // TODO: implicit barrier: should be exposed
3806 #if USE_ITT_NOTIFY
3807     __kmp_threads[global_tid]->th.th_ident = loc;
3808 #endif
3809     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3810 #if OMPT_SUPPORT && OMPT_OPTIONAL
3811     if (ompt_enabled.enabled) {
3812       ompt_frame->enter_frame = ompt_data_none;
3813     }
3814 #endif
3815 
3816   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3817                                    tree_reduce_block)) {
3818 
3819     // only master executes here (master releases all other workers)
3820     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3821                             global_tid);
3822 
3823   } else {
3824 
3825     // should never reach this block
3826     KMP_ASSERT(0); // "unexpected method"
3827   }
3828 #if OMP_40_ENABLED
3829   if (teams_swapped) {
3830     __kmp_restore_swapped_teams(th, team, task_state);
3831   }
3832 #endif
3833 
3834   if (__kmp_env_consistency_check)
3835     __kmp_pop_sync(global_tid, ct_reduce, loc);
3836 
3837   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3838                 global_tid, packed_reduction_method));
3839 
3840   return;
3841 }
3842 
3843 #undef __KMP_GET_REDUCTION_METHOD
3844 #undef __KMP_SET_REDUCTION_METHOD
3845 
3846 /* end of interface to fast scalable reduce routines */
3847 
3848 kmp_uint64 __kmpc_get_taskid() {
3849 
3850   kmp_int32 gtid;
3851   kmp_info_t *thread;
3852 
3853   gtid = __kmp_get_gtid();
3854   if (gtid < 0) {
3855     return 0;
3856   }
3857   thread = __kmp_thread_from_gtid(gtid);
3858   return thread->th.th_current_task->td_task_id;
3859 
3860 } // __kmpc_get_taskid
3861 
3862 kmp_uint64 __kmpc_get_parent_taskid() {
3863 
3864   kmp_int32 gtid;
3865   kmp_info_t *thread;
3866   kmp_taskdata_t *parent_task;
3867 
3868   gtid = __kmp_get_gtid();
3869   if (gtid < 0) {
3870     return 0;
3871   }
3872   thread = __kmp_thread_from_gtid(gtid);
3873   parent_task = thread->th.th_current_task->td_parent;
3874   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3875 
3876 } // __kmpc_get_parent_taskid
3877 
3878 #if OMP_45_ENABLED
3879 /*!
3880 @ingroup WORK_SHARING
3881 @param loc  source location information.
3882 @param gtid  global thread number.
3883 @param num_dims  number of associated doacross loops.
3884 @param dims  info on loops bounds.
3885 
3886 Initialize doacross loop information.
3887 Expect compiler send us inclusive bounds,
3888 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3889 */
3890 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3891                           const struct kmp_dim *dims) {
3892   int j, idx;
3893   kmp_int64 last, trace_count;
3894   kmp_info_t *th = __kmp_threads[gtid];
3895   kmp_team_t *team = th->th.th_team;
3896   kmp_uint32 *flags;
3897   kmp_disp_t *pr_buf = th->th.th_dispatch;
3898   dispatch_shared_info_t *sh_buf;
3899 
3900   KA_TRACE(
3901       20,
3902       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3903        gtid, num_dims, !team->t.t_serialized));
3904   KMP_DEBUG_ASSERT(dims != NULL);
3905   KMP_DEBUG_ASSERT(num_dims > 0);
3906 
3907   if (team->t.t_serialized) {
3908     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3909     return; // no dependencies if team is serialized
3910   }
3911   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3912   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3913   // the next loop
3914   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3915 
3916   // Save bounds info into allocated private buffer
3917   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3918   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3919       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3920   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3921   pr_buf->th_doacross_info[0] =
3922       (kmp_int64)num_dims; // first element is number of dimensions
3923   // Save also address of num_done in order to access it later without knowing
3924   // the buffer index
3925   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3926   pr_buf->th_doacross_info[2] = dims[0].lo;
3927   pr_buf->th_doacross_info[3] = dims[0].up;
3928   pr_buf->th_doacross_info[4] = dims[0].st;
3929   last = 5;
3930   for (j = 1; j < num_dims; ++j) {
3931     kmp_int64
3932         range_length; // To keep ranges of all dimensions but the first dims[0]
3933     if (dims[j].st == 1) { // most common case
3934       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3935       range_length = dims[j].up - dims[j].lo + 1;
3936     } else {
3937       if (dims[j].st > 0) {
3938         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3939         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3940       } else { // negative increment
3941         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3942         range_length =
3943             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3944       }
3945     }
3946     pr_buf->th_doacross_info[last++] = range_length;
3947     pr_buf->th_doacross_info[last++] = dims[j].lo;
3948     pr_buf->th_doacross_info[last++] = dims[j].up;
3949     pr_buf->th_doacross_info[last++] = dims[j].st;
3950   }
3951 
3952   // Compute total trip count.
3953   // Start with range of dims[0] which we don't need to keep in the buffer.
3954   if (dims[0].st == 1) { // most common case
3955     trace_count = dims[0].up - dims[0].lo + 1;
3956   } else if (dims[0].st > 0) {
3957     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3958     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3959   } else { // negative increment
3960     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3961     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3962   }
3963   for (j = 1; j < num_dims; ++j) {
3964     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3965   }
3966   KMP_DEBUG_ASSERT(trace_count > 0);
3967 
3968   // Check if shared buffer is not occupied by other loop (idx -
3969   // __kmp_dispatch_num_buffers)
3970   if (idx != sh_buf->doacross_buf_idx) {
3971     // Shared buffer is occupied, wait for it to be free
3972     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3973                  __kmp_eq_4, NULL);
3974   }
3975 #if KMP_32_BIT_ARCH
3976   // Check if we are the first thread. After the CAS the first thread gets 0,
3977   // others get 1 if initialization is in progress, allocated pointer otherwise.
3978   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3979   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3980       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3981 #else
3982   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3983       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3984 #endif
3985   if (flags == NULL) {
3986     // we are the first thread, allocate the array of flags
3987     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3988     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3989     KMP_MB();
3990     sh_buf->doacross_flags = flags;
3991   } else if (flags == (kmp_uint32 *)1) {
3992 #if KMP_32_BIT_ARCH
3993     // initialization is still in progress, need to wait
3994     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3995 #else
3996     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3997 #endif
3998       KMP_YIELD(TRUE);
3999     KMP_MB();
4000   } else {
4001     KMP_MB();
4002   }
4003   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4004   pr_buf->th_doacross_flags =
4005       sh_buf->doacross_flags; // save private copy in order to not
4006   // touch shared buffer on each iteration
4007   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4008 }
4009 
4010 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4011   kmp_int32 shft, num_dims, i;
4012   kmp_uint32 flag;
4013   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4014   kmp_info_t *th = __kmp_threads[gtid];
4015   kmp_team_t *team = th->th.th_team;
4016   kmp_disp_t *pr_buf;
4017   kmp_int64 lo, up, st;
4018 
4019   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4020   if (team->t.t_serialized) {
4021     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4022     return; // no dependencies if team is serialized
4023   }
4024 
4025   // calculate sequential iteration number and check out-of-bounds condition
4026   pr_buf = th->th.th_dispatch;
4027   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4028   num_dims = pr_buf->th_doacross_info[0];
4029   lo = pr_buf->th_doacross_info[2];
4030   up = pr_buf->th_doacross_info[3];
4031   st = pr_buf->th_doacross_info[4];
4032   if (st == 1) { // most common case
4033     if (vec[0] < lo || vec[0] > up) {
4034       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4035                     "bounds [%lld,%lld]\n",
4036                     gtid, vec[0], lo, up));
4037       return;
4038     }
4039     iter_number = vec[0] - lo;
4040   } else if (st > 0) {
4041     if (vec[0] < lo || vec[0] > up) {
4042       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4043                     "bounds [%lld,%lld]\n",
4044                     gtid, vec[0], lo, up));
4045       return;
4046     }
4047     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4048   } else { // negative increment
4049     if (vec[0] > lo || vec[0] < up) {
4050       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4051                     "bounds [%lld,%lld]\n",
4052                     gtid, vec[0], lo, up));
4053       return;
4054     }
4055     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4056   }
4057   for (i = 1; i < num_dims; ++i) {
4058     kmp_int64 iter, ln;
4059     kmp_int32 j = i * 4;
4060     ln = pr_buf->th_doacross_info[j + 1];
4061     lo = pr_buf->th_doacross_info[j + 2];
4062     up = pr_buf->th_doacross_info[j + 3];
4063     st = pr_buf->th_doacross_info[j + 4];
4064     if (st == 1) {
4065       if (vec[i] < lo || vec[i] > up) {
4066         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4067                       "bounds [%lld,%lld]\n",
4068                       gtid, vec[i], lo, up));
4069         return;
4070       }
4071       iter = vec[i] - lo;
4072     } else if (st > 0) {
4073       if (vec[i] < lo || vec[i] > up) {
4074         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4075                       "bounds [%lld,%lld]\n",
4076                       gtid, vec[i], lo, up));
4077         return;
4078       }
4079       iter = (kmp_uint64)(vec[i] - lo) / st;
4080     } else { // st < 0
4081       if (vec[i] > lo || vec[i] < up) {
4082         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4083                       "bounds [%lld,%lld]\n",
4084                       gtid, vec[i], lo, up));
4085         return;
4086       }
4087       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4088     }
4089     iter_number = iter + ln * iter_number;
4090   }
4091   shft = iter_number % 32; // use 32-bit granularity
4092   iter_number >>= 5; // divided by 32
4093   flag = 1 << shft;
4094   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4095     KMP_YIELD(TRUE);
4096   }
4097   KMP_MB();
4098   KA_TRACE(20,
4099            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4100             gtid, (iter_number << 5) + shft));
4101 }
4102 
4103 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4104   kmp_int32 shft, num_dims, i;
4105   kmp_uint32 flag;
4106   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4107   kmp_info_t *th = __kmp_threads[gtid];
4108   kmp_team_t *team = th->th.th_team;
4109   kmp_disp_t *pr_buf;
4110   kmp_int64 lo, st;
4111 
4112   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4113   if (team->t.t_serialized) {
4114     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4115     return; // no dependencies if team is serialized
4116   }
4117 
4118   // calculate sequential iteration number (same as in "wait" but no
4119   // out-of-bounds checks)
4120   pr_buf = th->th.th_dispatch;
4121   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4122   num_dims = pr_buf->th_doacross_info[0];
4123   lo = pr_buf->th_doacross_info[2];
4124   st = pr_buf->th_doacross_info[4];
4125   if (st == 1) { // most common case
4126     iter_number = vec[0] - lo;
4127   } else if (st > 0) {
4128     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4129   } else { // negative increment
4130     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4131   }
4132   for (i = 1; i < num_dims; ++i) {
4133     kmp_int64 iter, ln;
4134     kmp_int32 j = i * 4;
4135     ln = pr_buf->th_doacross_info[j + 1];
4136     lo = pr_buf->th_doacross_info[j + 2];
4137     st = pr_buf->th_doacross_info[j + 4];
4138     if (st == 1) {
4139       iter = vec[i] - lo;
4140     } else if (st > 0) {
4141       iter = (kmp_uint64)(vec[i] - lo) / st;
4142     } else { // st < 0
4143       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4144     }
4145     iter_number = iter + ln * iter_number;
4146   }
4147   shft = iter_number % 32; // use 32-bit granularity
4148   iter_number >>= 5; // divided by 32
4149   flag = 1 << shft;
4150   KMP_MB();
4151   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4152     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4153   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4154                 (iter_number << 5) + shft));
4155 }
4156 
4157 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4158   kmp_int32 num_done;
4159   kmp_info_t *th = __kmp_threads[gtid];
4160   kmp_team_t *team = th->th.th_team;
4161   kmp_disp_t *pr_buf = th->th.th_dispatch;
4162 
4163   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4164   if (team->t.t_serialized) {
4165     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4166     return; // nothing to do
4167   }
4168   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4169   if (num_done == th->th.th_team_nproc) {
4170     // we are the last thread, need to free shared resources
4171     int idx = pr_buf->th_doacross_buf_idx - 1;
4172     dispatch_shared_info_t *sh_buf =
4173         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4174     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4175                      (kmp_int64)&sh_buf->doacross_num_done);
4176     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4177     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4178     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4179     sh_buf->doacross_flags = NULL;
4180     sh_buf->doacross_num_done = 0;
4181     sh_buf->doacross_buf_idx +=
4182         __kmp_dispatch_num_buffers; // free buffer for future re-use
4183   }
4184   // free private resources (need to keep buffer index forever)
4185   pr_buf->th_doacross_flags = NULL;
4186   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4187   pr_buf->th_doacross_info = NULL;
4188   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4189 }
4190 #endif
4191 
4192 #if OMP_50_ENABLED
4193 int __kmpc_get_target_offload(void) {
4194   if (!__kmp_init_serial) {
4195     __kmp_serial_initialize();
4196   }
4197   return __kmp_target_offload;
4198 }
4199 
4200 int __kmpc_pause_resource(kmp_pause_status_t level) {
4201   if (!__kmp_init_serial) {
4202     return 1; // Can't pause if runtime is not initialized
4203   }
4204   return __kmp_pause_resource(level);
4205 }
4206 #endif // OMP_50_ENABLED
4207 
4208 // end of file //
4209