1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       kmp_team_t *parent_team = master_th->th.th_team;
292       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293       if (lwt)
294         ompt_frame = &(lwt->ompt_task_info.frame);
295       else {
296         int tid = __kmp_tid_from_gtid(gtid);
297         ompt_frame = &(
298             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299       }
300       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301     }
302     OMPT_STORE_RETURN_ADDRESS(gtid);
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306     SSC_MARK_FORKING();
307 #endif
308     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311                     kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_JOINING();
314 #endif
315     __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317                     ,
318                     fork_context_intel
319 #endif
320     );
321 
322     va_end(ap);
323   }
324 
325 #if KMP_STATS_ENABLED
326   if (previous_state == stats_state_e::SERIAL_REGION) {
327     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328     KMP_SET_THREAD_STATE(previous_state);
329   } else {
330     KMP_POP_PARTITIONED_TIMER();
331   }
332 #endif // KMP_STATS_ENABLED
333 }
334 
335 /*!
336 @ingroup PARALLEL
337 @param loc source location information
338 @param global_tid global thread number
339 @param num_teams number of teams requested for the teams construct
340 @param num_threads number of threads per team requested for the teams construct
341 
342 Set the number of teams to be used by the teams construct.
343 This call is only required if the teams construct has a `num_teams` clause
344 or a `thread_limit` clause (or both).
345 */
346 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
347                            kmp_int32 num_teams, kmp_int32 num_threads) {
348   KA_TRACE(20,
349            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
350             global_tid, num_teams, num_threads));
351   __kmp_assert_valid_gtid(global_tid);
352   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
353 }
354 
355 /*!
356 @ingroup PARALLEL
357 @param loc source location information
358 @param global_tid global thread number
359 @param num_teams_lo lower bound on number of teams requested for the teams
360 construct
361 @param num_teams_up upper bound on number of teams requested for the teams
362 construct
363 @param num_threads number of threads per team requested for the teams construct
364 
365 Set the number of teams to be used by the teams construct. The number of initial
366 teams cretaed will be greater than or equal to the lower bound and less than or
367 equal to the upper bound.
368 This call is only required if the teams construct has a `num_teams` clause
369 or a `thread_limit` clause (or both).
370 */
371 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
372                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
373                               kmp_int32 num_threads) {
374   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
375                 " num_teams_ub=%d num_threads=%d\n",
376                 global_tid, num_teams_lb, num_teams_ub, num_threads));
377   __kmp_assert_valid_gtid(global_tid);
378   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
379                           num_threads);
380 }
381 
382 /*!
383 @ingroup PARALLEL
384 @param loc  source location information
385 @param argc  total number of arguments in the ellipsis
386 @param microtask  pointer to callback routine consisting of outlined teams
387 construct
388 @param ...  pointers to shared variables that aren't global
389 
390 Do the actual fork and call the microtask in the relevant number of threads.
391 */
392 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
393                        ...) {
394   int gtid = __kmp_entry_gtid();
395   kmp_info_t *this_thr = __kmp_threads[gtid];
396   va_list ap;
397   va_start(ap, microtask);
398 
399 #if KMP_STATS_ENABLED
400   KMP_COUNT_BLOCK(OMP_TEAMS);
401   stats_state_e previous_state = KMP_GET_THREAD_STATE();
402   if (previous_state == stats_state_e::SERIAL_REGION) {
403     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
404   } else {
405     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
406   }
407 #endif
408 
409   // remember teams entry point and nesting level
410   this_thr->th.th_teams_microtask = microtask;
411   this_thr->th.th_teams_level =
412       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
413 
414 #if OMPT_SUPPORT
415   kmp_team_t *parent_team = this_thr->th.th_team;
416   int tid = __kmp_tid_from_gtid(gtid);
417   if (ompt_enabled.enabled) {
418     parent_team->t.t_implicit_task_taskdata[tid]
419         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
420   }
421   OMPT_STORE_RETURN_ADDRESS(gtid);
422 #endif
423 
424   // check if __kmpc_push_num_teams called, set default number of teams
425   // otherwise
426   if (this_thr->th.th_teams_size.nteams == 0) {
427     __kmp_push_num_teams(loc, gtid, 0, 0);
428   }
429   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
431   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
432 
433   __kmp_fork_call(
434       loc, gtid, fork_context_intel, argc,
435       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
436       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
437   __kmp_join_call(loc, gtid
438 #if OMPT_SUPPORT
439                   ,
440                   fork_context_intel
441 #endif
442   );
443 
444   // Pop current CG root off list
445   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
447   this_thr->th.th_cg_roots = tmp->up;
448   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
449                  " to node %p. cg_nthreads was %d\n",
450                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
451   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
452   int i = tmp->cg_nthreads--;
453   if (i == 1) { // check is we are the last thread in CG (not always the case)
454     __kmp_free(tmp);
455   }
456   // Restore current task's thread_limit from CG root
457   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
458   this_thr->th.th_current_task->td_icvs.thread_limit =
459       this_thr->th.th_cg_roots->cg_thread_limit;
460 
461   this_thr->th.th_teams_microtask = NULL;
462   this_thr->th.th_teams_level = 0;
463   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
464   va_end(ap);
465 #if KMP_STATS_ENABLED
466   if (previous_state == stats_state_e::SERIAL_REGION) {
467     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
468     KMP_SET_THREAD_STATE(previous_state);
469   } else {
470     KMP_POP_PARTITIONED_TIMER();
471   }
472 #endif // KMP_STATS_ENABLED
473 }
474 
475 // I don't think this function should ever have been exported.
476 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
477 // openmp code ever called it, but it's been exported from the RTL for so
478 // long that I'm afraid to remove the definition.
479 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
480 
481 /*!
482 @ingroup PARALLEL
483 @param loc  source location information
484 @param global_tid  global thread number
485 
486 Enter a serialized parallel construct. This interface is used to handle a
487 conditional parallel region, like this,
488 @code
489 #pragma omp parallel if (condition)
490 @endcode
491 when the condition is false.
492 */
493 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494   // The implementation is now in kmp_runtime.cpp so that it can share static
495   // functions with kmp_fork_call since the tasks to be done are similar in
496   // each case.
497   __kmp_assert_valid_gtid(global_tid);
498 #if OMPT_SUPPORT
499   OMPT_STORE_RETURN_ADDRESS(global_tid);
500 #endif
501   __kmp_serialized_parallel(loc, global_tid);
502 }
503 
504 /*!
505 @ingroup PARALLEL
506 @param loc  source location information
507 @param global_tid  global thread number
508 
509 Leave a serialized parallel construct.
510 */
511 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
512   kmp_internal_control_t *top;
513   kmp_info_t *this_thr;
514   kmp_team_t *serial_team;
515 
516   KC_TRACE(10,
517            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
518 
519   /* skip all this code for autopar serialized loops since it results in
520      unacceptable overhead */
521   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
522     return;
523 
524   // Not autopar code
525   __kmp_assert_valid_gtid(global_tid);
526   if (!TCR_4(__kmp_init_parallel))
527     __kmp_parallel_initialize();
528 
529   __kmp_resume_if_soft_paused();
530 
531   this_thr = __kmp_threads[global_tid];
532   serial_team = this_thr->th.th_serial_team;
533 
534   kmp_task_team_t *task_team = this_thr->th.th_task_team;
535   // we need to wait for the proxy tasks before finishing the thread
536   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
537     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
538 
539   KMP_MB();
540   KMP_DEBUG_ASSERT(serial_team);
541   KMP_ASSERT(serial_team->t.t_serialized);
542   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
543   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
545   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
546 
547 #if OMPT_SUPPORT
548   if (ompt_enabled.enabled &&
549       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
550     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
551     if (ompt_enabled.ompt_callback_implicit_task) {
552       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
553           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
554           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
555     }
556 
557     // reset clear the task id only after unlinking the task
558     ompt_data_t *parent_task_data;
559     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
560 
561     if (ompt_enabled.ompt_callback_parallel_end) {
562       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
563           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
564           ompt_parallel_invoker_program | ompt_parallel_team,
565           OMPT_LOAD_RETURN_ADDRESS(global_tid));
566     }
567     __ompt_lw_taskteam_unlink(this_thr);
568     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
569   }
570 #endif
571 
572   /* If necessary, pop the internal control stack values and replace the team
573    * values */
574   top = serial_team->t.t_control_stack_top;
575   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
576     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
577     serial_team->t.t_control_stack_top = top->next;
578     __kmp_free(top);
579   }
580 
581   /* pop dispatch buffers stack */
582   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
583   {
584     dispatch_private_info_t *disp_buffer =
585         serial_team->t.t_dispatch->th_disp_buffer;
586     serial_team->t.t_dispatch->th_disp_buffer =
587         serial_team->t.t_dispatch->th_disp_buffer->next;
588     __kmp_free(disp_buffer);
589   }
590   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
591 
592   --serial_team->t.t_serialized;
593   if (serial_team->t.t_serialized == 0) {
594 
595     /* return to the parallel section */
596 
597 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
598     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
599       __kmp_clear_x87_fpu_status_word();
600       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
601       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
602     }
603 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
604 
605     __kmp_pop_current_task_from_thread(this_thr);
606 #if OMPD_SUPPORT
607     if (ompd_state & OMPD_ENABLE_BP)
608       ompd_bp_parallel_end();
609 #endif
610 
611     this_thr->th.th_team = serial_team->t.t_parent;
612     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
613 
614     /* restore values cached in the thread */
615     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
616     this_thr->th.th_team_master =
617         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
618     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
619 
620     /* TODO the below shouldn't need to be adjusted for serialized teams */
621     this_thr->th.th_dispatch =
622         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
623 
624     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
625     this_thr->th.th_current_task->td_flags.executing = 1;
626 
627     if (__kmp_tasking_mode != tskm_immediate_exec) {
628       // Copy the task team from the new child / old parent team to the thread.
629       this_thr->th.th_task_team =
630           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
631       KA_TRACE(20,
632                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
633                 "team %p\n",
634                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
635     }
636   } else {
637     if (__kmp_tasking_mode != tskm_immediate_exec) {
638       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
639                     "depth of serial team %p to %d\n",
640                     global_tid, serial_team, serial_team->t.t_serialized));
641     }
642   }
643 
644   serial_team->t.t_level--;
645   if (__kmp_env_consistency_check)
646     __kmp_pop_parallel(global_tid, NULL);
647 #if OMPT_SUPPORT
648   if (ompt_enabled.enabled)
649     this_thr->th.ompt_thread_info.state =
650         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
651                                            : ompt_state_work_parallel);
652 #endif
653 }
654 
655 /*!
656 @ingroup SYNCHRONIZATION
657 @param loc  source location information.
658 
659 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
660 depending on the memory ordering convention obeyed by the compiler
661 even that may not be necessary).
662 */
663 void __kmpc_flush(ident_t *loc) {
664   KC_TRACE(10, ("__kmpc_flush: called\n"));
665 
666   /* need explicit __mf() here since use volatile instead in library */
667   KMP_MB(); /* Flush all pending memory write invalidates.  */
668 
669 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
670 #if KMP_MIC
671 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
672 // We shouldn't need it, though, since the ABI rules require that
673 // * If the compiler generates NGO stores it also generates the fence
674 // * If users hand-code NGO stores they should insert the fence
675 // therefore no incomplete unordered stores should be visible.
676 #else
677   // C74404
678   // This is to address non-temporal store instructions (sfence needed).
679   // The clflush instruction is addressed either (mfence needed).
680   // Probably the non-temporal load monvtdqa instruction should also be
681   // addressed.
682   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
683   if (!__kmp_cpuinfo.initialized) {
684     __kmp_query_cpuid(&__kmp_cpuinfo);
685   }
686   if (!__kmp_cpuinfo.flags.sse2) {
687     // CPU cannot execute SSE2 instructions.
688   } else {
689 #if KMP_COMPILER_ICC
690     _mm_mfence();
691 #elif KMP_COMPILER_MSVC
692     MemoryBarrier();
693 #else
694     __sync_synchronize();
695 #endif // KMP_COMPILER_ICC
696   }
697 #endif // KMP_MIC
698 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
699        KMP_ARCH_RISCV64)
700 // Nothing to see here move along
701 #elif KMP_ARCH_PPC64
702 // Nothing needed here (we have a real MB above).
703 #else
704 #error Unknown or unsupported architecture
705 #endif
706 
707 #if OMPT_SUPPORT && OMPT_OPTIONAL
708   if (ompt_enabled.ompt_callback_flush) {
709     ompt_callbacks.ompt_callback(ompt_callback_flush)(
710         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
711   }
712 #endif
713 }
714 
715 /* -------------------------------------------------------------------------- */
716 /*!
717 @ingroup SYNCHRONIZATION
718 @param loc source location information
719 @param global_tid thread id.
720 
721 Execute a barrier.
722 */
723 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
724   KMP_COUNT_BLOCK(OMP_BARRIER);
725   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
726   __kmp_assert_valid_gtid(global_tid);
727 
728   if (!TCR_4(__kmp_init_parallel))
729     __kmp_parallel_initialize();
730 
731   __kmp_resume_if_soft_paused();
732 
733   if (__kmp_env_consistency_check) {
734     if (loc == 0) {
735       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
736     }
737     __kmp_check_barrier(global_tid, ct_barrier, loc);
738   }
739 
740 #if OMPT_SUPPORT
741   ompt_frame_t *ompt_frame;
742   if (ompt_enabled.enabled) {
743     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
744     if (ompt_frame->enter_frame.ptr == NULL)
745       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
746   }
747   OMPT_STORE_RETURN_ADDRESS(global_tid);
748 #endif
749   __kmp_threads[global_tid]->th.th_ident = loc;
750   // TODO: explicit barrier_wait_id:
751   //   this function is called when 'barrier' directive is present or
752   //   implicit barrier at the end of a worksharing construct.
753   // 1) better to add a per-thread barrier counter to a thread data structure
754   // 2) set to 0 when a new team is created
755   // 4) no sync is required
756 
757   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
758 #if OMPT_SUPPORT && OMPT_OPTIONAL
759   if (ompt_enabled.enabled) {
760     ompt_frame->enter_frame = ompt_data_none;
761   }
762 #endif
763 }
764 
765 /* The BARRIER for a MASTER section is always explicit   */
766 /*!
767 @ingroup WORK_SHARING
768 @param loc  source location information.
769 @param global_tid  global thread number .
770 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
771 */
772 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
773   int status = 0;
774 
775   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
776   __kmp_assert_valid_gtid(global_tid);
777 
778   if (!TCR_4(__kmp_init_parallel))
779     __kmp_parallel_initialize();
780 
781   __kmp_resume_if_soft_paused();
782 
783   if (KMP_MASTER_GTID(global_tid)) {
784     KMP_COUNT_BLOCK(OMP_MASTER);
785     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
786     status = 1;
787   }
788 
789 #if OMPT_SUPPORT && OMPT_OPTIONAL
790   if (status) {
791     if (ompt_enabled.ompt_callback_masked) {
792       kmp_info_t *this_thr = __kmp_threads[global_tid];
793       kmp_team_t *team = this_thr->th.th_team;
794 
795       int tid = __kmp_tid_from_gtid(global_tid);
796       ompt_callbacks.ompt_callback(ompt_callback_masked)(
797           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
798           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
799           OMPT_GET_RETURN_ADDRESS(0));
800     }
801   }
802 #endif
803 
804   if (__kmp_env_consistency_check) {
805 #if KMP_USE_DYNAMIC_LOCK
806     if (status)
807       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
808     else
809       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
810 #else
811     if (status)
812       __kmp_push_sync(global_tid, ct_master, loc, NULL);
813     else
814       __kmp_check_sync(global_tid, ct_master, loc, NULL);
815 #endif
816   }
817 
818   return status;
819 }
820 
821 /*!
822 @ingroup WORK_SHARING
823 @param loc  source location information.
824 @param global_tid  global thread number .
825 
826 Mark the end of a <tt>master</tt> region. This should only be called by the
827 thread that executes the <tt>master</tt> region.
828 */
829 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
830   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
831   __kmp_assert_valid_gtid(global_tid);
832   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
833   KMP_POP_PARTITIONED_TIMER();
834 
835 #if OMPT_SUPPORT && OMPT_OPTIONAL
836   kmp_info_t *this_thr = __kmp_threads[global_tid];
837   kmp_team_t *team = this_thr->th.th_team;
838   if (ompt_enabled.ompt_callback_masked) {
839     int tid = __kmp_tid_from_gtid(global_tid);
840     ompt_callbacks.ompt_callback(ompt_callback_masked)(
841         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
842         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
843         OMPT_GET_RETURN_ADDRESS(0));
844   }
845 #endif
846 
847   if (__kmp_env_consistency_check) {
848     if (KMP_MASTER_GTID(global_tid))
849       __kmp_pop_sync(global_tid, ct_master, loc);
850   }
851 }
852 
853 /*!
854 @ingroup WORK_SHARING
855 @param loc  source location information.
856 @param global_tid  global thread number.
857 @param filter result of evaluating filter clause on thread global_tid, or zero
858 if no filter clause present
859 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
860 */
861 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
862   int status = 0;
863   int tid;
864   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
865   __kmp_assert_valid_gtid(global_tid);
866 
867   if (!TCR_4(__kmp_init_parallel))
868     __kmp_parallel_initialize();
869 
870   __kmp_resume_if_soft_paused();
871 
872   tid = __kmp_tid_from_gtid(global_tid);
873   if (tid == filter) {
874     KMP_COUNT_BLOCK(OMP_MASKED);
875     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
876     status = 1;
877   }
878 
879 #if OMPT_SUPPORT && OMPT_OPTIONAL
880   if (status) {
881     if (ompt_enabled.ompt_callback_masked) {
882       kmp_info_t *this_thr = __kmp_threads[global_tid];
883       kmp_team_t *team = this_thr->th.th_team;
884       ompt_callbacks.ompt_callback(ompt_callback_masked)(
885           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
886           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
887           OMPT_GET_RETURN_ADDRESS(0));
888     }
889   }
890 #endif
891 
892   if (__kmp_env_consistency_check) {
893 #if KMP_USE_DYNAMIC_LOCK
894     if (status)
895       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
896     else
897       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
898 #else
899     if (status)
900       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
901     else
902       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
903 #endif
904   }
905 
906   return status;
907 }
908 
909 /*!
910 @ingroup WORK_SHARING
911 @param loc  source location information.
912 @param global_tid  global thread number .
913 
914 Mark the end of a <tt>masked</tt> region. This should only be called by the
915 thread that executes the <tt>masked</tt> region.
916 */
917 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
918   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
919   __kmp_assert_valid_gtid(global_tid);
920   KMP_POP_PARTITIONED_TIMER();
921 
922 #if OMPT_SUPPORT && OMPT_OPTIONAL
923   kmp_info_t *this_thr = __kmp_threads[global_tid];
924   kmp_team_t *team = this_thr->th.th_team;
925   if (ompt_enabled.ompt_callback_masked) {
926     int tid = __kmp_tid_from_gtid(global_tid);
927     ompt_callbacks.ompt_callback(ompt_callback_masked)(
928         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
929         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
930         OMPT_GET_RETURN_ADDRESS(0));
931   }
932 #endif
933 
934   if (__kmp_env_consistency_check) {
935     __kmp_pop_sync(global_tid, ct_masked, loc);
936   }
937 }
938 
939 /*!
940 @ingroup WORK_SHARING
941 @param loc  source location information.
942 @param gtid  global thread number.
943 
944 Start execution of an <tt>ordered</tt> construct.
945 */
946 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
947   int cid = 0;
948   kmp_info_t *th;
949   KMP_DEBUG_ASSERT(__kmp_init_serial);
950 
951   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
952   __kmp_assert_valid_gtid(gtid);
953 
954   if (!TCR_4(__kmp_init_parallel))
955     __kmp_parallel_initialize();
956 
957   __kmp_resume_if_soft_paused();
958 
959 #if USE_ITT_BUILD
960   __kmp_itt_ordered_prep(gtid);
961 // TODO: ordered_wait_id
962 #endif /* USE_ITT_BUILD */
963 
964   th = __kmp_threads[gtid];
965 
966 #if OMPT_SUPPORT && OMPT_OPTIONAL
967   kmp_team_t *team;
968   ompt_wait_id_t lck;
969   void *codeptr_ra;
970   OMPT_STORE_RETURN_ADDRESS(gtid);
971   if (ompt_enabled.enabled) {
972     team = __kmp_team_from_gtid(gtid);
973     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
974     /* OMPT state update */
975     th->th.ompt_thread_info.wait_id = lck;
976     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
977 
978     /* OMPT event callback */
979     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
980     if (ompt_enabled.ompt_callback_mutex_acquire) {
981       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
982           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
983           codeptr_ra);
984     }
985   }
986 #endif
987 
988   if (th->th.th_dispatch->th_deo_fcn != 0)
989     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
990   else
991     __kmp_parallel_deo(&gtid, &cid, loc);
992 
993 #if OMPT_SUPPORT && OMPT_OPTIONAL
994   if (ompt_enabled.enabled) {
995     /* OMPT state update */
996     th->th.ompt_thread_info.state = ompt_state_work_parallel;
997     th->th.ompt_thread_info.wait_id = 0;
998 
999     /* OMPT event callback */
1000     if (ompt_enabled.ompt_callback_mutex_acquired) {
1001       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1002           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1003     }
1004   }
1005 #endif
1006 
1007 #if USE_ITT_BUILD
1008   __kmp_itt_ordered_start(gtid);
1009 #endif /* USE_ITT_BUILD */
1010 }
1011 
1012 /*!
1013 @ingroup WORK_SHARING
1014 @param loc  source location information.
1015 @param gtid  global thread number.
1016 
1017 End execution of an <tt>ordered</tt> construct.
1018 */
1019 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1020   int cid = 0;
1021   kmp_info_t *th;
1022 
1023   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1024   __kmp_assert_valid_gtid(gtid);
1025 
1026 #if USE_ITT_BUILD
1027   __kmp_itt_ordered_end(gtid);
1028 // TODO: ordered_wait_id
1029 #endif /* USE_ITT_BUILD */
1030 
1031   th = __kmp_threads[gtid];
1032 
1033   if (th->th.th_dispatch->th_dxo_fcn != 0)
1034     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1035   else
1036     __kmp_parallel_dxo(&gtid, &cid, loc);
1037 
1038 #if OMPT_SUPPORT && OMPT_OPTIONAL
1039   OMPT_STORE_RETURN_ADDRESS(gtid);
1040   if (ompt_enabled.ompt_callback_mutex_released) {
1041     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1042         ompt_mutex_ordered,
1043         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1044             ->t.t_ordered.dt.t_value,
1045         OMPT_LOAD_RETURN_ADDRESS(gtid));
1046   }
1047 #endif
1048 }
1049 
1050 #if KMP_USE_DYNAMIC_LOCK
1051 
1052 static __forceinline void
1053 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1054                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1055   // Pointer to the allocated indirect lock is written to crit, while indexing
1056   // is ignored.
1057   void *idx;
1058   kmp_indirect_lock_t **lck;
1059   lck = (kmp_indirect_lock_t **)crit;
1060   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1061   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1062   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1063   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1064   KA_TRACE(20,
1065            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1066 #if USE_ITT_BUILD
1067   __kmp_itt_critical_creating(ilk->lock, loc);
1068 #endif
1069   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1070   if (status == 0) {
1071 #if USE_ITT_BUILD
1072     __kmp_itt_critical_destroyed(ilk->lock);
1073 #endif
1074     // We don't really need to destroy the unclaimed lock here since it will be
1075     // cleaned up at program exit.
1076     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1077   }
1078   KMP_DEBUG_ASSERT(*lck != NULL);
1079 }
1080 
1081 // Fast-path acquire tas lock
1082 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1083   {                                                                            \
1084     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1085     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1086     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1087     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1088         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1089       kmp_uint32 spins;                                                        \
1090       KMP_FSYNC_PREPARE(l);                                                    \
1091       KMP_INIT_YIELD(spins);                                                   \
1092       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1093       do {                                                                     \
1094         if (TCR_4(__kmp_nth) >                                                 \
1095             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1096           KMP_YIELD(TRUE);                                                     \
1097         } else {                                                               \
1098           KMP_YIELD_SPIN(spins);                                               \
1099         }                                                                      \
1100         __kmp_spin_backoff(&backoff);                                          \
1101       } while (                                                                \
1102           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1103           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1104     }                                                                          \
1105     KMP_FSYNC_ACQUIRED(l);                                                     \
1106   }
1107 
1108 // Fast-path test tas lock
1109 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1110   {                                                                            \
1111     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1112     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1113     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1114     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1115          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1116   }
1117 
1118 // Fast-path release tas lock
1119 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1120   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1121 
1122 #if KMP_USE_FUTEX
1123 
1124 #include <sys/syscall.h>
1125 #include <unistd.h>
1126 #ifndef FUTEX_WAIT
1127 #define FUTEX_WAIT 0
1128 #endif
1129 #ifndef FUTEX_WAKE
1130 #define FUTEX_WAKE 1
1131 #endif
1132 
1133 // Fast-path acquire futex lock
1134 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1135   {                                                                            \
1136     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1137     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1138     KMP_MB();                                                                  \
1139     KMP_FSYNC_PREPARE(ftx);                                                    \
1140     kmp_int32 poll_val;                                                        \
1141     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1142                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1143                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1144       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1145       if (!cond) {                                                             \
1146         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1147                                          poll_val |                            \
1148                                              KMP_LOCK_BUSY(1, futex))) {       \
1149           continue;                                                            \
1150         }                                                                      \
1151         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1152       }                                                                        \
1153       kmp_int32 rc;                                                            \
1154       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1155                         NULL, NULL, 0)) != 0) {                                \
1156         continue;                                                              \
1157       }                                                                        \
1158       gtid_code |= 1;                                                          \
1159     }                                                                          \
1160     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1161   }
1162 
1163 // Fast-path test futex lock
1164 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1165   {                                                                            \
1166     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1167     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1168                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1169       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1170       rc = TRUE;                                                               \
1171     } else {                                                                   \
1172       rc = FALSE;                                                              \
1173     }                                                                          \
1174   }
1175 
1176 // Fast-path release futex lock
1177 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1178   {                                                                            \
1179     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1180     KMP_MB();                                                                  \
1181     KMP_FSYNC_RELEASING(ftx);                                                  \
1182     kmp_int32 poll_val =                                                       \
1183         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1184     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1185       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1186               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1187     }                                                                          \
1188     KMP_MB();                                                                  \
1189     KMP_YIELD_OVERSUB();                                                       \
1190   }
1191 
1192 #endif // KMP_USE_FUTEX
1193 
1194 #else // KMP_USE_DYNAMIC_LOCK
1195 
1196 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1197                                                       ident_t const *loc,
1198                                                       kmp_int32 gtid) {
1199   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1200 
1201   // Because of the double-check, the following load doesn't need to be volatile
1202   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1203 
1204   if (lck == NULL) {
1205     void *idx;
1206 
1207     // Allocate & initialize the lock.
1208     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1209     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1210     __kmp_init_user_lock_with_checks(lck);
1211     __kmp_set_user_lock_location(lck, loc);
1212 #if USE_ITT_BUILD
1213     __kmp_itt_critical_creating(lck);
1214 // __kmp_itt_critical_creating() should be called *before* the first usage
1215 // of underlying lock. It is the only place where we can guarantee it. There
1216 // are chances the lock will destroyed with no usage, but it is not a
1217 // problem, because this is not real event seen by user but rather setting
1218 // name for object (lock). See more details in kmp_itt.h.
1219 #endif /* USE_ITT_BUILD */
1220 
1221     // Use a cmpxchg instruction to slam the start of the critical section with
1222     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1223     // and use the lock that the other thread allocated.
1224     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1225 
1226     if (status == 0) {
1227 // Deallocate the lock and reload the value.
1228 #if USE_ITT_BUILD
1229       __kmp_itt_critical_destroyed(lck);
1230 // Let ITT know the lock is destroyed and the same memory location may be reused
1231 // for another purpose.
1232 #endif /* USE_ITT_BUILD */
1233       __kmp_destroy_user_lock_with_checks(lck);
1234       __kmp_user_lock_free(&idx, gtid, lck);
1235       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1236       KMP_DEBUG_ASSERT(lck != NULL);
1237     }
1238   }
1239   return lck;
1240 }
1241 
1242 #endif // KMP_USE_DYNAMIC_LOCK
1243 
1244 /*!
1245 @ingroup WORK_SHARING
1246 @param loc  source location information.
1247 @param global_tid  global thread number.
1248 @param crit identity of the critical section. This could be a pointer to a lock
1249 associated with the critical section, or some other suitably unique value.
1250 
1251 Enter code protected by a `critical` construct.
1252 This function blocks until the executing thread can enter the critical section.
1253 */
1254 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1255                      kmp_critical_name *crit) {
1256 #if KMP_USE_DYNAMIC_LOCK
1257 #if OMPT_SUPPORT && OMPT_OPTIONAL
1258   OMPT_STORE_RETURN_ADDRESS(global_tid);
1259 #endif // OMPT_SUPPORT
1260   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1261 #else
1262   KMP_COUNT_BLOCK(OMP_CRITICAL);
1263 #if OMPT_SUPPORT && OMPT_OPTIONAL
1264   ompt_state_t prev_state = ompt_state_undefined;
1265   ompt_thread_info_t ti;
1266 #endif
1267   kmp_user_lock_p lck;
1268 
1269   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1270   __kmp_assert_valid_gtid(global_tid);
1271 
1272   // TODO: add THR_OVHD_STATE
1273 
1274   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1275   KMP_CHECK_USER_LOCK_INIT();
1276 
1277   if ((__kmp_user_lock_kind == lk_tas) &&
1278       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1279     lck = (kmp_user_lock_p)crit;
1280   }
1281 #if KMP_USE_FUTEX
1282   else if ((__kmp_user_lock_kind == lk_futex) &&
1283            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1284     lck = (kmp_user_lock_p)crit;
1285   }
1286 #endif
1287   else { // ticket, queuing or drdpa
1288     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1289   }
1290 
1291   if (__kmp_env_consistency_check)
1292     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1293 
1294     // since the critical directive binds to all threads, not just the current
1295     // team we have to check this even if we are in a serialized team.
1296     // also, even if we are the uber thread, we still have to conduct the lock,
1297     // as we have to contend with sibling threads.
1298 
1299 #if USE_ITT_BUILD
1300   __kmp_itt_critical_acquiring(lck);
1301 #endif /* USE_ITT_BUILD */
1302 #if OMPT_SUPPORT && OMPT_OPTIONAL
1303   OMPT_STORE_RETURN_ADDRESS(gtid);
1304   void *codeptr_ra = NULL;
1305   if (ompt_enabled.enabled) {
1306     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1307     /* OMPT state update */
1308     prev_state = ti.state;
1309     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1310     ti.state = ompt_state_wait_critical;
1311 
1312     /* OMPT event callback */
1313     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1314     if (ompt_enabled.ompt_callback_mutex_acquire) {
1315       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1316           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1317           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1318     }
1319   }
1320 #endif
1321   // Value of 'crit' should be good for using as a critical_id of the critical
1322   // section directive.
1323   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1324 
1325 #if USE_ITT_BUILD
1326   __kmp_itt_critical_acquired(lck);
1327 #endif /* USE_ITT_BUILD */
1328 #if OMPT_SUPPORT && OMPT_OPTIONAL
1329   if (ompt_enabled.enabled) {
1330     /* OMPT state update */
1331     ti.state = prev_state;
1332     ti.wait_id = 0;
1333 
1334     /* OMPT event callback */
1335     if (ompt_enabled.ompt_callback_mutex_acquired) {
1336       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1337           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1338     }
1339   }
1340 #endif
1341   KMP_POP_PARTITIONED_TIMER();
1342 
1343   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1344   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1345 #endif // KMP_USE_DYNAMIC_LOCK
1346 }
1347 
1348 #if KMP_USE_DYNAMIC_LOCK
1349 
1350 // Converts the given hint to an internal lock implementation
1351 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1352 #if KMP_USE_TSX
1353 #define KMP_TSX_LOCK(seq) lockseq_##seq
1354 #else
1355 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1356 #endif
1357 
1358 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1359 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.flags.rtm)
1360 #else
1361 #define KMP_CPUINFO_RTM 0
1362 #endif
1363 
1364   // Hints that do not require further logic
1365   if (hint & kmp_lock_hint_hle)
1366     return KMP_TSX_LOCK(hle);
1367   if (hint & kmp_lock_hint_rtm)
1368     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1369   if (hint & kmp_lock_hint_adaptive)
1370     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1371 
1372   // Rule out conflicting hints first by returning the default lock
1373   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1374     return __kmp_user_lock_seq;
1375   if ((hint & omp_lock_hint_speculative) &&
1376       (hint & omp_lock_hint_nonspeculative))
1377     return __kmp_user_lock_seq;
1378 
1379   // Do not even consider speculation when it appears to be contended
1380   if (hint & omp_lock_hint_contended)
1381     return lockseq_queuing;
1382 
1383   // Uncontended lock without speculation
1384   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1385     return lockseq_tas;
1386 
1387   // Use RTM lock for speculation
1388   if (hint & omp_lock_hint_speculative)
1389     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1390 
1391   return __kmp_user_lock_seq;
1392 }
1393 
1394 #if OMPT_SUPPORT && OMPT_OPTIONAL
1395 #if KMP_USE_DYNAMIC_LOCK
1396 static kmp_mutex_impl_t
1397 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1398   if (user_lock) {
1399     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1400     case 0:
1401       break;
1402 #if KMP_USE_FUTEX
1403     case locktag_futex:
1404       return kmp_mutex_impl_queuing;
1405 #endif
1406     case locktag_tas:
1407       return kmp_mutex_impl_spin;
1408 #if KMP_USE_TSX
1409     case locktag_hle:
1410     case locktag_rtm_spin:
1411       return kmp_mutex_impl_speculative;
1412 #endif
1413     default:
1414       return kmp_mutex_impl_none;
1415     }
1416     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1417   }
1418   KMP_ASSERT(ilock);
1419   switch (ilock->type) {
1420 #if KMP_USE_TSX
1421   case locktag_adaptive:
1422   case locktag_rtm_queuing:
1423     return kmp_mutex_impl_speculative;
1424 #endif
1425   case locktag_nested_tas:
1426     return kmp_mutex_impl_spin;
1427 #if KMP_USE_FUTEX
1428   case locktag_nested_futex:
1429 #endif
1430   case locktag_ticket:
1431   case locktag_queuing:
1432   case locktag_drdpa:
1433   case locktag_nested_ticket:
1434   case locktag_nested_queuing:
1435   case locktag_nested_drdpa:
1436     return kmp_mutex_impl_queuing;
1437   default:
1438     return kmp_mutex_impl_none;
1439   }
1440 }
1441 #else
1442 // For locks without dynamic binding
1443 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1444   switch (__kmp_user_lock_kind) {
1445   case lk_tas:
1446     return kmp_mutex_impl_spin;
1447 #if KMP_USE_FUTEX
1448   case lk_futex:
1449 #endif
1450   case lk_ticket:
1451   case lk_queuing:
1452   case lk_drdpa:
1453     return kmp_mutex_impl_queuing;
1454 #if KMP_USE_TSX
1455   case lk_hle:
1456   case lk_rtm_queuing:
1457   case lk_rtm_spin:
1458   case lk_adaptive:
1459     return kmp_mutex_impl_speculative;
1460 #endif
1461   default:
1462     return kmp_mutex_impl_none;
1463   }
1464 }
1465 #endif // KMP_USE_DYNAMIC_LOCK
1466 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1467 
1468 /*!
1469 @ingroup WORK_SHARING
1470 @param loc  source location information.
1471 @param global_tid  global thread number.
1472 @param crit identity of the critical section. This could be a pointer to a lock
1473 associated with the critical section, or some other suitably unique value.
1474 @param hint the lock hint.
1475 
1476 Enter code protected by a `critical` construct with a hint. The hint value is
1477 used to suggest a lock implementation. This function blocks until the executing
1478 thread can enter the critical section unless the hint suggests use of
1479 speculative execution and the hardware supports it.
1480 */
1481 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1482                                kmp_critical_name *crit, uint32_t hint) {
1483   KMP_COUNT_BLOCK(OMP_CRITICAL);
1484   kmp_user_lock_p lck;
1485 #if OMPT_SUPPORT && OMPT_OPTIONAL
1486   ompt_state_t prev_state = ompt_state_undefined;
1487   ompt_thread_info_t ti;
1488   // This is the case, if called from __kmpc_critical:
1489   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1490   if (!codeptr)
1491     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1492 #endif
1493 
1494   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1495   __kmp_assert_valid_gtid(global_tid);
1496 
1497   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1498   // Check if it is initialized.
1499   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1500   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1501   if (*lk == 0) {
1502     if (KMP_IS_D_LOCK(lockseq)) {
1503       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1504                                   KMP_GET_D_TAG(lockseq));
1505     } else {
1506       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1507     }
1508   }
1509   // Branch for accessing the actual lock object and set operation. This
1510   // branching is inevitable since this lock initialization does not follow the
1511   // normal dispatch path (lock table is not used).
1512   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1513     lck = (kmp_user_lock_p)lk;
1514     if (__kmp_env_consistency_check) {
1515       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1516                       __kmp_map_hint_to_lock(hint));
1517     }
1518 #if USE_ITT_BUILD
1519     __kmp_itt_critical_acquiring(lck);
1520 #endif
1521 #if OMPT_SUPPORT && OMPT_OPTIONAL
1522     if (ompt_enabled.enabled) {
1523       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1524       /* OMPT state update */
1525       prev_state = ti.state;
1526       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1527       ti.state = ompt_state_wait_critical;
1528 
1529       /* OMPT event callback */
1530       if (ompt_enabled.ompt_callback_mutex_acquire) {
1531         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1532             ompt_mutex_critical, (unsigned int)hint,
1533             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1534             codeptr);
1535       }
1536     }
1537 #endif
1538 #if KMP_USE_INLINED_TAS
1539     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1540       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1541     } else
1542 #elif KMP_USE_INLINED_FUTEX
1543     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1544       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1545     } else
1546 #endif
1547     {
1548       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1549     }
1550   } else {
1551     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1552     lck = ilk->lock;
1553     if (__kmp_env_consistency_check) {
1554       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1555                       __kmp_map_hint_to_lock(hint));
1556     }
1557 #if USE_ITT_BUILD
1558     __kmp_itt_critical_acquiring(lck);
1559 #endif
1560 #if OMPT_SUPPORT && OMPT_OPTIONAL
1561     if (ompt_enabled.enabled) {
1562       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1563       /* OMPT state update */
1564       prev_state = ti.state;
1565       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1566       ti.state = ompt_state_wait_critical;
1567 
1568       /* OMPT event callback */
1569       if (ompt_enabled.ompt_callback_mutex_acquire) {
1570         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1571             ompt_mutex_critical, (unsigned int)hint,
1572             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1573             codeptr);
1574       }
1575     }
1576 #endif
1577     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1578   }
1579   KMP_POP_PARTITIONED_TIMER();
1580 
1581 #if USE_ITT_BUILD
1582   __kmp_itt_critical_acquired(lck);
1583 #endif /* USE_ITT_BUILD */
1584 #if OMPT_SUPPORT && OMPT_OPTIONAL
1585   if (ompt_enabled.enabled) {
1586     /* OMPT state update */
1587     ti.state = prev_state;
1588     ti.wait_id = 0;
1589 
1590     /* OMPT event callback */
1591     if (ompt_enabled.ompt_callback_mutex_acquired) {
1592       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1593           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1594     }
1595   }
1596 #endif
1597 
1598   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1599   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1600 } // __kmpc_critical_with_hint
1601 
1602 #endif // KMP_USE_DYNAMIC_LOCK
1603 
1604 /*!
1605 @ingroup WORK_SHARING
1606 @param loc  source location information.
1607 @param global_tid  global thread number .
1608 @param crit identity of the critical section. This could be a pointer to a lock
1609 associated with the critical section, or some other suitably unique value.
1610 
1611 Leave a critical section, releasing any lock that was held during its execution.
1612 */
1613 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1614                          kmp_critical_name *crit) {
1615   kmp_user_lock_p lck;
1616 
1617   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1618 
1619 #if KMP_USE_DYNAMIC_LOCK
1620   int locktag = KMP_EXTRACT_D_TAG(crit);
1621   if (locktag) {
1622     lck = (kmp_user_lock_p)crit;
1623     KMP_ASSERT(lck != NULL);
1624     if (__kmp_env_consistency_check) {
1625       __kmp_pop_sync(global_tid, ct_critical, loc);
1626     }
1627 #if USE_ITT_BUILD
1628     __kmp_itt_critical_releasing(lck);
1629 #endif
1630 #if KMP_USE_INLINED_TAS
1631     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1632       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1633     } else
1634 #elif KMP_USE_INLINED_FUTEX
1635     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1636       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1637     } else
1638 #endif
1639     {
1640       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1641     }
1642   } else {
1643     kmp_indirect_lock_t *ilk =
1644         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1645     KMP_ASSERT(ilk != NULL);
1646     lck = ilk->lock;
1647     if (__kmp_env_consistency_check) {
1648       __kmp_pop_sync(global_tid, ct_critical, loc);
1649     }
1650 #if USE_ITT_BUILD
1651     __kmp_itt_critical_releasing(lck);
1652 #endif
1653     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1654   }
1655 
1656 #else // KMP_USE_DYNAMIC_LOCK
1657 
1658   if ((__kmp_user_lock_kind == lk_tas) &&
1659       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1660     lck = (kmp_user_lock_p)crit;
1661   }
1662 #if KMP_USE_FUTEX
1663   else if ((__kmp_user_lock_kind == lk_futex) &&
1664            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1665     lck = (kmp_user_lock_p)crit;
1666   }
1667 #endif
1668   else { // ticket, queuing or drdpa
1669     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1670   }
1671 
1672   KMP_ASSERT(lck != NULL);
1673 
1674   if (__kmp_env_consistency_check)
1675     __kmp_pop_sync(global_tid, ct_critical, loc);
1676 
1677 #if USE_ITT_BUILD
1678   __kmp_itt_critical_releasing(lck);
1679 #endif /* USE_ITT_BUILD */
1680   // Value of 'crit' should be good for using as a critical_id of the critical
1681   // section directive.
1682   __kmp_release_user_lock_with_checks(lck, global_tid);
1683 
1684 #endif // KMP_USE_DYNAMIC_LOCK
1685 
1686 #if OMPT_SUPPORT && OMPT_OPTIONAL
1687   /* OMPT release event triggers after lock is released; place here to trigger
1688    * for all #if branches */
1689   OMPT_STORE_RETURN_ADDRESS(global_tid);
1690   if (ompt_enabled.ompt_callback_mutex_released) {
1691     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1692         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1693         OMPT_LOAD_RETURN_ADDRESS(0));
1694   }
1695 #endif
1696 
1697   KMP_POP_PARTITIONED_TIMER();
1698   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1699 }
1700 
1701 /*!
1702 @ingroup SYNCHRONIZATION
1703 @param loc source location information
1704 @param global_tid thread id.
1705 @return one if the thread should execute the master block, zero otherwise
1706 
1707 Start execution of a combined barrier and master. The barrier is executed inside
1708 this function.
1709 */
1710 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1711   int status;
1712   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1713   __kmp_assert_valid_gtid(global_tid);
1714 
1715   if (!TCR_4(__kmp_init_parallel))
1716     __kmp_parallel_initialize();
1717 
1718   __kmp_resume_if_soft_paused();
1719 
1720   if (__kmp_env_consistency_check)
1721     __kmp_check_barrier(global_tid, ct_barrier, loc);
1722 
1723 #if OMPT_SUPPORT
1724   ompt_frame_t *ompt_frame;
1725   if (ompt_enabled.enabled) {
1726     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1727     if (ompt_frame->enter_frame.ptr == NULL)
1728       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1729   }
1730   OMPT_STORE_RETURN_ADDRESS(global_tid);
1731 #endif
1732 #if USE_ITT_NOTIFY
1733   __kmp_threads[global_tid]->th.th_ident = loc;
1734 #endif
1735   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1736 #if OMPT_SUPPORT && OMPT_OPTIONAL
1737   if (ompt_enabled.enabled) {
1738     ompt_frame->enter_frame = ompt_data_none;
1739   }
1740 #endif
1741 
1742   return (status != 0) ? 0 : 1;
1743 }
1744 
1745 /*!
1746 @ingroup SYNCHRONIZATION
1747 @param loc source location information
1748 @param global_tid thread id.
1749 
1750 Complete the execution of a combined barrier and master. This function should
1751 only be called at the completion of the <tt>master</tt> code. Other threads will
1752 still be waiting at the barrier and this call releases them.
1753 */
1754 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1755   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1756   __kmp_assert_valid_gtid(global_tid);
1757   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1758 }
1759 
1760 /*!
1761 @ingroup SYNCHRONIZATION
1762 @param loc source location information
1763 @param global_tid thread id.
1764 @return one if the thread should execute the master block, zero otherwise
1765 
1766 Start execution of a combined barrier and master(nowait) construct.
1767 The barrier is executed inside this function.
1768 There is no equivalent "end" function, since the
1769 */
1770 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1771   kmp_int32 ret;
1772   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1773   __kmp_assert_valid_gtid(global_tid);
1774 
1775   if (!TCR_4(__kmp_init_parallel))
1776     __kmp_parallel_initialize();
1777 
1778   __kmp_resume_if_soft_paused();
1779 
1780   if (__kmp_env_consistency_check) {
1781     if (loc == 0) {
1782       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1783     }
1784     __kmp_check_barrier(global_tid, ct_barrier, loc);
1785   }
1786 
1787 #if OMPT_SUPPORT
1788   ompt_frame_t *ompt_frame;
1789   if (ompt_enabled.enabled) {
1790     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1791     if (ompt_frame->enter_frame.ptr == NULL)
1792       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1793   }
1794   OMPT_STORE_RETURN_ADDRESS(global_tid);
1795 #endif
1796 #if USE_ITT_NOTIFY
1797   __kmp_threads[global_tid]->th.th_ident = loc;
1798 #endif
1799   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1800 #if OMPT_SUPPORT && OMPT_OPTIONAL
1801   if (ompt_enabled.enabled) {
1802     ompt_frame->enter_frame = ompt_data_none;
1803   }
1804 #endif
1805 
1806   ret = __kmpc_master(loc, global_tid);
1807 
1808   if (__kmp_env_consistency_check) {
1809     /*  there's no __kmpc_end_master called; so the (stats) */
1810     /*  actions of __kmpc_end_master are done here          */
1811     if (ret) {
1812       /* only one thread should do the pop since only */
1813       /* one did the push (see __kmpc_master())       */
1814       __kmp_pop_sync(global_tid, ct_master, loc);
1815     }
1816   }
1817 
1818   return (ret);
1819 }
1820 
1821 /* The BARRIER for a SINGLE process section is always explicit   */
1822 /*!
1823 @ingroup WORK_SHARING
1824 @param loc  source location information
1825 @param global_tid  global thread number
1826 @return One if this thread should execute the single construct, zero otherwise.
1827 
1828 Test whether to execute a <tt>single</tt> construct.
1829 There are no implicit barriers in the two "single" calls, rather the compiler
1830 should introduce an explicit barrier if it is required.
1831 */
1832 
1833 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1834   __kmp_assert_valid_gtid(global_tid);
1835   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1836 
1837   if (rc) {
1838     // We are going to execute the single statement, so we should count it.
1839     KMP_COUNT_BLOCK(OMP_SINGLE);
1840     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1841   }
1842 
1843 #if OMPT_SUPPORT && OMPT_OPTIONAL
1844   kmp_info_t *this_thr = __kmp_threads[global_tid];
1845   kmp_team_t *team = this_thr->th.th_team;
1846   int tid = __kmp_tid_from_gtid(global_tid);
1847 
1848   if (ompt_enabled.enabled) {
1849     if (rc) {
1850       if (ompt_enabled.ompt_callback_work) {
1851         ompt_callbacks.ompt_callback(ompt_callback_work)(
1852             ompt_work_single_executor, ompt_scope_begin,
1853             &(team->t.ompt_team_info.parallel_data),
1854             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1855             1, OMPT_GET_RETURN_ADDRESS(0));
1856       }
1857     } else {
1858       if (ompt_enabled.ompt_callback_work) {
1859         ompt_callbacks.ompt_callback(ompt_callback_work)(
1860             ompt_work_single_other, ompt_scope_begin,
1861             &(team->t.ompt_team_info.parallel_data),
1862             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1863             1, OMPT_GET_RETURN_ADDRESS(0));
1864         ompt_callbacks.ompt_callback(ompt_callback_work)(
1865             ompt_work_single_other, ompt_scope_end,
1866             &(team->t.ompt_team_info.parallel_data),
1867             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1868             1, OMPT_GET_RETURN_ADDRESS(0));
1869       }
1870     }
1871   }
1872 #endif
1873 
1874   return rc;
1875 }
1876 
1877 /*!
1878 @ingroup WORK_SHARING
1879 @param loc  source location information
1880 @param global_tid  global thread number
1881 
1882 Mark the end of a <tt>single</tt> construct.  This function should
1883 only be called by the thread that executed the block of code protected
1884 by the `single` construct.
1885 */
1886 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1887   __kmp_assert_valid_gtid(global_tid);
1888   __kmp_exit_single(global_tid);
1889   KMP_POP_PARTITIONED_TIMER();
1890 
1891 #if OMPT_SUPPORT && OMPT_OPTIONAL
1892   kmp_info_t *this_thr = __kmp_threads[global_tid];
1893   kmp_team_t *team = this_thr->th.th_team;
1894   int tid = __kmp_tid_from_gtid(global_tid);
1895 
1896   if (ompt_enabled.ompt_callback_work) {
1897     ompt_callbacks.ompt_callback(ompt_callback_work)(
1898         ompt_work_single_executor, ompt_scope_end,
1899         &(team->t.ompt_team_info.parallel_data),
1900         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1901         OMPT_GET_RETURN_ADDRESS(0));
1902   }
1903 #endif
1904 }
1905 
1906 /*!
1907 @ingroup WORK_SHARING
1908 @param loc Source location
1909 @param global_tid Global thread id
1910 
1911 Mark the end of a statically scheduled loop.
1912 */
1913 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1914   KMP_POP_PARTITIONED_TIMER();
1915   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1916 
1917 #if OMPT_SUPPORT && OMPT_OPTIONAL
1918   if (ompt_enabled.ompt_callback_work) {
1919     ompt_work_t ompt_work_type = ompt_work_loop;
1920     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1921     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1922     // Determine workshare type
1923     if (loc != NULL) {
1924       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1925         ompt_work_type = ompt_work_loop;
1926       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1927         ompt_work_type = ompt_work_sections;
1928       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1929         ompt_work_type = ompt_work_distribute;
1930       } else {
1931         // use default set above.
1932         // a warning about this case is provided in __kmpc_for_static_init
1933       }
1934       KMP_DEBUG_ASSERT(ompt_work_type);
1935     }
1936     ompt_callbacks.ompt_callback(ompt_callback_work)(
1937         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1938         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1939   }
1940 #endif
1941   if (__kmp_env_consistency_check)
1942     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1943 }
1944 
1945 // User routines which take C-style arguments (call by value)
1946 // different from the Fortran equivalent routines
1947 
1948 void ompc_set_num_threads(int arg) {
1949   // !!!!! TODO: check the per-task binding
1950   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1951 }
1952 
1953 void ompc_set_dynamic(int flag) {
1954   kmp_info_t *thread;
1955 
1956   /* For the thread-private implementation of the internal controls */
1957   thread = __kmp_entry_thread();
1958 
1959   __kmp_save_internal_controls(thread);
1960 
1961   set__dynamic(thread, flag ? true : false);
1962 }
1963 
1964 void ompc_set_nested(int flag) {
1965   kmp_info_t *thread;
1966 
1967   /* For the thread-private internal controls implementation */
1968   thread = __kmp_entry_thread();
1969 
1970   __kmp_save_internal_controls(thread);
1971 
1972   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1973 }
1974 
1975 void ompc_set_max_active_levels(int max_active_levels) {
1976   /* TO DO */
1977   /* we want per-task implementation of this internal control */
1978 
1979   /* For the per-thread internal controls implementation */
1980   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1981 }
1982 
1983 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1984   // !!!!! TODO: check the per-task binding
1985   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1986 }
1987 
1988 int ompc_get_ancestor_thread_num(int level) {
1989   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1990 }
1991 
1992 int ompc_get_team_size(int level) {
1993   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1994 }
1995 
1996 /* OpenMP 5.0 Affinity Format API */
1997 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
1998   if (!__kmp_init_serial) {
1999     __kmp_serial_initialize();
2000   }
2001   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2002                          format, KMP_STRLEN(format) + 1);
2003 }
2004 
2005 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2006   size_t format_size;
2007   if (!__kmp_init_serial) {
2008     __kmp_serial_initialize();
2009   }
2010   format_size = KMP_STRLEN(__kmp_affinity_format);
2011   if (buffer && size) {
2012     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2013                            format_size + 1);
2014   }
2015   return format_size;
2016 }
2017 
2018 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2019   int gtid;
2020   if (!TCR_4(__kmp_init_middle)) {
2021     __kmp_middle_initialize();
2022   }
2023   __kmp_assign_root_init_mask();
2024   gtid = __kmp_get_gtid();
2025   __kmp_aux_display_affinity(gtid, format);
2026 }
2027 
2028 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2029                                               char const *format) {
2030   int gtid;
2031   size_t num_required;
2032   kmp_str_buf_t capture_buf;
2033   if (!TCR_4(__kmp_init_middle)) {
2034     __kmp_middle_initialize();
2035   }
2036   __kmp_assign_root_init_mask();
2037   gtid = __kmp_get_gtid();
2038   __kmp_str_buf_init(&capture_buf);
2039   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2040   if (buffer && buf_size) {
2041     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2042                            capture_buf.used + 1);
2043   }
2044   __kmp_str_buf_free(&capture_buf);
2045   return num_required;
2046 }
2047 
2048 void kmpc_set_stacksize(int arg) {
2049   // __kmp_aux_set_stacksize initializes the library if needed
2050   __kmp_aux_set_stacksize(arg);
2051 }
2052 
2053 void kmpc_set_stacksize_s(size_t arg) {
2054   // __kmp_aux_set_stacksize initializes the library if needed
2055   __kmp_aux_set_stacksize(arg);
2056 }
2057 
2058 void kmpc_set_blocktime(int arg) {
2059   int gtid, tid;
2060   kmp_info_t *thread;
2061 
2062   gtid = __kmp_entry_gtid();
2063   tid = __kmp_tid_from_gtid(gtid);
2064   thread = __kmp_thread_from_gtid(gtid);
2065 
2066   __kmp_aux_set_blocktime(arg, thread, tid);
2067 }
2068 
2069 void kmpc_set_library(int arg) {
2070   // __kmp_user_set_library initializes the library if needed
2071   __kmp_user_set_library((enum library_type)arg);
2072 }
2073 
2074 void kmpc_set_defaults(char const *str) {
2075   // __kmp_aux_set_defaults initializes the library if needed
2076   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2077 }
2078 
2079 void kmpc_set_disp_num_buffers(int arg) {
2080   // ignore after initialization because some teams have already
2081   // allocated dispatch buffers
2082   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2083       arg <= KMP_MAX_DISP_NUM_BUFF) {
2084     __kmp_dispatch_num_buffers = arg;
2085   }
2086 }
2087 
2088 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2089 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2090   return -1;
2091 #else
2092   if (!TCR_4(__kmp_init_middle)) {
2093     __kmp_middle_initialize();
2094   }
2095   __kmp_assign_root_init_mask();
2096   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2097 #endif
2098 }
2099 
2100 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2101 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2102   return -1;
2103 #else
2104   if (!TCR_4(__kmp_init_middle)) {
2105     __kmp_middle_initialize();
2106   }
2107   __kmp_assign_root_init_mask();
2108   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2109 #endif
2110 }
2111 
2112 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2113 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2114   return -1;
2115 #else
2116   if (!TCR_4(__kmp_init_middle)) {
2117     __kmp_middle_initialize();
2118   }
2119   __kmp_assign_root_init_mask();
2120   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2121 #endif
2122 }
2123 
2124 /* -------------------------------------------------------------------------- */
2125 /*!
2126 @ingroup THREADPRIVATE
2127 @param loc       source location information
2128 @param gtid      global thread number
2129 @param cpy_size  size of the cpy_data buffer
2130 @param cpy_data  pointer to data to be copied
2131 @param cpy_func  helper function to call for copying data
2132 @param didit     flag variable: 1=single thread; 0=not single thread
2133 
2134 __kmpc_copyprivate implements the interface for the private data broadcast
2135 needed for the copyprivate clause associated with a single region in an
2136 OpenMP<sup>*</sup> program (both C and Fortran).
2137 All threads participating in the parallel region call this routine.
2138 One of the threads (called the single thread) should have the <tt>didit</tt>
2139 variable set to 1 and all other threads should have that variable set to 0.
2140 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2141 
2142 The OpenMP specification forbids the use of nowait on the single region when a
2143 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2144 barrier internally to avoid race conditions, so the code generation for the
2145 single region should avoid generating a barrier after the call to @ref
2146 __kmpc_copyprivate.
2147 
2148 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2149 The <tt>loc</tt> parameter is a pointer to source location information.
2150 
2151 Internal implementation: The single thread will first copy its descriptor
2152 address (cpy_data) to a team-private location, then the other threads will each
2153 call the function pointed to by the parameter cpy_func, which carries out the
2154 copy by copying the data using the cpy_data buffer.
2155 
2156 The cpy_func routine used for the copy and the contents of the data area defined
2157 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2158 to be done. For instance, the cpy_data buffer can hold the actual data to be
2159 copied or it may hold a list of pointers to the data. The cpy_func routine must
2160 interpret the cpy_data buffer appropriately.
2161 
2162 The interface to cpy_func is as follows:
2163 @code
2164 void cpy_func( void *destination, void *source )
2165 @endcode
2166 where void *destination is the cpy_data pointer for the thread being copied to
2167 and void *source is the cpy_data pointer for the thread being copied from.
2168 */
2169 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2170                         void *cpy_data, void (*cpy_func)(void *, void *),
2171                         kmp_int32 didit) {
2172   void **data_ptr;
2173   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2174   __kmp_assert_valid_gtid(gtid);
2175 
2176   KMP_MB();
2177 
2178   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2179 
2180   if (__kmp_env_consistency_check) {
2181     if (loc == 0) {
2182       KMP_WARNING(ConstructIdentInvalid);
2183     }
2184   }
2185 
2186   // ToDo: Optimize the following two barriers into some kind of split barrier
2187 
2188   if (didit)
2189     *data_ptr = cpy_data;
2190 
2191 #if OMPT_SUPPORT
2192   ompt_frame_t *ompt_frame;
2193   if (ompt_enabled.enabled) {
2194     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2195     if (ompt_frame->enter_frame.ptr == NULL)
2196       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2197   }
2198   OMPT_STORE_RETURN_ADDRESS(gtid);
2199 #endif
2200 /* This barrier is not a barrier region boundary */
2201 #if USE_ITT_NOTIFY
2202   __kmp_threads[gtid]->th.th_ident = loc;
2203 #endif
2204   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2205 
2206   if (!didit)
2207     (*cpy_func)(cpy_data, *data_ptr);
2208 
2209   // Consider next barrier a user-visible barrier for barrier region boundaries
2210   // Nesting checks are already handled by the single construct checks
2211   {
2212 #if OMPT_SUPPORT
2213     OMPT_STORE_RETURN_ADDRESS(gtid);
2214 #endif
2215 #if USE_ITT_NOTIFY
2216     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2217 // tasks can overwrite the location)
2218 #endif
2219     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2220 #if OMPT_SUPPORT && OMPT_OPTIONAL
2221     if (ompt_enabled.enabled) {
2222       ompt_frame->enter_frame = ompt_data_none;
2223     }
2224 #endif
2225   }
2226 }
2227 
2228 /* -------------------------------------------------------------------------- */
2229 
2230 #define INIT_LOCK __kmp_init_user_lock_with_checks
2231 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2232 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2233 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2234 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2235 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2236   __kmp_acquire_nested_user_lock_with_checks_timed
2237 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2238 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2239 #define TEST_LOCK __kmp_test_user_lock_with_checks
2240 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2241 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2242 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2243 
2244 // TODO: Make check abort messages use location info & pass it into
2245 // with_checks routines
2246 
2247 #if KMP_USE_DYNAMIC_LOCK
2248 
2249 // internal lock initializer
2250 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2251                                                     kmp_dyna_lockseq_t seq) {
2252   if (KMP_IS_D_LOCK(seq)) {
2253     KMP_INIT_D_LOCK(lock, seq);
2254 #if USE_ITT_BUILD
2255     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2256 #endif
2257   } else {
2258     KMP_INIT_I_LOCK(lock, seq);
2259 #if USE_ITT_BUILD
2260     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2261     __kmp_itt_lock_creating(ilk->lock, loc);
2262 #endif
2263   }
2264 }
2265 
2266 // internal nest lock initializer
2267 static __forceinline void
2268 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2269                                kmp_dyna_lockseq_t seq) {
2270 #if KMP_USE_TSX
2271   // Don't have nested lock implementation for speculative locks
2272   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2273       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2274     seq = __kmp_user_lock_seq;
2275 #endif
2276   switch (seq) {
2277   case lockseq_tas:
2278     seq = lockseq_nested_tas;
2279     break;
2280 #if KMP_USE_FUTEX
2281   case lockseq_futex:
2282     seq = lockseq_nested_futex;
2283     break;
2284 #endif
2285   case lockseq_ticket:
2286     seq = lockseq_nested_ticket;
2287     break;
2288   case lockseq_queuing:
2289     seq = lockseq_nested_queuing;
2290     break;
2291   case lockseq_drdpa:
2292     seq = lockseq_nested_drdpa;
2293     break;
2294   default:
2295     seq = lockseq_nested_queuing;
2296   }
2297   KMP_INIT_I_LOCK(lock, seq);
2298 #if USE_ITT_BUILD
2299   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2300   __kmp_itt_lock_creating(ilk->lock, loc);
2301 #endif
2302 }
2303 
2304 /* initialize the lock with a hint */
2305 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2306                                 uintptr_t hint) {
2307   KMP_DEBUG_ASSERT(__kmp_init_serial);
2308   if (__kmp_env_consistency_check && user_lock == NULL) {
2309     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2310   }
2311 
2312   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2313 
2314 #if OMPT_SUPPORT && OMPT_OPTIONAL
2315   // This is the case, if called from omp_init_lock_with_hint:
2316   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2317   if (!codeptr)
2318     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2319   if (ompt_enabled.ompt_callback_lock_init) {
2320     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2321         ompt_mutex_lock, (omp_lock_hint_t)hint,
2322         __ompt_get_mutex_impl_type(user_lock),
2323         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2324   }
2325 #endif
2326 }
2327 
2328 /* initialize the lock with a hint */
2329 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2330                                      void **user_lock, uintptr_t hint) {
2331   KMP_DEBUG_ASSERT(__kmp_init_serial);
2332   if (__kmp_env_consistency_check && user_lock == NULL) {
2333     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2334   }
2335 
2336   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2337 
2338 #if OMPT_SUPPORT && OMPT_OPTIONAL
2339   // This is the case, if called from omp_init_lock_with_hint:
2340   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2341   if (!codeptr)
2342     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2343   if (ompt_enabled.ompt_callback_lock_init) {
2344     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2345         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2346         __ompt_get_mutex_impl_type(user_lock),
2347         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2348   }
2349 #endif
2350 }
2351 
2352 #endif // KMP_USE_DYNAMIC_LOCK
2353 
2354 /* initialize the lock */
2355 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2356 #if KMP_USE_DYNAMIC_LOCK
2357 
2358   KMP_DEBUG_ASSERT(__kmp_init_serial);
2359   if (__kmp_env_consistency_check && user_lock == NULL) {
2360     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2361   }
2362   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2363 
2364 #if OMPT_SUPPORT && OMPT_OPTIONAL
2365   // This is the case, if called from omp_init_lock_with_hint:
2366   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2367   if (!codeptr)
2368     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2369   if (ompt_enabled.ompt_callback_lock_init) {
2370     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2371         ompt_mutex_lock, omp_lock_hint_none,
2372         __ompt_get_mutex_impl_type(user_lock),
2373         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2374   }
2375 #endif
2376 
2377 #else // KMP_USE_DYNAMIC_LOCK
2378 
2379   static char const *const func = "omp_init_lock";
2380   kmp_user_lock_p lck;
2381   KMP_DEBUG_ASSERT(__kmp_init_serial);
2382 
2383   if (__kmp_env_consistency_check) {
2384     if (user_lock == NULL) {
2385       KMP_FATAL(LockIsUninitialized, func);
2386     }
2387   }
2388 
2389   KMP_CHECK_USER_LOCK_INIT();
2390 
2391   if ((__kmp_user_lock_kind == lk_tas) &&
2392       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2393     lck = (kmp_user_lock_p)user_lock;
2394   }
2395 #if KMP_USE_FUTEX
2396   else if ((__kmp_user_lock_kind == lk_futex) &&
2397            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2398     lck = (kmp_user_lock_p)user_lock;
2399   }
2400 #endif
2401   else {
2402     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2403   }
2404   INIT_LOCK(lck);
2405   __kmp_set_user_lock_location(lck, loc);
2406 
2407 #if OMPT_SUPPORT && OMPT_OPTIONAL
2408   // This is the case, if called from omp_init_lock_with_hint:
2409   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2410   if (!codeptr)
2411     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2412   if (ompt_enabled.ompt_callback_lock_init) {
2413     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2414         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2415         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2416   }
2417 #endif
2418 
2419 #if USE_ITT_BUILD
2420   __kmp_itt_lock_creating(lck);
2421 #endif /* USE_ITT_BUILD */
2422 
2423 #endif // KMP_USE_DYNAMIC_LOCK
2424 } // __kmpc_init_lock
2425 
2426 /* initialize the lock */
2427 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2428 #if KMP_USE_DYNAMIC_LOCK
2429 
2430   KMP_DEBUG_ASSERT(__kmp_init_serial);
2431   if (__kmp_env_consistency_check && user_lock == NULL) {
2432     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2433   }
2434   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2435 
2436 #if OMPT_SUPPORT && OMPT_OPTIONAL
2437   // This is the case, if called from omp_init_lock_with_hint:
2438   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2439   if (!codeptr)
2440     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2441   if (ompt_enabled.ompt_callback_lock_init) {
2442     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2443         ompt_mutex_nest_lock, omp_lock_hint_none,
2444         __ompt_get_mutex_impl_type(user_lock),
2445         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2446   }
2447 #endif
2448 
2449 #else // KMP_USE_DYNAMIC_LOCK
2450 
2451   static char const *const func = "omp_init_nest_lock";
2452   kmp_user_lock_p lck;
2453   KMP_DEBUG_ASSERT(__kmp_init_serial);
2454 
2455   if (__kmp_env_consistency_check) {
2456     if (user_lock == NULL) {
2457       KMP_FATAL(LockIsUninitialized, func);
2458     }
2459   }
2460 
2461   KMP_CHECK_USER_LOCK_INIT();
2462 
2463   if ((__kmp_user_lock_kind == lk_tas) &&
2464       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2465        OMP_NEST_LOCK_T_SIZE)) {
2466     lck = (kmp_user_lock_p)user_lock;
2467   }
2468 #if KMP_USE_FUTEX
2469   else if ((__kmp_user_lock_kind == lk_futex) &&
2470            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2471             OMP_NEST_LOCK_T_SIZE)) {
2472     lck = (kmp_user_lock_p)user_lock;
2473   }
2474 #endif
2475   else {
2476     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2477   }
2478 
2479   INIT_NESTED_LOCK(lck);
2480   __kmp_set_user_lock_location(lck, loc);
2481 
2482 #if OMPT_SUPPORT && OMPT_OPTIONAL
2483   // This is the case, if called from omp_init_lock_with_hint:
2484   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2485   if (!codeptr)
2486     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2487   if (ompt_enabled.ompt_callback_lock_init) {
2488     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2489         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2490         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2491   }
2492 #endif
2493 
2494 #if USE_ITT_BUILD
2495   __kmp_itt_lock_creating(lck);
2496 #endif /* USE_ITT_BUILD */
2497 
2498 #endif // KMP_USE_DYNAMIC_LOCK
2499 } // __kmpc_init_nest_lock
2500 
2501 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2502 #if KMP_USE_DYNAMIC_LOCK
2503 
2504 #if USE_ITT_BUILD
2505   kmp_user_lock_p lck;
2506   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2507     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2508   } else {
2509     lck = (kmp_user_lock_p)user_lock;
2510   }
2511   __kmp_itt_lock_destroyed(lck);
2512 #endif
2513 #if OMPT_SUPPORT && OMPT_OPTIONAL
2514   // This is the case, if called from omp_init_lock_with_hint:
2515   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2516   if (!codeptr)
2517     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2518   if (ompt_enabled.ompt_callback_lock_destroy) {
2519     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2520         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2521   }
2522 #endif
2523   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2524 #else
2525   kmp_user_lock_p lck;
2526 
2527   if ((__kmp_user_lock_kind == lk_tas) &&
2528       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2529     lck = (kmp_user_lock_p)user_lock;
2530   }
2531 #if KMP_USE_FUTEX
2532   else if ((__kmp_user_lock_kind == lk_futex) &&
2533            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2534     lck = (kmp_user_lock_p)user_lock;
2535   }
2536 #endif
2537   else {
2538     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2539   }
2540 
2541 #if OMPT_SUPPORT && OMPT_OPTIONAL
2542   // This is the case, if called from omp_init_lock_with_hint:
2543   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2544   if (!codeptr)
2545     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2546   if (ompt_enabled.ompt_callback_lock_destroy) {
2547     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2548         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2549   }
2550 #endif
2551 
2552 #if USE_ITT_BUILD
2553   __kmp_itt_lock_destroyed(lck);
2554 #endif /* USE_ITT_BUILD */
2555   DESTROY_LOCK(lck);
2556 
2557   if ((__kmp_user_lock_kind == lk_tas) &&
2558       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2559     ;
2560   }
2561 #if KMP_USE_FUTEX
2562   else if ((__kmp_user_lock_kind == lk_futex) &&
2563            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2564     ;
2565   }
2566 #endif
2567   else {
2568     __kmp_user_lock_free(user_lock, gtid, lck);
2569   }
2570 #endif // KMP_USE_DYNAMIC_LOCK
2571 } // __kmpc_destroy_lock
2572 
2573 /* destroy the lock */
2574 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2575 #if KMP_USE_DYNAMIC_LOCK
2576 
2577 #if USE_ITT_BUILD
2578   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2579   __kmp_itt_lock_destroyed(ilk->lock);
2580 #endif
2581 #if OMPT_SUPPORT && OMPT_OPTIONAL
2582   // This is the case, if called from omp_init_lock_with_hint:
2583   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2584   if (!codeptr)
2585     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2586   if (ompt_enabled.ompt_callback_lock_destroy) {
2587     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2588         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2589   }
2590 #endif
2591   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2592 
2593 #else // KMP_USE_DYNAMIC_LOCK
2594 
2595   kmp_user_lock_p lck;
2596 
2597   if ((__kmp_user_lock_kind == lk_tas) &&
2598       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2599        OMP_NEST_LOCK_T_SIZE)) {
2600     lck = (kmp_user_lock_p)user_lock;
2601   }
2602 #if KMP_USE_FUTEX
2603   else if ((__kmp_user_lock_kind == lk_futex) &&
2604            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2605             OMP_NEST_LOCK_T_SIZE)) {
2606     lck = (kmp_user_lock_p)user_lock;
2607   }
2608 #endif
2609   else {
2610     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2611   }
2612 
2613 #if OMPT_SUPPORT && OMPT_OPTIONAL
2614   // This is the case, if called from omp_init_lock_with_hint:
2615   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2616   if (!codeptr)
2617     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2618   if (ompt_enabled.ompt_callback_lock_destroy) {
2619     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2620         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2621   }
2622 #endif
2623 
2624 #if USE_ITT_BUILD
2625   __kmp_itt_lock_destroyed(lck);
2626 #endif /* USE_ITT_BUILD */
2627 
2628   DESTROY_NESTED_LOCK(lck);
2629 
2630   if ((__kmp_user_lock_kind == lk_tas) &&
2631       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2632        OMP_NEST_LOCK_T_SIZE)) {
2633     ;
2634   }
2635 #if KMP_USE_FUTEX
2636   else if ((__kmp_user_lock_kind == lk_futex) &&
2637            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2638             OMP_NEST_LOCK_T_SIZE)) {
2639     ;
2640   }
2641 #endif
2642   else {
2643     __kmp_user_lock_free(user_lock, gtid, lck);
2644   }
2645 #endif // KMP_USE_DYNAMIC_LOCK
2646 } // __kmpc_destroy_nest_lock
2647 
2648 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2649   KMP_COUNT_BLOCK(OMP_set_lock);
2650 #if KMP_USE_DYNAMIC_LOCK
2651   int tag = KMP_EXTRACT_D_TAG(user_lock);
2652 #if USE_ITT_BUILD
2653   __kmp_itt_lock_acquiring(
2654       (kmp_user_lock_p)
2655           user_lock); // itt function will get to the right lock object.
2656 #endif
2657 #if OMPT_SUPPORT && OMPT_OPTIONAL
2658   // This is the case, if called from omp_init_lock_with_hint:
2659   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2660   if (!codeptr)
2661     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2662   if (ompt_enabled.ompt_callback_mutex_acquire) {
2663     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2664         ompt_mutex_lock, omp_lock_hint_none,
2665         __ompt_get_mutex_impl_type(user_lock),
2666         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2667   }
2668 #endif
2669 #if KMP_USE_INLINED_TAS
2670   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2671     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2672   } else
2673 #elif KMP_USE_INLINED_FUTEX
2674   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2675     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2676   } else
2677 #endif
2678   {
2679     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2680   }
2681 #if USE_ITT_BUILD
2682   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2683 #endif
2684 #if OMPT_SUPPORT && OMPT_OPTIONAL
2685   if (ompt_enabled.ompt_callback_mutex_acquired) {
2686     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2687         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2688   }
2689 #endif
2690 
2691 #else // KMP_USE_DYNAMIC_LOCK
2692 
2693   kmp_user_lock_p lck;
2694 
2695   if ((__kmp_user_lock_kind == lk_tas) &&
2696       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2697     lck = (kmp_user_lock_p)user_lock;
2698   }
2699 #if KMP_USE_FUTEX
2700   else if ((__kmp_user_lock_kind == lk_futex) &&
2701            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2702     lck = (kmp_user_lock_p)user_lock;
2703   }
2704 #endif
2705   else {
2706     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2707   }
2708 
2709 #if USE_ITT_BUILD
2710   __kmp_itt_lock_acquiring(lck);
2711 #endif /* USE_ITT_BUILD */
2712 #if OMPT_SUPPORT && OMPT_OPTIONAL
2713   // This is the case, if called from omp_init_lock_with_hint:
2714   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2715   if (!codeptr)
2716     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2717   if (ompt_enabled.ompt_callback_mutex_acquire) {
2718     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2719         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2720         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2721   }
2722 #endif
2723 
2724   ACQUIRE_LOCK(lck, gtid);
2725 
2726 #if USE_ITT_BUILD
2727   __kmp_itt_lock_acquired(lck);
2728 #endif /* USE_ITT_BUILD */
2729 
2730 #if OMPT_SUPPORT && OMPT_OPTIONAL
2731   if (ompt_enabled.ompt_callback_mutex_acquired) {
2732     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2733         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2734   }
2735 #endif
2736 
2737 #endif // KMP_USE_DYNAMIC_LOCK
2738 }
2739 
2740 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2741 #if KMP_USE_DYNAMIC_LOCK
2742 
2743 #if USE_ITT_BUILD
2744   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2745 #endif
2746 #if OMPT_SUPPORT && OMPT_OPTIONAL
2747   // This is the case, if called from omp_init_lock_with_hint:
2748   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2749   if (!codeptr)
2750     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2751   if (ompt_enabled.enabled) {
2752     if (ompt_enabled.ompt_callback_mutex_acquire) {
2753       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2754           ompt_mutex_nest_lock, omp_lock_hint_none,
2755           __ompt_get_mutex_impl_type(user_lock),
2756           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2757     }
2758   }
2759 #endif
2760   int acquire_status =
2761       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2762   (void)acquire_status;
2763 #if USE_ITT_BUILD
2764   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2765 #endif
2766 
2767 #if OMPT_SUPPORT && OMPT_OPTIONAL
2768   if (ompt_enabled.enabled) {
2769     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2770       if (ompt_enabled.ompt_callback_mutex_acquired) {
2771         // lock_first
2772         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2773             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2774             codeptr);
2775       }
2776     } else {
2777       if (ompt_enabled.ompt_callback_nest_lock) {
2778         // lock_next
2779         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2780             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2781       }
2782     }
2783   }
2784 #endif
2785 
2786 #else // KMP_USE_DYNAMIC_LOCK
2787   int acquire_status;
2788   kmp_user_lock_p lck;
2789 
2790   if ((__kmp_user_lock_kind == lk_tas) &&
2791       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2792        OMP_NEST_LOCK_T_SIZE)) {
2793     lck = (kmp_user_lock_p)user_lock;
2794   }
2795 #if KMP_USE_FUTEX
2796   else if ((__kmp_user_lock_kind == lk_futex) &&
2797            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2798             OMP_NEST_LOCK_T_SIZE)) {
2799     lck = (kmp_user_lock_p)user_lock;
2800   }
2801 #endif
2802   else {
2803     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2804   }
2805 
2806 #if USE_ITT_BUILD
2807   __kmp_itt_lock_acquiring(lck);
2808 #endif /* USE_ITT_BUILD */
2809 #if OMPT_SUPPORT && OMPT_OPTIONAL
2810   // This is the case, if called from omp_init_lock_with_hint:
2811   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2812   if (!codeptr)
2813     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2814   if (ompt_enabled.enabled) {
2815     if (ompt_enabled.ompt_callback_mutex_acquire) {
2816       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2817           ompt_mutex_nest_lock, omp_lock_hint_none,
2818           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2819           codeptr);
2820     }
2821   }
2822 #endif
2823 
2824   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2825 
2826 #if USE_ITT_BUILD
2827   __kmp_itt_lock_acquired(lck);
2828 #endif /* USE_ITT_BUILD */
2829 
2830 #if OMPT_SUPPORT && OMPT_OPTIONAL
2831   if (ompt_enabled.enabled) {
2832     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2833       if (ompt_enabled.ompt_callback_mutex_acquired) {
2834         // lock_first
2835         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2836             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2837       }
2838     } else {
2839       if (ompt_enabled.ompt_callback_nest_lock) {
2840         // lock_next
2841         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2842             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2843       }
2844     }
2845   }
2846 #endif
2847 
2848 #endif // KMP_USE_DYNAMIC_LOCK
2849 }
2850 
2851 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2852 #if KMP_USE_DYNAMIC_LOCK
2853 
2854   int tag = KMP_EXTRACT_D_TAG(user_lock);
2855 #if USE_ITT_BUILD
2856   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2857 #endif
2858 #if KMP_USE_INLINED_TAS
2859   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2860     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2861   } else
2862 #elif KMP_USE_INLINED_FUTEX
2863   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2864     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2865   } else
2866 #endif
2867   {
2868     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2869   }
2870 
2871 #if OMPT_SUPPORT && OMPT_OPTIONAL
2872   // This is the case, if called from omp_init_lock_with_hint:
2873   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2874   if (!codeptr)
2875     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2876   if (ompt_enabled.ompt_callback_mutex_released) {
2877     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2878         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2879   }
2880 #endif
2881 
2882 #else // KMP_USE_DYNAMIC_LOCK
2883 
2884   kmp_user_lock_p lck;
2885 
2886   /* Can't use serial interval since not block structured */
2887   /* release the lock */
2888 
2889   if ((__kmp_user_lock_kind == lk_tas) &&
2890       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2891 #if KMP_OS_LINUX &&                                                            \
2892     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2893 // "fast" path implemented to fix customer performance issue
2894 #if USE_ITT_BUILD
2895     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2896 #endif /* USE_ITT_BUILD */
2897     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2898     KMP_MB();
2899 
2900 #if OMPT_SUPPORT && OMPT_OPTIONAL
2901     // This is the case, if called from omp_init_lock_with_hint:
2902     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2903     if (!codeptr)
2904       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2905     if (ompt_enabled.ompt_callback_mutex_released) {
2906       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2907           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2908     }
2909 #endif
2910 
2911     return;
2912 #else
2913     lck = (kmp_user_lock_p)user_lock;
2914 #endif
2915   }
2916 #if KMP_USE_FUTEX
2917   else if ((__kmp_user_lock_kind == lk_futex) &&
2918            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2919     lck = (kmp_user_lock_p)user_lock;
2920   }
2921 #endif
2922   else {
2923     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2924   }
2925 
2926 #if USE_ITT_BUILD
2927   __kmp_itt_lock_releasing(lck);
2928 #endif /* USE_ITT_BUILD */
2929 
2930   RELEASE_LOCK(lck, gtid);
2931 
2932 #if OMPT_SUPPORT && OMPT_OPTIONAL
2933   // This is the case, if called from omp_init_lock_with_hint:
2934   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2935   if (!codeptr)
2936     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2937   if (ompt_enabled.ompt_callback_mutex_released) {
2938     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2939         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2940   }
2941 #endif
2942 
2943 #endif // KMP_USE_DYNAMIC_LOCK
2944 }
2945 
2946 /* release the lock */
2947 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2948 #if KMP_USE_DYNAMIC_LOCK
2949 
2950 #if USE_ITT_BUILD
2951   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2952 #endif
2953   int release_status =
2954       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2955   (void)release_status;
2956 
2957 #if OMPT_SUPPORT && OMPT_OPTIONAL
2958   // This is the case, if called from omp_init_lock_with_hint:
2959   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2960   if (!codeptr)
2961     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2962   if (ompt_enabled.enabled) {
2963     if (release_status == KMP_LOCK_RELEASED) {
2964       if (ompt_enabled.ompt_callback_mutex_released) {
2965         // release_lock_last
2966         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2967             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2968             codeptr);
2969       }
2970     } else if (ompt_enabled.ompt_callback_nest_lock) {
2971       // release_lock_prev
2972       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2973           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2974     }
2975   }
2976 #endif
2977 
2978 #else // KMP_USE_DYNAMIC_LOCK
2979 
2980   kmp_user_lock_p lck;
2981 
2982   /* Can't use serial interval since not block structured */
2983 
2984   if ((__kmp_user_lock_kind == lk_tas) &&
2985       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2986        OMP_NEST_LOCK_T_SIZE)) {
2987 #if KMP_OS_LINUX &&                                                            \
2988     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2989     // "fast" path implemented to fix customer performance issue
2990     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2991 #if USE_ITT_BUILD
2992     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2993 #endif /* USE_ITT_BUILD */
2994 
2995 #if OMPT_SUPPORT && OMPT_OPTIONAL
2996     int release_status = KMP_LOCK_STILL_HELD;
2997 #endif
2998 
2999     if (--(tl->lk.depth_locked) == 0) {
3000       TCW_4(tl->lk.poll, 0);
3001 #if OMPT_SUPPORT && OMPT_OPTIONAL
3002       release_status = KMP_LOCK_RELEASED;
3003 #endif
3004     }
3005     KMP_MB();
3006 
3007 #if OMPT_SUPPORT && OMPT_OPTIONAL
3008     // This is the case, if called from omp_init_lock_with_hint:
3009     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3010     if (!codeptr)
3011       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3012     if (ompt_enabled.enabled) {
3013       if (release_status == KMP_LOCK_RELEASED) {
3014         if (ompt_enabled.ompt_callback_mutex_released) {
3015           // release_lock_last
3016           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3017               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3018         }
3019       } else if (ompt_enabled.ompt_callback_nest_lock) {
3020         // release_lock_previous
3021         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3022             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3023       }
3024     }
3025 #endif
3026 
3027     return;
3028 #else
3029     lck = (kmp_user_lock_p)user_lock;
3030 #endif
3031   }
3032 #if KMP_USE_FUTEX
3033   else if ((__kmp_user_lock_kind == lk_futex) &&
3034            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3035             OMP_NEST_LOCK_T_SIZE)) {
3036     lck = (kmp_user_lock_p)user_lock;
3037   }
3038 #endif
3039   else {
3040     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3041   }
3042 
3043 #if USE_ITT_BUILD
3044   __kmp_itt_lock_releasing(lck);
3045 #endif /* USE_ITT_BUILD */
3046 
3047   int release_status;
3048   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3049 #if OMPT_SUPPORT && OMPT_OPTIONAL
3050   // This is the case, if called from omp_init_lock_with_hint:
3051   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3052   if (!codeptr)
3053     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3054   if (ompt_enabled.enabled) {
3055     if (release_status == KMP_LOCK_RELEASED) {
3056       if (ompt_enabled.ompt_callback_mutex_released) {
3057         // release_lock_last
3058         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3059             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3060       }
3061     } else if (ompt_enabled.ompt_callback_nest_lock) {
3062       // release_lock_previous
3063       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3064           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3065     }
3066   }
3067 #endif
3068 
3069 #endif // KMP_USE_DYNAMIC_LOCK
3070 }
3071 
3072 /* try to acquire the lock */
3073 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3074   KMP_COUNT_BLOCK(OMP_test_lock);
3075 
3076 #if KMP_USE_DYNAMIC_LOCK
3077   int rc;
3078   int tag = KMP_EXTRACT_D_TAG(user_lock);
3079 #if USE_ITT_BUILD
3080   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3081 #endif
3082 #if OMPT_SUPPORT && OMPT_OPTIONAL
3083   // This is the case, if called from omp_init_lock_with_hint:
3084   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3085   if (!codeptr)
3086     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3087   if (ompt_enabled.ompt_callback_mutex_acquire) {
3088     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3089         ompt_mutex_lock, omp_lock_hint_none,
3090         __ompt_get_mutex_impl_type(user_lock),
3091         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3092   }
3093 #endif
3094 #if KMP_USE_INLINED_TAS
3095   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3096     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3097   } else
3098 #elif KMP_USE_INLINED_FUTEX
3099   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3100     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3101   } else
3102 #endif
3103   {
3104     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3105   }
3106   if (rc) {
3107 #if USE_ITT_BUILD
3108     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3109 #endif
3110 #if OMPT_SUPPORT && OMPT_OPTIONAL
3111     if (ompt_enabled.ompt_callback_mutex_acquired) {
3112       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3113           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3114     }
3115 #endif
3116     return FTN_TRUE;
3117   } else {
3118 #if USE_ITT_BUILD
3119     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3120 #endif
3121     return FTN_FALSE;
3122   }
3123 
3124 #else // KMP_USE_DYNAMIC_LOCK
3125 
3126   kmp_user_lock_p lck;
3127   int rc;
3128 
3129   if ((__kmp_user_lock_kind == lk_tas) &&
3130       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3131     lck = (kmp_user_lock_p)user_lock;
3132   }
3133 #if KMP_USE_FUTEX
3134   else if ((__kmp_user_lock_kind == lk_futex) &&
3135            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3136     lck = (kmp_user_lock_p)user_lock;
3137   }
3138 #endif
3139   else {
3140     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3141   }
3142 
3143 #if USE_ITT_BUILD
3144   __kmp_itt_lock_acquiring(lck);
3145 #endif /* USE_ITT_BUILD */
3146 #if OMPT_SUPPORT && OMPT_OPTIONAL
3147   // This is the case, if called from omp_init_lock_with_hint:
3148   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3149   if (!codeptr)
3150     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3151   if (ompt_enabled.ompt_callback_mutex_acquire) {
3152     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3153         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3154         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3155   }
3156 #endif
3157 
3158   rc = TEST_LOCK(lck, gtid);
3159 #if USE_ITT_BUILD
3160   if (rc) {
3161     __kmp_itt_lock_acquired(lck);
3162   } else {
3163     __kmp_itt_lock_cancelled(lck);
3164   }
3165 #endif /* USE_ITT_BUILD */
3166 #if OMPT_SUPPORT && OMPT_OPTIONAL
3167   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3168     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3169         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3170   }
3171 #endif
3172 
3173   return (rc ? FTN_TRUE : FTN_FALSE);
3174 
3175   /* Can't use serial interval since not block structured */
3176 
3177 #endif // KMP_USE_DYNAMIC_LOCK
3178 }
3179 
3180 /* try to acquire the lock */
3181 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3182 #if KMP_USE_DYNAMIC_LOCK
3183   int rc;
3184 #if USE_ITT_BUILD
3185   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3186 #endif
3187 #if OMPT_SUPPORT && OMPT_OPTIONAL
3188   // This is the case, if called from omp_init_lock_with_hint:
3189   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3190   if (!codeptr)
3191     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3192   if (ompt_enabled.ompt_callback_mutex_acquire) {
3193     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3194         ompt_mutex_nest_lock, omp_lock_hint_none,
3195         __ompt_get_mutex_impl_type(user_lock),
3196         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3197   }
3198 #endif
3199   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3200 #if USE_ITT_BUILD
3201   if (rc) {
3202     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3203   } else {
3204     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3205   }
3206 #endif
3207 #if OMPT_SUPPORT && OMPT_OPTIONAL
3208   if (ompt_enabled.enabled && rc) {
3209     if (rc == 1) {
3210       if (ompt_enabled.ompt_callback_mutex_acquired) {
3211         // lock_first
3212         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3213             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3214             codeptr);
3215       }
3216     } else {
3217       if (ompt_enabled.ompt_callback_nest_lock) {
3218         // lock_next
3219         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3220             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3221       }
3222     }
3223   }
3224 #endif
3225   return rc;
3226 
3227 #else // KMP_USE_DYNAMIC_LOCK
3228 
3229   kmp_user_lock_p lck;
3230   int rc;
3231 
3232   if ((__kmp_user_lock_kind == lk_tas) &&
3233       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3234        OMP_NEST_LOCK_T_SIZE)) {
3235     lck = (kmp_user_lock_p)user_lock;
3236   }
3237 #if KMP_USE_FUTEX
3238   else if ((__kmp_user_lock_kind == lk_futex) &&
3239            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3240             OMP_NEST_LOCK_T_SIZE)) {
3241     lck = (kmp_user_lock_p)user_lock;
3242   }
3243 #endif
3244   else {
3245     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3246   }
3247 
3248 #if USE_ITT_BUILD
3249   __kmp_itt_lock_acquiring(lck);
3250 #endif /* USE_ITT_BUILD */
3251 
3252 #if OMPT_SUPPORT && OMPT_OPTIONAL
3253   // This is the case, if called from omp_init_lock_with_hint:
3254   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3255   if (!codeptr)
3256     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3257   if (ompt_enabled.enabled) &&
3258         ompt_enabled.ompt_callback_mutex_acquire) {
3259       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3260           ompt_mutex_nest_lock, omp_lock_hint_none,
3261           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3262           codeptr);
3263     }
3264 #endif
3265 
3266   rc = TEST_NESTED_LOCK(lck, gtid);
3267 #if USE_ITT_BUILD
3268   if (rc) {
3269     __kmp_itt_lock_acquired(lck);
3270   } else {
3271     __kmp_itt_lock_cancelled(lck);
3272   }
3273 #endif /* USE_ITT_BUILD */
3274 #if OMPT_SUPPORT && OMPT_OPTIONAL
3275   if (ompt_enabled.enabled && rc) {
3276     if (rc == 1) {
3277       if (ompt_enabled.ompt_callback_mutex_acquired) {
3278         // lock_first
3279         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3280             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3281       }
3282     } else {
3283       if (ompt_enabled.ompt_callback_nest_lock) {
3284         // lock_next
3285         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3286             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3287       }
3288     }
3289   }
3290 #endif
3291   return rc;
3292 
3293   /* Can't use serial interval since not block structured */
3294 
3295 #endif // KMP_USE_DYNAMIC_LOCK
3296 }
3297 
3298 // Interface to fast scalable reduce methods routines
3299 
3300 // keep the selected method in a thread local structure for cross-function
3301 // usage: will be used in __kmpc_end_reduce* functions;
3302 // another solution: to re-determine the method one more time in
3303 // __kmpc_end_reduce* functions (new prototype required then)
3304 // AT: which solution is better?
3305 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3306   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3307 
3308 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3309   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3310 
3311 // description of the packed_reduction_method variable: look at the macros in
3312 // kmp.h
3313 
3314 // used in a critical section reduce block
3315 static __forceinline void
3316 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3317                                           kmp_critical_name *crit) {
3318 
3319   // this lock was visible to a customer and to the threading profile tool as a
3320   // serial overhead span (although it's used for an internal purpose only)
3321   //            why was it visible in previous implementation?
3322   //            should we keep it visible in new reduce block?
3323   kmp_user_lock_p lck;
3324 
3325 #if KMP_USE_DYNAMIC_LOCK
3326 
3327   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3328   // Check if it is initialized.
3329   if (*lk == 0) {
3330     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3331       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3332                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3333     } else {
3334       __kmp_init_indirect_csptr(crit, loc, global_tid,
3335                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3336     }
3337   }
3338   // Branch for accessing the actual lock object and set operation. This
3339   // branching is inevitable since this lock initialization does not follow the
3340   // normal dispatch path (lock table is not used).
3341   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3342     lck = (kmp_user_lock_p)lk;
3343     KMP_DEBUG_ASSERT(lck != NULL);
3344     if (__kmp_env_consistency_check) {
3345       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3346     }
3347     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3348   } else {
3349     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3350     lck = ilk->lock;
3351     KMP_DEBUG_ASSERT(lck != NULL);
3352     if (__kmp_env_consistency_check) {
3353       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3354     }
3355     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3356   }
3357 
3358 #else // KMP_USE_DYNAMIC_LOCK
3359 
3360   // We know that the fast reduction code is only emitted by Intel compilers
3361   // with 32 byte critical sections. If there isn't enough space, then we
3362   // have to use a pointer.
3363   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3364     lck = (kmp_user_lock_p)crit;
3365   } else {
3366     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3367   }
3368   KMP_DEBUG_ASSERT(lck != NULL);
3369 
3370   if (__kmp_env_consistency_check)
3371     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3372 
3373   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3374 
3375 #endif // KMP_USE_DYNAMIC_LOCK
3376 }
3377 
3378 // used in a critical section reduce block
3379 static __forceinline void
3380 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3381                                         kmp_critical_name *crit) {
3382 
3383   kmp_user_lock_p lck;
3384 
3385 #if KMP_USE_DYNAMIC_LOCK
3386 
3387   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3388     lck = (kmp_user_lock_p)crit;
3389     if (__kmp_env_consistency_check)
3390       __kmp_pop_sync(global_tid, ct_critical, loc);
3391     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3392   } else {
3393     kmp_indirect_lock_t *ilk =
3394         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3395     if (__kmp_env_consistency_check)
3396       __kmp_pop_sync(global_tid, ct_critical, loc);
3397     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3398   }
3399 
3400 #else // KMP_USE_DYNAMIC_LOCK
3401 
3402   // We know that the fast reduction code is only emitted by Intel compilers
3403   // with 32 byte critical sections. If there isn't enough space, then we have
3404   // to use a pointer.
3405   if (__kmp_base_user_lock_size > 32) {
3406     lck = *((kmp_user_lock_p *)crit);
3407     KMP_ASSERT(lck != NULL);
3408   } else {
3409     lck = (kmp_user_lock_p)crit;
3410   }
3411 
3412   if (__kmp_env_consistency_check)
3413     __kmp_pop_sync(global_tid, ct_critical, loc);
3414 
3415   __kmp_release_user_lock_with_checks(lck, global_tid);
3416 
3417 #endif // KMP_USE_DYNAMIC_LOCK
3418 } // __kmp_end_critical_section_reduce_block
3419 
3420 static __forceinline int
3421 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3422                                      int *task_state) {
3423   kmp_team_t *team;
3424 
3425   // Check if we are inside the teams construct?
3426   if (th->th.th_teams_microtask) {
3427     *team_p = team = th->th.th_team;
3428     if (team->t.t_level == th->th.th_teams_level) {
3429       // This is reduction at teams construct.
3430       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3431       // Let's swap teams temporarily for the reduction.
3432       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3433       th->th.th_team = team->t.t_parent;
3434       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3435       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3436       *task_state = th->th.th_task_state;
3437       th->th.th_task_state = 0;
3438 
3439       return 1;
3440     }
3441   }
3442   return 0;
3443 }
3444 
3445 static __forceinline void
3446 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3447   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3448   th->th.th_info.ds.ds_tid = 0;
3449   th->th.th_team = team;
3450   th->th.th_team_nproc = team->t.t_nproc;
3451   th->th.th_task_team = team->t.t_task_team[task_state];
3452   __kmp_type_convert(task_state, &(th->th.th_task_state));
3453 }
3454 
3455 /* 2.a.i. Reduce Block without a terminating barrier */
3456 /*!
3457 @ingroup SYNCHRONIZATION
3458 @param loc source location information
3459 @param global_tid global thread number
3460 @param num_vars number of items (variables) to be reduced
3461 @param reduce_size size of data in bytes to be reduced
3462 @param reduce_data pointer to data to be reduced
3463 @param reduce_func callback function providing reduction operation on two
3464 operands and returning result of reduction in lhs_data
3465 @param lck pointer to the unique lock data structure
3466 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3467 threads if atomic reduction needed
3468 
3469 The nowait version is used for a reduce clause with the nowait argument.
3470 */
3471 kmp_int32
3472 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3473                      size_t reduce_size, void *reduce_data,
3474                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3475                      kmp_critical_name *lck) {
3476 
3477   KMP_COUNT_BLOCK(REDUCE_nowait);
3478   int retval = 0;
3479   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3480   kmp_info_t *th;
3481   kmp_team_t *team;
3482   int teams_swapped = 0, task_state;
3483   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3484   __kmp_assert_valid_gtid(global_tid);
3485 
3486   // why do we need this initialization here at all?
3487   // Reduction clause can not be used as a stand-alone directive.
3488 
3489   // do not call __kmp_serial_initialize(), it will be called by
3490   // __kmp_parallel_initialize() if needed
3491   // possible detection of false-positive race by the threadchecker ???
3492   if (!TCR_4(__kmp_init_parallel))
3493     __kmp_parallel_initialize();
3494 
3495   __kmp_resume_if_soft_paused();
3496 
3497 // check correctness of reduce block nesting
3498 #if KMP_USE_DYNAMIC_LOCK
3499   if (__kmp_env_consistency_check)
3500     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3501 #else
3502   if (__kmp_env_consistency_check)
3503     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3504 #endif
3505 
3506   th = __kmp_thread_from_gtid(global_tid);
3507   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3508 
3509   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3510   // the value should be kept in a variable
3511   // the variable should be either a construct-specific or thread-specific
3512   // property, not a team specific property
3513   //     (a thread can reach the next reduce block on the next construct, reduce
3514   //     method may differ on the next construct)
3515   // an ident_t "loc" parameter could be used as a construct-specific property
3516   // (what if loc == 0?)
3517   //     (if both construct-specific and team-specific variables were shared,
3518   //     then unness extra syncs should be needed)
3519   // a thread-specific variable is better regarding two issues above (next
3520   // construct and extra syncs)
3521   // a thread-specific "th_local.reduction_method" variable is used currently
3522   // each thread executes 'determine' and 'set' lines (no need to execute by one
3523   // thread, to avoid unness extra syncs)
3524 
3525   packed_reduction_method = __kmp_determine_reduction_method(
3526       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3527   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3528 
3529   OMPT_REDUCTION_DECL(th, global_tid);
3530   if (packed_reduction_method == critical_reduce_block) {
3531 
3532     OMPT_REDUCTION_BEGIN;
3533 
3534     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3535     retval = 1;
3536 
3537   } else if (packed_reduction_method == empty_reduce_block) {
3538 
3539     OMPT_REDUCTION_BEGIN;
3540 
3541     // usage: if team size == 1, no synchronization is required ( Intel
3542     // platforms only )
3543     retval = 1;
3544 
3545   } else if (packed_reduction_method == atomic_reduce_block) {
3546 
3547     retval = 2;
3548 
3549     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3550     // won't be called by the code gen)
3551     //     (it's not quite good, because the checking block has been closed by
3552     //     this 'pop',
3553     //      but atomic operation has not been executed yet, will be executed
3554     //      slightly later, literally on next instruction)
3555     if (__kmp_env_consistency_check)
3556       __kmp_pop_sync(global_tid, ct_reduce, loc);
3557 
3558   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3559                                    tree_reduce_block)) {
3560 
3561 // AT: performance issue: a real barrier here
3562 // AT: (if primary thread is slow, other threads are blocked here waiting for
3563 //      the primary thread to come and release them)
3564 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3565 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3566 //      be confusing to a customer)
3567 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3568 // might go faster and be more in line with sense of NOWAIT
3569 // AT: TO DO: do epcc test and compare times
3570 
3571 // this barrier should be invisible to a customer and to the threading profile
3572 // tool (it's neither a terminating barrier nor customer's code, it's
3573 // used for an internal purpose)
3574 #if OMPT_SUPPORT
3575     // JP: can this barrier potentially leed to task scheduling?
3576     // JP: as long as there is a barrier in the implementation, OMPT should and
3577     // will provide the barrier events
3578     //         so we set-up the necessary frame/return addresses.
3579     ompt_frame_t *ompt_frame;
3580     if (ompt_enabled.enabled) {
3581       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3582       if (ompt_frame->enter_frame.ptr == NULL)
3583         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3584     }
3585     OMPT_STORE_RETURN_ADDRESS(global_tid);
3586 #endif
3587 #if USE_ITT_NOTIFY
3588     __kmp_threads[global_tid]->th.th_ident = loc;
3589 #endif
3590     retval =
3591         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3592                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3593     retval = (retval != 0) ? (0) : (1);
3594 #if OMPT_SUPPORT && OMPT_OPTIONAL
3595     if (ompt_enabled.enabled) {
3596       ompt_frame->enter_frame = ompt_data_none;
3597     }
3598 #endif
3599 
3600     // all other workers except primary thread should do this pop here
3601     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3602     if (__kmp_env_consistency_check) {
3603       if (retval == 0) {
3604         __kmp_pop_sync(global_tid, ct_reduce, loc);
3605       }
3606     }
3607 
3608   } else {
3609 
3610     // should never reach this block
3611     KMP_ASSERT(0); // "unexpected method"
3612   }
3613   if (teams_swapped) {
3614     __kmp_restore_swapped_teams(th, team, task_state);
3615   }
3616   KA_TRACE(
3617       10,
3618       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3619        global_tid, packed_reduction_method, retval));
3620 
3621   return retval;
3622 }
3623 
3624 /*!
3625 @ingroup SYNCHRONIZATION
3626 @param loc source location information
3627 @param global_tid global thread id.
3628 @param lck pointer to the unique lock data structure
3629 
3630 Finish the execution of a reduce nowait.
3631 */
3632 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3633                               kmp_critical_name *lck) {
3634 
3635   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3636 
3637   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3638   __kmp_assert_valid_gtid(global_tid);
3639 
3640   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3641 
3642   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3643 
3644   if (packed_reduction_method == critical_reduce_block) {
3645 
3646     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3647     OMPT_REDUCTION_END;
3648 
3649   } else if (packed_reduction_method == empty_reduce_block) {
3650 
3651     // usage: if team size == 1, no synchronization is required ( on Intel
3652     // platforms only )
3653 
3654     OMPT_REDUCTION_END;
3655 
3656   } else if (packed_reduction_method == atomic_reduce_block) {
3657 
3658     // neither primary thread nor other workers should get here
3659     //     (code gen does not generate this call in case 2: atomic reduce block)
3660     // actually it's better to remove this elseif at all;
3661     // after removal this value will checked by the 'else' and will assert
3662 
3663   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3664                                    tree_reduce_block)) {
3665 
3666     // only primary thread gets here
3667     // OMPT: tree reduction is annotated in the barrier code
3668 
3669   } else {
3670 
3671     // should never reach this block
3672     KMP_ASSERT(0); // "unexpected method"
3673   }
3674 
3675   if (__kmp_env_consistency_check)
3676     __kmp_pop_sync(global_tid, ct_reduce, loc);
3677 
3678   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3679                 global_tid, packed_reduction_method));
3680 
3681   return;
3682 }
3683 
3684 /* 2.a.ii. Reduce Block with a terminating barrier */
3685 
3686 /*!
3687 @ingroup SYNCHRONIZATION
3688 @param loc source location information
3689 @param global_tid global thread number
3690 @param num_vars number of items (variables) to be reduced
3691 @param reduce_size size of data in bytes to be reduced
3692 @param reduce_data pointer to data to be reduced
3693 @param reduce_func callback function providing reduction operation on two
3694 operands and returning result of reduction in lhs_data
3695 @param lck pointer to the unique lock data structure
3696 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3697 threads if atomic reduction needed
3698 
3699 A blocking reduce that includes an implicit barrier.
3700 */
3701 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3702                         size_t reduce_size, void *reduce_data,
3703                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3704                         kmp_critical_name *lck) {
3705   KMP_COUNT_BLOCK(REDUCE_wait);
3706   int retval = 0;
3707   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3708   kmp_info_t *th;
3709   kmp_team_t *team;
3710   int teams_swapped = 0, task_state;
3711 
3712   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3713   __kmp_assert_valid_gtid(global_tid);
3714 
3715   // why do we need this initialization here at all?
3716   // Reduction clause can not be a stand-alone directive.
3717 
3718   // do not call __kmp_serial_initialize(), it will be called by
3719   // __kmp_parallel_initialize() if needed
3720   // possible detection of false-positive race by the threadchecker ???
3721   if (!TCR_4(__kmp_init_parallel))
3722     __kmp_parallel_initialize();
3723 
3724   __kmp_resume_if_soft_paused();
3725 
3726 // check correctness of reduce block nesting
3727 #if KMP_USE_DYNAMIC_LOCK
3728   if (__kmp_env_consistency_check)
3729     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3730 #else
3731   if (__kmp_env_consistency_check)
3732     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3733 #endif
3734 
3735   th = __kmp_thread_from_gtid(global_tid);
3736   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3737 
3738   packed_reduction_method = __kmp_determine_reduction_method(
3739       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3740   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3741 
3742   OMPT_REDUCTION_DECL(th, global_tid);
3743 
3744   if (packed_reduction_method == critical_reduce_block) {
3745 
3746     OMPT_REDUCTION_BEGIN;
3747     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3748     retval = 1;
3749 
3750   } else if (packed_reduction_method == empty_reduce_block) {
3751 
3752     OMPT_REDUCTION_BEGIN;
3753     // usage: if team size == 1, no synchronization is required ( Intel
3754     // platforms only )
3755     retval = 1;
3756 
3757   } else if (packed_reduction_method == atomic_reduce_block) {
3758 
3759     retval = 2;
3760 
3761   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3762                                    tree_reduce_block)) {
3763 
3764 // case tree_reduce_block:
3765 // this barrier should be visible to a customer and to the threading profile
3766 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3767 #if OMPT_SUPPORT
3768     ompt_frame_t *ompt_frame;
3769     if (ompt_enabled.enabled) {
3770       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3771       if (ompt_frame->enter_frame.ptr == NULL)
3772         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3773     }
3774     OMPT_STORE_RETURN_ADDRESS(global_tid);
3775 #endif
3776 #if USE_ITT_NOTIFY
3777     __kmp_threads[global_tid]->th.th_ident =
3778         loc; // needed for correct notification of frames
3779 #endif
3780     retval =
3781         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3782                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3783     retval = (retval != 0) ? (0) : (1);
3784 #if OMPT_SUPPORT && OMPT_OPTIONAL
3785     if (ompt_enabled.enabled) {
3786       ompt_frame->enter_frame = ompt_data_none;
3787     }
3788 #endif
3789 
3790     // all other workers except primary thread should do this pop here
3791     // (none of other workers except primary will enter __kmpc_end_reduce())
3792     if (__kmp_env_consistency_check) {
3793       if (retval == 0) { // 0: all other workers; 1: primary thread
3794         __kmp_pop_sync(global_tid, ct_reduce, loc);
3795       }
3796     }
3797 
3798   } else {
3799 
3800     // should never reach this block
3801     KMP_ASSERT(0); // "unexpected method"
3802   }
3803   if (teams_swapped) {
3804     __kmp_restore_swapped_teams(th, team, task_state);
3805   }
3806 
3807   KA_TRACE(10,
3808            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3809             global_tid, packed_reduction_method, retval));
3810   return retval;
3811 }
3812 
3813 /*!
3814 @ingroup SYNCHRONIZATION
3815 @param loc source location information
3816 @param global_tid global thread id.
3817 @param lck pointer to the unique lock data structure
3818 
3819 Finish the execution of a blocking reduce.
3820 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3821 start function.
3822 */
3823 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3824                        kmp_critical_name *lck) {
3825 
3826   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3827   kmp_info_t *th;
3828   kmp_team_t *team;
3829   int teams_swapped = 0, task_state;
3830 
3831   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3832   __kmp_assert_valid_gtid(global_tid);
3833 
3834   th = __kmp_thread_from_gtid(global_tid);
3835   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3836 
3837   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3838 
3839   // this barrier should be visible to a customer and to the threading profile
3840   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3841   OMPT_REDUCTION_DECL(th, global_tid);
3842 
3843   if (packed_reduction_method == critical_reduce_block) {
3844     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3845 
3846     OMPT_REDUCTION_END;
3847 
3848 // TODO: implicit barrier: should be exposed
3849 #if OMPT_SUPPORT
3850     ompt_frame_t *ompt_frame;
3851     if (ompt_enabled.enabled) {
3852       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3853       if (ompt_frame->enter_frame.ptr == NULL)
3854         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3855     }
3856     OMPT_STORE_RETURN_ADDRESS(global_tid);
3857 #endif
3858 #if USE_ITT_NOTIFY
3859     __kmp_threads[global_tid]->th.th_ident = loc;
3860 #endif
3861     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3862 #if OMPT_SUPPORT && OMPT_OPTIONAL
3863     if (ompt_enabled.enabled) {
3864       ompt_frame->enter_frame = ompt_data_none;
3865     }
3866 #endif
3867 
3868   } else if (packed_reduction_method == empty_reduce_block) {
3869 
3870     OMPT_REDUCTION_END;
3871 
3872 // usage: if team size==1, no synchronization is required (Intel platforms only)
3873 
3874 // TODO: implicit barrier: should be exposed
3875 #if OMPT_SUPPORT
3876     ompt_frame_t *ompt_frame;
3877     if (ompt_enabled.enabled) {
3878       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3879       if (ompt_frame->enter_frame.ptr == NULL)
3880         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3881     }
3882     OMPT_STORE_RETURN_ADDRESS(global_tid);
3883 #endif
3884 #if USE_ITT_NOTIFY
3885     __kmp_threads[global_tid]->th.th_ident = loc;
3886 #endif
3887     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3888 #if OMPT_SUPPORT && OMPT_OPTIONAL
3889     if (ompt_enabled.enabled) {
3890       ompt_frame->enter_frame = ompt_data_none;
3891     }
3892 #endif
3893 
3894   } else if (packed_reduction_method == atomic_reduce_block) {
3895 
3896 #if OMPT_SUPPORT
3897     ompt_frame_t *ompt_frame;
3898     if (ompt_enabled.enabled) {
3899       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3900       if (ompt_frame->enter_frame.ptr == NULL)
3901         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3902     }
3903     OMPT_STORE_RETURN_ADDRESS(global_tid);
3904 #endif
3905 // TODO: implicit barrier: should be exposed
3906 #if USE_ITT_NOTIFY
3907     __kmp_threads[global_tid]->th.th_ident = loc;
3908 #endif
3909     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3910 #if OMPT_SUPPORT && OMPT_OPTIONAL
3911     if (ompt_enabled.enabled) {
3912       ompt_frame->enter_frame = ompt_data_none;
3913     }
3914 #endif
3915 
3916   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3917                                    tree_reduce_block)) {
3918 
3919     // only primary thread executes here (primary releases all other workers)
3920     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3921                             global_tid);
3922 
3923   } else {
3924 
3925     // should never reach this block
3926     KMP_ASSERT(0); // "unexpected method"
3927   }
3928   if (teams_swapped) {
3929     __kmp_restore_swapped_teams(th, team, task_state);
3930   }
3931 
3932   if (__kmp_env_consistency_check)
3933     __kmp_pop_sync(global_tid, ct_reduce, loc);
3934 
3935   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3936                 global_tid, packed_reduction_method));
3937 
3938   return;
3939 }
3940 
3941 #undef __KMP_GET_REDUCTION_METHOD
3942 #undef __KMP_SET_REDUCTION_METHOD
3943 
3944 /* end of interface to fast scalable reduce routines */
3945 
3946 kmp_uint64 __kmpc_get_taskid() {
3947 
3948   kmp_int32 gtid;
3949   kmp_info_t *thread;
3950 
3951   gtid = __kmp_get_gtid();
3952   if (gtid < 0) {
3953     return 0;
3954   }
3955   thread = __kmp_thread_from_gtid(gtid);
3956   return thread->th.th_current_task->td_task_id;
3957 
3958 } // __kmpc_get_taskid
3959 
3960 kmp_uint64 __kmpc_get_parent_taskid() {
3961 
3962   kmp_int32 gtid;
3963   kmp_info_t *thread;
3964   kmp_taskdata_t *parent_task;
3965 
3966   gtid = __kmp_get_gtid();
3967   if (gtid < 0) {
3968     return 0;
3969   }
3970   thread = __kmp_thread_from_gtid(gtid);
3971   parent_task = thread->th.th_current_task->td_parent;
3972   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3973 
3974 } // __kmpc_get_parent_taskid
3975 
3976 /*!
3977 @ingroup WORK_SHARING
3978 @param loc  source location information.
3979 @param gtid  global thread number.
3980 @param num_dims  number of associated doacross loops.
3981 @param dims  info on loops bounds.
3982 
3983 Initialize doacross loop information.
3984 Expect compiler send us inclusive bounds,
3985 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3986 */
3987 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3988                           const struct kmp_dim *dims) {
3989   __kmp_assert_valid_gtid(gtid);
3990   int j, idx;
3991   kmp_int64 last, trace_count;
3992   kmp_info_t *th = __kmp_threads[gtid];
3993   kmp_team_t *team = th->th.th_team;
3994   kmp_uint32 *flags;
3995   kmp_disp_t *pr_buf = th->th.th_dispatch;
3996   dispatch_shared_info_t *sh_buf;
3997 
3998   KA_TRACE(
3999       20,
4000       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4001        gtid, num_dims, !team->t.t_serialized));
4002   KMP_DEBUG_ASSERT(dims != NULL);
4003   KMP_DEBUG_ASSERT(num_dims > 0);
4004 
4005   if (team->t.t_serialized) {
4006     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4007     return; // no dependencies if team is serialized
4008   }
4009   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4010   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4011   // the next loop
4012   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4013 
4014   // Save bounds info into allocated private buffer
4015   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4016   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4017       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4018   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4019   pr_buf->th_doacross_info[0] =
4020       (kmp_int64)num_dims; // first element is number of dimensions
4021   // Save also address of num_done in order to access it later without knowing
4022   // the buffer index
4023   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4024   pr_buf->th_doacross_info[2] = dims[0].lo;
4025   pr_buf->th_doacross_info[3] = dims[0].up;
4026   pr_buf->th_doacross_info[4] = dims[0].st;
4027   last = 5;
4028   for (j = 1; j < num_dims; ++j) {
4029     kmp_int64
4030         range_length; // To keep ranges of all dimensions but the first dims[0]
4031     if (dims[j].st == 1) { // most common case
4032       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4033       range_length = dims[j].up - dims[j].lo + 1;
4034     } else {
4035       if (dims[j].st > 0) {
4036         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4037         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4038       } else { // negative increment
4039         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4040         range_length =
4041             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4042       }
4043     }
4044     pr_buf->th_doacross_info[last++] = range_length;
4045     pr_buf->th_doacross_info[last++] = dims[j].lo;
4046     pr_buf->th_doacross_info[last++] = dims[j].up;
4047     pr_buf->th_doacross_info[last++] = dims[j].st;
4048   }
4049 
4050   // Compute total trip count.
4051   // Start with range of dims[0] which we don't need to keep in the buffer.
4052   if (dims[0].st == 1) { // most common case
4053     trace_count = dims[0].up - dims[0].lo + 1;
4054   } else if (dims[0].st > 0) {
4055     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4056     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4057   } else { // negative increment
4058     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4059     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4060   }
4061   for (j = 1; j < num_dims; ++j) {
4062     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4063   }
4064   KMP_DEBUG_ASSERT(trace_count > 0);
4065 
4066   // Check if shared buffer is not occupied by other loop (idx -
4067   // __kmp_dispatch_num_buffers)
4068   if (idx != sh_buf->doacross_buf_idx) {
4069     // Shared buffer is occupied, wait for it to be free
4070     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4071                  __kmp_eq_4, NULL);
4072   }
4073 #if KMP_32_BIT_ARCH
4074   // Check if we are the first thread. After the CAS the first thread gets 0,
4075   // others get 1 if initialization is in progress, allocated pointer otherwise.
4076   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4077   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4078       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4079 #else
4080   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4081       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4082 #endif
4083   if (flags == NULL) {
4084     // we are the first thread, allocate the array of flags
4085     size_t size =
4086         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4087     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4088     KMP_MB();
4089     sh_buf->doacross_flags = flags;
4090   } else if (flags == (kmp_uint32 *)1) {
4091 #if KMP_32_BIT_ARCH
4092     // initialization is still in progress, need to wait
4093     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4094 #else
4095     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4096 #endif
4097       KMP_YIELD(TRUE);
4098     KMP_MB();
4099   } else {
4100     KMP_MB();
4101   }
4102   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4103   pr_buf->th_doacross_flags =
4104       sh_buf->doacross_flags; // save private copy in order to not
4105   // touch shared buffer on each iteration
4106   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4107 }
4108 
4109 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4110   __kmp_assert_valid_gtid(gtid);
4111   kmp_int64 shft;
4112   size_t num_dims, i;
4113   kmp_uint32 flag;
4114   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4115   kmp_info_t *th = __kmp_threads[gtid];
4116   kmp_team_t *team = th->th.th_team;
4117   kmp_disp_t *pr_buf;
4118   kmp_int64 lo, up, st;
4119 
4120   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4121   if (team->t.t_serialized) {
4122     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4123     return; // no dependencies if team is serialized
4124   }
4125 
4126   // calculate sequential iteration number and check out-of-bounds condition
4127   pr_buf = th->th.th_dispatch;
4128   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4129   num_dims = (size_t)pr_buf->th_doacross_info[0];
4130   lo = pr_buf->th_doacross_info[2];
4131   up = pr_buf->th_doacross_info[3];
4132   st = pr_buf->th_doacross_info[4];
4133 #if OMPT_SUPPORT && OMPT_OPTIONAL
4134   ompt_dependence_t deps[num_dims];
4135 #endif
4136   if (st == 1) { // most common case
4137     if (vec[0] < lo || vec[0] > up) {
4138       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4139                     "bounds [%lld,%lld]\n",
4140                     gtid, vec[0], lo, up));
4141       return;
4142     }
4143     iter_number = vec[0] - lo;
4144   } else if (st > 0) {
4145     if (vec[0] < lo || vec[0] > up) {
4146       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4147                     "bounds [%lld,%lld]\n",
4148                     gtid, vec[0], lo, up));
4149       return;
4150     }
4151     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4152   } else { // negative increment
4153     if (vec[0] > lo || vec[0] < up) {
4154       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4155                     "bounds [%lld,%lld]\n",
4156                     gtid, vec[0], lo, up));
4157       return;
4158     }
4159     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4160   }
4161 #if OMPT_SUPPORT && OMPT_OPTIONAL
4162   deps[0].variable.value = iter_number;
4163   deps[0].dependence_type = ompt_dependence_type_sink;
4164 #endif
4165   for (i = 1; i < num_dims; ++i) {
4166     kmp_int64 iter, ln;
4167     size_t j = i * 4;
4168     ln = pr_buf->th_doacross_info[j + 1];
4169     lo = pr_buf->th_doacross_info[j + 2];
4170     up = pr_buf->th_doacross_info[j + 3];
4171     st = pr_buf->th_doacross_info[j + 4];
4172     if (st == 1) {
4173       if (vec[i] < lo || vec[i] > up) {
4174         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4175                       "bounds [%lld,%lld]\n",
4176                       gtid, vec[i], lo, up));
4177         return;
4178       }
4179       iter = vec[i] - lo;
4180     } else if (st > 0) {
4181       if (vec[i] < lo || vec[i] > up) {
4182         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4183                       "bounds [%lld,%lld]\n",
4184                       gtid, vec[i], lo, up));
4185         return;
4186       }
4187       iter = (kmp_uint64)(vec[i] - lo) / st;
4188     } else { // st < 0
4189       if (vec[i] > lo || vec[i] < up) {
4190         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4191                       "bounds [%lld,%lld]\n",
4192                       gtid, vec[i], lo, up));
4193         return;
4194       }
4195       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4196     }
4197     iter_number = iter + ln * iter_number;
4198 #if OMPT_SUPPORT && OMPT_OPTIONAL
4199     deps[i].variable.value = iter;
4200     deps[i].dependence_type = ompt_dependence_type_sink;
4201 #endif
4202   }
4203   shft = iter_number % 32; // use 32-bit granularity
4204   iter_number >>= 5; // divided by 32
4205   flag = 1 << shft;
4206   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4207     KMP_YIELD(TRUE);
4208   }
4209   KMP_MB();
4210 #if OMPT_SUPPORT && OMPT_OPTIONAL
4211   if (ompt_enabled.ompt_callback_dependences) {
4212     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4213         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4214   }
4215 #endif
4216   KA_TRACE(20,
4217            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4218             gtid, (iter_number << 5) + shft));
4219 }
4220 
4221 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4222   __kmp_assert_valid_gtid(gtid);
4223   kmp_int64 shft;
4224   size_t num_dims, i;
4225   kmp_uint32 flag;
4226   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4227   kmp_info_t *th = __kmp_threads[gtid];
4228   kmp_team_t *team = th->th.th_team;
4229   kmp_disp_t *pr_buf;
4230   kmp_int64 lo, st;
4231 
4232   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4233   if (team->t.t_serialized) {
4234     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4235     return; // no dependencies if team is serialized
4236   }
4237 
4238   // calculate sequential iteration number (same as in "wait" but no
4239   // out-of-bounds checks)
4240   pr_buf = th->th.th_dispatch;
4241   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4242   num_dims = (size_t)pr_buf->th_doacross_info[0];
4243   lo = pr_buf->th_doacross_info[2];
4244   st = pr_buf->th_doacross_info[4];
4245 #if OMPT_SUPPORT && OMPT_OPTIONAL
4246   ompt_dependence_t deps[num_dims];
4247 #endif
4248   if (st == 1) { // most common case
4249     iter_number = vec[0] - lo;
4250   } else if (st > 0) {
4251     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4252   } else { // negative increment
4253     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4254   }
4255 #if OMPT_SUPPORT && OMPT_OPTIONAL
4256   deps[0].variable.value = iter_number;
4257   deps[0].dependence_type = ompt_dependence_type_source;
4258 #endif
4259   for (i = 1; i < num_dims; ++i) {
4260     kmp_int64 iter, ln;
4261     size_t j = i * 4;
4262     ln = pr_buf->th_doacross_info[j + 1];
4263     lo = pr_buf->th_doacross_info[j + 2];
4264     st = pr_buf->th_doacross_info[j + 4];
4265     if (st == 1) {
4266       iter = vec[i] - lo;
4267     } else if (st > 0) {
4268       iter = (kmp_uint64)(vec[i] - lo) / st;
4269     } else { // st < 0
4270       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4271     }
4272     iter_number = iter + ln * iter_number;
4273 #if OMPT_SUPPORT && OMPT_OPTIONAL
4274     deps[i].variable.value = iter;
4275     deps[i].dependence_type = ompt_dependence_type_source;
4276 #endif
4277   }
4278 #if OMPT_SUPPORT && OMPT_OPTIONAL
4279   if (ompt_enabled.ompt_callback_dependences) {
4280     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4281         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4282   }
4283 #endif
4284   shft = iter_number % 32; // use 32-bit granularity
4285   iter_number >>= 5; // divided by 32
4286   flag = 1 << shft;
4287   KMP_MB();
4288   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4289     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4290   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4291                 (iter_number << 5) + shft));
4292 }
4293 
4294 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4295   __kmp_assert_valid_gtid(gtid);
4296   kmp_int32 num_done;
4297   kmp_info_t *th = __kmp_threads[gtid];
4298   kmp_team_t *team = th->th.th_team;
4299   kmp_disp_t *pr_buf = th->th.th_dispatch;
4300 
4301   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4302   if (team->t.t_serialized) {
4303     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4304     return; // nothing to do
4305   }
4306   num_done =
4307       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4308   if (num_done == th->th.th_team_nproc) {
4309     // we are the last thread, need to free shared resources
4310     int idx = pr_buf->th_doacross_buf_idx - 1;
4311     dispatch_shared_info_t *sh_buf =
4312         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4313     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4314                      (kmp_int64)&sh_buf->doacross_num_done);
4315     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4316     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4317     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4318     sh_buf->doacross_flags = NULL;
4319     sh_buf->doacross_num_done = 0;
4320     sh_buf->doacross_buf_idx +=
4321         __kmp_dispatch_num_buffers; // free buffer for future re-use
4322   }
4323   // free private resources (need to keep buffer index forever)
4324   pr_buf->th_doacross_flags = NULL;
4325   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4326   pr_buf->th_doacross_info = NULL;
4327   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4328 }
4329 
4330 /* OpenMP 5.1 Memory Management routines */
4331 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4332   return __kmp_alloc(__kmp_entry_gtid(), 0, size, allocator);
4333 }
4334 
4335 void *omp_aligned_alloc(size_t align, size_t size,
4336                         omp_allocator_handle_t allocator) {
4337   return __kmp_alloc(__kmp_entry_gtid(), align, size, allocator);
4338 }
4339 
4340 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4341   return __kmp_calloc(__kmp_entry_gtid(), 0, nmemb, size, allocator);
4342 }
4343 
4344 void *omp_aligned_calloc(size_t align, size_t nmemb, size_t size,
4345                          omp_allocator_handle_t allocator) {
4346   return __kmp_calloc(__kmp_entry_gtid(), align, nmemb, size, allocator);
4347 }
4348 
4349 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4350                   omp_allocator_handle_t free_allocator) {
4351   return __kmp_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4352                         free_allocator);
4353 }
4354 
4355 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4356   ___kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4357 }
4358 /* end of OpenMP 5.1 Memory Management routines */
4359 
4360 int __kmpc_get_target_offload(void) {
4361   if (!__kmp_init_serial) {
4362     __kmp_serial_initialize();
4363   }
4364   return __kmp_target_offload;
4365 }
4366 
4367 int __kmpc_pause_resource(kmp_pause_status_t level) {
4368   if (!__kmp_init_serial) {
4369     return 1; // Can't pause if runtime is not initialized
4370   }
4371   return __kmp_pause_resource(level);
4372 }
4373 
4374 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4375   if (!__kmp_init_serial)
4376     __kmp_serial_initialize();
4377 
4378   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4379 
4380 #if OMPT_SUPPORT
4381   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4382     ompt_callbacks.ompt_callback(ompt_callback_error)(
4383         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4384         OMPT_GET_RETURN_ADDRESS(0));
4385   }
4386 #endif // OMPT_SUPPORT
4387 
4388   char *src_loc;
4389   if (loc && loc->psource) {
4390     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4391     src_loc =
4392         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4393     __kmp_str_loc_free(&str_loc);
4394   } else {
4395     src_loc = __kmp_str_format("unknown");
4396   }
4397 
4398   if (severity == severity_warning)
4399     KMP_WARNING(UserDirectedWarning, src_loc, message);
4400   else
4401     KMP_FATAL(UserDirectedError, src_loc, message);
4402 
4403   __kmp_str_free(&src_loc);
4404 }
4405 
4406 // Mark begin of scope directive.
4407 void __kmpc_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4408 // reserved is for extension of scope directive and not used.
4409 #if OMPT_SUPPORT && OMPT_OPTIONAL
4410   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4411     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4412     int tid = __kmp_tid_from_gtid(gtid);
4413     ompt_callbacks.ompt_callback(ompt_callback_work)(
4414         ompt_work_scope, ompt_scope_begin,
4415         &(team->t.ompt_team_info.parallel_data),
4416         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4417         OMPT_GET_RETURN_ADDRESS(0));
4418   }
4419 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4420 }
4421 
4422 // Mark end of scope directive
4423 void __kmpc_end_scope(ident_t *loc, kmp_int32 gtid, void *reserved) {
4424 // reserved is for extension of scope directive and not used.
4425 #if OMPT_SUPPORT && OMPT_OPTIONAL
4426   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_work) {
4427     kmp_team_t *team = __kmp_threads[gtid]->th.th_team;
4428     int tid = __kmp_tid_from_gtid(gtid);
4429     ompt_callbacks.ompt_callback(ompt_callback_work)(
4430         ompt_work_scope, ompt_scope_end,
4431         &(team->t.ompt_team_info.parallel_data),
4432         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
4433         OMPT_GET_RETURN_ADDRESS(0));
4434   }
4435 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
4436 }
4437 
4438 #ifdef KMP_USE_VERSION_SYMBOLS
4439 // For GOMP compatibility there are two versions of each omp_* API.
4440 // One is the plain C symbol and one is the Fortran symbol with an appended
4441 // underscore. When we implement a specific ompc_* version of an omp_*
4442 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4443 // instead of the Fortran versions in kmp_ftn_entry.h
4444 extern "C" {
4445 // Have to undef these from omp.h so they aren't translated into
4446 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4447 #ifdef omp_set_affinity_format
4448 #undef omp_set_affinity_format
4449 #endif
4450 #ifdef omp_get_affinity_format
4451 #undef omp_get_affinity_format
4452 #endif
4453 #ifdef omp_display_affinity
4454 #undef omp_display_affinity
4455 #endif
4456 #ifdef omp_capture_affinity
4457 #undef omp_capture_affinity
4458 #endif
4459 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4460                         "OMP_5.0");
4461 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4462                         "OMP_5.0");
4463 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4464                         "OMP_5.0");
4465 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4466                         "OMP_5.0");
4467 } // extern "C"
4468 #endif
4469