1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the primary thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc source location information
357 @param global_tid global thread number
358 @param num_teams_lo lower bound on number of teams requested for the teams
359 construct
360 @param num_teams_up upper bound on number of teams requested for the teams
361 construct
362 @param num_threads number of threads per team requested for the teams construct
363 
364 Set the number of teams to be used by the teams construct. The number of initial
365 teams cretaed will be greater than or equal to the lower bound and less than or
366 equal to the upper bound.
367 This call is only required if the teams construct has a `num_teams` clause
368 or a `thread_limit` clause (or both).
369 */
370 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
371                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
372                               kmp_int32 num_threads) {
373   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
374                 " num_teams_ub=%d num_threads=%d\n",
375                 global_tid, num_teams_lb, num_teams_ub, num_threads));
376   __kmp_assert_valid_gtid(global_tid);
377   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
378                           num_threads);
379 }
380 
381 /*!
382 @ingroup PARALLEL
383 @param loc  source location information
384 @param argc  total number of arguments in the ellipsis
385 @param microtask  pointer to callback routine consisting of outlined teams
386 construct
387 @param ...  pointers to shared variables that aren't global
388 
389 Do the actual fork and call the microtask in the relevant number of threads.
390 */
391 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
392                        ...) {
393   int gtid = __kmp_entry_gtid();
394   kmp_info_t *this_thr = __kmp_threads[gtid];
395   va_list ap;
396   va_start(ap, microtask);
397 
398 #if KMP_STATS_ENABLED
399   KMP_COUNT_BLOCK(OMP_TEAMS);
400   stats_state_e previous_state = KMP_GET_THREAD_STATE();
401   if (previous_state == stats_state_e::SERIAL_REGION) {
402     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
403   } else {
404     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
405   }
406 #endif
407 
408   // remember teams entry point and nesting level
409   this_thr->th.th_teams_microtask = microtask;
410   this_thr->th.th_teams_level =
411       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
412 
413 #if OMPT_SUPPORT
414   kmp_team_t *parent_team = this_thr->th.th_team;
415   int tid = __kmp_tid_from_gtid(gtid);
416   if (ompt_enabled.enabled) {
417     parent_team->t.t_implicit_task_taskdata[tid]
418         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
419   }
420   OMPT_STORE_RETURN_ADDRESS(gtid);
421 #endif
422 
423   // check if __kmpc_push_num_teams called, set default number of teams
424   // otherwise
425   if (this_thr->th.th_teams_size.nteams == 0) {
426     __kmp_push_num_teams(loc, gtid, 0, 0);
427   }
428   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
431 
432   __kmp_fork_call(
433       loc, gtid, fork_context_intel, argc,
434       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
435       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
436   __kmp_join_call(loc, gtid
437 #if OMPT_SUPPORT
438                   ,
439                   fork_context_intel
440 #endif
441   );
442 
443   // Pop current CG root off list
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
446   this_thr->th.th_cg_roots = tmp->up;
447   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
448                  " to node %p. cg_nthreads was %d\n",
449                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
450   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
451   int i = tmp->cg_nthreads--;
452   if (i == 1) { // check is we are the last thread in CG (not always the case)
453     __kmp_free(tmp);
454   }
455   // Restore current task's thread_limit from CG root
456   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
457   this_thr->th.th_current_task->td_icvs.thread_limit =
458       this_thr->th.th_cg_roots->cg_thread_limit;
459 
460   this_thr->th.th_teams_microtask = NULL;
461   this_thr->th.th_teams_level = 0;
462   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
463   va_end(ap);
464 #if KMP_STATS_ENABLED
465   if (previous_state == stats_state_e::SERIAL_REGION) {
466     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
467     KMP_SET_THREAD_STATE(previous_state);
468   } else {
469     KMP_POP_PARTITIONED_TIMER();
470   }
471 #endif // KMP_STATS_ENABLED
472 }
473 
474 // I don't think this function should ever have been exported.
475 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
476 // openmp code ever called it, but it's been exported from the RTL for so
477 // long that I'm afraid to remove the definition.
478 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
479 
480 /*!
481 @ingroup PARALLEL
482 @param loc  source location information
483 @param global_tid  global thread number
484 
485 Enter a serialized parallel construct. This interface is used to handle a
486 conditional parallel region, like this,
487 @code
488 #pragma omp parallel if (condition)
489 @endcode
490 when the condition is false.
491 */
492 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
493   // The implementation is now in kmp_runtime.cpp so that it can share static
494   // functions with kmp_fork_call since the tasks to be done are similar in
495   // each case.
496   __kmp_assert_valid_gtid(global_tid);
497 #if OMPT_SUPPORT
498   OMPT_STORE_RETURN_ADDRESS(global_tid);
499 #endif
500   __kmp_serialized_parallel(loc, global_tid);
501 }
502 
503 /*!
504 @ingroup PARALLEL
505 @param loc  source location information
506 @param global_tid  global thread number
507 
508 Leave a serialized parallel construct.
509 */
510 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
511   kmp_internal_control_t *top;
512   kmp_info_t *this_thr;
513   kmp_team_t *serial_team;
514 
515   KC_TRACE(10,
516            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
517 
518   /* skip all this code for autopar serialized loops since it results in
519      unacceptable overhead */
520   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
521     return;
522 
523   // Not autopar code
524   __kmp_assert_valid_gtid(global_tid);
525   if (!TCR_4(__kmp_init_parallel))
526     __kmp_parallel_initialize();
527 
528   __kmp_resume_if_soft_paused();
529 
530   this_thr = __kmp_threads[global_tid];
531   serial_team = this_thr->th.th_serial_team;
532 
533   kmp_task_team_t *task_team = this_thr->th.th_task_team;
534   // we need to wait for the proxy tasks before finishing the thread
535   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   // if( serial_team -> t.t_serialized > 1 )
581   serial_team->t.t_level--;
582 
583   /* pop dispatch buffers stack */
584   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
585   {
586     dispatch_private_info_t *disp_buffer =
587         serial_team->t.t_dispatch->th_disp_buffer;
588     serial_team->t.t_dispatch->th_disp_buffer =
589         serial_team->t.t_dispatch->th_disp_buffer->next;
590     __kmp_free(disp_buffer);
591   }
592   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
593 
594   --serial_team->t.t_serialized;
595   if (serial_team->t.t_serialized == 0) {
596 
597     /* return to the parallel section */
598 
599 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
600     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
601       __kmp_clear_x87_fpu_status_word();
602       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
603       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
604     }
605 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
606 
607 #if OMPD_SUPPORT
608     if (ompd_state & OMPD_ENABLE_BP)
609       ompd_bp_parallel_end();
610 #endif
611 
612     this_thr->th.th_team = serial_team->t.t_parent;
613     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
614 
615     /* restore values cached in the thread */
616     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
617     this_thr->th.th_team_master =
618         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
619     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
620 
621     /* TODO the below shouldn't need to be adjusted for serialized teams */
622     this_thr->th.th_dispatch =
623         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
624 
625     __kmp_pop_current_task_from_thread(this_thr);
626 
627     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
628     this_thr->th.th_current_task->td_flags.executing = 1;
629 
630     if (__kmp_tasking_mode != tskm_immediate_exec) {
631       // Copy the task team from the new child / old parent team to the thread.
632       this_thr->th.th_task_team =
633           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
634       KA_TRACE(20,
635                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
636                 "team %p\n",
637                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
638     }
639   } else {
640     if (__kmp_tasking_mode != tskm_immediate_exec) {
641       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
642                     "depth of serial team %p to %d\n",
643                     global_tid, serial_team, serial_team->t.t_serialized));
644     }
645   }
646 
647   if (__kmp_env_consistency_check)
648     __kmp_pop_parallel(global_tid, NULL);
649 #if OMPT_SUPPORT
650   if (ompt_enabled.enabled)
651     this_thr->th.ompt_thread_info.state =
652         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
653                                            : ompt_state_work_parallel);
654 #endif
655 }
656 
657 /*!
658 @ingroup SYNCHRONIZATION
659 @param loc  source location information.
660 
661 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
662 depending on the memory ordering convention obeyed by the compiler
663 even that may not be necessary).
664 */
665 void __kmpc_flush(ident_t *loc) {
666   KC_TRACE(10, ("__kmpc_flush: called\n"));
667 
668   /* need explicit __mf() here since use volatile instead in library */
669   KMP_MB(); /* Flush all pending memory write invalidates.  */
670 
671 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
672 #if KMP_MIC
673 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
674 // We shouldn't need it, though, since the ABI rules require that
675 // * If the compiler generates NGO stores it also generates the fence
676 // * If users hand-code NGO stores they should insert the fence
677 // therefore no incomplete unordered stores should be visible.
678 #else
679   // C74404
680   // This is to address non-temporal store instructions (sfence needed).
681   // The clflush instruction is addressed either (mfence needed).
682   // Probably the non-temporal load monvtdqa instruction should also be
683   // addressed.
684   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
685   if (!__kmp_cpuinfo.initialized) {
686     __kmp_query_cpuid(&__kmp_cpuinfo);
687   }
688   if (!__kmp_cpuinfo.sse2) {
689     // CPU cannot execute SSE2 instructions.
690   } else {
691 #if KMP_COMPILER_ICC
692     _mm_mfence();
693 #elif KMP_COMPILER_MSVC
694     MemoryBarrier();
695 #else
696     __sync_synchronize();
697 #endif // KMP_COMPILER_ICC
698   }
699 #endif // KMP_MIC
700 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
701        KMP_ARCH_RISCV64)
702 // Nothing to see here move along
703 #elif KMP_ARCH_PPC64
704 // Nothing needed here (we have a real MB above).
705 #else
706 #error Unknown or unsupported architecture
707 #endif
708 
709 #if OMPT_SUPPORT && OMPT_OPTIONAL
710   if (ompt_enabled.ompt_callback_flush) {
711     ompt_callbacks.ompt_callback(ompt_callback_flush)(
712         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
713   }
714 #endif
715 }
716 
717 /* -------------------------------------------------------------------------- */
718 /*!
719 @ingroup SYNCHRONIZATION
720 @param loc source location information
721 @param global_tid thread id.
722 
723 Execute a barrier.
724 */
725 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
726   KMP_COUNT_BLOCK(OMP_BARRIER);
727   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
728   __kmp_assert_valid_gtid(global_tid);
729 
730   if (!TCR_4(__kmp_init_parallel))
731     __kmp_parallel_initialize();
732 
733   __kmp_resume_if_soft_paused();
734 
735   if (__kmp_env_consistency_check) {
736     if (loc == 0) {
737       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
738     }
739     __kmp_check_barrier(global_tid, ct_barrier, loc);
740   }
741 
742 #if OMPT_SUPPORT
743   ompt_frame_t *ompt_frame;
744   if (ompt_enabled.enabled) {
745     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
746     if (ompt_frame->enter_frame.ptr == NULL)
747       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
748   }
749   OMPT_STORE_RETURN_ADDRESS(global_tid);
750 #endif
751   __kmp_threads[global_tid]->th.th_ident = loc;
752   // TODO: explicit barrier_wait_id:
753   //   this function is called when 'barrier' directive is present or
754   //   implicit barrier at the end of a worksharing construct.
755   // 1) better to add a per-thread barrier counter to a thread data structure
756   // 2) set to 0 when a new team is created
757   // 4) no sync is required
758 
759   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
760 #if OMPT_SUPPORT && OMPT_OPTIONAL
761   if (ompt_enabled.enabled) {
762     ompt_frame->enter_frame = ompt_data_none;
763   }
764 #endif
765 }
766 
767 /* The BARRIER for a MASTER section is always explicit   */
768 /*!
769 @ingroup WORK_SHARING
770 @param loc  source location information.
771 @param global_tid  global thread number .
772 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
773 */
774 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
775   int status = 0;
776 
777   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
778   __kmp_assert_valid_gtid(global_tid);
779 
780   if (!TCR_4(__kmp_init_parallel))
781     __kmp_parallel_initialize();
782 
783   __kmp_resume_if_soft_paused();
784 
785   if (KMP_MASTER_GTID(global_tid)) {
786     KMP_COUNT_BLOCK(OMP_MASTER);
787     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
788     status = 1;
789   }
790 
791 #if OMPT_SUPPORT && OMPT_OPTIONAL
792   if (status) {
793     if (ompt_enabled.ompt_callback_masked) {
794       kmp_info_t *this_thr = __kmp_threads[global_tid];
795       kmp_team_t *team = this_thr->th.th_team;
796 
797       int tid = __kmp_tid_from_gtid(global_tid);
798       ompt_callbacks.ompt_callback(ompt_callback_masked)(
799           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
800           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
801           OMPT_GET_RETURN_ADDRESS(0));
802     }
803   }
804 #endif
805 
806   if (__kmp_env_consistency_check) {
807 #if KMP_USE_DYNAMIC_LOCK
808     if (status)
809       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
810     else
811       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
812 #else
813     if (status)
814       __kmp_push_sync(global_tid, ct_master, loc, NULL);
815     else
816       __kmp_check_sync(global_tid, ct_master, loc, NULL);
817 #endif
818   }
819 
820   return status;
821 }
822 
823 /*!
824 @ingroup WORK_SHARING
825 @param loc  source location information.
826 @param global_tid  global thread number .
827 
828 Mark the end of a <tt>master</tt> region. This should only be called by the
829 thread that executes the <tt>master</tt> region.
830 */
831 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
832   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
833   __kmp_assert_valid_gtid(global_tid);
834   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
835   KMP_POP_PARTITIONED_TIMER();
836 
837 #if OMPT_SUPPORT && OMPT_OPTIONAL
838   kmp_info_t *this_thr = __kmp_threads[global_tid];
839   kmp_team_t *team = this_thr->th.th_team;
840   if (ompt_enabled.ompt_callback_masked) {
841     int tid = __kmp_tid_from_gtid(global_tid);
842     ompt_callbacks.ompt_callback(ompt_callback_masked)(
843         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
844         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
845         OMPT_GET_RETURN_ADDRESS(0));
846   }
847 #endif
848 
849   if (__kmp_env_consistency_check) {
850     if (KMP_MASTER_GTID(global_tid))
851       __kmp_pop_sync(global_tid, ct_master, loc);
852   }
853 }
854 
855 /*!
856 @ingroup WORK_SHARING
857 @param loc  source location information.
858 @param global_tid  global thread number.
859 @param filter result of evaluating filter clause on thread global_tid, or zero
860 if no filter clause present
861 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
862 */
863 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
864   int status = 0;
865   int tid;
866   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
867   __kmp_assert_valid_gtid(global_tid);
868 
869   if (!TCR_4(__kmp_init_parallel))
870     __kmp_parallel_initialize();
871 
872   __kmp_resume_if_soft_paused();
873 
874   tid = __kmp_tid_from_gtid(global_tid);
875   if (tid == filter) {
876     KMP_COUNT_BLOCK(OMP_MASKED);
877     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
878     status = 1;
879   }
880 
881 #if OMPT_SUPPORT && OMPT_OPTIONAL
882   if (status) {
883     if (ompt_enabled.ompt_callback_masked) {
884       kmp_info_t *this_thr = __kmp_threads[global_tid];
885       kmp_team_t *team = this_thr->th.th_team;
886       ompt_callbacks.ompt_callback(ompt_callback_masked)(
887           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
888           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
889           OMPT_GET_RETURN_ADDRESS(0));
890     }
891   }
892 #endif
893 
894   if (__kmp_env_consistency_check) {
895 #if KMP_USE_DYNAMIC_LOCK
896     if (status)
897       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
898     else
899       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
900 #else
901     if (status)
902       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
903     else
904       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
905 #endif
906   }
907 
908   return status;
909 }
910 
911 /*!
912 @ingroup WORK_SHARING
913 @param loc  source location information.
914 @param global_tid  global thread number .
915 
916 Mark the end of a <tt>masked</tt> region. This should only be called by the
917 thread that executes the <tt>masked</tt> region.
918 */
919 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
920   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
921   __kmp_assert_valid_gtid(global_tid);
922   KMP_POP_PARTITIONED_TIMER();
923 
924 #if OMPT_SUPPORT && OMPT_OPTIONAL
925   kmp_info_t *this_thr = __kmp_threads[global_tid];
926   kmp_team_t *team = this_thr->th.th_team;
927   if (ompt_enabled.ompt_callback_masked) {
928     int tid = __kmp_tid_from_gtid(global_tid);
929     ompt_callbacks.ompt_callback(ompt_callback_masked)(
930         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
931         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
932         OMPT_GET_RETURN_ADDRESS(0));
933   }
934 #endif
935 
936   if (__kmp_env_consistency_check) {
937     __kmp_pop_sync(global_tid, ct_masked, loc);
938   }
939 }
940 
941 /*!
942 @ingroup WORK_SHARING
943 @param loc  source location information.
944 @param gtid  global thread number.
945 
946 Start execution of an <tt>ordered</tt> construct.
947 */
948 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
949   int cid = 0;
950   kmp_info_t *th;
951   KMP_DEBUG_ASSERT(__kmp_init_serial);
952 
953   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
954   __kmp_assert_valid_gtid(gtid);
955 
956   if (!TCR_4(__kmp_init_parallel))
957     __kmp_parallel_initialize();
958 
959   __kmp_resume_if_soft_paused();
960 
961 #if USE_ITT_BUILD
962   __kmp_itt_ordered_prep(gtid);
963 // TODO: ordered_wait_id
964 #endif /* USE_ITT_BUILD */
965 
966   th = __kmp_threads[gtid];
967 
968 #if OMPT_SUPPORT && OMPT_OPTIONAL
969   kmp_team_t *team;
970   ompt_wait_id_t lck;
971   void *codeptr_ra;
972   OMPT_STORE_RETURN_ADDRESS(gtid);
973   if (ompt_enabled.enabled) {
974     team = __kmp_team_from_gtid(gtid);
975     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
976     /* OMPT state update */
977     th->th.ompt_thread_info.wait_id = lck;
978     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
979 
980     /* OMPT event callback */
981     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
982     if (ompt_enabled.ompt_callback_mutex_acquire) {
983       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
984           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
985           codeptr_ra);
986     }
987   }
988 #endif
989 
990   if (th->th.th_dispatch->th_deo_fcn != 0)
991     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
992   else
993     __kmp_parallel_deo(&gtid, &cid, loc);
994 
995 #if OMPT_SUPPORT && OMPT_OPTIONAL
996   if (ompt_enabled.enabled) {
997     /* OMPT state update */
998     th->th.ompt_thread_info.state = ompt_state_work_parallel;
999     th->th.ompt_thread_info.wait_id = 0;
1000 
1001     /* OMPT event callback */
1002     if (ompt_enabled.ompt_callback_mutex_acquired) {
1003       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1004           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1005     }
1006   }
1007 #endif
1008 
1009 #if USE_ITT_BUILD
1010   __kmp_itt_ordered_start(gtid);
1011 #endif /* USE_ITT_BUILD */
1012 }
1013 
1014 /*!
1015 @ingroup WORK_SHARING
1016 @param loc  source location information.
1017 @param gtid  global thread number.
1018 
1019 End execution of an <tt>ordered</tt> construct.
1020 */
1021 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1022   int cid = 0;
1023   kmp_info_t *th;
1024 
1025   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1026   __kmp_assert_valid_gtid(gtid);
1027 
1028 #if USE_ITT_BUILD
1029   __kmp_itt_ordered_end(gtid);
1030 // TODO: ordered_wait_id
1031 #endif /* USE_ITT_BUILD */
1032 
1033   th = __kmp_threads[gtid];
1034 
1035   if (th->th.th_dispatch->th_dxo_fcn != 0)
1036     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1037   else
1038     __kmp_parallel_dxo(&gtid, &cid, loc);
1039 
1040 #if OMPT_SUPPORT && OMPT_OPTIONAL
1041   OMPT_STORE_RETURN_ADDRESS(gtid);
1042   if (ompt_enabled.ompt_callback_mutex_released) {
1043     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1044         ompt_mutex_ordered,
1045         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1046             ->t.t_ordered.dt.t_value,
1047         OMPT_LOAD_RETURN_ADDRESS(gtid));
1048   }
1049 #endif
1050 }
1051 
1052 #if KMP_USE_DYNAMIC_LOCK
1053 
1054 static __forceinline void
1055 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1056                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1057   // Pointer to the allocated indirect lock is written to crit, while indexing
1058   // is ignored.
1059   void *idx;
1060   kmp_indirect_lock_t **lck;
1061   lck = (kmp_indirect_lock_t **)crit;
1062   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1063   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1064   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1065   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1066   KA_TRACE(20,
1067            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1068 #if USE_ITT_BUILD
1069   __kmp_itt_critical_creating(ilk->lock, loc);
1070 #endif
1071   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1072   if (status == 0) {
1073 #if USE_ITT_BUILD
1074     __kmp_itt_critical_destroyed(ilk->lock);
1075 #endif
1076     // We don't really need to destroy the unclaimed lock here since it will be
1077     // cleaned up at program exit.
1078     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1079   }
1080   KMP_DEBUG_ASSERT(*lck != NULL);
1081 }
1082 
1083 // Fast-path acquire tas lock
1084 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1085   {                                                                            \
1086     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1087     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1088     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1089     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1090         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1091       kmp_uint32 spins;                                                        \
1092       KMP_FSYNC_PREPARE(l);                                                    \
1093       KMP_INIT_YIELD(spins);                                                   \
1094       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1095       do {                                                                     \
1096         if (TCR_4(__kmp_nth) >                                                 \
1097             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1098           KMP_YIELD(TRUE);                                                     \
1099         } else {                                                               \
1100           KMP_YIELD_SPIN(spins);                                               \
1101         }                                                                      \
1102         __kmp_spin_backoff(&backoff);                                          \
1103       } while (                                                                \
1104           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1105           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1106     }                                                                          \
1107     KMP_FSYNC_ACQUIRED(l);                                                     \
1108   }
1109 
1110 // Fast-path test tas lock
1111 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1112   {                                                                            \
1113     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1114     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1115     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1116     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1117          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1118   }
1119 
1120 // Fast-path release tas lock
1121 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1122   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1123 
1124 #if KMP_USE_FUTEX
1125 
1126 #include <sys/syscall.h>
1127 #include <unistd.h>
1128 #ifndef FUTEX_WAIT
1129 #define FUTEX_WAIT 0
1130 #endif
1131 #ifndef FUTEX_WAKE
1132 #define FUTEX_WAKE 1
1133 #endif
1134 
1135 // Fast-path acquire futex lock
1136 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1137   {                                                                            \
1138     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1139     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1140     KMP_MB();                                                                  \
1141     KMP_FSYNC_PREPARE(ftx);                                                    \
1142     kmp_int32 poll_val;                                                        \
1143     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1144                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1145                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1146       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1147       if (!cond) {                                                             \
1148         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1149                                          poll_val |                            \
1150                                              KMP_LOCK_BUSY(1, futex))) {       \
1151           continue;                                                            \
1152         }                                                                      \
1153         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1154       }                                                                        \
1155       kmp_int32 rc;                                                            \
1156       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1157                         NULL, NULL, 0)) != 0) {                                \
1158         continue;                                                              \
1159       }                                                                        \
1160       gtid_code |= 1;                                                          \
1161     }                                                                          \
1162     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1163   }
1164 
1165 // Fast-path test futex lock
1166 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1167   {                                                                            \
1168     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1169     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1170                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1171       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1172       rc = TRUE;                                                               \
1173     } else {                                                                   \
1174       rc = FALSE;                                                              \
1175     }                                                                          \
1176   }
1177 
1178 // Fast-path release futex lock
1179 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1180   {                                                                            \
1181     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1182     KMP_MB();                                                                  \
1183     KMP_FSYNC_RELEASING(ftx);                                                  \
1184     kmp_int32 poll_val =                                                       \
1185         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1186     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1187       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1188               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1189     }                                                                          \
1190     KMP_MB();                                                                  \
1191     KMP_YIELD_OVERSUB();                                                       \
1192   }
1193 
1194 #endif // KMP_USE_FUTEX
1195 
1196 #else // KMP_USE_DYNAMIC_LOCK
1197 
1198 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1199                                                       ident_t const *loc,
1200                                                       kmp_int32 gtid) {
1201   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1202 
1203   // Because of the double-check, the following load doesn't need to be volatile
1204   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1205 
1206   if (lck == NULL) {
1207     void *idx;
1208 
1209     // Allocate & initialize the lock.
1210     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1211     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1212     __kmp_init_user_lock_with_checks(lck);
1213     __kmp_set_user_lock_location(lck, loc);
1214 #if USE_ITT_BUILD
1215     __kmp_itt_critical_creating(lck);
1216 // __kmp_itt_critical_creating() should be called *before* the first usage
1217 // of underlying lock. It is the only place where we can guarantee it. There
1218 // are chances the lock will destroyed with no usage, but it is not a
1219 // problem, because this is not real event seen by user but rather setting
1220 // name for object (lock). See more details in kmp_itt.h.
1221 #endif /* USE_ITT_BUILD */
1222 
1223     // Use a cmpxchg instruction to slam the start of the critical section with
1224     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1225     // and use the lock that the other thread allocated.
1226     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1227 
1228     if (status == 0) {
1229 // Deallocate the lock and reload the value.
1230 #if USE_ITT_BUILD
1231       __kmp_itt_critical_destroyed(lck);
1232 // Let ITT know the lock is destroyed and the same memory location may be reused
1233 // for another purpose.
1234 #endif /* USE_ITT_BUILD */
1235       __kmp_destroy_user_lock_with_checks(lck);
1236       __kmp_user_lock_free(&idx, gtid, lck);
1237       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1238       KMP_DEBUG_ASSERT(lck != NULL);
1239     }
1240   }
1241   return lck;
1242 }
1243 
1244 #endif // KMP_USE_DYNAMIC_LOCK
1245 
1246 /*!
1247 @ingroup WORK_SHARING
1248 @param loc  source location information.
1249 @param global_tid  global thread number.
1250 @param crit identity of the critical section. This could be a pointer to a lock
1251 associated with the critical section, or some other suitably unique value.
1252 
1253 Enter code protected by a `critical` construct.
1254 This function blocks until the executing thread can enter the critical section.
1255 */
1256 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1257                      kmp_critical_name *crit) {
1258 #if KMP_USE_DYNAMIC_LOCK
1259 #if OMPT_SUPPORT && OMPT_OPTIONAL
1260   OMPT_STORE_RETURN_ADDRESS(global_tid);
1261 #endif // OMPT_SUPPORT
1262   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1263 #else
1264   KMP_COUNT_BLOCK(OMP_CRITICAL);
1265 #if OMPT_SUPPORT && OMPT_OPTIONAL
1266   ompt_state_t prev_state = ompt_state_undefined;
1267   ompt_thread_info_t ti;
1268 #endif
1269   kmp_user_lock_p lck;
1270 
1271   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1272   __kmp_assert_valid_gtid(global_tid);
1273 
1274   // TODO: add THR_OVHD_STATE
1275 
1276   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1277   KMP_CHECK_USER_LOCK_INIT();
1278 
1279   if ((__kmp_user_lock_kind == lk_tas) &&
1280       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1281     lck = (kmp_user_lock_p)crit;
1282   }
1283 #if KMP_USE_FUTEX
1284   else if ((__kmp_user_lock_kind == lk_futex) &&
1285            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1286     lck = (kmp_user_lock_p)crit;
1287   }
1288 #endif
1289   else { // ticket, queuing or drdpa
1290     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1291   }
1292 
1293   if (__kmp_env_consistency_check)
1294     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1295 
1296     // since the critical directive binds to all threads, not just the current
1297     // team we have to check this even if we are in a serialized team.
1298     // also, even if we are the uber thread, we still have to conduct the lock,
1299     // as we have to contend with sibling threads.
1300 
1301 #if USE_ITT_BUILD
1302   __kmp_itt_critical_acquiring(lck);
1303 #endif /* USE_ITT_BUILD */
1304 #if OMPT_SUPPORT && OMPT_OPTIONAL
1305   OMPT_STORE_RETURN_ADDRESS(gtid);
1306   void *codeptr_ra = NULL;
1307   if (ompt_enabled.enabled) {
1308     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1309     /* OMPT state update */
1310     prev_state = ti.state;
1311     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1312     ti.state = ompt_state_wait_critical;
1313 
1314     /* OMPT event callback */
1315     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1316     if (ompt_enabled.ompt_callback_mutex_acquire) {
1317       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1318           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1319           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1320     }
1321   }
1322 #endif
1323   // Value of 'crit' should be good for using as a critical_id of the critical
1324   // section directive.
1325   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1326 
1327 #if USE_ITT_BUILD
1328   __kmp_itt_critical_acquired(lck);
1329 #endif /* USE_ITT_BUILD */
1330 #if OMPT_SUPPORT && OMPT_OPTIONAL
1331   if (ompt_enabled.enabled) {
1332     /* OMPT state update */
1333     ti.state = prev_state;
1334     ti.wait_id = 0;
1335 
1336     /* OMPT event callback */
1337     if (ompt_enabled.ompt_callback_mutex_acquired) {
1338       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1339           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1340     }
1341   }
1342 #endif
1343   KMP_POP_PARTITIONED_TIMER();
1344 
1345   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1346   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1347 #endif // KMP_USE_DYNAMIC_LOCK
1348 }
1349 
1350 #if KMP_USE_DYNAMIC_LOCK
1351 
1352 // Converts the given hint to an internal lock implementation
1353 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1354 #if KMP_USE_TSX
1355 #define KMP_TSX_LOCK(seq) lockseq_##seq
1356 #else
1357 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1358 #endif
1359 
1360 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1361 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1362 #else
1363 #define KMP_CPUINFO_RTM 0
1364 #endif
1365 
1366   // Hints that do not require further logic
1367   if (hint & kmp_lock_hint_hle)
1368     return KMP_TSX_LOCK(hle);
1369   if (hint & kmp_lock_hint_rtm)
1370     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1371   if (hint & kmp_lock_hint_adaptive)
1372     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1373 
1374   // Rule out conflicting hints first by returning the default lock
1375   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1376     return __kmp_user_lock_seq;
1377   if ((hint & omp_lock_hint_speculative) &&
1378       (hint & omp_lock_hint_nonspeculative))
1379     return __kmp_user_lock_seq;
1380 
1381   // Do not even consider speculation when it appears to be contended
1382   if (hint & omp_lock_hint_contended)
1383     return lockseq_queuing;
1384 
1385   // Uncontended lock without speculation
1386   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1387     return lockseq_tas;
1388 
1389   // Use RTM lock for speculation
1390   if (hint & omp_lock_hint_speculative)
1391     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1392 
1393   return __kmp_user_lock_seq;
1394 }
1395 
1396 #if OMPT_SUPPORT && OMPT_OPTIONAL
1397 #if KMP_USE_DYNAMIC_LOCK
1398 static kmp_mutex_impl_t
1399 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1400   if (user_lock) {
1401     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1402     case 0:
1403       break;
1404 #if KMP_USE_FUTEX
1405     case locktag_futex:
1406       return kmp_mutex_impl_queuing;
1407 #endif
1408     case locktag_tas:
1409       return kmp_mutex_impl_spin;
1410 #if KMP_USE_TSX
1411     case locktag_hle:
1412     case locktag_rtm_spin:
1413       return kmp_mutex_impl_speculative;
1414 #endif
1415     default:
1416       return kmp_mutex_impl_none;
1417     }
1418     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1419   }
1420   KMP_ASSERT(ilock);
1421   switch (ilock->type) {
1422 #if KMP_USE_TSX
1423   case locktag_adaptive:
1424   case locktag_rtm_queuing:
1425     return kmp_mutex_impl_speculative;
1426 #endif
1427   case locktag_nested_tas:
1428     return kmp_mutex_impl_spin;
1429 #if KMP_USE_FUTEX
1430   case locktag_nested_futex:
1431 #endif
1432   case locktag_ticket:
1433   case locktag_queuing:
1434   case locktag_drdpa:
1435   case locktag_nested_ticket:
1436   case locktag_nested_queuing:
1437   case locktag_nested_drdpa:
1438     return kmp_mutex_impl_queuing;
1439   default:
1440     return kmp_mutex_impl_none;
1441   }
1442 }
1443 #else
1444 // For locks without dynamic binding
1445 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1446   switch (__kmp_user_lock_kind) {
1447   case lk_tas:
1448     return kmp_mutex_impl_spin;
1449 #if KMP_USE_FUTEX
1450   case lk_futex:
1451 #endif
1452   case lk_ticket:
1453   case lk_queuing:
1454   case lk_drdpa:
1455     return kmp_mutex_impl_queuing;
1456 #if KMP_USE_TSX
1457   case lk_hle:
1458   case lk_rtm_queuing:
1459   case lk_rtm_spin:
1460   case lk_adaptive:
1461     return kmp_mutex_impl_speculative;
1462 #endif
1463   default:
1464     return kmp_mutex_impl_none;
1465   }
1466 }
1467 #endif // KMP_USE_DYNAMIC_LOCK
1468 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1469 
1470 /*!
1471 @ingroup WORK_SHARING
1472 @param loc  source location information.
1473 @param global_tid  global thread number.
1474 @param crit identity of the critical section. This could be a pointer to a lock
1475 associated with the critical section, or some other suitably unique value.
1476 @param hint the lock hint.
1477 
1478 Enter code protected by a `critical` construct with a hint. The hint value is
1479 used to suggest a lock implementation. This function blocks until the executing
1480 thread can enter the critical section unless the hint suggests use of
1481 speculative execution and the hardware supports it.
1482 */
1483 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1484                                kmp_critical_name *crit, uint32_t hint) {
1485   KMP_COUNT_BLOCK(OMP_CRITICAL);
1486   kmp_user_lock_p lck;
1487 #if OMPT_SUPPORT && OMPT_OPTIONAL
1488   ompt_state_t prev_state = ompt_state_undefined;
1489   ompt_thread_info_t ti;
1490   // This is the case, if called from __kmpc_critical:
1491   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1492   if (!codeptr)
1493     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1494 #endif
1495 
1496   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1497   __kmp_assert_valid_gtid(global_tid);
1498 
1499   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1500   // Check if it is initialized.
1501   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1502   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1503   if (*lk == 0) {
1504     if (KMP_IS_D_LOCK(lockseq)) {
1505       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1506                                   KMP_GET_D_TAG(lockseq));
1507     } else {
1508       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1509     }
1510   }
1511   // Branch for accessing the actual lock object and set operation. This
1512   // branching is inevitable since this lock initialization does not follow the
1513   // normal dispatch path (lock table is not used).
1514   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1515     lck = (kmp_user_lock_p)lk;
1516     if (__kmp_env_consistency_check) {
1517       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1518                       __kmp_map_hint_to_lock(hint));
1519     }
1520 #if USE_ITT_BUILD
1521     __kmp_itt_critical_acquiring(lck);
1522 #endif
1523 #if OMPT_SUPPORT && OMPT_OPTIONAL
1524     if (ompt_enabled.enabled) {
1525       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1526       /* OMPT state update */
1527       prev_state = ti.state;
1528       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1529       ti.state = ompt_state_wait_critical;
1530 
1531       /* OMPT event callback */
1532       if (ompt_enabled.ompt_callback_mutex_acquire) {
1533         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1534             ompt_mutex_critical, (unsigned int)hint,
1535             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1536             codeptr);
1537       }
1538     }
1539 #endif
1540 #if KMP_USE_INLINED_TAS
1541     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1542       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1543     } else
1544 #elif KMP_USE_INLINED_FUTEX
1545     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1546       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1547     } else
1548 #endif
1549     {
1550       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1551     }
1552   } else {
1553     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1554     lck = ilk->lock;
1555     if (__kmp_env_consistency_check) {
1556       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1557                       __kmp_map_hint_to_lock(hint));
1558     }
1559 #if USE_ITT_BUILD
1560     __kmp_itt_critical_acquiring(lck);
1561 #endif
1562 #if OMPT_SUPPORT && OMPT_OPTIONAL
1563     if (ompt_enabled.enabled) {
1564       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1565       /* OMPT state update */
1566       prev_state = ti.state;
1567       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1568       ti.state = ompt_state_wait_critical;
1569 
1570       /* OMPT event callback */
1571       if (ompt_enabled.ompt_callback_mutex_acquire) {
1572         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1573             ompt_mutex_critical, (unsigned int)hint,
1574             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1575             codeptr);
1576       }
1577     }
1578 #endif
1579     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1580   }
1581   KMP_POP_PARTITIONED_TIMER();
1582 
1583 #if USE_ITT_BUILD
1584   __kmp_itt_critical_acquired(lck);
1585 #endif /* USE_ITT_BUILD */
1586 #if OMPT_SUPPORT && OMPT_OPTIONAL
1587   if (ompt_enabled.enabled) {
1588     /* OMPT state update */
1589     ti.state = prev_state;
1590     ti.wait_id = 0;
1591 
1592     /* OMPT event callback */
1593     if (ompt_enabled.ompt_callback_mutex_acquired) {
1594       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1595           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1596     }
1597   }
1598 #endif
1599 
1600   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1601   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1602 } // __kmpc_critical_with_hint
1603 
1604 #endif // KMP_USE_DYNAMIC_LOCK
1605 
1606 /*!
1607 @ingroup WORK_SHARING
1608 @param loc  source location information.
1609 @param global_tid  global thread number .
1610 @param crit identity of the critical section. This could be a pointer to a lock
1611 associated with the critical section, or some other suitably unique value.
1612 
1613 Leave a critical section, releasing any lock that was held during its execution.
1614 */
1615 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1616                          kmp_critical_name *crit) {
1617   kmp_user_lock_p lck;
1618 
1619   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1620 
1621 #if KMP_USE_DYNAMIC_LOCK
1622   int locktag = KMP_EXTRACT_D_TAG(crit);
1623   if (locktag) {
1624     lck = (kmp_user_lock_p)crit;
1625     KMP_ASSERT(lck != NULL);
1626     if (__kmp_env_consistency_check) {
1627       __kmp_pop_sync(global_tid, ct_critical, loc);
1628     }
1629 #if USE_ITT_BUILD
1630     __kmp_itt_critical_releasing(lck);
1631 #endif
1632 #if KMP_USE_INLINED_TAS
1633     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1634       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1635     } else
1636 #elif KMP_USE_INLINED_FUTEX
1637     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1638       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1639     } else
1640 #endif
1641     {
1642       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1643     }
1644   } else {
1645     kmp_indirect_lock_t *ilk =
1646         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1647     KMP_ASSERT(ilk != NULL);
1648     lck = ilk->lock;
1649     if (__kmp_env_consistency_check) {
1650       __kmp_pop_sync(global_tid, ct_critical, loc);
1651     }
1652 #if USE_ITT_BUILD
1653     __kmp_itt_critical_releasing(lck);
1654 #endif
1655     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1656   }
1657 
1658 #else // KMP_USE_DYNAMIC_LOCK
1659 
1660   if ((__kmp_user_lock_kind == lk_tas) &&
1661       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1662     lck = (kmp_user_lock_p)crit;
1663   }
1664 #if KMP_USE_FUTEX
1665   else if ((__kmp_user_lock_kind == lk_futex) &&
1666            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1667     lck = (kmp_user_lock_p)crit;
1668   }
1669 #endif
1670   else { // ticket, queuing or drdpa
1671     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1672   }
1673 
1674   KMP_ASSERT(lck != NULL);
1675 
1676   if (__kmp_env_consistency_check)
1677     __kmp_pop_sync(global_tid, ct_critical, loc);
1678 
1679 #if USE_ITT_BUILD
1680   __kmp_itt_critical_releasing(lck);
1681 #endif /* USE_ITT_BUILD */
1682   // Value of 'crit' should be good for using as a critical_id of the critical
1683   // section directive.
1684   __kmp_release_user_lock_with_checks(lck, global_tid);
1685 
1686 #endif // KMP_USE_DYNAMIC_LOCK
1687 
1688 #if OMPT_SUPPORT && OMPT_OPTIONAL
1689   /* OMPT release event triggers after lock is released; place here to trigger
1690    * for all #if branches */
1691   OMPT_STORE_RETURN_ADDRESS(global_tid);
1692   if (ompt_enabled.ompt_callback_mutex_released) {
1693     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1694         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1695         OMPT_LOAD_RETURN_ADDRESS(0));
1696   }
1697 #endif
1698 
1699   KMP_POP_PARTITIONED_TIMER();
1700   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1701 }
1702 
1703 /*!
1704 @ingroup SYNCHRONIZATION
1705 @param loc source location information
1706 @param global_tid thread id.
1707 @return one if the thread should execute the master block, zero otherwise
1708 
1709 Start execution of a combined barrier and master. The barrier is executed inside
1710 this function.
1711 */
1712 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1713   int status;
1714   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1715   __kmp_assert_valid_gtid(global_tid);
1716 
1717   if (!TCR_4(__kmp_init_parallel))
1718     __kmp_parallel_initialize();
1719 
1720   __kmp_resume_if_soft_paused();
1721 
1722   if (__kmp_env_consistency_check)
1723     __kmp_check_barrier(global_tid, ct_barrier, loc);
1724 
1725 #if OMPT_SUPPORT
1726   ompt_frame_t *ompt_frame;
1727   if (ompt_enabled.enabled) {
1728     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1729     if (ompt_frame->enter_frame.ptr == NULL)
1730       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1731   }
1732   OMPT_STORE_RETURN_ADDRESS(global_tid);
1733 #endif
1734 #if USE_ITT_NOTIFY
1735   __kmp_threads[global_tid]->th.th_ident = loc;
1736 #endif
1737   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1738 #if OMPT_SUPPORT && OMPT_OPTIONAL
1739   if (ompt_enabled.enabled) {
1740     ompt_frame->enter_frame = ompt_data_none;
1741   }
1742 #endif
1743 
1744   return (status != 0) ? 0 : 1;
1745 }
1746 
1747 /*!
1748 @ingroup SYNCHRONIZATION
1749 @param loc source location information
1750 @param global_tid thread id.
1751 
1752 Complete the execution of a combined barrier and master. This function should
1753 only be called at the completion of the <tt>master</tt> code. Other threads will
1754 still be waiting at the barrier and this call releases them.
1755 */
1756 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1757   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1758   __kmp_assert_valid_gtid(global_tid);
1759   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1760 }
1761 
1762 /*!
1763 @ingroup SYNCHRONIZATION
1764 @param loc source location information
1765 @param global_tid thread id.
1766 @return one if the thread should execute the master block, zero otherwise
1767 
1768 Start execution of a combined barrier and master(nowait) construct.
1769 The barrier is executed inside this function.
1770 There is no equivalent "end" function, since the
1771 */
1772 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1773   kmp_int32 ret;
1774   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1775   __kmp_assert_valid_gtid(global_tid);
1776 
1777   if (!TCR_4(__kmp_init_parallel))
1778     __kmp_parallel_initialize();
1779 
1780   __kmp_resume_if_soft_paused();
1781 
1782   if (__kmp_env_consistency_check) {
1783     if (loc == 0) {
1784       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1785     }
1786     __kmp_check_barrier(global_tid, ct_barrier, loc);
1787   }
1788 
1789 #if OMPT_SUPPORT
1790   ompt_frame_t *ompt_frame;
1791   if (ompt_enabled.enabled) {
1792     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1793     if (ompt_frame->enter_frame.ptr == NULL)
1794       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1795   }
1796   OMPT_STORE_RETURN_ADDRESS(global_tid);
1797 #endif
1798 #if USE_ITT_NOTIFY
1799   __kmp_threads[global_tid]->th.th_ident = loc;
1800 #endif
1801   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1802 #if OMPT_SUPPORT && OMPT_OPTIONAL
1803   if (ompt_enabled.enabled) {
1804     ompt_frame->enter_frame = ompt_data_none;
1805   }
1806 #endif
1807 
1808   ret = __kmpc_master(loc, global_tid);
1809 
1810   if (__kmp_env_consistency_check) {
1811     /*  there's no __kmpc_end_master called; so the (stats) */
1812     /*  actions of __kmpc_end_master are done here          */
1813     if (ret) {
1814       /* only one thread should do the pop since only */
1815       /* one did the push (see __kmpc_master())       */
1816       __kmp_pop_sync(global_tid, ct_master, loc);
1817     }
1818   }
1819 
1820   return (ret);
1821 }
1822 
1823 /* The BARRIER for a SINGLE process section is always explicit   */
1824 /*!
1825 @ingroup WORK_SHARING
1826 @param loc  source location information
1827 @param global_tid  global thread number
1828 @return One if this thread should execute the single construct, zero otherwise.
1829 
1830 Test whether to execute a <tt>single</tt> construct.
1831 There are no implicit barriers in the two "single" calls, rather the compiler
1832 should introduce an explicit barrier if it is required.
1833 */
1834 
1835 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1836   __kmp_assert_valid_gtid(global_tid);
1837   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1838 
1839   if (rc) {
1840     // We are going to execute the single statement, so we should count it.
1841     KMP_COUNT_BLOCK(OMP_SINGLE);
1842     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1843   }
1844 
1845 #if OMPT_SUPPORT && OMPT_OPTIONAL
1846   kmp_info_t *this_thr = __kmp_threads[global_tid];
1847   kmp_team_t *team = this_thr->th.th_team;
1848   int tid = __kmp_tid_from_gtid(global_tid);
1849 
1850   if (ompt_enabled.enabled) {
1851     if (rc) {
1852       if (ompt_enabled.ompt_callback_work) {
1853         ompt_callbacks.ompt_callback(ompt_callback_work)(
1854             ompt_work_single_executor, ompt_scope_begin,
1855             &(team->t.ompt_team_info.parallel_data),
1856             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1857             1, OMPT_GET_RETURN_ADDRESS(0));
1858       }
1859     } else {
1860       if (ompt_enabled.ompt_callback_work) {
1861         ompt_callbacks.ompt_callback(ompt_callback_work)(
1862             ompt_work_single_other, ompt_scope_begin,
1863             &(team->t.ompt_team_info.parallel_data),
1864             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1865             1, OMPT_GET_RETURN_ADDRESS(0));
1866         ompt_callbacks.ompt_callback(ompt_callback_work)(
1867             ompt_work_single_other, ompt_scope_end,
1868             &(team->t.ompt_team_info.parallel_data),
1869             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1870             1, OMPT_GET_RETURN_ADDRESS(0));
1871       }
1872     }
1873   }
1874 #endif
1875 
1876   return rc;
1877 }
1878 
1879 /*!
1880 @ingroup WORK_SHARING
1881 @param loc  source location information
1882 @param global_tid  global thread number
1883 
1884 Mark the end of a <tt>single</tt> construct.  This function should
1885 only be called by the thread that executed the block of code protected
1886 by the `single` construct.
1887 */
1888 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1889   __kmp_assert_valid_gtid(global_tid);
1890   __kmp_exit_single(global_tid);
1891   KMP_POP_PARTITIONED_TIMER();
1892 
1893 #if OMPT_SUPPORT && OMPT_OPTIONAL
1894   kmp_info_t *this_thr = __kmp_threads[global_tid];
1895   kmp_team_t *team = this_thr->th.th_team;
1896   int tid = __kmp_tid_from_gtid(global_tid);
1897 
1898   if (ompt_enabled.ompt_callback_work) {
1899     ompt_callbacks.ompt_callback(ompt_callback_work)(
1900         ompt_work_single_executor, ompt_scope_end,
1901         &(team->t.ompt_team_info.parallel_data),
1902         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1903         OMPT_GET_RETURN_ADDRESS(0));
1904   }
1905 #endif
1906 }
1907 
1908 /*!
1909 @ingroup WORK_SHARING
1910 @param loc Source location
1911 @param global_tid Global thread id
1912 
1913 Mark the end of a statically scheduled loop.
1914 */
1915 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1916   KMP_POP_PARTITIONED_TIMER();
1917   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1918 
1919 #if OMPT_SUPPORT && OMPT_OPTIONAL
1920   if (ompt_enabled.ompt_callback_work) {
1921     ompt_work_t ompt_work_type = ompt_work_loop;
1922     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1923     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1924     // Determine workshare type
1925     if (loc != NULL) {
1926       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1927         ompt_work_type = ompt_work_loop;
1928       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1929         ompt_work_type = ompt_work_sections;
1930       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1931         ompt_work_type = ompt_work_distribute;
1932       } else {
1933         // use default set above.
1934         // a warning about this case is provided in __kmpc_for_static_init
1935       }
1936       KMP_DEBUG_ASSERT(ompt_work_type);
1937     }
1938     ompt_callbacks.ompt_callback(ompt_callback_work)(
1939         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1940         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1941   }
1942 #endif
1943   if (__kmp_env_consistency_check)
1944     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1945 }
1946 
1947 // User routines which take C-style arguments (call by value)
1948 // different from the Fortran equivalent routines
1949 
1950 void ompc_set_num_threads(int arg) {
1951   // !!!!! TODO: check the per-task binding
1952   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1953 }
1954 
1955 void ompc_set_dynamic(int flag) {
1956   kmp_info_t *thread;
1957 
1958   /* For the thread-private implementation of the internal controls */
1959   thread = __kmp_entry_thread();
1960 
1961   __kmp_save_internal_controls(thread);
1962 
1963   set__dynamic(thread, flag ? true : false);
1964 }
1965 
1966 void ompc_set_nested(int flag) {
1967   kmp_info_t *thread;
1968 
1969   /* For the thread-private internal controls implementation */
1970   thread = __kmp_entry_thread();
1971 
1972   __kmp_save_internal_controls(thread);
1973 
1974   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1975 }
1976 
1977 void ompc_set_max_active_levels(int max_active_levels) {
1978   /* TO DO */
1979   /* we want per-task implementation of this internal control */
1980 
1981   /* For the per-thread internal controls implementation */
1982   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1983 }
1984 
1985 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1986   // !!!!! TODO: check the per-task binding
1987   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1988 }
1989 
1990 int ompc_get_ancestor_thread_num(int level) {
1991   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1992 }
1993 
1994 int ompc_get_team_size(int level) {
1995   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1996 }
1997 
1998 /* OpenMP 5.0 Affinity Format API */
1999 
2000 void ompc_set_affinity_format(char const *format) {
2001   if (!__kmp_init_serial) {
2002     __kmp_serial_initialize();
2003   }
2004   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2005                          format, KMP_STRLEN(format) + 1);
2006 }
2007 
2008 size_t ompc_get_affinity_format(char *buffer, size_t size) {
2009   size_t format_size;
2010   if (!__kmp_init_serial) {
2011     __kmp_serial_initialize();
2012   }
2013   format_size = KMP_STRLEN(__kmp_affinity_format);
2014   if (buffer && size) {
2015     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2016                            format_size + 1);
2017   }
2018   return format_size;
2019 }
2020 
2021 void ompc_display_affinity(char const *format) {
2022   int gtid;
2023   if (!TCR_4(__kmp_init_middle)) {
2024     __kmp_middle_initialize();
2025   }
2026   gtid = __kmp_get_gtid();
2027   __kmp_aux_display_affinity(gtid, format);
2028 }
2029 
2030 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
2031                              char const *format) {
2032   int gtid;
2033   size_t num_required;
2034   kmp_str_buf_t capture_buf;
2035   if (!TCR_4(__kmp_init_middle)) {
2036     __kmp_middle_initialize();
2037   }
2038   gtid = __kmp_get_gtid();
2039   __kmp_str_buf_init(&capture_buf);
2040   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2041   if (buffer && buf_size) {
2042     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2043                            capture_buf.used + 1);
2044   }
2045   __kmp_str_buf_free(&capture_buf);
2046   return num_required;
2047 }
2048 
2049 void kmpc_set_stacksize(int arg) {
2050   // __kmp_aux_set_stacksize initializes the library if needed
2051   __kmp_aux_set_stacksize(arg);
2052 }
2053 
2054 void kmpc_set_stacksize_s(size_t arg) {
2055   // __kmp_aux_set_stacksize initializes the library if needed
2056   __kmp_aux_set_stacksize(arg);
2057 }
2058 
2059 void kmpc_set_blocktime(int arg) {
2060   int gtid, tid;
2061   kmp_info_t *thread;
2062 
2063   gtid = __kmp_entry_gtid();
2064   tid = __kmp_tid_from_gtid(gtid);
2065   thread = __kmp_thread_from_gtid(gtid);
2066 
2067   __kmp_aux_set_blocktime(arg, thread, tid);
2068 }
2069 
2070 void kmpc_set_library(int arg) {
2071   // __kmp_user_set_library initializes the library if needed
2072   __kmp_user_set_library((enum library_type)arg);
2073 }
2074 
2075 void kmpc_set_defaults(char const *str) {
2076   // __kmp_aux_set_defaults initializes the library if needed
2077   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2078 }
2079 
2080 void kmpc_set_disp_num_buffers(int arg) {
2081   // ignore after initialization because some teams have already
2082   // allocated dispatch buffers
2083   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2084       arg <= KMP_MAX_DISP_NUM_BUFF) {
2085     __kmp_dispatch_num_buffers = arg;
2086   }
2087 }
2088 
2089 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2090 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2091   return -1;
2092 #else
2093   if (!TCR_4(__kmp_init_middle)) {
2094     __kmp_middle_initialize();
2095   }
2096   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2097 #endif
2098 }
2099 
2100 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2101 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2102   return -1;
2103 #else
2104   if (!TCR_4(__kmp_init_middle)) {
2105     __kmp_middle_initialize();
2106   }
2107   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2108 #endif
2109 }
2110 
2111 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2112 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2113   return -1;
2114 #else
2115   if (!TCR_4(__kmp_init_middle)) {
2116     __kmp_middle_initialize();
2117   }
2118   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2119 #endif
2120 }
2121 
2122 /* -------------------------------------------------------------------------- */
2123 /*!
2124 @ingroup THREADPRIVATE
2125 @param loc       source location information
2126 @param gtid      global thread number
2127 @param cpy_size  size of the cpy_data buffer
2128 @param cpy_data  pointer to data to be copied
2129 @param cpy_func  helper function to call for copying data
2130 @param didit     flag variable: 1=single thread; 0=not single thread
2131 
2132 __kmpc_copyprivate implements the interface for the private data broadcast
2133 needed for the copyprivate clause associated with a single region in an
2134 OpenMP<sup>*</sup> program (both C and Fortran).
2135 All threads participating in the parallel region call this routine.
2136 One of the threads (called the single thread) should have the <tt>didit</tt>
2137 variable set to 1 and all other threads should have that variable set to 0.
2138 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2139 
2140 The OpenMP specification forbids the use of nowait on the single region when a
2141 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2142 barrier internally to avoid race conditions, so the code generation for the
2143 single region should avoid generating a barrier after the call to @ref
2144 __kmpc_copyprivate.
2145 
2146 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2147 The <tt>loc</tt> parameter is a pointer to source location information.
2148 
2149 Internal implementation: The single thread will first copy its descriptor
2150 address (cpy_data) to a team-private location, then the other threads will each
2151 call the function pointed to by the parameter cpy_func, which carries out the
2152 copy by copying the data using the cpy_data buffer.
2153 
2154 The cpy_func routine used for the copy and the contents of the data area defined
2155 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2156 to be done. For instance, the cpy_data buffer can hold the actual data to be
2157 copied or it may hold a list of pointers to the data. The cpy_func routine must
2158 interpret the cpy_data buffer appropriately.
2159 
2160 The interface to cpy_func is as follows:
2161 @code
2162 void cpy_func( void *destination, void *source )
2163 @endcode
2164 where void *destination is the cpy_data pointer for the thread being copied to
2165 and void *source is the cpy_data pointer for the thread being copied from.
2166 */
2167 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2168                         void *cpy_data, void (*cpy_func)(void *, void *),
2169                         kmp_int32 didit) {
2170   void **data_ptr;
2171   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2172   __kmp_assert_valid_gtid(gtid);
2173 
2174   KMP_MB();
2175 
2176   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2177 
2178   if (__kmp_env_consistency_check) {
2179     if (loc == 0) {
2180       KMP_WARNING(ConstructIdentInvalid);
2181     }
2182   }
2183 
2184   // ToDo: Optimize the following two barriers into some kind of split barrier
2185 
2186   if (didit)
2187     *data_ptr = cpy_data;
2188 
2189 #if OMPT_SUPPORT
2190   ompt_frame_t *ompt_frame;
2191   if (ompt_enabled.enabled) {
2192     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2193     if (ompt_frame->enter_frame.ptr == NULL)
2194       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2195   }
2196   OMPT_STORE_RETURN_ADDRESS(gtid);
2197 #endif
2198 /* This barrier is not a barrier region boundary */
2199 #if USE_ITT_NOTIFY
2200   __kmp_threads[gtid]->th.th_ident = loc;
2201 #endif
2202   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2203 
2204   if (!didit)
2205     (*cpy_func)(cpy_data, *data_ptr);
2206 
2207   // Consider next barrier a user-visible barrier for barrier region boundaries
2208   // Nesting checks are already handled by the single construct checks
2209   {
2210 #if OMPT_SUPPORT
2211     OMPT_STORE_RETURN_ADDRESS(gtid);
2212 #endif
2213 #if USE_ITT_NOTIFY
2214     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2215 // tasks can overwrite the location)
2216 #endif
2217     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2218 #if OMPT_SUPPORT && OMPT_OPTIONAL
2219     if (ompt_enabled.enabled) {
2220       ompt_frame->enter_frame = ompt_data_none;
2221     }
2222 #endif
2223   }
2224 }
2225 
2226 /* -------------------------------------------------------------------------- */
2227 
2228 #define INIT_LOCK __kmp_init_user_lock_with_checks
2229 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2230 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2231 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2232 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2233 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2234   __kmp_acquire_nested_user_lock_with_checks_timed
2235 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2236 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2237 #define TEST_LOCK __kmp_test_user_lock_with_checks
2238 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2239 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2240 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2241 
2242 // TODO: Make check abort messages use location info & pass it into
2243 // with_checks routines
2244 
2245 #if KMP_USE_DYNAMIC_LOCK
2246 
2247 // internal lock initializer
2248 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2249                                                     kmp_dyna_lockseq_t seq) {
2250   if (KMP_IS_D_LOCK(seq)) {
2251     KMP_INIT_D_LOCK(lock, seq);
2252 #if USE_ITT_BUILD
2253     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2254 #endif
2255   } else {
2256     KMP_INIT_I_LOCK(lock, seq);
2257 #if USE_ITT_BUILD
2258     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2259     __kmp_itt_lock_creating(ilk->lock, loc);
2260 #endif
2261   }
2262 }
2263 
2264 // internal nest lock initializer
2265 static __forceinline void
2266 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2267                                kmp_dyna_lockseq_t seq) {
2268 #if KMP_USE_TSX
2269   // Don't have nested lock implementation for speculative locks
2270   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2271       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2272     seq = __kmp_user_lock_seq;
2273 #endif
2274   switch (seq) {
2275   case lockseq_tas:
2276     seq = lockseq_nested_tas;
2277     break;
2278 #if KMP_USE_FUTEX
2279   case lockseq_futex:
2280     seq = lockseq_nested_futex;
2281     break;
2282 #endif
2283   case lockseq_ticket:
2284     seq = lockseq_nested_ticket;
2285     break;
2286   case lockseq_queuing:
2287     seq = lockseq_nested_queuing;
2288     break;
2289   case lockseq_drdpa:
2290     seq = lockseq_nested_drdpa;
2291     break;
2292   default:
2293     seq = lockseq_nested_queuing;
2294   }
2295   KMP_INIT_I_LOCK(lock, seq);
2296 #if USE_ITT_BUILD
2297   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2298   __kmp_itt_lock_creating(ilk->lock, loc);
2299 #endif
2300 }
2301 
2302 /* initialize the lock with a hint */
2303 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2304                                 uintptr_t hint) {
2305   KMP_DEBUG_ASSERT(__kmp_init_serial);
2306   if (__kmp_env_consistency_check && user_lock == NULL) {
2307     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2308   }
2309 
2310   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2311 
2312 #if OMPT_SUPPORT && OMPT_OPTIONAL
2313   // This is the case, if called from omp_init_lock_with_hint:
2314   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2315   if (!codeptr)
2316     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2317   if (ompt_enabled.ompt_callback_lock_init) {
2318     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2319         ompt_mutex_lock, (omp_lock_hint_t)hint,
2320         __ompt_get_mutex_impl_type(user_lock),
2321         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2322   }
2323 #endif
2324 }
2325 
2326 /* initialize the lock with a hint */
2327 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2328                                      void **user_lock, uintptr_t hint) {
2329   KMP_DEBUG_ASSERT(__kmp_init_serial);
2330   if (__kmp_env_consistency_check && user_lock == NULL) {
2331     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2332   }
2333 
2334   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2335 
2336 #if OMPT_SUPPORT && OMPT_OPTIONAL
2337   // This is the case, if called from omp_init_lock_with_hint:
2338   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2339   if (!codeptr)
2340     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2341   if (ompt_enabled.ompt_callback_lock_init) {
2342     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2343         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2344         __ompt_get_mutex_impl_type(user_lock),
2345         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2346   }
2347 #endif
2348 }
2349 
2350 #endif // KMP_USE_DYNAMIC_LOCK
2351 
2352 /* initialize the lock */
2353 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2354 #if KMP_USE_DYNAMIC_LOCK
2355 
2356   KMP_DEBUG_ASSERT(__kmp_init_serial);
2357   if (__kmp_env_consistency_check && user_lock == NULL) {
2358     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2359   }
2360   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2361 
2362 #if OMPT_SUPPORT && OMPT_OPTIONAL
2363   // This is the case, if called from omp_init_lock_with_hint:
2364   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2365   if (!codeptr)
2366     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2367   if (ompt_enabled.ompt_callback_lock_init) {
2368     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2369         ompt_mutex_lock, omp_lock_hint_none,
2370         __ompt_get_mutex_impl_type(user_lock),
2371         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2372   }
2373 #endif
2374 
2375 #else // KMP_USE_DYNAMIC_LOCK
2376 
2377   static char const *const func = "omp_init_lock";
2378   kmp_user_lock_p lck;
2379   KMP_DEBUG_ASSERT(__kmp_init_serial);
2380 
2381   if (__kmp_env_consistency_check) {
2382     if (user_lock == NULL) {
2383       KMP_FATAL(LockIsUninitialized, func);
2384     }
2385   }
2386 
2387   KMP_CHECK_USER_LOCK_INIT();
2388 
2389   if ((__kmp_user_lock_kind == lk_tas) &&
2390       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2391     lck = (kmp_user_lock_p)user_lock;
2392   }
2393 #if KMP_USE_FUTEX
2394   else if ((__kmp_user_lock_kind == lk_futex) &&
2395            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2396     lck = (kmp_user_lock_p)user_lock;
2397   }
2398 #endif
2399   else {
2400     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2401   }
2402   INIT_LOCK(lck);
2403   __kmp_set_user_lock_location(lck, loc);
2404 
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2406   // This is the case, if called from omp_init_lock_with_hint:
2407   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2408   if (!codeptr)
2409     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2410   if (ompt_enabled.ompt_callback_lock_init) {
2411     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2412         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2413         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2414   }
2415 #endif
2416 
2417 #if USE_ITT_BUILD
2418   __kmp_itt_lock_creating(lck);
2419 #endif /* USE_ITT_BUILD */
2420 
2421 #endif // KMP_USE_DYNAMIC_LOCK
2422 } // __kmpc_init_lock
2423 
2424 /* initialize the lock */
2425 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2426 #if KMP_USE_DYNAMIC_LOCK
2427 
2428   KMP_DEBUG_ASSERT(__kmp_init_serial);
2429   if (__kmp_env_consistency_check && user_lock == NULL) {
2430     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2431   }
2432   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2433 
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435   // This is the case, if called from omp_init_lock_with_hint:
2436   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2437   if (!codeptr)
2438     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2439   if (ompt_enabled.ompt_callback_lock_init) {
2440     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2441         ompt_mutex_nest_lock, omp_lock_hint_none,
2442         __ompt_get_mutex_impl_type(user_lock),
2443         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2444   }
2445 #endif
2446 
2447 #else // KMP_USE_DYNAMIC_LOCK
2448 
2449   static char const *const func = "omp_init_nest_lock";
2450   kmp_user_lock_p lck;
2451   KMP_DEBUG_ASSERT(__kmp_init_serial);
2452 
2453   if (__kmp_env_consistency_check) {
2454     if (user_lock == NULL) {
2455       KMP_FATAL(LockIsUninitialized, func);
2456     }
2457   }
2458 
2459   KMP_CHECK_USER_LOCK_INIT();
2460 
2461   if ((__kmp_user_lock_kind == lk_tas) &&
2462       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2463        OMP_NEST_LOCK_T_SIZE)) {
2464     lck = (kmp_user_lock_p)user_lock;
2465   }
2466 #if KMP_USE_FUTEX
2467   else if ((__kmp_user_lock_kind == lk_futex) &&
2468            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2469             OMP_NEST_LOCK_T_SIZE)) {
2470     lck = (kmp_user_lock_p)user_lock;
2471   }
2472 #endif
2473   else {
2474     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2475   }
2476 
2477   INIT_NESTED_LOCK(lck);
2478   __kmp_set_user_lock_location(lck, loc);
2479 
2480 #if OMPT_SUPPORT && OMPT_OPTIONAL
2481   // This is the case, if called from omp_init_lock_with_hint:
2482   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2483   if (!codeptr)
2484     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2485   if (ompt_enabled.ompt_callback_lock_init) {
2486     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2487         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2488         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2489   }
2490 #endif
2491 
2492 #if USE_ITT_BUILD
2493   __kmp_itt_lock_creating(lck);
2494 #endif /* USE_ITT_BUILD */
2495 
2496 #endif // KMP_USE_DYNAMIC_LOCK
2497 } // __kmpc_init_nest_lock
2498 
2499 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2500 #if KMP_USE_DYNAMIC_LOCK
2501 
2502 #if USE_ITT_BUILD
2503   kmp_user_lock_p lck;
2504   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2505     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2506   } else {
2507     lck = (kmp_user_lock_p)user_lock;
2508   }
2509   __kmp_itt_lock_destroyed(lck);
2510 #endif
2511 #if OMPT_SUPPORT && OMPT_OPTIONAL
2512   // This is the case, if called from omp_init_lock_with_hint:
2513   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2514   if (!codeptr)
2515     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2516   if (ompt_enabled.ompt_callback_lock_destroy) {
2517     kmp_user_lock_p lck;
2518     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2519       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2520     } else {
2521       lck = (kmp_user_lock_p)user_lock;
2522     }
2523     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2524         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2525   }
2526 #endif
2527   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2528 #else
2529   kmp_user_lock_p lck;
2530 
2531   if ((__kmp_user_lock_kind == lk_tas) &&
2532       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2533     lck = (kmp_user_lock_p)user_lock;
2534   }
2535 #if KMP_USE_FUTEX
2536   else if ((__kmp_user_lock_kind == lk_futex) &&
2537            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2538     lck = (kmp_user_lock_p)user_lock;
2539   }
2540 #endif
2541   else {
2542     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2543   }
2544 
2545 #if OMPT_SUPPORT && OMPT_OPTIONAL
2546   // This is the case, if called from omp_init_lock_with_hint:
2547   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2548   if (!codeptr)
2549     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2550   if (ompt_enabled.ompt_callback_lock_destroy) {
2551     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2552         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2553   }
2554 #endif
2555 
2556 #if USE_ITT_BUILD
2557   __kmp_itt_lock_destroyed(lck);
2558 #endif /* USE_ITT_BUILD */
2559   DESTROY_LOCK(lck);
2560 
2561   if ((__kmp_user_lock_kind == lk_tas) &&
2562       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2563     ;
2564   }
2565 #if KMP_USE_FUTEX
2566   else if ((__kmp_user_lock_kind == lk_futex) &&
2567            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2568     ;
2569   }
2570 #endif
2571   else {
2572     __kmp_user_lock_free(user_lock, gtid, lck);
2573   }
2574 #endif // KMP_USE_DYNAMIC_LOCK
2575 } // __kmpc_destroy_lock
2576 
2577 /* destroy the lock */
2578 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2579 #if KMP_USE_DYNAMIC_LOCK
2580 
2581 #if USE_ITT_BUILD
2582   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2583   __kmp_itt_lock_destroyed(ilk->lock);
2584 #endif
2585 #if OMPT_SUPPORT && OMPT_OPTIONAL
2586   // This is the case, if called from omp_init_lock_with_hint:
2587   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2588   if (!codeptr)
2589     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2590   if (ompt_enabled.ompt_callback_lock_destroy) {
2591     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2592         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2593   }
2594 #endif
2595   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2596 
2597 #else // KMP_USE_DYNAMIC_LOCK
2598 
2599   kmp_user_lock_p lck;
2600 
2601   if ((__kmp_user_lock_kind == lk_tas) &&
2602       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2603        OMP_NEST_LOCK_T_SIZE)) {
2604     lck = (kmp_user_lock_p)user_lock;
2605   }
2606 #if KMP_USE_FUTEX
2607   else if ((__kmp_user_lock_kind == lk_futex) &&
2608            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2609             OMP_NEST_LOCK_T_SIZE)) {
2610     lck = (kmp_user_lock_p)user_lock;
2611   }
2612 #endif
2613   else {
2614     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2615   }
2616 
2617 #if OMPT_SUPPORT && OMPT_OPTIONAL
2618   // This is the case, if called from omp_init_lock_with_hint:
2619   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2620   if (!codeptr)
2621     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2622   if (ompt_enabled.ompt_callback_lock_destroy) {
2623     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2624         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2625   }
2626 #endif
2627 
2628 #if USE_ITT_BUILD
2629   __kmp_itt_lock_destroyed(lck);
2630 #endif /* USE_ITT_BUILD */
2631 
2632   DESTROY_NESTED_LOCK(lck);
2633 
2634   if ((__kmp_user_lock_kind == lk_tas) &&
2635       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2636        OMP_NEST_LOCK_T_SIZE)) {
2637     ;
2638   }
2639 #if KMP_USE_FUTEX
2640   else if ((__kmp_user_lock_kind == lk_futex) &&
2641            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2642             OMP_NEST_LOCK_T_SIZE)) {
2643     ;
2644   }
2645 #endif
2646   else {
2647     __kmp_user_lock_free(user_lock, gtid, lck);
2648   }
2649 #endif // KMP_USE_DYNAMIC_LOCK
2650 } // __kmpc_destroy_nest_lock
2651 
2652 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2653   KMP_COUNT_BLOCK(OMP_set_lock);
2654 #if KMP_USE_DYNAMIC_LOCK
2655   int tag = KMP_EXTRACT_D_TAG(user_lock);
2656 #if USE_ITT_BUILD
2657   __kmp_itt_lock_acquiring(
2658       (kmp_user_lock_p)
2659           user_lock); // itt function will get to the right lock object.
2660 #endif
2661 #if OMPT_SUPPORT && OMPT_OPTIONAL
2662   // This is the case, if called from omp_init_lock_with_hint:
2663   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2664   if (!codeptr)
2665     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2666   if (ompt_enabled.ompt_callback_mutex_acquire) {
2667     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2668         ompt_mutex_lock, omp_lock_hint_none,
2669         __ompt_get_mutex_impl_type(user_lock),
2670         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2671   }
2672 #endif
2673 #if KMP_USE_INLINED_TAS
2674   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2675     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2676   } else
2677 #elif KMP_USE_INLINED_FUTEX
2678   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2679     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2680   } else
2681 #endif
2682   {
2683     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2684   }
2685 #if USE_ITT_BUILD
2686   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2687 #endif
2688 #if OMPT_SUPPORT && OMPT_OPTIONAL
2689   if (ompt_enabled.ompt_callback_mutex_acquired) {
2690     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2691         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2692   }
2693 #endif
2694 
2695 #else // KMP_USE_DYNAMIC_LOCK
2696 
2697   kmp_user_lock_p lck;
2698 
2699   if ((__kmp_user_lock_kind == lk_tas) &&
2700       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2701     lck = (kmp_user_lock_p)user_lock;
2702   }
2703 #if KMP_USE_FUTEX
2704   else if ((__kmp_user_lock_kind == lk_futex) &&
2705            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2706     lck = (kmp_user_lock_p)user_lock;
2707   }
2708 #endif
2709   else {
2710     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2711   }
2712 
2713 #if USE_ITT_BUILD
2714   __kmp_itt_lock_acquiring(lck);
2715 #endif /* USE_ITT_BUILD */
2716 #if OMPT_SUPPORT && OMPT_OPTIONAL
2717   // This is the case, if called from omp_init_lock_with_hint:
2718   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2719   if (!codeptr)
2720     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2721   if (ompt_enabled.ompt_callback_mutex_acquire) {
2722     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2723         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2724         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2725   }
2726 #endif
2727 
2728   ACQUIRE_LOCK(lck, gtid);
2729 
2730 #if USE_ITT_BUILD
2731   __kmp_itt_lock_acquired(lck);
2732 #endif /* USE_ITT_BUILD */
2733 
2734 #if OMPT_SUPPORT && OMPT_OPTIONAL
2735   if (ompt_enabled.ompt_callback_mutex_acquired) {
2736     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2737         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2738   }
2739 #endif
2740 
2741 #endif // KMP_USE_DYNAMIC_LOCK
2742 }
2743 
2744 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2745 #if KMP_USE_DYNAMIC_LOCK
2746 
2747 #if USE_ITT_BUILD
2748   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2749 #endif
2750 #if OMPT_SUPPORT && OMPT_OPTIONAL
2751   // This is the case, if called from omp_init_lock_with_hint:
2752   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2753   if (!codeptr)
2754     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2755   if (ompt_enabled.enabled) {
2756     if (ompt_enabled.ompt_callback_mutex_acquire) {
2757       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2758           ompt_mutex_nest_lock, omp_lock_hint_none,
2759           __ompt_get_mutex_impl_type(user_lock),
2760           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2761     }
2762   }
2763 #endif
2764   int acquire_status =
2765       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2766   (void)acquire_status;
2767 #if USE_ITT_BUILD
2768   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2769 #endif
2770 
2771 #if OMPT_SUPPORT && OMPT_OPTIONAL
2772   if (ompt_enabled.enabled) {
2773     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2774       if (ompt_enabled.ompt_callback_mutex_acquired) {
2775         // lock_first
2776         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2777             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2778             codeptr);
2779       }
2780     } else {
2781       if (ompt_enabled.ompt_callback_nest_lock) {
2782         // lock_next
2783         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2784             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2785       }
2786     }
2787   }
2788 #endif
2789 
2790 #else // KMP_USE_DYNAMIC_LOCK
2791   int acquire_status;
2792   kmp_user_lock_p lck;
2793 
2794   if ((__kmp_user_lock_kind == lk_tas) &&
2795       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2796        OMP_NEST_LOCK_T_SIZE)) {
2797     lck = (kmp_user_lock_p)user_lock;
2798   }
2799 #if KMP_USE_FUTEX
2800   else if ((__kmp_user_lock_kind == lk_futex) &&
2801            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2802             OMP_NEST_LOCK_T_SIZE)) {
2803     lck = (kmp_user_lock_p)user_lock;
2804   }
2805 #endif
2806   else {
2807     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2808   }
2809 
2810 #if USE_ITT_BUILD
2811   __kmp_itt_lock_acquiring(lck);
2812 #endif /* USE_ITT_BUILD */
2813 #if OMPT_SUPPORT && OMPT_OPTIONAL
2814   // This is the case, if called from omp_init_lock_with_hint:
2815   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2816   if (!codeptr)
2817     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2818   if (ompt_enabled.enabled) {
2819     if (ompt_enabled.ompt_callback_mutex_acquire) {
2820       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2821           ompt_mutex_nest_lock, omp_lock_hint_none,
2822           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2823           codeptr);
2824     }
2825   }
2826 #endif
2827 
2828   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2829 
2830 #if USE_ITT_BUILD
2831   __kmp_itt_lock_acquired(lck);
2832 #endif /* USE_ITT_BUILD */
2833 
2834 #if OMPT_SUPPORT && OMPT_OPTIONAL
2835   if (ompt_enabled.enabled) {
2836     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2837       if (ompt_enabled.ompt_callback_mutex_acquired) {
2838         // lock_first
2839         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2840             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2841       }
2842     } else {
2843       if (ompt_enabled.ompt_callback_nest_lock) {
2844         // lock_next
2845         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2846             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2847       }
2848     }
2849   }
2850 #endif
2851 
2852 #endif // KMP_USE_DYNAMIC_LOCK
2853 }
2854 
2855 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2856 #if KMP_USE_DYNAMIC_LOCK
2857 
2858   int tag = KMP_EXTRACT_D_TAG(user_lock);
2859 #if USE_ITT_BUILD
2860   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2861 #endif
2862 #if KMP_USE_INLINED_TAS
2863   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2864     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2865   } else
2866 #elif KMP_USE_INLINED_FUTEX
2867   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2868     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2869   } else
2870 #endif
2871   {
2872     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2873   }
2874 
2875 #if OMPT_SUPPORT && OMPT_OPTIONAL
2876   // This is the case, if called from omp_init_lock_with_hint:
2877   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2878   if (!codeptr)
2879     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2880   if (ompt_enabled.ompt_callback_mutex_released) {
2881     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2882         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2883   }
2884 #endif
2885 
2886 #else // KMP_USE_DYNAMIC_LOCK
2887 
2888   kmp_user_lock_p lck;
2889 
2890   /* Can't use serial interval since not block structured */
2891   /* release the lock */
2892 
2893   if ((__kmp_user_lock_kind == lk_tas) &&
2894       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2895 #if KMP_OS_LINUX &&                                                            \
2896     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2897 // "fast" path implemented to fix customer performance issue
2898 #if USE_ITT_BUILD
2899     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2900 #endif /* USE_ITT_BUILD */
2901     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2902     KMP_MB();
2903 
2904 #if OMPT_SUPPORT && OMPT_OPTIONAL
2905     // This is the case, if called from omp_init_lock_with_hint:
2906     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2907     if (!codeptr)
2908       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2909     if (ompt_enabled.ompt_callback_mutex_released) {
2910       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2911           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2912     }
2913 #endif
2914 
2915     return;
2916 #else
2917     lck = (kmp_user_lock_p)user_lock;
2918 #endif
2919   }
2920 #if KMP_USE_FUTEX
2921   else if ((__kmp_user_lock_kind == lk_futex) &&
2922            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2923     lck = (kmp_user_lock_p)user_lock;
2924   }
2925 #endif
2926   else {
2927     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2928   }
2929 
2930 #if USE_ITT_BUILD
2931   __kmp_itt_lock_releasing(lck);
2932 #endif /* USE_ITT_BUILD */
2933 
2934   RELEASE_LOCK(lck, gtid);
2935 
2936 #if OMPT_SUPPORT && OMPT_OPTIONAL
2937   // This is the case, if called from omp_init_lock_with_hint:
2938   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2939   if (!codeptr)
2940     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2941   if (ompt_enabled.ompt_callback_mutex_released) {
2942     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2943         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2944   }
2945 #endif
2946 
2947 #endif // KMP_USE_DYNAMIC_LOCK
2948 }
2949 
2950 /* release the lock */
2951 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2952 #if KMP_USE_DYNAMIC_LOCK
2953 
2954 #if USE_ITT_BUILD
2955   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2956 #endif
2957   int release_status =
2958       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2959   (void)release_status;
2960 
2961 #if OMPT_SUPPORT && OMPT_OPTIONAL
2962   // This is the case, if called from omp_init_lock_with_hint:
2963   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2964   if (!codeptr)
2965     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2966   if (ompt_enabled.enabled) {
2967     if (release_status == KMP_LOCK_RELEASED) {
2968       if (ompt_enabled.ompt_callback_mutex_released) {
2969         // release_lock_last
2970         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2971             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2972             codeptr);
2973       }
2974     } else if (ompt_enabled.ompt_callback_nest_lock) {
2975       // release_lock_prev
2976       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2977           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2978     }
2979   }
2980 #endif
2981 
2982 #else // KMP_USE_DYNAMIC_LOCK
2983 
2984   kmp_user_lock_p lck;
2985 
2986   /* Can't use serial interval since not block structured */
2987 
2988   if ((__kmp_user_lock_kind == lk_tas) &&
2989       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2990        OMP_NEST_LOCK_T_SIZE)) {
2991 #if KMP_OS_LINUX &&                                                            \
2992     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2993     // "fast" path implemented to fix customer performance issue
2994     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2995 #if USE_ITT_BUILD
2996     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2997 #endif /* USE_ITT_BUILD */
2998 
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000     int release_status = KMP_LOCK_STILL_HELD;
3001 #endif
3002 
3003     if (--(tl->lk.depth_locked) == 0) {
3004       TCW_4(tl->lk.poll, 0);
3005 #if OMPT_SUPPORT && OMPT_OPTIONAL
3006       release_status = KMP_LOCK_RELEASED;
3007 #endif
3008     }
3009     KMP_MB();
3010 
3011 #if OMPT_SUPPORT && OMPT_OPTIONAL
3012     // This is the case, if called from omp_init_lock_with_hint:
3013     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3014     if (!codeptr)
3015       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3016     if (ompt_enabled.enabled) {
3017       if (release_status == KMP_LOCK_RELEASED) {
3018         if (ompt_enabled.ompt_callback_mutex_released) {
3019           // release_lock_last
3020           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3021               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3022         }
3023       } else if (ompt_enabled.ompt_callback_nest_lock) {
3024         // release_lock_previous
3025         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3026             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3027       }
3028     }
3029 #endif
3030 
3031     return;
3032 #else
3033     lck = (kmp_user_lock_p)user_lock;
3034 #endif
3035   }
3036 #if KMP_USE_FUTEX
3037   else if ((__kmp_user_lock_kind == lk_futex) &&
3038            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3039             OMP_NEST_LOCK_T_SIZE)) {
3040     lck = (kmp_user_lock_p)user_lock;
3041   }
3042 #endif
3043   else {
3044     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3045   }
3046 
3047 #if USE_ITT_BUILD
3048   __kmp_itt_lock_releasing(lck);
3049 #endif /* USE_ITT_BUILD */
3050 
3051   int release_status;
3052   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3053 #if OMPT_SUPPORT && OMPT_OPTIONAL
3054   // This is the case, if called from omp_init_lock_with_hint:
3055   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3056   if (!codeptr)
3057     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3058   if (ompt_enabled.enabled) {
3059     if (release_status == KMP_LOCK_RELEASED) {
3060       if (ompt_enabled.ompt_callback_mutex_released) {
3061         // release_lock_last
3062         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3063             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3064       }
3065     } else if (ompt_enabled.ompt_callback_nest_lock) {
3066       // release_lock_previous
3067       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3068           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3069     }
3070   }
3071 #endif
3072 
3073 #endif // KMP_USE_DYNAMIC_LOCK
3074 }
3075 
3076 /* try to acquire the lock */
3077 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3078   KMP_COUNT_BLOCK(OMP_test_lock);
3079 
3080 #if KMP_USE_DYNAMIC_LOCK
3081   int rc;
3082   int tag = KMP_EXTRACT_D_TAG(user_lock);
3083 #if USE_ITT_BUILD
3084   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3085 #endif
3086 #if OMPT_SUPPORT && OMPT_OPTIONAL
3087   // This is the case, if called from omp_init_lock_with_hint:
3088   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3089   if (!codeptr)
3090     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3091   if (ompt_enabled.ompt_callback_mutex_acquire) {
3092     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3093         ompt_mutex_lock, omp_lock_hint_none,
3094         __ompt_get_mutex_impl_type(user_lock),
3095         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3096   }
3097 #endif
3098 #if KMP_USE_INLINED_TAS
3099   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3100     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3101   } else
3102 #elif KMP_USE_INLINED_FUTEX
3103   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3104     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3105   } else
3106 #endif
3107   {
3108     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3109   }
3110   if (rc) {
3111 #if USE_ITT_BUILD
3112     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3113 #endif
3114 #if OMPT_SUPPORT && OMPT_OPTIONAL
3115     if (ompt_enabled.ompt_callback_mutex_acquired) {
3116       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3117           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3118     }
3119 #endif
3120     return FTN_TRUE;
3121   } else {
3122 #if USE_ITT_BUILD
3123     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3124 #endif
3125     return FTN_FALSE;
3126   }
3127 
3128 #else // KMP_USE_DYNAMIC_LOCK
3129 
3130   kmp_user_lock_p lck;
3131   int rc;
3132 
3133   if ((__kmp_user_lock_kind == lk_tas) &&
3134       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3135     lck = (kmp_user_lock_p)user_lock;
3136   }
3137 #if KMP_USE_FUTEX
3138   else if ((__kmp_user_lock_kind == lk_futex) &&
3139            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3140     lck = (kmp_user_lock_p)user_lock;
3141   }
3142 #endif
3143   else {
3144     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3145   }
3146 
3147 #if USE_ITT_BUILD
3148   __kmp_itt_lock_acquiring(lck);
3149 #endif /* USE_ITT_BUILD */
3150 #if OMPT_SUPPORT && OMPT_OPTIONAL
3151   // This is the case, if called from omp_init_lock_with_hint:
3152   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3153   if (!codeptr)
3154     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3155   if (ompt_enabled.ompt_callback_mutex_acquire) {
3156     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3157         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3158         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3159   }
3160 #endif
3161 
3162   rc = TEST_LOCK(lck, gtid);
3163 #if USE_ITT_BUILD
3164   if (rc) {
3165     __kmp_itt_lock_acquired(lck);
3166   } else {
3167     __kmp_itt_lock_cancelled(lck);
3168   }
3169 #endif /* USE_ITT_BUILD */
3170 #if OMPT_SUPPORT && OMPT_OPTIONAL
3171   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3172     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3173         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3174   }
3175 #endif
3176 
3177   return (rc ? FTN_TRUE : FTN_FALSE);
3178 
3179   /* Can't use serial interval since not block structured */
3180 
3181 #endif // KMP_USE_DYNAMIC_LOCK
3182 }
3183 
3184 /* try to acquire the lock */
3185 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3186 #if KMP_USE_DYNAMIC_LOCK
3187   int rc;
3188 #if USE_ITT_BUILD
3189   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3190 #endif
3191 #if OMPT_SUPPORT && OMPT_OPTIONAL
3192   // This is the case, if called from omp_init_lock_with_hint:
3193   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3194   if (!codeptr)
3195     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3196   if (ompt_enabled.ompt_callback_mutex_acquire) {
3197     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3198         ompt_mutex_nest_lock, omp_lock_hint_none,
3199         __ompt_get_mutex_impl_type(user_lock),
3200         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3201   }
3202 #endif
3203   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3204 #if USE_ITT_BUILD
3205   if (rc) {
3206     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3207   } else {
3208     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3209   }
3210 #endif
3211 #if OMPT_SUPPORT && OMPT_OPTIONAL
3212   if (ompt_enabled.enabled && rc) {
3213     if (rc == 1) {
3214       if (ompt_enabled.ompt_callback_mutex_acquired) {
3215         // lock_first
3216         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3217             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3218             codeptr);
3219       }
3220     } else {
3221       if (ompt_enabled.ompt_callback_nest_lock) {
3222         // lock_next
3223         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3224             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3225       }
3226     }
3227   }
3228 #endif
3229   return rc;
3230 
3231 #else // KMP_USE_DYNAMIC_LOCK
3232 
3233   kmp_user_lock_p lck;
3234   int rc;
3235 
3236   if ((__kmp_user_lock_kind == lk_tas) &&
3237       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3238        OMP_NEST_LOCK_T_SIZE)) {
3239     lck = (kmp_user_lock_p)user_lock;
3240   }
3241 #if KMP_USE_FUTEX
3242   else if ((__kmp_user_lock_kind == lk_futex) &&
3243            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3244             OMP_NEST_LOCK_T_SIZE)) {
3245     lck = (kmp_user_lock_p)user_lock;
3246   }
3247 #endif
3248   else {
3249     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3250   }
3251 
3252 #if USE_ITT_BUILD
3253   __kmp_itt_lock_acquiring(lck);
3254 #endif /* USE_ITT_BUILD */
3255 
3256 #if OMPT_SUPPORT && OMPT_OPTIONAL
3257   // This is the case, if called from omp_init_lock_with_hint:
3258   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3259   if (!codeptr)
3260     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3261   if (ompt_enabled.enabled) &&
3262         ompt_enabled.ompt_callback_mutex_acquire) {
3263       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3264           ompt_mutex_nest_lock, omp_lock_hint_none,
3265           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3266           codeptr);
3267     }
3268 #endif
3269 
3270   rc = TEST_NESTED_LOCK(lck, gtid);
3271 #if USE_ITT_BUILD
3272   if (rc) {
3273     __kmp_itt_lock_acquired(lck);
3274   } else {
3275     __kmp_itt_lock_cancelled(lck);
3276   }
3277 #endif /* USE_ITT_BUILD */
3278 #if OMPT_SUPPORT && OMPT_OPTIONAL
3279   if (ompt_enabled.enabled && rc) {
3280     if (rc == 1) {
3281       if (ompt_enabled.ompt_callback_mutex_acquired) {
3282         // lock_first
3283         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3284             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3285       }
3286     } else {
3287       if (ompt_enabled.ompt_callback_nest_lock) {
3288         // lock_next
3289         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3290             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3291       }
3292     }
3293   }
3294 #endif
3295   return rc;
3296 
3297   /* Can't use serial interval since not block structured */
3298 
3299 #endif // KMP_USE_DYNAMIC_LOCK
3300 }
3301 
3302 // Interface to fast scalable reduce methods routines
3303 
3304 // keep the selected method in a thread local structure for cross-function
3305 // usage: will be used in __kmpc_end_reduce* functions;
3306 // another solution: to re-determine the method one more time in
3307 // __kmpc_end_reduce* functions (new prototype required then)
3308 // AT: which solution is better?
3309 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3310   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3311 
3312 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3313   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3314 
3315 // description of the packed_reduction_method variable: look at the macros in
3316 // kmp.h
3317 
3318 // used in a critical section reduce block
3319 static __forceinline void
3320 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3321                                           kmp_critical_name *crit) {
3322 
3323   // this lock was visible to a customer and to the threading profile tool as a
3324   // serial overhead span (although it's used for an internal purpose only)
3325   //            why was it visible in previous implementation?
3326   //            should we keep it visible in new reduce block?
3327   kmp_user_lock_p lck;
3328 
3329 #if KMP_USE_DYNAMIC_LOCK
3330 
3331   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3332   // Check if it is initialized.
3333   if (*lk == 0) {
3334     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3335       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3336                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3337     } else {
3338       __kmp_init_indirect_csptr(crit, loc, global_tid,
3339                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3340     }
3341   }
3342   // Branch for accessing the actual lock object and set operation. This
3343   // branching is inevitable since this lock initialization does not follow the
3344   // normal dispatch path (lock table is not used).
3345   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3346     lck = (kmp_user_lock_p)lk;
3347     KMP_DEBUG_ASSERT(lck != NULL);
3348     if (__kmp_env_consistency_check) {
3349       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3350     }
3351     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3352   } else {
3353     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3354     lck = ilk->lock;
3355     KMP_DEBUG_ASSERT(lck != NULL);
3356     if (__kmp_env_consistency_check) {
3357       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3358     }
3359     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3360   }
3361 
3362 #else // KMP_USE_DYNAMIC_LOCK
3363 
3364   // We know that the fast reduction code is only emitted by Intel compilers
3365   // with 32 byte critical sections. If there isn't enough space, then we
3366   // have to use a pointer.
3367   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3368     lck = (kmp_user_lock_p)crit;
3369   } else {
3370     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3371   }
3372   KMP_DEBUG_ASSERT(lck != NULL);
3373 
3374   if (__kmp_env_consistency_check)
3375     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3376 
3377   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3378 
3379 #endif // KMP_USE_DYNAMIC_LOCK
3380 }
3381 
3382 // used in a critical section reduce block
3383 static __forceinline void
3384 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3385                                         kmp_critical_name *crit) {
3386 
3387   kmp_user_lock_p lck;
3388 
3389 #if KMP_USE_DYNAMIC_LOCK
3390 
3391   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3392     lck = (kmp_user_lock_p)crit;
3393     if (__kmp_env_consistency_check)
3394       __kmp_pop_sync(global_tid, ct_critical, loc);
3395     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3396   } else {
3397     kmp_indirect_lock_t *ilk =
3398         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3399     if (__kmp_env_consistency_check)
3400       __kmp_pop_sync(global_tid, ct_critical, loc);
3401     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3402   }
3403 
3404 #else // KMP_USE_DYNAMIC_LOCK
3405 
3406   // We know that the fast reduction code is only emitted by Intel compilers
3407   // with 32 byte critical sections. If there isn't enough space, then we have
3408   // to use a pointer.
3409   if (__kmp_base_user_lock_size > 32) {
3410     lck = *((kmp_user_lock_p *)crit);
3411     KMP_ASSERT(lck != NULL);
3412   } else {
3413     lck = (kmp_user_lock_p)crit;
3414   }
3415 
3416   if (__kmp_env_consistency_check)
3417     __kmp_pop_sync(global_tid, ct_critical, loc);
3418 
3419   __kmp_release_user_lock_with_checks(lck, global_tid);
3420 
3421 #endif // KMP_USE_DYNAMIC_LOCK
3422 } // __kmp_end_critical_section_reduce_block
3423 
3424 static __forceinline int
3425 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3426                                      int *task_state) {
3427   kmp_team_t *team;
3428 
3429   // Check if we are inside the teams construct?
3430   if (th->th.th_teams_microtask) {
3431     *team_p = team = th->th.th_team;
3432     if (team->t.t_level == th->th.th_teams_level) {
3433       // This is reduction at teams construct.
3434       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3435       // Let's swap teams temporarily for the reduction.
3436       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3437       th->th.th_team = team->t.t_parent;
3438       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3439       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3440       *task_state = th->th.th_task_state;
3441       th->th.th_task_state = 0;
3442 
3443       return 1;
3444     }
3445   }
3446   return 0;
3447 }
3448 
3449 static __forceinline void
3450 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3451   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3452   th->th.th_info.ds.ds_tid = 0;
3453   th->th.th_team = team;
3454   th->th.th_team_nproc = team->t.t_nproc;
3455   th->th.th_task_team = team->t.t_task_team[task_state];
3456   __kmp_type_convert(task_state, &(th->th.th_task_state));
3457 }
3458 
3459 /* 2.a.i. Reduce Block without a terminating barrier */
3460 /*!
3461 @ingroup SYNCHRONIZATION
3462 @param loc source location information
3463 @param global_tid global thread number
3464 @param num_vars number of items (variables) to be reduced
3465 @param reduce_size size of data in bytes to be reduced
3466 @param reduce_data pointer to data to be reduced
3467 @param reduce_func callback function providing reduction operation on two
3468 operands and returning result of reduction in lhs_data
3469 @param lck pointer to the unique lock data structure
3470 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3471 threads if atomic reduction needed
3472 
3473 The nowait version is used for a reduce clause with the nowait argument.
3474 */
3475 kmp_int32
3476 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3477                      size_t reduce_size, void *reduce_data,
3478                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3479                      kmp_critical_name *lck) {
3480 
3481   KMP_COUNT_BLOCK(REDUCE_nowait);
3482   int retval = 0;
3483   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3484   kmp_info_t *th;
3485   kmp_team_t *team;
3486   int teams_swapped = 0, task_state;
3487   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3488   __kmp_assert_valid_gtid(global_tid);
3489 
3490   // why do we need this initialization here at all?
3491   // Reduction clause can not be used as a stand-alone directive.
3492 
3493   // do not call __kmp_serial_initialize(), it will be called by
3494   // __kmp_parallel_initialize() if needed
3495   // possible detection of false-positive race by the threadchecker ???
3496   if (!TCR_4(__kmp_init_parallel))
3497     __kmp_parallel_initialize();
3498 
3499   __kmp_resume_if_soft_paused();
3500 
3501 // check correctness of reduce block nesting
3502 #if KMP_USE_DYNAMIC_LOCK
3503   if (__kmp_env_consistency_check)
3504     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3505 #else
3506   if (__kmp_env_consistency_check)
3507     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3508 #endif
3509 
3510   th = __kmp_thread_from_gtid(global_tid);
3511   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3512 
3513   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3514   // the value should be kept in a variable
3515   // the variable should be either a construct-specific or thread-specific
3516   // property, not a team specific property
3517   //     (a thread can reach the next reduce block on the next construct, reduce
3518   //     method may differ on the next construct)
3519   // an ident_t "loc" parameter could be used as a construct-specific property
3520   // (what if loc == 0?)
3521   //     (if both construct-specific and team-specific variables were shared,
3522   //     then unness extra syncs should be needed)
3523   // a thread-specific variable is better regarding two issues above (next
3524   // construct and extra syncs)
3525   // a thread-specific "th_local.reduction_method" variable is used currently
3526   // each thread executes 'determine' and 'set' lines (no need to execute by one
3527   // thread, to avoid unness extra syncs)
3528 
3529   packed_reduction_method = __kmp_determine_reduction_method(
3530       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3531   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3532 
3533   OMPT_REDUCTION_DECL(th, global_tid);
3534   if (packed_reduction_method == critical_reduce_block) {
3535 
3536     OMPT_REDUCTION_BEGIN;
3537 
3538     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3539     retval = 1;
3540 
3541   } else if (packed_reduction_method == empty_reduce_block) {
3542 
3543     OMPT_REDUCTION_BEGIN;
3544 
3545     // usage: if team size == 1, no synchronization is required ( Intel
3546     // platforms only )
3547     retval = 1;
3548 
3549   } else if (packed_reduction_method == atomic_reduce_block) {
3550 
3551     retval = 2;
3552 
3553     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3554     // won't be called by the code gen)
3555     //     (it's not quite good, because the checking block has been closed by
3556     //     this 'pop',
3557     //      but atomic operation has not been executed yet, will be executed
3558     //      slightly later, literally on next instruction)
3559     if (__kmp_env_consistency_check)
3560       __kmp_pop_sync(global_tid, ct_reduce, loc);
3561 
3562   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3563                                    tree_reduce_block)) {
3564 
3565 // AT: performance issue: a real barrier here
3566 // AT: (if primary thread is slow, other threads are blocked here waiting for
3567 //      the primary thread to come and release them)
3568 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3569 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3570 //      be confusing to a customer)
3571 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3572 // might go faster and be more in line with sense of NOWAIT
3573 // AT: TO DO: do epcc test and compare times
3574 
3575 // this barrier should be invisible to a customer and to the threading profile
3576 // tool (it's neither a terminating barrier nor customer's code, it's
3577 // used for an internal purpose)
3578 #if OMPT_SUPPORT
3579     // JP: can this barrier potentially leed to task scheduling?
3580     // JP: as long as there is a barrier in the implementation, OMPT should and
3581     // will provide the barrier events
3582     //         so we set-up the necessary frame/return addresses.
3583     ompt_frame_t *ompt_frame;
3584     if (ompt_enabled.enabled) {
3585       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3586       if (ompt_frame->enter_frame.ptr == NULL)
3587         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3588     }
3589     OMPT_STORE_RETURN_ADDRESS(global_tid);
3590 #endif
3591 #if USE_ITT_NOTIFY
3592     __kmp_threads[global_tid]->th.th_ident = loc;
3593 #endif
3594     retval =
3595         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3596                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3597     retval = (retval != 0) ? (0) : (1);
3598 #if OMPT_SUPPORT && OMPT_OPTIONAL
3599     if (ompt_enabled.enabled) {
3600       ompt_frame->enter_frame = ompt_data_none;
3601     }
3602 #endif
3603 
3604     // all other workers except primary thread should do this pop here
3605     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3606     if (__kmp_env_consistency_check) {
3607       if (retval == 0) {
3608         __kmp_pop_sync(global_tid, ct_reduce, loc);
3609       }
3610     }
3611 
3612   } else {
3613 
3614     // should never reach this block
3615     KMP_ASSERT(0); // "unexpected method"
3616   }
3617   if (teams_swapped) {
3618     __kmp_restore_swapped_teams(th, team, task_state);
3619   }
3620   KA_TRACE(
3621       10,
3622       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3623        global_tid, packed_reduction_method, retval));
3624 
3625   return retval;
3626 }
3627 
3628 /*!
3629 @ingroup SYNCHRONIZATION
3630 @param loc source location information
3631 @param global_tid global thread id.
3632 @param lck pointer to the unique lock data structure
3633 
3634 Finish the execution of a reduce nowait.
3635 */
3636 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3637                               kmp_critical_name *lck) {
3638 
3639   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3640 
3641   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3642   __kmp_assert_valid_gtid(global_tid);
3643 
3644   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3645 
3646   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3647 
3648   if (packed_reduction_method == critical_reduce_block) {
3649 
3650     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3651     OMPT_REDUCTION_END;
3652 
3653   } else if (packed_reduction_method == empty_reduce_block) {
3654 
3655     // usage: if team size == 1, no synchronization is required ( on Intel
3656     // platforms only )
3657 
3658     OMPT_REDUCTION_END;
3659 
3660   } else if (packed_reduction_method == atomic_reduce_block) {
3661 
3662     // neither primary thread nor other workers should get here
3663     //     (code gen does not generate this call in case 2: atomic reduce block)
3664     // actually it's better to remove this elseif at all;
3665     // after removal this value will checked by the 'else' and will assert
3666 
3667   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3668                                    tree_reduce_block)) {
3669 
3670     // only primary thread gets here
3671     // OMPT: tree reduction is annotated in the barrier code
3672 
3673   } else {
3674 
3675     // should never reach this block
3676     KMP_ASSERT(0); // "unexpected method"
3677   }
3678 
3679   if (__kmp_env_consistency_check)
3680     __kmp_pop_sync(global_tid, ct_reduce, loc);
3681 
3682   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3683                 global_tid, packed_reduction_method));
3684 
3685   return;
3686 }
3687 
3688 /* 2.a.ii. Reduce Block with a terminating barrier */
3689 
3690 /*!
3691 @ingroup SYNCHRONIZATION
3692 @param loc source location information
3693 @param global_tid global thread number
3694 @param num_vars number of items (variables) to be reduced
3695 @param reduce_size size of data in bytes to be reduced
3696 @param reduce_data pointer to data to be reduced
3697 @param reduce_func callback function providing reduction operation on two
3698 operands and returning result of reduction in lhs_data
3699 @param lck pointer to the unique lock data structure
3700 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3701 threads if atomic reduction needed
3702 
3703 A blocking reduce that includes an implicit barrier.
3704 */
3705 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3706                         size_t reduce_size, void *reduce_data,
3707                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3708                         kmp_critical_name *lck) {
3709   KMP_COUNT_BLOCK(REDUCE_wait);
3710   int retval = 0;
3711   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3712   kmp_info_t *th;
3713   kmp_team_t *team;
3714   int teams_swapped = 0, task_state;
3715 
3716   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3717   __kmp_assert_valid_gtid(global_tid);
3718 
3719   // why do we need this initialization here at all?
3720   // Reduction clause can not be a stand-alone directive.
3721 
3722   // do not call __kmp_serial_initialize(), it will be called by
3723   // __kmp_parallel_initialize() if needed
3724   // possible detection of false-positive race by the threadchecker ???
3725   if (!TCR_4(__kmp_init_parallel))
3726     __kmp_parallel_initialize();
3727 
3728   __kmp_resume_if_soft_paused();
3729 
3730 // check correctness of reduce block nesting
3731 #if KMP_USE_DYNAMIC_LOCK
3732   if (__kmp_env_consistency_check)
3733     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3734 #else
3735   if (__kmp_env_consistency_check)
3736     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3737 #endif
3738 
3739   th = __kmp_thread_from_gtid(global_tid);
3740   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3741 
3742   packed_reduction_method = __kmp_determine_reduction_method(
3743       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3744   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3745 
3746   OMPT_REDUCTION_DECL(th, global_tid);
3747 
3748   if (packed_reduction_method == critical_reduce_block) {
3749 
3750     OMPT_REDUCTION_BEGIN;
3751     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3752     retval = 1;
3753 
3754   } else if (packed_reduction_method == empty_reduce_block) {
3755 
3756     OMPT_REDUCTION_BEGIN;
3757     // usage: if team size == 1, no synchronization is required ( Intel
3758     // platforms only )
3759     retval = 1;
3760 
3761   } else if (packed_reduction_method == atomic_reduce_block) {
3762 
3763     retval = 2;
3764 
3765   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3766                                    tree_reduce_block)) {
3767 
3768 // case tree_reduce_block:
3769 // this barrier should be visible to a customer and to the threading profile
3770 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3771 #if OMPT_SUPPORT
3772     ompt_frame_t *ompt_frame;
3773     if (ompt_enabled.enabled) {
3774       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3775       if (ompt_frame->enter_frame.ptr == NULL)
3776         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3777     }
3778     OMPT_STORE_RETURN_ADDRESS(global_tid);
3779 #endif
3780 #if USE_ITT_NOTIFY
3781     __kmp_threads[global_tid]->th.th_ident =
3782         loc; // needed for correct notification of frames
3783 #endif
3784     retval =
3785         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3786                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3787     retval = (retval != 0) ? (0) : (1);
3788 #if OMPT_SUPPORT && OMPT_OPTIONAL
3789     if (ompt_enabled.enabled) {
3790       ompt_frame->enter_frame = ompt_data_none;
3791     }
3792 #endif
3793 
3794     // all other workers except primary thread should do this pop here
3795     // (none of other workers except primary will enter __kmpc_end_reduce())
3796     if (__kmp_env_consistency_check) {
3797       if (retval == 0) { // 0: all other workers; 1: primary thread
3798         __kmp_pop_sync(global_tid, ct_reduce, loc);
3799       }
3800     }
3801 
3802   } else {
3803 
3804     // should never reach this block
3805     KMP_ASSERT(0); // "unexpected method"
3806   }
3807   if (teams_swapped) {
3808     __kmp_restore_swapped_teams(th, team, task_state);
3809   }
3810 
3811   KA_TRACE(10,
3812            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3813             global_tid, packed_reduction_method, retval));
3814   return retval;
3815 }
3816 
3817 /*!
3818 @ingroup SYNCHRONIZATION
3819 @param loc source location information
3820 @param global_tid global thread id.
3821 @param lck pointer to the unique lock data structure
3822 
3823 Finish the execution of a blocking reduce.
3824 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3825 start function.
3826 */
3827 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3828                        kmp_critical_name *lck) {
3829 
3830   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3831   kmp_info_t *th;
3832   kmp_team_t *team;
3833   int teams_swapped = 0, task_state;
3834 
3835   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3836   __kmp_assert_valid_gtid(global_tid);
3837 
3838   th = __kmp_thread_from_gtid(global_tid);
3839   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3840 
3841   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3842 
3843   // this barrier should be visible to a customer and to the threading profile
3844   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3845   OMPT_REDUCTION_DECL(th, global_tid);
3846 
3847   if (packed_reduction_method == critical_reduce_block) {
3848     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3849 
3850     OMPT_REDUCTION_END;
3851 
3852 // TODO: implicit barrier: should be exposed
3853 #if OMPT_SUPPORT
3854     ompt_frame_t *ompt_frame;
3855     if (ompt_enabled.enabled) {
3856       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3857       if (ompt_frame->enter_frame.ptr == NULL)
3858         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3859     }
3860     OMPT_STORE_RETURN_ADDRESS(global_tid);
3861 #endif
3862 #if USE_ITT_NOTIFY
3863     __kmp_threads[global_tid]->th.th_ident = loc;
3864 #endif
3865     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3866 #if OMPT_SUPPORT && OMPT_OPTIONAL
3867     if (ompt_enabled.enabled) {
3868       ompt_frame->enter_frame = ompt_data_none;
3869     }
3870 #endif
3871 
3872   } else if (packed_reduction_method == empty_reduce_block) {
3873 
3874     OMPT_REDUCTION_END;
3875 
3876 // usage: if team size==1, no synchronization is required (Intel platforms only)
3877 
3878 // TODO: implicit barrier: should be exposed
3879 #if OMPT_SUPPORT
3880     ompt_frame_t *ompt_frame;
3881     if (ompt_enabled.enabled) {
3882       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3883       if (ompt_frame->enter_frame.ptr == NULL)
3884         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3885     }
3886     OMPT_STORE_RETURN_ADDRESS(global_tid);
3887 #endif
3888 #if USE_ITT_NOTIFY
3889     __kmp_threads[global_tid]->th.th_ident = loc;
3890 #endif
3891     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3892 #if OMPT_SUPPORT && OMPT_OPTIONAL
3893     if (ompt_enabled.enabled) {
3894       ompt_frame->enter_frame = ompt_data_none;
3895     }
3896 #endif
3897 
3898   } else if (packed_reduction_method == atomic_reduce_block) {
3899 
3900 #if OMPT_SUPPORT
3901     ompt_frame_t *ompt_frame;
3902     if (ompt_enabled.enabled) {
3903       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3904       if (ompt_frame->enter_frame.ptr == NULL)
3905         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3906     }
3907     OMPT_STORE_RETURN_ADDRESS(global_tid);
3908 #endif
3909 // TODO: implicit barrier: should be exposed
3910 #if USE_ITT_NOTIFY
3911     __kmp_threads[global_tid]->th.th_ident = loc;
3912 #endif
3913     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3914 #if OMPT_SUPPORT && OMPT_OPTIONAL
3915     if (ompt_enabled.enabled) {
3916       ompt_frame->enter_frame = ompt_data_none;
3917     }
3918 #endif
3919 
3920   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3921                                    tree_reduce_block)) {
3922 
3923     // only primary thread executes here (primary releases all other workers)
3924     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3925                             global_tid);
3926 
3927   } else {
3928 
3929     // should never reach this block
3930     KMP_ASSERT(0); // "unexpected method"
3931   }
3932   if (teams_swapped) {
3933     __kmp_restore_swapped_teams(th, team, task_state);
3934   }
3935 
3936   if (__kmp_env_consistency_check)
3937     __kmp_pop_sync(global_tid, ct_reduce, loc);
3938 
3939   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3940                 global_tid, packed_reduction_method));
3941 
3942   return;
3943 }
3944 
3945 #undef __KMP_GET_REDUCTION_METHOD
3946 #undef __KMP_SET_REDUCTION_METHOD
3947 
3948 /* end of interface to fast scalable reduce routines */
3949 
3950 kmp_uint64 __kmpc_get_taskid() {
3951 
3952   kmp_int32 gtid;
3953   kmp_info_t *thread;
3954 
3955   gtid = __kmp_get_gtid();
3956   if (gtid < 0) {
3957     return 0;
3958   }
3959   thread = __kmp_thread_from_gtid(gtid);
3960   return thread->th.th_current_task->td_task_id;
3961 
3962 } // __kmpc_get_taskid
3963 
3964 kmp_uint64 __kmpc_get_parent_taskid() {
3965 
3966   kmp_int32 gtid;
3967   kmp_info_t *thread;
3968   kmp_taskdata_t *parent_task;
3969 
3970   gtid = __kmp_get_gtid();
3971   if (gtid < 0) {
3972     return 0;
3973   }
3974   thread = __kmp_thread_from_gtid(gtid);
3975   parent_task = thread->th.th_current_task->td_parent;
3976   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3977 
3978 } // __kmpc_get_parent_taskid
3979 
3980 /*!
3981 @ingroup WORK_SHARING
3982 @param loc  source location information.
3983 @param gtid  global thread number.
3984 @param num_dims  number of associated doacross loops.
3985 @param dims  info on loops bounds.
3986 
3987 Initialize doacross loop information.
3988 Expect compiler send us inclusive bounds,
3989 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3990 */
3991 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3992                           const struct kmp_dim *dims) {
3993   __kmp_assert_valid_gtid(gtid);
3994   int j, idx;
3995   kmp_int64 last, trace_count;
3996   kmp_info_t *th = __kmp_threads[gtid];
3997   kmp_team_t *team = th->th.th_team;
3998   kmp_uint32 *flags;
3999   kmp_disp_t *pr_buf = th->th.th_dispatch;
4000   dispatch_shared_info_t *sh_buf;
4001 
4002   KA_TRACE(
4003       20,
4004       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4005        gtid, num_dims, !team->t.t_serialized));
4006   KMP_DEBUG_ASSERT(dims != NULL);
4007   KMP_DEBUG_ASSERT(num_dims > 0);
4008 
4009   if (team->t.t_serialized) {
4010     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4011     return; // no dependencies if team is serialized
4012   }
4013   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4014   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4015   // the next loop
4016   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4017 
4018   // Save bounds info into allocated private buffer
4019   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4020   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4021       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4022   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4023   pr_buf->th_doacross_info[0] =
4024       (kmp_int64)num_dims; // first element is number of dimensions
4025   // Save also address of num_done in order to access it later without knowing
4026   // the buffer index
4027   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4028   pr_buf->th_doacross_info[2] = dims[0].lo;
4029   pr_buf->th_doacross_info[3] = dims[0].up;
4030   pr_buf->th_doacross_info[4] = dims[0].st;
4031   last = 5;
4032   for (j = 1; j < num_dims; ++j) {
4033     kmp_int64
4034         range_length; // To keep ranges of all dimensions but the first dims[0]
4035     if (dims[j].st == 1) { // most common case
4036       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4037       range_length = dims[j].up - dims[j].lo + 1;
4038     } else {
4039       if (dims[j].st > 0) {
4040         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4041         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4042       } else { // negative increment
4043         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4044         range_length =
4045             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4046       }
4047     }
4048     pr_buf->th_doacross_info[last++] = range_length;
4049     pr_buf->th_doacross_info[last++] = dims[j].lo;
4050     pr_buf->th_doacross_info[last++] = dims[j].up;
4051     pr_buf->th_doacross_info[last++] = dims[j].st;
4052   }
4053 
4054   // Compute total trip count.
4055   // Start with range of dims[0] which we don't need to keep in the buffer.
4056   if (dims[0].st == 1) { // most common case
4057     trace_count = dims[0].up - dims[0].lo + 1;
4058   } else if (dims[0].st > 0) {
4059     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4060     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4061   } else { // negative increment
4062     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4063     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4064   }
4065   for (j = 1; j < num_dims; ++j) {
4066     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4067   }
4068   KMP_DEBUG_ASSERT(trace_count > 0);
4069 
4070   // Check if shared buffer is not occupied by other loop (idx -
4071   // __kmp_dispatch_num_buffers)
4072   if (idx != sh_buf->doacross_buf_idx) {
4073     // Shared buffer is occupied, wait for it to be free
4074     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4075                  __kmp_eq_4, NULL);
4076   }
4077 #if KMP_32_BIT_ARCH
4078   // Check if we are the first thread. After the CAS the first thread gets 0,
4079   // others get 1 if initialization is in progress, allocated pointer otherwise.
4080   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4081   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4082       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4083 #else
4084   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4085       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4086 #endif
4087   if (flags == NULL) {
4088     // we are the first thread, allocate the array of flags
4089     size_t size =
4090         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4091     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4092     KMP_MB();
4093     sh_buf->doacross_flags = flags;
4094   } else if (flags == (kmp_uint32 *)1) {
4095 #if KMP_32_BIT_ARCH
4096     // initialization is still in progress, need to wait
4097     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4098 #else
4099     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4100 #endif
4101       KMP_YIELD(TRUE);
4102     KMP_MB();
4103   } else {
4104     KMP_MB();
4105   }
4106   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4107   pr_buf->th_doacross_flags =
4108       sh_buf->doacross_flags; // save private copy in order to not
4109   // touch shared buffer on each iteration
4110   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4111 }
4112 
4113 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4114   __kmp_assert_valid_gtid(gtid);
4115   kmp_int64 shft;
4116   size_t num_dims, i;
4117   kmp_uint32 flag;
4118   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4119   kmp_info_t *th = __kmp_threads[gtid];
4120   kmp_team_t *team = th->th.th_team;
4121   kmp_disp_t *pr_buf;
4122   kmp_int64 lo, up, st;
4123 
4124   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4125   if (team->t.t_serialized) {
4126     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4127     return; // no dependencies if team is serialized
4128   }
4129 
4130   // calculate sequential iteration number and check out-of-bounds condition
4131   pr_buf = th->th.th_dispatch;
4132   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4133   num_dims = (size_t)pr_buf->th_doacross_info[0];
4134   lo = pr_buf->th_doacross_info[2];
4135   up = pr_buf->th_doacross_info[3];
4136   st = pr_buf->th_doacross_info[4];
4137 #if OMPT_SUPPORT && OMPT_OPTIONAL
4138   ompt_dependence_t deps[num_dims];
4139 #endif
4140   if (st == 1) { // most common case
4141     if (vec[0] < lo || vec[0] > up) {
4142       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4143                     "bounds [%lld,%lld]\n",
4144                     gtid, vec[0], lo, up));
4145       return;
4146     }
4147     iter_number = vec[0] - lo;
4148   } else if (st > 0) {
4149     if (vec[0] < lo || vec[0] > up) {
4150       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4151                     "bounds [%lld,%lld]\n",
4152                     gtid, vec[0], lo, up));
4153       return;
4154     }
4155     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4156   } else { // negative increment
4157     if (vec[0] > lo || vec[0] < up) {
4158       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4159                     "bounds [%lld,%lld]\n",
4160                     gtid, vec[0], lo, up));
4161       return;
4162     }
4163     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4164   }
4165 #if OMPT_SUPPORT && OMPT_OPTIONAL
4166   deps[0].variable.value = iter_number;
4167   deps[0].dependence_type = ompt_dependence_type_sink;
4168 #endif
4169   for (i = 1; i < num_dims; ++i) {
4170     kmp_int64 iter, ln;
4171     size_t j = i * 4;
4172     ln = pr_buf->th_doacross_info[j + 1];
4173     lo = pr_buf->th_doacross_info[j + 2];
4174     up = pr_buf->th_doacross_info[j + 3];
4175     st = pr_buf->th_doacross_info[j + 4];
4176     if (st == 1) {
4177       if (vec[i] < lo || vec[i] > up) {
4178         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4179                       "bounds [%lld,%lld]\n",
4180                       gtid, vec[i], lo, up));
4181         return;
4182       }
4183       iter = vec[i] - lo;
4184     } else if (st > 0) {
4185       if (vec[i] < lo || vec[i] > up) {
4186         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4187                       "bounds [%lld,%lld]\n",
4188                       gtid, vec[i], lo, up));
4189         return;
4190       }
4191       iter = (kmp_uint64)(vec[i] - lo) / st;
4192     } else { // st < 0
4193       if (vec[i] > lo || vec[i] < up) {
4194         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4195                       "bounds [%lld,%lld]\n",
4196                       gtid, vec[i], lo, up));
4197         return;
4198       }
4199       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4200     }
4201     iter_number = iter + ln * iter_number;
4202 #if OMPT_SUPPORT && OMPT_OPTIONAL
4203     deps[i].variable.value = iter;
4204     deps[i].dependence_type = ompt_dependence_type_sink;
4205 #endif
4206   }
4207   shft = iter_number % 32; // use 32-bit granularity
4208   iter_number >>= 5; // divided by 32
4209   flag = 1 << shft;
4210   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4211     KMP_YIELD(TRUE);
4212   }
4213   KMP_MB();
4214 #if OMPT_SUPPORT && OMPT_OPTIONAL
4215   if (ompt_enabled.ompt_callback_dependences) {
4216     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4217         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4218   }
4219 #endif
4220   KA_TRACE(20,
4221            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4222             gtid, (iter_number << 5) + shft));
4223 }
4224 
4225 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4226   __kmp_assert_valid_gtid(gtid);
4227   kmp_int64 shft;
4228   size_t num_dims, i;
4229   kmp_uint32 flag;
4230   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4231   kmp_info_t *th = __kmp_threads[gtid];
4232   kmp_team_t *team = th->th.th_team;
4233   kmp_disp_t *pr_buf;
4234   kmp_int64 lo, st;
4235 
4236   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4237   if (team->t.t_serialized) {
4238     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4239     return; // no dependencies if team is serialized
4240   }
4241 
4242   // calculate sequential iteration number (same as in "wait" but no
4243   // out-of-bounds checks)
4244   pr_buf = th->th.th_dispatch;
4245   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4246   num_dims = (size_t)pr_buf->th_doacross_info[0];
4247   lo = pr_buf->th_doacross_info[2];
4248   st = pr_buf->th_doacross_info[4];
4249 #if OMPT_SUPPORT && OMPT_OPTIONAL
4250   ompt_dependence_t deps[num_dims];
4251 #endif
4252   if (st == 1) { // most common case
4253     iter_number = vec[0] - lo;
4254   } else if (st > 0) {
4255     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4256   } else { // negative increment
4257     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4258   }
4259 #if OMPT_SUPPORT && OMPT_OPTIONAL
4260   deps[0].variable.value = iter_number;
4261   deps[0].dependence_type = ompt_dependence_type_source;
4262 #endif
4263   for (i = 1; i < num_dims; ++i) {
4264     kmp_int64 iter, ln;
4265     size_t j = i * 4;
4266     ln = pr_buf->th_doacross_info[j + 1];
4267     lo = pr_buf->th_doacross_info[j + 2];
4268     st = pr_buf->th_doacross_info[j + 4];
4269     if (st == 1) {
4270       iter = vec[i] - lo;
4271     } else if (st > 0) {
4272       iter = (kmp_uint64)(vec[i] - lo) / st;
4273     } else { // st < 0
4274       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4275     }
4276     iter_number = iter + ln * iter_number;
4277 #if OMPT_SUPPORT && OMPT_OPTIONAL
4278     deps[i].variable.value = iter;
4279     deps[i].dependence_type = ompt_dependence_type_source;
4280 #endif
4281   }
4282 #if OMPT_SUPPORT && OMPT_OPTIONAL
4283   if (ompt_enabled.ompt_callback_dependences) {
4284     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4285         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4286   }
4287 #endif
4288   shft = iter_number % 32; // use 32-bit granularity
4289   iter_number >>= 5; // divided by 32
4290   flag = 1 << shft;
4291   KMP_MB();
4292   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4293     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4294   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4295                 (iter_number << 5) + shft));
4296 }
4297 
4298 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4299   __kmp_assert_valid_gtid(gtid);
4300   kmp_int32 num_done;
4301   kmp_info_t *th = __kmp_threads[gtid];
4302   kmp_team_t *team = th->th.th_team;
4303   kmp_disp_t *pr_buf = th->th.th_dispatch;
4304 
4305   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4306   if (team->t.t_serialized) {
4307     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4308     return; // nothing to do
4309   }
4310   num_done =
4311       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4312   if (num_done == th->th.th_team_nproc) {
4313     // we are the last thread, need to free shared resources
4314     int idx = pr_buf->th_doacross_buf_idx - 1;
4315     dispatch_shared_info_t *sh_buf =
4316         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4317     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4318                      (kmp_int64)&sh_buf->doacross_num_done);
4319     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4320     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4321     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4322     sh_buf->doacross_flags = NULL;
4323     sh_buf->doacross_num_done = 0;
4324     sh_buf->doacross_buf_idx +=
4325         __kmp_dispatch_num_buffers; // free buffer for future re-use
4326   }
4327   // free private resources (need to keep buffer index forever)
4328   pr_buf->th_doacross_flags = NULL;
4329   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4330   pr_buf->th_doacross_info = NULL;
4331   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4332 }
4333 
4334 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4335 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4336   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4337 }
4338 
4339 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4340   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4341 }
4342 
4343 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4344                   omp_allocator_handle_t free_allocator) {
4345   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4346                         free_allocator);
4347 }
4348 
4349 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4350   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4351 }
4352 
4353 int __kmpc_get_target_offload(void) {
4354   if (!__kmp_init_serial) {
4355     __kmp_serial_initialize();
4356   }
4357   return __kmp_target_offload;
4358 }
4359 
4360 int __kmpc_pause_resource(kmp_pause_status_t level) {
4361   if (!__kmp_init_serial) {
4362     return 1; // Can't pause if runtime is not initialized
4363   }
4364   return __kmp_pause_resource(level);
4365 }
4366 
4367 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4368   if (!__kmp_init_serial)
4369     __kmp_serial_initialize();
4370 
4371   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4372 
4373 #if OMPT_SUPPORT
4374   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4375     ompt_callbacks.ompt_callback(ompt_callback_error)(
4376         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4377         OMPT_GET_RETURN_ADDRESS(0));
4378   }
4379 #endif // OMPT_SUPPORT
4380 
4381   char *src_loc;
4382   if (loc && loc->psource) {
4383     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4384     src_loc =
4385         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4386     __kmp_str_loc_free(&str_loc);
4387   } else {
4388     src_loc = __kmp_str_format("unknown");
4389   }
4390 
4391   if (severity == severity_warning)
4392     KMP_WARNING(UserDirectedWarning, src_loc, message);
4393   else
4394     KMP_FATAL(UserDirectedError, src_loc, message);
4395 
4396   __kmp_str_free(&src_loc);
4397 }
4398