1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     __kmp_assign_root_init_mask();
43     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
44   } else if (__kmp_ignore_mppbeg() == FALSE) {
45     // By default __kmp_ignore_mppbeg() returns TRUE.
46     __kmp_internal_begin();
47     KC_TRACE(10, ("__kmpc_begin: called\n"));
48   }
49 }
50 
51 /*!
52  * @ingroup STARTUP_SHUTDOWN
53  * @param loc source location information
54  *
55  * Shutdown the runtime library. This is also optional, and even if called will
56  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
57  * zero.
58  */
59 void __kmpc_end(ident_t *loc) {
60   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
61   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
62   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
63   // returns FALSE and __kmpc_end() will unregister this root (it can cause
64   // library shut down).
65   if (__kmp_ignore_mppend() == FALSE) {
66     KC_TRACE(10, ("__kmpc_end: called\n"));
67     KA_TRACE(30, ("__kmpc_end\n"));
68 
69     __kmp_internal_end_thread(-1);
70   }
71 #if KMP_OS_WINDOWS && OMPT_SUPPORT
72   // Normal exit process on Windows does not allow worker threads of the final
73   // parallel region to finish reporting their events, so shutting down the
74   // library here fixes the issue at least for the cases where __kmpc_end() is
75   // placed properly.
76   if (ompt_enabled.enabled)
77     __kmp_internal_end_library(__kmp_gtid_get_specific());
78 #endif
79 }
80 
81 /*!
82 @ingroup THREAD_STATES
83 @param loc Source location information.
84 @return The global thread index of the active thread.
85 
86 This function can be called in any context.
87 
88 If the runtime has ony been entered at the outermost level from a
89 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
90 that which would be returned by omp_get_thread_num() in the outermost
91 active parallel construct. (Or zero if there is no active parallel
92 construct, since the primary thread is necessarily thread zero).
93 
94 If multiple non-OpenMP threads all enter an OpenMP construct then this
95 will be a unique thread identifier among all the threads created by
96 the OpenMP runtime (but the value cannot be defined in terms of
97 OpenMP thread ids returned by omp_get_thread_num()).
98 */
99 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
100   kmp_int32 gtid = __kmp_entry_gtid();
101 
102   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
103 
104   return gtid;
105 }
106 
107 /*!
108 @ingroup THREAD_STATES
109 @param loc Source location information.
110 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
111 
112 This function can be called in any context.
113 It returns the total number of threads under the control of the OpenMP runtime.
114 That is not a number that can be determined by any OpenMP standard calls, since
115 the library may be called from more than one non-OpenMP thread, and this
116 reflects the total over all such calls. Similarly the runtime maintains
117 underlying threads even when they are not active (since the cost of creating
118 and destroying OS threads is high), this call counts all such threads even if
119 they are not waiting for work.
120 */
121 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
122   KC_TRACE(10,
123            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
124 
125   return TCR_4(__kmp_all_nth);
126 }
127 
128 /*!
129 @ingroup THREAD_STATES
130 @param loc Source location information.
131 @return The thread number of the calling thread in the innermost active parallel
132 construct.
133 */
134 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
135   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
136   return __kmp_tid_from_gtid(__kmp_entry_gtid());
137 }
138 
139 /*!
140 @ingroup THREAD_STATES
141 @param loc Source location information.
142 @return The number of threads in the innermost active parallel construct.
143 */
144 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
145   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
146 
147   return __kmp_entry_thread()->th.th_team->t.t_nproc;
148 }
149 
150 /*!
151  * @ingroup DEPRECATED
152  * @param loc location description
153  *
154  * This function need not be called. It always returns TRUE.
155  */
156 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
157 #ifndef KMP_DEBUG
158 
159   return TRUE;
160 
161 #else
162 
163   const char *semi2;
164   const char *semi3;
165   int line_no;
166 
167   if (__kmp_par_range == 0) {
168     return TRUE;
169   }
170   semi2 = loc->psource;
171   if (semi2 == NULL) {
172     return TRUE;
173   }
174   semi2 = strchr(semi2, ';');
175   if (semi2 == NULL) {
176     return TRUE;
177   }
178   semi2 = strchr(semi2 + 1, ';');
179   if (semi2 == NULL) {
180     return TRUE;
181   }
182   if (__kmp_par_range_filename[0]) {
183     const char *name = semi2 - 1;
184     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
185       name--;
186     }
187     if ((*name == '/') || (*name == ';')) {
188       name++;
189     }
190     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
191       return __kmp_par_range < 0;
192     }
193   }
194   semi3 = strchr(semi2 + 1, ';');
195   if (__kmp_par_range_routine[0]) {
196     if ((semi3 != NULL) && (semi3 > semi2) &&
197         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
198       return __kmp_par_range < 0;
199     }
200   }
201   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
202     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
203       return __kmp_par_range > 0;
204     }
205     return __kmp_par_range < 0;
206   }
207   return TRUE;
208 
209 #endif /* KMP_DEBUG */
210 }
211 
212 /*!
213 @ingroup THREAD_STATES
214 @param loc Source location information.
215 @return 1 if this thread is executing inside an active parallel region, zero if
216 not.
217 */
218 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
219   return __kmp_entry_thread()->th.th_root->r.r_active;
220 }
221 
222 /*!
223 @ingroup PARALLEL
224 @param loc source location information
225 @param global_tid global thread number
226 @param num_threads number of threads requested for this parallel construct
227 
228 Set the number of threads to be used by the next fork spawned by this thread.
229 This call is only required if the parallel construct has a `num_threads` clause.
230 */
231 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
232                              kmp_int32 num_threads) {
233   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
234                 global_tid, num_threads));
235   __kmp_assert_valid_gtid(global_tid);
236   __kmp_push_num_threads(loc, global_tid, num_threads);
237 }
238 
239 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
240   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
241   /* the num_threads are automatically popped */
242 }
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248   __kmp_assert_valid_gtid(global_tid);
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 /*!
253 @ingroup PARALLEL
254 @param loc  source location information
255 @param argc  total number of arguments in the ellipsis
256 @param microtask  pointer to callback routine consisting of outlined parallel
257 construct
258 @param ...  pointers to shared variables that aren't global
259 
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263   int gtid = __kmp_entry_gtid();
264 
265 #if (KMP_STATS_ENABLED)
266   // If we were in a serial region, then stop the serial timer, record
267   // the event, and start parallel region timer
268   stats_state_e previous_state = KMP_GET_THREAD_STATE();
269   if (previous_state == stats_state_e::SERIAL_REGION) {
270     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271   } else {
272     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273   }
274   int inParallel = __kmpc_in_parallel(loc);
275   if (inParallel) {
276     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277   } else {
278     KMP_COUNT_BLOCK(OMP_PARALLEL);
279   }
280 #endif
281 
282   // maybe to save thr_state is enough here
283   {
284     va_list ap;
285     va_start(ap, microtask);
286 
287 #if OMPT_SUPPORT
288     ompt_frame_t *ompt_frame;
289     if (ompt_enabled.enabled) {
290       kmp_info_t *master_th = __kmp_threads[gtid];
291       kmp_team_t *parent_team = master_th->th.th_team;
292       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293       if (lwt)
294         ompt_frame = &(lwt->ompt_task_info.frame);
295       else {
296         int tid = __kmp_tid_from_gtid(gtid);
297         ompt_frame = &(
298             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299       }
300       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301     }
302     OMPT_STORE_RETURN_ADDRESS(gtid);
303 #endif
304 
305 #if INCLUDE_SSC_MARKS
306     SSC_MARK_FORKING();
307 #endif
308     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311                     kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313     SSC_MARK_JOINING();
314 #endif
315     __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317                     ,
318                     fork_context_intel
319 #endif
320     );
321 
322     va_end(ap);
323   }
324 
325 #if KMP_STATS_ENABLED
326   if (previous_state == stats_state_e::SERIAL_REGION) {
327     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328     KMP_SET_THREAD_STATE(previous_state);
329   } else {
330     KMP_POP_PARTITIONED_TIMER();
331   }
332 #endif // KMP_STATS_ENABLED
333 }
334 
335 /*!
336 @ingroup PARALLEL
337 @param loc source location information
338 @param global_tid global thread number
339 @param num_teams number of teams requested for the teams construct
340 @param num_threads number of threads per team requested for the teams construct
341 
342 Set the number of teams to be used by the teams construct.
343 This call is only required if the teams construct has a `num_teams` clause
344 or a `thread_limit` clause (or both).
345 */
346 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
347                            kmp_int32 num_teams, kmp_int32 num_threads) {
348   KA_TRACE(20,
349            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
350             global_tid, num_teams, num_threads));
351   __kmp_assert_valid_gtid(global_tid);
352   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
353 }
354 
355 /*!
356 @ingroup PARALLEL
357 @param loc source location information
358 @param global_tid global thread number
359 @param num_teams_lo lower bound on number of teams requested for the teams
360 construct
361 @param num_teams_up upper bound on number of teams requested for the teams
362 construct
363 @param num_threads number of threads per team requested for the teams construct
364 
365 Set the number of teams to be used by the teams construct. The number of initial
366 teams cretaed will be greater than or equal to the lower bound and less than or
367 equal to the upper bound.
368 This call is only required if the teams construct has a `num_teams` clause
369 or a `thread_limit` clause (or both).
370 */
371 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
372                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
373                               kmp_int32 num_threads) {
374   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
375                 " num_teams_ub=%d num_threads=%d\n",
376                 global_tid, num_teams_lb, num_teams_ub, num_threads));
377   __kmp_assert_valid_gtid(global_tid);
378   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
379                           num_threads);
380 }
381 
382 /*!
383 @ingroup PARALLEL
384 @param loc  source location information
385 @param argc  total number of arguments in the ellipsis
386 @param microtask  pointer to callback routine consisting of outlined teams
387 construct
388 @param ...  pointers to shared variables that aren't global
389 
390 Do the actual fork and call the microtask in the relevant number of threads.
391 */
392 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
393                        ...) {
394   int gtid = __kmp_entry_gtid();
395   kmp_info_t *this_thr = __kmp_threads[gtid];
396   va_list ap;
397   va_start(ap, microtask);
398 
399 #if KMP_STATS_ENABLED
400   KMP_COUNT_BLOCK(OMP_TEAMS);
401   stats_state_e previous_state = KMP_GET_THREAD_STATE();
402   if (previous_state == stats_state_e::SERIAL_REGION) {
403     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
404   } else {
405     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
406   }
407 #endif
408 
409   // remember teams entry point and nesting level
410   this_thr->th.th_teams_microtask = microtask;
411   this_thr->th.th_teams_level =
412       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
413 
414 #if OMPT_SUPPORT
415   kmp_team_t *parent_team = this_thr->th.th_team;
416   int tid = __kmp_tid_from_gtid(gtid);
417   if (ompt_enabled.enabled) {
418     parent_team->t.t_implicit_task_taskdata[tid]
419         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
420   }
421   OMPT_STORE_RETURN_ADDRESS(gtid);
422 #endif
423 
424   // check if __kmpc_push_num_teams called, set default number of teams
425   // otherwise
426   if (this_thr->th.th_teams_size.nteams == 0) {
427     __kmp_push_num_teams(loc, gtid, 0, 0);
428   }
429   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
431   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
432 
433   __kmp_fork_call(
434       loc, gtid, fork_context_intel, argc,
435       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
436       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
437   __kmp_join_call(loc, gtid
438 #if OMPT_SUPPORT
439                   ,
440                   fork_context_intel
441 #endif
442   );
443 
444   // Pop current CG root off list
445   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
446   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
447   this_thr->th.th_cg_roots = tmp->up;
448   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
449                  " to node %p. cg_nthreads was %d\n",
450                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
451   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
452   int i = tmp->cg_nthreads--;
453   if (i == 1) { // check is we are the last thread in CG (not always the case)
454     __kmp_free(tmp);
455   }
456   // Restore current task's thread_limit from CG root
457   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
458   this_thr->th.th_current_task->td_icvs.thread_limit =
459       this_thr->th.th_cg_roots->cg_thread_limit;
460 
461   this_thr->th.th_teams_microtask = NULL;
462   this_thr->th.th_teams_level = 0;
463   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
464   va_end(ap);
465 #if KMP_STATS_ENABLED
466   if (previous_state == stats_state_e::SERIAL_REGION) {
467     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
468     KMP_SET_THREAD_STATE(previous_state);
469   } else {
470     KMP_POP_PARTITIONED_TIMER();
471   }
472 #endif // KMP_STATS_ENABLED
473 }
474 
475 // I don't think this function should ever have been exported.
476 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
477 // openmp code ever called it, but it's been exported from the RTL for so
478 // long that I'm afraid to remove the definition.
479 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
480 
481 /*!
482 @ingroup PARALLEL
483 @param loc  source location information
484 @param global_tid  global thread number
485 
486 Enter a serialized parallel construct. This interface is used to handle a
487 conditional parallel region, like this,
488 @code
489 #pragma omp parallel if (condition)
490 @endcode
491 when the condition is false.
492 */
493 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
494   // The implementation is now in kmp_runtime.cpp so that it can share static
495   // functions with kmp_fork_call since the tasks to be done are similar in
496   // each case.
497   __kmp_assert_valid_gtid(global_tid);
498 #if OMPT_SUPPORT
499   OMPT_STORE_RETURN_ADDRESS(global_tid);
500 #endif
501   __kmp_serialized_parallel(loc, global_tid);
502 }
503 
504 /*!
505 @ingroup PARALLEL
506 @param loc  source location information
507 @param global_tid  global thread number
508 
509 Leave a serialized parallel construct.
510 */
511 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
512   kmp_internal_control_t *top;
513   kmp_info_t *this_thr;
514   kmp_team_t *serial_team;
515 
516   KC_TRACE(10,
517            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
518 
519   /* skip all this code for autopar serialized loops since it results in
520      unacceptable overhead */
521   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
522     return;
523 
524   // Not autopar code
525   __kmp_assert_valid_gtid(global_tid);
526   if (!TCR_4(__kmp_init_parallel))
527     __kmp_parallel_initialize();
528 
529   __kmp_resume_if_soft_paused();
530 
531   this_thr = __kmp_threads[global_tid];
532   serial_team = this_thr->th.th_serial_team;
533 
534   kmp_task_team_t *task_team = this_thr->th.th_task_team;
535   // we need to wait for the proxy tasks before finishing the thread
536   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
537     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
538 
539   KMP_MB();
540   KMP_DEBUG_ASSERT(serial_team);
541   KMP_ASSERT(serial_team->t.t_serialized);
542   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
543   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
545   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
546 
547 #if OMPT_SUPPORT
548   if (ompt_enabled.enabled &&
549       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
550     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
551     if (ompt_enabled.ompt_callback_implicit_task) {
552       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
553           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
554           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
555     }
556 
557     // reset clear the task id only after unlinking the task
558     ompt_data_t *parent_task_data;
559     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
560 
561     if (ompt_enabled.ompt_callback_parallel_end) {
562       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
563           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
564           ompt_parallel_invoker_program | ompt_parallel_team,
565           OMPT_LOAD_RETURN_ADDRESS(global_tid));
566     }
567     __ompt_lw_taskteam_unlink(this_thr);
568     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
569   }
570 #endif
571 
572   /* If necessary, pop the internal control stack values and replace the team
573    * values */
574   top = serial_team->t.t_control_stack_top;
575   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
576     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
577     serial_team->t.t_control_stack_top = top->next;
578     __kmp_free(top);
579   }
580 
581   // if( serial_team -> t.t_serialized > 1 )
582   serial_team->t.t_level--;
583 
584   /* pop dispatch buffers stack */
585   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
586   {
587     dispatch_private_info_t *disp_buffer =
588         serial_team->t.t_dispatch->th_disp_buffer;
589     serial_team->t.t_dispatch->th_disp_buffer =
590         serial_team->t.t_dispatch->th_disp_buffer->next;
591     __kmp_free(disp_buffer);
592   }
593   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
594 
595   --serial_team->t.t_serialized;
596   if (serial_team->t.t_serialized == 0) {
597 
598     /* return to the parallel section */
599 
600 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
601     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
602       __kmp_clear_x87_fpu_status_word();
603       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
604       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
605     }
606 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
607 
608 #if OMPD_SUPPORT
609     if (ompd_state & OMPD_ENABLE_BP)
610       ompd_bp_parallel_end();
611 #endif
612 
613     this_thr->th.th_team = serial_team->t.t_parent;
614     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
615 
616     /* restore values cached in the thread */
617     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
618     this_thr->th.th_team_master =
619         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
620     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
621 
622     /* TODO the below shouldn't need to be adjusted for serialized teams */
623     this_thr->th.th_dispatch =
624         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
625 
626     __kmp_pop_current_task_from_thread(this_thr);
627 
628     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
629     this_thr->th.th_current_task->td_flags.executing = 1;
630 
631     if (__kmp_tasking_mode != tskm_immediate_exec) {
632       // Copy the task team from the new child / old parent team to the thread.
633       this_thr->th.th_task_team =
634           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
635       KA_TRACE(20,
636                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
637                 "team %p\n",
638                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
639     }
640   } else {
641     if (__kmp_tasking_mode != tskm_immediate_exec) {
642       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
643                     "depth of serial team %p to %d\n",
644                     global_tid, serial_team, serial_team->t.t_serialized));
645     }
646   }
647 
648   if (__kmp_env_consistency_check)
649     __kmp_pop_parallel(global_tid, NULL);
650 #if OMPT_SUPPORT
651   if (ompt_enabled.enabled)
652     this_thr->th.ompt_thread_info.state =
653         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
654                                            : ompt_state_work_parallel);
655 #endif
656 }
657 
658 /*!
659 @ingroup SYNCHRONIZATION
660 @param loc  source location information.
661 
662 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
663 depending on the memory ordering convention obeyed by the compiler
664 even that may not be necessary).
665 */
666 void __kmpc_flush(ident_t *loc) {
667   KC_TRACE(10, ("__kmpc_flush: called\n"));
668 
669   /* need explicit __mf() here since use volatile instead in library */
670   KMP_MB(); /* Flush all pending memory write invalidates.  */
671 
672 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
673 #if KMP_MIC
674 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
675 // We shouldn't need it, though, since the ABI rules require that
676 // * If the compiler generates NGO stores it also generates the fence
677 // * If users hand-code NGO stores they should insert the fence
678 // therefore no incomplete unordered stores should be visible.
679 #else
680   // C74404
681   // This is to address non-temporal store instructions (sfence needed).
682   // The clflush instruction is addressed either (mfence needed).
683   // Probably the non-temporal load monvtdqa instruction should also be
684   // addressed.
685   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
686   if (!__kmp_cpuinfo.initialized) {
687     __kmp_query_cpuid(&__kmp_cpuinfo);
688   }
689   if (!__kmp_cpuinfo.sse2) {
690     // CPU cannot execute SSE2 instructions.
691   } else {
692 #if KMP_COMPILER_ICC
693     _mm_mfence();
694 #elif KMP_COMPILER_MSVC
695     MemoryBarrier();
696 #else
697     __sync_synchronize();
698 #endif // KMP_COMPILER_ICC
699   }
700 #endif // KMP_MIC
701 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
702        KMP_ARCH_RISCV64)
703 // Nothing to see here move along
704 #elif KMP_ARCH_PPC64
705 // Nothing needed here (we have a real MB above).
706 #else
707 #error Unknown or unsupported architecture
708 #endif
709 
710 #if OMPT_SUPPORT && OMPT_OPTIONAL
711   if (ompt_enabled.ompt_callback_flush) {
712     ompt_callbacks.ompt_callback(ompt_callback_flush)(
713         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
714   }
715 #endif
716 }
717 
718 /* -------------------------------------------------------------------------- */
719 /*!
720 @ingroup SYNCHRONIZATION
721 @param loc source location information
722 @param global_tid thread id.
723 
724 Execute a barrier.
725 */
726 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
727   KMP_COUNT_BLOCK(OMP_BARRIER);
728   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
729   __kmp_assert_valid_gtid(global_tid);
730 
731   if (!TCR_4(__kmp_init_parallel))
732     __kmp_parallel_initialize();
733 
734   __kmp_resume_if_soft_paused();
735 
736   if (__kmp_env_consistency_check) {
737     if (loc == 0) {
738       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
739     }
740     __kmp_check_barrier(global_tid, ct_barrier, loc);
741   }
742 
743 #if OMPT_SUPPORT
744   ompt_frame_t *ompt_frame;
745   if (ompt_enabled.enabled) {
746     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
747     if (ompt_frame->enter_frame.ptr == NULL)
748       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
749   }
750   OMPT_STORE_RETURN_ADDRESS(global_tid);
751 #endif
752   __kmp_threads[global_tid]->th.th_ident = loc;
753   // TODO: explicit barrier_wait_id:
754   //   this function is called when 'barrier' directive is present or
755   //   implicit barrier at the end of a worksharing construct.
756   // 1) better to add a per-thread barrier counter to a thread data structure
757   // 2) set to 0 when a new team is created
758   // 4) no sync is required
759 
760   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
761 #if OMPT_SUPPORT && OMPT_OPTIONAL
762   if (ompt_enabled.enabled) {
763     ompt_frame->enter_frame = ompt_data_none;
764   }
765 #endif
766 }
767 
768 /* The BARRIER for a MASTER section is always explicit   */
769 /*!
770 @ingroup WORK_SHARING
771 @param loc  source location information.
772 @param global_tid  global thread number .
773 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
774 */
775 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
776   int status = 0;
777 
778   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
779   __kmp_assert_valid_gtid(global_tid);
780 
781   if (!TCR_4(__kmp_init_parallel))
782     __kmp_parallel_initialize();
783 
784   __kmp_resume_if_soft_paused();
785 
786   if (KMP_MASTER_GTID(global_tid)) {
787     KMP_COUNT_BLOCK(OMP_MASTER);
788     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
789     status = 1;
790   }
791 
792 #if OMPT_SUPPORT && OMPT_OPTIONAL
793   if (status) {
794     if (ompt_enabled.ompt_callback_masked) {
795       kmp_info_t *this_thr = __kmp_threads[global_tid];
796       kmp_team_t *team = this_thr->th.th_team;
797 
798       int tid = __kmp_tid_from_gtid(global_tid);
799       ompt_callbacks.ompt_callback(ompt_callback_masked)(
800           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
801           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
802           OMPT_GET_RETURN_ADDRESS(0));
803     }
804   }
805 #endif
806 
807   if (__kmp_env_consistency_check) {
808 #if KMP_USE_DYNAMIC_LOCK
809     if (status)
810       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
811     else
812       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
813 #else
814     if (status)
815       __kmp_push_sync(global_tid, ct_master, loc, NULL);
816     else
817       __kmp_check_sync(global_tid, ct_master, loc, NULL);
818 #endif
819   }
820 
821   return status;
822 }
823 
824 /*!
825 @ingroup WORK_SHARING
826 @param loc  source location information.
827 @param global_tid  global thread number .
828 
829 Mark the end of a <tt>master</tt> region. This should only be called by the
830 thread that executes the <tt>master</tt> region.
831 */
832 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
833   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
834   __kmp_assert_valid_gtid(global_tid);
835   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
836   KMP_POP_PARTITIONED_TIMER();
837 
838 #if OMPT_SUPPORT && OMPT_OPTIONAL
839   kmp_info_t *this_thr = __kmp_threads[global_tid];
840   kmp_team_t *team = this_thr->th.th_team;
841   if (ompt_enabled.ompt_callback_masked) {
842     int tid = __kmp_tid_from_gtid(global_tid);
843     ompt_callbacks.ompt_callback(ompt_callback_masked)(
844         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
845         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
846         OMPT_GET_RETURN_ADDRESS(0));
847   }
848 #endif
849 
850   if (__kmp_env_consistency_check) {
851     if (KMP_MASTER_GTID(global_tid))
852       __kmp_pop_sync(global_tid, ct_master, loc);
853   }
854 }
855 
856 /*!
857 @ingroup WORK_SHARING
858 @param loc  source location information.
859 @param global_tid  global thread number.
860 @param filter result of evaluating filter clause on thread global_tid, or zero
861 if no filter clause present
862 @return 1 if this thread should execute the <tt>masked</tt> block, 0 otherwise.
863 */
864 kmp_int32 __kmpc_masked(ident_t *loc, kmp_int32 global_tid, kmp_int32 filter) {
865   int status = 0;
866   int tid;
867   KC_TRACE(10, ("__kmpc_masked: called T#%d\n", global_tid));
868   __kmp_assert_valid_gtid(global_tid);
869 
870   if (!TCR_4(__kmp_init_parallel))
871     __kmp_parallel_initialize();
872 
873   __kmp_resume_if_soft_paused();
874 
875   tid = __kmp_tid_from_gtid(global_tid);
876   if (tid == filter) {
877     KMP_COUNT_BLOCK(OMP_MASKED);
878     KMP_PUSH_PARTITIONED_TIMER(OMP_masked);
879     status = 1;
880   }
881 
882 #if OMPT_SUPPORT && OMPT_OPTIONAL
883   if (status) {
884     if (ompt_enabled.ompt_callback_masked) {
885       kmp_info_t *this_thr = __kmp_threads[global_tid];
886       kmp_team_t *team = this_thr->th.th_team;
887       ompt_callbacks.ompt_callback(ompt_callback_masked)(
888           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
889           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
890           OMPT_GET_RETURN_ADDRESS(0));
891     }
892   }
893 #endif
894 
895   if (__kmp_env_consistency_check) {
896 #if KMP_USE_DYNAMIC_LOCK
897     if (status)
898       __kmp_push_sync(global_tid, ct_masked, loc, NULL, 0);
899     else
900       __kmp_check_sync(global_tid, ct_masked, loc, NULL, 0);
901 #else
902     if (status)
903       __kmp_push_sync(global_tid, ct_masked, loc, NULL);
904     else
905       __kmp_check_sync(global_tid, ct_masked, loc, NULL);
906 #endif
907   }
908 
909   return status;
910 }
911 
912 /*!
913 @ingroup WORK_SHARING
914 @param loc  source location information.
915 @param global_tid  global thread number .
916 
917 Mark the end of a <tt>masked</tt> region. This should only be called by the
918 thread that executes the <tt>masked</tt> region.
919 */
920 void __kmpc_end_masked(ident_t *loc, kmp_int32 global_tid) {
921   KC_TRACE(10, ("__kmpc_end_masked: called T#%d\n", global_tid));
922   __kmp_assert_valid_gtid(global_tid);
923   KMP_POP_PARTITIONED_TIMER();
924 
925 #if OMPT_SUPPORT && OMPT_OPTIONAL
926   kmp_info_t *this_thr = __kmp_threads[global_tid];
927   kmp_team_t *team = this_thr->th.th_team;
928   if (ompt_enabled.ompt_callback_masked) {
929     int tid = __kmp_tid_from_gtid(global_tid);
930     ompt_callbacks.ompt_callback(ompt_callback_masked)(
931         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
932         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
933         OMPT_GET_RETURN_ADDRESS(0));
934   }
935 #endif
936 
937   if (__kmp_env_consistency_check) {
938     __kmp_pop_sync(global_tid, ct_masked, loc);
939   }
940 }
941 
942 /*!
943 @ingroup WORK_SHARING
944 @param loc  source location information.
945 @param gtid  global thread number.
946 
947 Start execution of an <tt>ordered</tt> construct.
948 */
949 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
950   int cid = 0;
951   kmp_info_t *th;
952   KMP_DEBUG_ASSERT(__kmp_init_serial);
953 
954   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
955   __kmp_assert_valid_gtid(gtid);
956 
957   if (!TCR_4(__kmp_init_parallel))
958     __kmp_parallel_initialize();
959 
960   __kmp_resume_if_soft_paused();
961 
962 #if USE_ITT_BUILD
963   __kmp_itt_ordered_prep(gtid);
964 // TODO: ordered_wait_id
965 #endif /* USE_ITT_BUILD */
966 
967   th = __kmp_threads[gtid];
968 
969 #if OMPT_SUPPORT && OMPT_OPTIONAL
970   kmp_team_t *team;
971   ompt_wait_id_t lck;
972   void *codeptr_ra;
973   OMPT_STORE_RETURN_ADDRESS(gtid);
974   if (ompt_enabled.enabled) {
975     team = __kmp_team_from_gtid(gtid);
976     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
977     /* OMPT state update */
978     th->th.ompt_thread_info.wait_id = lck;
979     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
980 
981     /* OMPT event callback */
982     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
983     if (ompt_enabled.ompt_callback_mutex_acquire) {
984       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
985           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
986           codeptr_ra);
987     }
988   }
989 #endif
990 
991   if (th->th.th_dispatch->th_deo_fcn != 0)
992     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
993   else
994     __kmp_parallel_deo(&gtid, &cid, loc);
995 
996 #if OMPT_SUPPORT && OMPT_OPTIONAL
997   if (ompt_enabled.enabled) {
998     /* OMPT state update */
999     th->th.ompt_thread_info.state = ompt_state_work_parallel;
1000     th->th.ompt_thread_info.wait_id = 0;
1001 
1002     /* OMPT event callback */
1003     if (ompt_enabled.ompt_callback_mutex_acquired) {
1004       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1005           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1006     }
1007   }
1008 #endif
1009 
1010 #if USE_ITT_BUILD
1011   __kmp_itt_ordered_start(gtid);
1012 #endif /* USE_ITT_BUILD */
1013 }
1014 
1015 /*!
1016 @ingroup WORK_SHARING
1017 @param loc  source location information.
1018 @param gtid  global thread number.
1019 
1020 End execution of an <tt>ordered</tt> construct.
1021 */
1022 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
1023   int cid = 0;
1024   kmp_info_t *th;
1025 
1026   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
1027   __kmp_assert_valid_gtid(gtid);
1028 
1029 #if USE_ITT_BUILD
1030   __kmp_itt_ordered_end(gtid);
1031 // TODO: ordered_wait_id
1032 #endif /* USE_ITT_BUILD */
1033 
1034   th = __kmp_threads[gtid];
1035 
1036   if (th->th.th_dispatch->th_dxo_fcn != 0)
1037     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
1038   else
1039     __kmp_parallel_dxo(&gtid, &cid, loc);
1040 
1041 #if OMPT_SUPPORT && OMPT_OPTIONAL
1042   OMPT_STORE_RETURN_ADDRESS(gtid);
1043   if (ompt_enabled.ompt_callback_mutex_released) {
1044     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1045         ompt_mutex_ordered,
1046         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
1047             ->t.t_ordered.dt.t_value,
1048         OMPT_LOAD_RETURN_ADDRESS(gtid));
1049   }
1050 #endif
1051 }
1052 
1053 #if KMP_USE_DYNAMIC_LOCK
1054 
1055 static __forceinline void
1056 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
1057                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
1058   // Pointer to the allocated indirect lock is written to crit, while indexing
1059   // is ignored.
1060   void *idx;
1061   kmp_indirect_lock_t **lck;
1062   lck = (kmp_indirect_lock_t **)crit;
1063   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
1064   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
1065   KMP_SET_I_LOCK_LOCATION(ilk, loc);
1066   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
1067   KA_TRACE(20,
1068            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
1069 #if USE_ITT_BUILD
1070   __kmp_itt_critical_creating(ilk->lock, loc);
1071 #endif
1072   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
1073   if (status == 0) {
1074 #if USE_ITT_BUILD
1075     __kmp_itt_critical_destroyed(ilk->lock);
1076 #endif
1077     // We don't really need to destroy the unclaimed lock here since it will be
1078     // cleaned up at program exit.
1079     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
1080   }
1081   KMP_DEBUG_ASSERT(*lck != NULL);
1082 }
1083 
1084 // Fast-path acquire tas lock
1085 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
1086   {                                                                            \
1087     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1088     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1089     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1090     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
1091         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1092       kmp_uint32 spins;                                                        \
1093       KMP_FSYNC_PREPARE(l);                                                    \
1094       KMP_INIT_YIELD(spins);                                                   \
1095       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1096       do {                                                                     \
1097         if (TCR_4(__kmp_nth) >                                                 \
1098             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1099           KMP_YIELD(TRUE);                                                     \
1100         } else {                                                               \
1101           KMP_YIELD_SPIN(spins);                                               \
1102         }                                                                      \
1103         __kmp_spin_backoff(&backoff);                                          \
1104       } while (                                                                \
1105           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1106           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1107     }                                                                          \
1108     KMP_FSYNC_ACQUIRED(l);                                                     \
1109   }
1110 
1111 // Fast-path test tas lock
1112 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1113   {                                                                            \
1114     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1115     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1116     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1117     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1118          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1119   }
1120 
1121 // Fast-path release tas lock
1122 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1123   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1124 
1125 #if KMP_USE_FUTEX
1126 
1127 #include <sys/syscall.h>
1128 #include <unistd.h>
1129 #ifndef FUTEX_WAIT
1130 #define FUTEX_WAIT 0
1131 #endif
1132 #ifndef FUTEX_WAKE
1133 #define FUTEX_WAKE 1
1134 #endif
1135 
1136 // Fast-path acquire futex lock
1137 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1138   {                                                                            \
1139     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1140     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1141     KMP_MB();                                                                  \
1142     KMP_FSYNC_PREPARE(ftx);                                                    \
1143     kmp_int32 poll_val;                                                        \
1144     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1145                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1146                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1147       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1148       if (!cond) {                                                             \
1149         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1150                                          poll_val |                            \
1151                                              KMP_LOCK_BUSY(1, futex))) {       \
1152           continue;                                                            \
1153         }                                                                      \
1154         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1155       }                                                                        \
1156       kmp_int32 rc;                                                            \
1157       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1158                         NULL, NULL, 0)) != 0) {                                \
1159         continue;                                                              \
1160       }                                                                        \
1161       gtid_code |= 1;                                                          \
1162     }                                                                          \
1163     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1164   }
1165 
1166 // Fast-path test futex lock
1167 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1168   {                                                                            \
1169     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1170     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1171                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1172       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1173       rc = TRUE;                                                               \
1174     } else {                                                                   \
1175       rc = FALSE;                                                              \
1176     }                                                                          \
1177   }
1178 
1179 // Fast-path release futex lock
1180 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1181   {                                                                            \
1182     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1183     KMP_MB();                                                                  \
1184     KMP_FSYNC_RELEASING(ftx);                                                  \
1185     kmp_int32 poll_val =                                                       \
1186         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1187     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1188       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1189               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1190     }                                                                          \
1191     KMP_MB();                                                                  \
1192     KMP_YIELD_OVERSUB();                                                       \
1193   }
1194 
1195 #endif // KMP_USE_FUTEX
1196 
1197 #else // KMP_USE_DYNAMIC_LOCK
1198 
1199 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1200                                                       ident_t const *loc,
1201                                                       kmp_int32 gtid) {
1202   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1203 
1204   // Because of the double-check, the following load doesn't need to be volatile
1205   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1206 
1207   if (lck == NULL) {
1208     void *idx;
1209 
1210     // Allocate & initialize the lock.
1211     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1212     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1213     __kmp_init_user_lock_with_checks(lck);
1214     __kmp_set_user_lock_location(lck, loc);
1215 #if USE_ITT_BUILD
1216     __kmp_itt_critical_creating(lck);
1217 // __kmp_itt_critical_creating() should be called *before* the first usage
1218 // of underlying lock. It is the only place where we can guarantee it. There
1219 // are chances the lock will destroyed with no usage, but it is not a
1220 // problem, because this is not real event seen by user but rather setting
1221 // name for object (lock). See more details in kmp_itt.h.
1222 #endif /* USE_ITT_BUILD */
1223 
1224     // Use a cmpxchg instruction to slam the start of the critical section with
1225     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1226     // and use the lock that the other thread allocated.
1227     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1228 
1229     if (status == 0) {
1230 // Deallocate the lock and reload the value.
1231 #if USE_ITT_BUILD
1232       __kmp_itt_critical_destroyed(lck);
1233 // Let ITT know the lock is destroyed and the same memory location may be reused
1234 // for another purpose.
1235 #endif /* USE_ITT_BUILD */
1236       __kmp_destroy_user_lock_with_checks(lck);
1237       __kmp_user_lock_free(&idx, gtid, lck);
1238       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1239       KMP_DEBUG_ASSERT(lck != NULL);
1240     }
1241   }
1242   return lck;
1243 }
1244 
1245 #endif // KMP_USE_DYNAMIC_LOCK
1246 
1247 /*!
1248 @ingroup WORK_SHARING
1249 @param loc  source location information.
1250 @param global_tid  global thread number.
1251 @param crit identity of the critical section. This could be a pointer to a lock
1252 associated with the critical section, or some other suitably unique value.
1253 
1254 Enter code protected by a `critical` construct.
1255 This function blocks until the executing thread can enter the critical section.
1256 */
1257 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1258                      kmp_critical_name *crit) {
1259 #if KMP_USE_DYNAMIC_LOCK
1260 #if OMPT_SUPPORT && OMPT_OPTIONAL
1261   OMPT_STORE_RETURN_ADDRESS(global_tid);
1262 #endif // OMPT_SUPPORT
1263   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1264 #else
1265   KMP_COUNT_BLOCK(OMP_CRITICAL);
1266 #if OMPT_SUPPORT && OMPT_OPTIONAL
1267   ompt_state_t prev_state = ompt_state_undefined;
1268   ompt_thread_info_t ti;
1269 #endif
1270   kmp_user_lock_p lck;
1271 
1272   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1273   __kmp_assert_valid_gtid(global_tid);
1274 
1275   // TODO: add THR_OVHD_STATE
1276 
1277   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1278   KMP_CHECK_USER_LOCK_INIT();
1279 
1280   if ((__kmp_user_lock_kind == lk_tas) &&
1281       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1282     lck = (kmp_user_lock_p)crit;
1283   }
1284 #if KMP_USE_FUTEX
1285   else if ((__kmp_user_lock_kind == lk_futex) &&
1286            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1287     lck = (kmp_user_lock_p)crit;
1288   }
1289 #endif
1290   else { // ticket, queuing or drdpa
1291     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1292   }
1293 
1294   if (__kmp_env_consistency_check)
1295     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1296 
1297     // since the critical directive binds to all threads, not just the current
1298     // team we have to check this even if we are in a serialized team.
1299     // also, even if we are the uber thread, we still have to conduct the lock,
1300     // as we have to contend with sibling threads.
1301 
1302 #if USE_ITT_BUILD
1303   __kmp_itt_critical_acquiring(lck);
1304 #endif /* USE_ITT_BUILD */
1305 #if OMPT_SUPPORT && OMPT_OPTIONAL
1306   OMPT_STORE_RETURN_ADDRESS(gtid);
1307   void *codeptr_ra = NULL;
1308   if (ompt_enabled.enabled) {
1309     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1310     /* OMPT state update */
1311     prev_state = ti.state;
1312     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1313     ti.state = ompt_state_wait_critical;
1314 
1315     /* OMPT event callback */
1316     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1317     if (ompt_enabled.ompt_callback_mutex_acquire) {
1318       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1319           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1320           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1321     }
1322   }
1323 #endif
1324   // Value of 'crit' should be good for using as a critical_id of the critical
1325   // section directive.
1326   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1327 
1328 #if USE_ITT_BUILD
1329   __kmp_itt_critical_acquired(lck);
1330 #endif /* USE_ITT_BUILD */
1331 #if OMPT_SUPPORT && OMPT_OPTIONAL
1332   if (ompt_enabled.enabled) {
1333     /* OMPT state update */
1334     ti.state = prev_state;
1335     ti.wait_id = 0;
1336 
1337     /* OMPT event callback */
1338     if (ompt_enabled.ompt_callback_mutex_acquired) {
1339       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1340           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1341     }
1342   }
1343 #endif
1344   KMP_POP_PARTITIONED_TIMER();
1345 
1346   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1347   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1348 #endif // KMP_USE_DYNAMIC_LOCK
1349 }
1350 
1351 #if KMP_USE_DYNAMIC_LOCK
1352 
1353 // Converts the given hint to an internal lock implementation
1354 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1355 #if KMP_USE_TSX
1356 #define KMP_TSX_LOCK(seq) lockseq_##seq
1357 #else
1358 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1359 #endif
1360 
1361 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1362 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1363 #else
1364 #define KMP_CPUINFO_RTM 0
1365 #endif
1366 
1367   // Hints that do not require further logic
1368   if (hint & kmp_lock_hint_hle)
1369     return KMP_TSX_LOCK(hle);
1370   if (hint & kmp_lock_hint_rtm)
1371     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1372   if (hint & kmp_lock_hint_adaptive)
1373     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1374 
1375   // Rule out conflicting hints first by returning the default lock
1376   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1377     return __kmp_user_lock_seq;
1378   if ((hint & omp_lock_hint_speculative) &&
1379       (hint & omp_lock_hint_nonspeculative))
1380     return __kmp_user_lock_seq;
1381 
1382   // Do not even consider speculation when it appears to be contended
1383   if (hint & omp_lock_hint_contended)
1384     return lockseq_queuing;
1385 
1386   // Uncontended lock without speculation
1387   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1388     return lockseq_tas;
1389 
1390   // Use RTM lock for speculation
1391   if (hint & omp_lock_hint_speculative)
1392     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1393 
1394   return __kmp_user_lock_seq;
1395 }
1396 
1397 #if OMPT_SUPPORT && OMPT_OPTIONAL
1398 #if KMP_USE_DYNAMIC_LOCK
1399 static kmp_mutex_impl_t
1400 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1401   if (user_lock) {
1402     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1403     case 0:
1404       break;
1405 #if KMP_USE_FUTEX
1406     case locktag_futex:
1407       return kmp_mutex_impl_queuing;
1408 #endif
1409     case locktag_tas:
1410       return kmp_mutex_impl_spin;
1411 #if KMP_USE_TSX
1412     case locktag_hle:
1413     case locktag_rtm_spin:
1414       return kmp_mutex_impl_speculative;
1415 #endif
1416     default:
1417       return kmp_mutex_impl_none;
1418     }
1419     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1420   }
1421   KMP_ASSERT(ilock);
1422   switch (ilock->type) {
1423 #if KMP_USE_TSX
1424   case locktag_adaptive:
1425   case locktag_rtm_queuing:
1426     return kmp_mutex_impl_speculative;
1427 #endif
1428   case locktag_nested_tas:
1429     return kmp_mutex_impl_spin;
1430 #if KMP_USE_FUTEX
1431   case locktag_nested_futex:
1432 #endif
1433   case locktag_ticket:
1434   case locktag_queuing:
1435   case locktag_drdpa:
1436   case locktag_nested_ticket:
1437   case locktag_nested_queuing:
1438   case locktag_nested_drdpa:
1439     return kmp_mutex_impl_queuing;
1440   default:
1441     return kmp_mutex_impl_none;
1442   }
1443 }
1444 #else
1445 // For locks without dynamic binding
1446 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1447   switch (__kmp_user_lock_kind) {
1448   case lk_tas:
1449     return kmp_mutex_impl_spin;
1450 #if KMP_USE_FUTEX
1451   case lk_futex:
1452 #endif
1453   case lk_ticket:
1454   case lk_queuing:
1455   case lk_drdpa:
1456     return kmp_mutex_impl_queuing;
1457 #if KMP_USE_TSX
1458   case lk_hle:
1459   case lk_rtm_queuing:
1460   case lk_rtm_spin:
1461   case lk_adaptive:
1462     return kmp_mutex_impl_speculative;
1463 #endif
1464   default:
1465     return kmp_mutex_impl_none;
1466   }
1467 }
1468 #endif // KMP_USE_DYNAMIC_LOCK
1469 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1470 
1471 /*!
1472 @ingroup WORK_SHARING
1473 @param loc  source location information.
1474 @param global_tid  global thread number.
1475 @param crit identity of the critical section. This could be a pointer to a lock
1476 associated with the critical section, or some other suitably unique value.
1477 @param hint the lock hint.
1478 
1479 Enter code protected by a `critical` construct with a hint. The hint value is
1480 used to suggest a lock implementation. This function blocks until the executing
1481 thread can enter the critical section unless the hint suggests use of
1482 speculative execution and the hardware supports it.
1483 */
1484 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1485                                kmp_critical_name *crit, uint32_t hint) {
1486   KMP_COUNT_BLOCK(OMP_CRITICAL);
1487   kmp_user_lock_p lck;
1488 #if OMPT_SUPPORT && OMPT_OPTIONAL
1489   ompt_state_t prev_state = ompt_state_undefined;
1490   ompt_thread_info_t ti;
1491   // This is the case, if called from __kmpc_critical:
1492   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1493   if (!codeptr)
1494     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1495 #endif
1496 
1497   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1498   __kmp_assert_valid_gtid(global_tid);
1499 
1500   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1501   // Check if it is initialized.
1502   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1503   kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
1504   if (*lk == 0) {
1505     if (KMP_IS_D_LOCK(lockseq)) {
1506       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1507                                   KMP_GET_D_TAG(lockseq));
1508     } else {
1509       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
1510     }
1511   }
1512   // Branch for accessing the actual lock object and set operation. This
1513   // branching is inevitable since this lock initialization does not follow the
1514   // normal dispatch path (lock table is not used).
1515   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1516     lck = (kmp_user_lock_p)lk;
1517     if (__kmp_env_consistency_check) {
1518       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1519                       __kmp_map_hint_to_lock(hint));
1520     }
1521 #if USE_ITT_BUILD
1522     __kmp_itt_critical_acquiring(lck);
1523 #endif
1524 #if OMPT_SUPPORT && OMPT_OPTIONAL
1525     if (ompt_enabled.enabled) {
1526       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1527       /* OMPT state update */
1528       prev_state = ti.state;
1529       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1530       ti.state = ompt_state_wait_critical;
1531 
1532       /* OMPT event callback */
1533       if (ompt_enabled.ompt_callback_mutex_acquire) {
1534         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1535             ompt_mutex_critical, (unsigned int)hint,
1536             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1537             codeptr);
1538       }
1539     }
1540 #endif
1541 #if KMP_USE_INLINED_TAS
1542     if (lockseq == lockseq_tas && !__kmp_env_consistency_check) {
1543       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1544     } else
1545 #elif KMP_USE_INLINED_FUTEX
1546     if (lockseq == lockseq_futex && !__kmp_env_consistency_check) {
1547       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1548     } else
1549 #endif
1550     {
1551       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1552     }
1553   } else {
1554     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1555     lck = ilk->lock;
1556     if (__kmp_env_consistency_check) {
1557       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1558                       __kmp_map_hint_to_lock(hint));
1559     }
1560 #if USE_ITT_BUILD
1561     __kmp_itt_critical_acquiring(lck);
1562 #endif
1563 #if OMPT_SUPPORT && OMPT_OPTIONAL
1564     if (ompt_enabled.enabled) {
1565       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1566       /* OMPT state update */
1567       prev_state = ti.state;
1568       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1569       ti.state = ompt_state_wait_critical;
1570 
1571       /* OMPT event callback */
1572       if (ompt_enabled.ompt_callback_mutex_acquire) {
1573         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1574             ompt_mutex_critical, (unsigned int)hint,
1575             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1576             codeptr);
1577       }
1578     }
1579 #endif
1580     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1581   }
1582   KMP_POP_PARTITIONED_TIMER();
1583 
1584 #if USE_ITT_BUILD
1585   __kmp_itt_critical_acquired(lck);
1586 #endif /* USE_ITT_BUILD */
1587 #if OMPT_SUPPORT && OMPT_OPTIONAL
1588   if (ompt_enabled.enabled) {
1589     /* OMPT state update */
1590     ti.state = prev_state;
1591     ti.wait_id = 0;
1592 
1593     /* OMPT event callback */
1594     if (ompt_enabled.ompt_callback_mutex_acquired) {
1595       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1596           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1597     }
1598   }
1599 #endif
1600 
1601   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1602   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1603 } // __kmpc_critical_with_hint
1604 
1605 #endif // KMP_USE_DYNAMIC_LOCK
1606 
1607 /*!
1608 @ingroup WORK_SHARING
1609 @param loc  source location information.
1610 @param global_tid  global thread number .
1611 @param crit identity of the critical section. This could be a pointer to a lock
1612 associated with the critical section, or some other suitably unique value.
1613 
1614 Leave a critical section, releasing any lock that was held during its execution.
1615 */
1616 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1617                          kmp_critical_name *crit) {
1618   kmp_user_lock_p lck;
1619 
1620   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1621 
1622 #if KMP_USE_DYNAMIC_LOCK
1623   int locktag = KMP_EXTRACT_D_TAG(crit);
1624   if (locktag) {
1625     lck = (kmp_user_lock_p)crit;
1626     KMP_ASSERT(lck != NULL);
1627     if (__kmp_env_consistency_check) {
1628       __kmp_pop_sync(global_tid, ct_critical, loc);
1629     }
1630 #if USE_ITT_BUILD
1631     __kmp_itt_critical_releasing(lck);
1632 #endif
1633 #if KMP_USE_INLINED_TAS
1634     if (locktag == locktag_tas && !__kmp_env_consistency_check) {
1635       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1636     } else
1637 #elif KMP_USE_INLINED_FUTEX
1638     if (locktag == locktag_futex && !__kmp_env_consistency_check) {
1639       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1640     } else
1641 #endif
1642     {
1643       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1644     }
1645   } else {
1646     kmp_indirect_lock_t *ilk =
1647         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1648     KMP_ASSERT(ilk != NULL);
1649     lck = ilk->lock;
1650     if (__kmp_env_consistency_check) {
1651       __kmp_pop_sync(global_tid, ct_critical, loc);
1652     }
1653 #if USE_ITT_BUILD
1654     __kmp_itt_critical_releasing(lck);
1655 #endif
1656     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1657   }
1658 
1659 #else // KMP_USE_DYNAMIC_LOCK
1660 
1661   if ((__kmp_user_lock_kind == lk_tas) &&
1662       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1663     lck = (kmp_user_lock_p)crit;
1664   }
1665 #if KMP_USE_FUTEX
1666   else if ((__kmp_user_lock_kind == lk_futex) &&
1667            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1668     lck = (kmp_user_lock_p)crit;
1669   }
1670 #endif
1671   else { // ticket, queuing or drdpa
1672     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1673   }
1674 
1675   KMP_ASSERT(lck != NULL);
1676 
1677   if (__kmp_env_consistency_check)
1678     __kmp_pop_sync(global_tid, ct_critical, loc);
1679 
1680 #if USE_ITT_BUILD
1681   __kmp_itt_critical_releasing(lck);
1682 #endif /* USE_ITT_BUILD */
1683   // Value of 'crit' should be good for using as a critical_id of the critical
1684   // section directive.
1685   __kmp_release_user_lock_with_checks(lck, global_tid);
1686 
1687 #endif // KMP_USE_DYNAMIC_LOCK
1688 
1689 #if OMPT_SUPPORT && OMPT_OPTIONAL
1690   /* OMPT release event triggers after lock is released; place here to trigger
1691    * for all #if branches */
1692   OMPT_STORE_RETURN_ADDRESS(global_tid);
1693   if (ompt_enabled.ompt_callback_mutex_released) {
1694     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1695         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1696         OMPT_LOAD_RETURN_ADDRESS(0));
1697   }
1698 #endif
1699 
1700   KMP_POP_PARTITIONED_TIMER();
1701   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1702 }
1703 
1704 /*!
1705 @ingroup SYNCHRONIZATION
1706 @param loc source location information
1707 @param global_tid thread id.
1708 @return one if the thread should execute the master block, zero otherwise
1709 
1710 Start execution of a combined barrier and master. The barrier is executed inside
1711 this function.
1712 */
1713 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1714   int status;
1715   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1716   __kmp_assert_valid_gtid(global_tid);
1717 
1718   if (!TCR_4(__kmp_init_parallel))
1719     __kmp_parallel_initialize();
1720 
1721   __kmp_resume_if_soft_paused();
1722 
1723   if (__kmp_env_consistency_check)
1724     __kmp_check_barrier(global_tid, ct_barrier, loc);
1725 
1726 #if OMPT_SUPPORT
1727   ompt_frame_t *ompt_frame;
1728   if (ompt_enabled.enabled) {
1729     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1730     if (ompt_frame->enter_frame.ptr == NULL)
1731       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1732   }
1733   OMPT_STORE_RETURN_ADDRESS(global_tid);
1734 #endif
1735 #if USE_ITT_NOTIFY
1736   __kmp_threads[global_tid]->th.th_ident = loc;
1737 #endif
1738   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1739 #if OMPT_SUPPORT && OMPT_OPTIONAL
1740   if (ompt_enabled.enabled) {
1741     ompt_frame->enter_frame = ompt_data_none;
1742   }
1743 #endif
1744 
1745   return (status != 0) ? 0 : 1;
1746 }
1747 
1748 /*!
1749 @ingroup SYNCHRONIZATION
1750 @param loc source location information
1751 @param global_tid thread id.
1752 
1753 Complete the execution of a combined barrier and master. This function should
1754 only be called at the completion of the <tt>master</tt> code. Other threads will
1755 still be waiting at the barrier and this call releases them.
1756 */
1757 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1758   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1759   __kmp_assert_valid_gtid(global_tid);
1760   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1761 }
1762 
1763 /*!
1764 @ingroup SYNCHRONIZATION
1765 @param loc source location information
1766 @param global_tid thread id.
1767 @return one if the thread should execute the master block, zero otherwise
1768 
1769 Start execution of a combined barrier and master(nowait) construct.
1770 The barrier is executed inside this function.
1771 There is no equivalent "end" function, since the
1772 */
1773 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1774   kmp_int32 ret;
1775   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1776   __kmp_assert_valid_gtid(global_tid);
1777 
1778   if (!TCR_4(__kmp_init_parallel))
1779     __kmp_parallel_initialize();
1780 
1781   __kmp_resume_if_soft_paused();
1782 
1783   if (__kmp_env_consistency_check) {
1784     if (loc == 0) {
1785       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1786     }
1787     __kmp_check_barrier(global_tid, ct_barrier, loc);
1788   }
1789 
1790 #if OMPT_SUPPORT
1791   ompt_frame_t *ompt_frame;
1792   if (ompt_enabled.enabled) {
1793     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1794     if (ompt_frame->enter_frame.ptr == NULL)
1795       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1796   }
1797   OMPT_STORE_RETURN_ADDRESS(global_tid);
1798 #endif
1799 #if USE_ITT_NOTIFY
1800   __kmp_threads[global_tid]->th.th_ident = loc;
1801 #endif
1802   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1803 #if OMPT_SUPPORT && OMPT_OPTIONAL
1804   if (ompt_enabled.enabled) {
1805     ompt_frame->enter_frame = ompt_data_none;
1806   }
1807 #endif
1808 
1809   ret = __kmpc_master(loc, global_tid);
1810 
1811   if (__kmp_env_consistency_check) {
1812     /*  there's no __kmpc_end_master called; so the (stats) */
1813     /*  actions of __kmpc_end_master are done here          */
1814     if (ret) {
1815       /* only one thread should do the pop since only */
1816       /* one did the push (see __kmpc_master())       */
1817       __kmp_pop_sync(global_tid, ct_master, loc);
1818     }
1819   }
1820 
1821   return (ret);
1822 }
1823 
1824 /* The BARRIER for a SINGLE process section is always explicit   */
1825 /*!
1826 @ingroup WORK_SHARING
1827 @param loc  source location information
1828 @param global_tid  global thread number
1829 @return One if this thread should execute the single construct, zero otherwise.
1830 
1831 Test whether to execute a <tt>single</tt> construct.
1832 There are no implicit barriers in the two "single" calls, rather the compiler
1833 should introduce an explicit barrier if it is required.
1834 */
1835 
1836 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1837   __kmp_assert_valid_gtid(global_tid);
1838   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1839 
1840   if (rc) {
1841     // We are going to execute the single statement, so we should count it.
1842     KMP_COUNT_BLOCK(OMP_SINGLE);
1843     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1844   }
1845 
1846 #if OMPT_SUPPORT && OMPT_OPTIONAL
1847   kmp_info_t *this_thr = __kmp_threads[global_tid];
1848   kmp_team_t *team = this_thr->th.th_team;
1849   int tid = __kmp_tid_from_gtid(global_tid);
1850 
1851   if (ompt_enabled.enabled) {
1852     if (rc) {
1853       if (ompt_enabled.ompt_callback_work) {
1854         ompt_callbacks.ompt_callback(ompt_callback_work)(
1855             ompt_work_single_executor, ompt_scope_begin,
1856             &(team->t.ompt_team_info.parallel_data),
1857             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1858             1, OMPT_GET_RETURN_ADDRESS(0));
1859       }
1860     } else {
1861       if (ompt_enabled.ompt_callback_work) {
1862         ompt_callbacks.ompt_callback(ompt_callback_work)(
1863             ompt_work_single_other, ompt_scope_begin,
1864             &(team->t.ompt_team_info.parallel_data),
1865             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1866             1, OMPT_GET_RETURN_ADDRESS(0));
1867         ompt_callbacks.ompt_callback(ompt_callback_work)(
1868             ompt_work_single_other, ompt_scope_end,
1869             &(team->t.ompt_team_info.parallel_data),
1870             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1871             1, OMPT_GET_RETURN_ADDRESS(0));
1872       }
1873     }
1874   }
1875 #endif
1876 
1877   return rc;
1878 }
1879 
1880 /*!
1881 @ingroup WORK_SHARING
1882 @param loc  source location information
1883 @param global_tid  global thread number
1884 
1885 Mark the end of a <tt>single</tt> construct.  This function should
1886 only be called by the thread that executed the block of code protected
1887 by the `single` construct.
1888 */
1889 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1890   __kmp_assert_valid_gtid(global_tid);
1891   __kmp_exit_single(global_tid);
1892   KMP_POP_PARTITIONED_TIMER();
1893 
1894 #if OMPT_SUPPORT && OMPT_OPTIONAL
1895   kmp_info_t *this_thr = __kmp_threads[global_tid];
1896   kmp_team_t *team = this_thr->th.th_team;
1897   int tid = __kmp_tid_from_gtid(global_tid);
1898 
1899   if (ompt_enabled.ompt_callback_work) {
1900     ompt_callbacks.ompt_callback(ompt_callback_work)(
1901         ompt_work_single_executor, ompt_scope_end,
1902         &(team->t.ompt_team_info.parallel_data),
1903         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1904         OMPT_GET_RETURN_ADDRESS(0));
1905   }
1906 #endif
1907 }
1908 
1909 /*!
1910 @ingroup WORK_SHARING
1911 @param loc Source location
1912 @param global_tid Global thread id
1913 
1914 Mark the end of a statically scheduled loop.
1915 */
1916 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1917   KMP_POP_PARTITIONED_TIMER();
1918   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1919 
1920 #if OMPT_SUPPORT && OMPT_OPTIONAL
1921   if (ompt_enabled.ompt_callback_work) {
1922     ompt_work_t ompt_work_type = ompt_work_loop;
1923     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1924     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1925     // Determine workshare type
1926     if (loc != NULL) {
1927       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1928         ompt_work_type = ompt_work_loop;
1929       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1930         ompt_work_type = ompt_work_sections;
1931       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1932         ompt_work_type = ompt_work_distribute;
1933       } else {
1934         // use default set above.
1935         // a warning about this case is provided in __kmpc_for_static_init
1936       }
1937       KMP_DEBUG_ASSERT(ompt_work_type);
1938     }
1939     ompt_callbacks.ompt_callback(ompt_callback_work)(
1940         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1941         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1942   }
1943 #endif
1944   if (__kmp_env_consistency_check)
1945     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1946 }
1947 
1948 // User routines which take C-style arguments (call by value)
1949 // different from the Fortran equivalent routines
1950 
1951 void ompc_set_num_threads(int arg) {
1952   // !!!!! TODO: check the per-task binding
1953   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1954 }
1955 
1956 void ompc_set_dynamic(int flag) {
1957   kmp_info_t *thread;
1958 
1959   /* For the thread-private implementation of the internal controls */
1960   thread = __kmp_entry_thread();
1961 
1962   __kmp_save_internal_controls(thread);
1963 
1964   set__dynamic(thread, flag ? true : false);
1965 }
1966 
1967 void ompc_set_nested(int flag) {
1968   kmp_info_t *thread;
1969 
1970   /* For the thread-private internal controls implementation */
1971   thread = __kmp_entry_thread();
1972 
1973   __kmp_save_internal_controls(thread);
1974 
1975   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1976 }
1977 
1978 void ompc_set_max_active_levels(int max_active_levels) {
1979   /* TO DO */
1980   /* we want per-task implementation of this internal control */
1981 
1982   /* For the per-thread internal controls implementation */
1983   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1984 }
1985 
1986 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1987   // !!!!! TODO: check the per-task binding
1988   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1989 }
1990 
1991 int ompc_get_ancestor_thread_num(int level) {
1992   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1993 }
1994 
1995 int ompc_get_team_size(int level) {
1996   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1997 }
1998 
1999 /* OpenMP 5.0 Affinity Format API */
2000 void KMP_EXPAND_NAME(ompc_set_affinity_format)(char const *format) {
2001   if (!__kmp_init_serial) {
2002     __kmp_serial_initialize();
2003   }
2004   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
2005                          format, KMP_STRLEN(format) + 1);
2006 }
2007 
2008 size_t KMP_EXPAND_NAME(ompc_get_affinity_format)(char *buffer, size_t size) {
2009   size_t format_size;
2010   if (!__kmp_init_serial) {
2011     __kmp_serial_initialize();
2012   }
2013   format_size = KMP_STRLEN(__kmp_affinity_format);
2014   if (buffer && size) {
2015     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
2016                            format_size + 1);
2017   }
2018   return format_size;
2019 }
2020 
2021 void KMP_EXPAND_NAME(ompc_display_affinity)(char const *format) {
2022   int gtid;
2023   if (!TCR_4(__kmp_init_middle)) {
2024     __kmp_middle_initialize();
2025   }
2026   __kmp_assign_root_init_mask();
2027   gtid = __kmp_get_gtid();
2028   __kmp_aux_display_affinity(gtid, format);
2029 }
2030 
2031 size_t KMP_EXPAND_NAME(ompc_capture_affinity)(char *buffer, size_t buf_size,
2032                                               char const *format) {
2033   int gtid;
2034   size_t num_required;
2035   kmp_str_buf_t capture_buf;
2036   if (!TCR_4(__kmp_init_middle)) {
2037     __kmp_middle_initialize();
2038   }
2039   __kmp_assign_root_init_mask();
2040   gtid = __kmp_get_gtid();
2041   __kmp_str_buf_init(&capture_buf);
2042   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
2043   if (buffer && buf_size) {
2044     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
2045                            capture_buf.used + 1);
2046   }
2047   __kmp_str_buf_free(&capture_buf);
2048   return num_required;
2049 }
2050 
2051 void kmpc_set_stacksize(int arg) {
2052   // __kmp_aux_set_stacksize initializes the library if needed
2053   __kmp_aux_set_stacksize(arg);
2054 }
2055 
2056 void kmpc_set_stacksize_s(size_t arg) {
2057   // __kmp_aux_set_stacksize initializes the library if needed
2058   __kmp_aux_set_stacksize(arg);
2059 }
2060 
2061 void kmpc_set_blocktime(int arg) {
2062   int gtid, tid;
2063   kmp_info_t *thread;
2064 
2065   gtid = __kmp_entry_gtid();
2066   tid = __kmp_tid_from_gtid(gtid);
2067   thread = __kmp_thread_from_gtid(gtid);
2068 
2069   __kmp_aux_set_blocktime(arg, thread, tid);
2070 }
2071 
2072 void kmpc_set_library(int arg) {
2073   // __kmp_user_set_library initializes the library if needed
2074   __kmp_user_set_library((enum library_type)arg);
2075 }
2076 
2077 void kmpc_set_defaults(char const *str) {
2078   // __kmp_aux_set_defaults initializes the library if needed
2079   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
2080 }
2081 
2082 void kmpc_set_disp_num_buffers(int arg) {
2083   // ignore after initialization because some teams have already
2084   // allocated dispatch buffers
2085   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
2086       arg <= KMP_MAX_DISP_NUM_BUFF) {
2087     __kmp_dispatch_num_buffers = arg;
2088   }
2089 }
2090 
2091 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
2092 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2093   return -1;
2094 #else
2095   if (!TCR_4(__kmp_init_middle)) {
2096     __kmp_middle_initialize();
2097   }
2098   __kmp_assign_root_init_mask();
2099   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2100 #endif
2101 }
2102 
2103 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2104 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2105   return -1;
2106 #else
2107   if (!TCR_4(__kmp_init_middle)) {
2108     __kmp_middle_initialize();
2109   }
2110   __kmp_assign_root_init_mask();
2111   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2112 #endif
2113 }
2114 
2115 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2116 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2117   return -1;
2118 #else
2119   if (!TCR_4(__kmp_init_middle)) {
2120     __kmp_middle_initialize();
2121   }
2122   __kmp_assign_root_init_mask();
2123   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2124 #endif
2125 }
2126 
2127 /* -------------------------------------------------------------------------- */
2128 /*!
2129 @ingroup THREADPRIVATE
2130 @param loc       source location information
2131 @param gtid      global thread number
2132 @param cpy_size  size of the cpy_data buffer
2133 @param cpy_data  pointer to data to be copied
2134 @param cpy_func  helper function to call for copying data
2135 @param didit     flag variable: 1=single thread; 0=not single thread
2136 
2137 __kmpc_copyprivate implements the interface for the private data broadcast
2138 needed for the copyprivate clause associated with a single region in an
2139 OpenMP<sup>*</sup> program (both C and Fortran).
2140 All threads participating in the parallel region call this routine.
2141 One of the threads (called the single thread) should have the <tt>didit</tt>
2142 variable set to 1 and all other threads should have that variable set to 0.
2143 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2144 
2145 The OpenMP specification forbids the use of nowait on the single region when a
2146 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2147 barrier internally to avoid race conditions, so the code generation for the
2148 single region should avoid generating a barrier after the call to @ref
2149 __kmpc_copyprivate.
2150 
2151 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2152 The <tt>loc</tt> parameter is a pointer to source location information.
2153 
2154 Internal implementation: The single thread will first copy its descriptor
2155 address (cpy_data) to a team-private location, then the other threads will each
2156 call the function pointed to by the parameter cpy_func, which carries out the
2157 copy by copying the data using the cpy_data buffer.
2158 
2159 The cpy_func routine used for the copy and the contents of the data area defined
2160 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2161 to be done. For instance, the cpy_data buffer can hold the actual data to be
2162 copied or it may hold a list of pointers to the data. The cpy_func routine must
2163 interpret the cpy_data buffer appropriately.
2164 
2165 The interface to cpy_func is as follows:
2166 @code
2167 void cpy_func( void *destination, void *source )
2168 @endcode
2169 where void *destination is the cpy_data pointer for the thread being copied to
2170 and void *source is the cpy_data pointer for the thread being copied from.
2171 */
2172 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2173                         void *cpy_data, void (*cpy_func)(void *, void *),
2174                         kmp_int32 didit) {
2175   void **data_ptr;
2176   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2177   __kmp_assert_valid_gtid(gtid);
2178 
2179   KMP_MB();
2180 
2181   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2182 
2183   if (__kmp_env_consistency_check) {
2184     if (loc == 0) {
2185       KMP_WARNING(ConstructIdentInvalid);
2186     }
2187   }
2188 
2189   // ToDo: Optimize the following two barriers into some kind of split barrier
2190 
2191   if (didit)
2192     *data_ptr = cpy_data;
2193 
2194 #if OMPT_SUPPORT
2195   ompt_frame_t *ompt_frame;
2196   if (ompt_enabled.enabled) {
2197     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2198     if (ompt_frame->enter_frame.ptr == NULL)
2199       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2200   }
2201   OMPT_STORE_RETURN_ADDRESS(gtid);
2202 #endif
2203 /* This barrier is not a barrier region boundary */
2204 #if USE_ITT_NOTIFY
2205   __kmp_threads[gtid]->th.th_ident = loc;
2206 #endif
2207   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2208 
2209   if (!didit)
2210     (*cpy_func)(cpy_data, *data_ptr);
2211 
2212   // Consider next barrier a user-visible barrier for barrier region boundaries
2213   // Nesting checks are already handled by the single construct checks
2214   {
2215 #if OMPT_SUPPORT
2216     OMPT_STORE_RETURN_ADDRESS(gtid);
2217 #endif
2218 #if USE_ITT_NOTIFY
2219     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2220 // tasks can overwrite the location)
2221 #endif
2222     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2223 #if OMPT_SUPPORT && OMPT_OPTIONAL
2224     if (ompt_enabled.enabled) {
2225       ompt_frame->enter_frame = ompt_data_none;
2226     }
2227 #endif
2228   }
2229 }
2230 
2231 /* -------------------------------------------------------------------------- */
2232 
2233 #define INIT_LOCK __kmp_init_user_lock_with_checks
2234 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2235 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2236 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2237 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2238 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2239   __kmp_acquire_nested_user_lock_with_checks_timed
2240 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2241 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2242 #define TEST_LOCK __kmp_test_user_lock_with_checks
2243 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2244 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2245 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2246 
2247 // TODO: Make check abort messages use location info & pass it into
2248 // with_checks routines
2249 
2250 #if KMP_USE_DYNAMIC_LOCK
2251 
2252 // internal lock initializer
2253 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2254                                                     kmp_dyna_lockseq_t seq) {
2255   if (KMP_IS_D_LOCK(seq)) {
2256     KMP_INIT_D_LOCK(lock, seq);
2257 #if USE_ITT_BUILD
2258     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2259 #endif
2260   } else {
2261     KMP_INIT_I_LOCK(lock, seq);
2262 #if USE_ITT_BUILD
2263     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2264     __kmp_itt_lock_creating(ilk->lock, loc);
2265 #endif
2266   }
2267 }
2268 
2269 // internal nest lock initializer
2270 static __forceinline void
2271 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2272                                kmp_dyna_lockseq_t seq) {
2273 #if KMP_USE_TSX
2274   // Don't have nested lock implementation for speculative locks
2275   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2276       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2277     seq = __kmp_user_lock_seq;
2278 #endif
2279   switch (seq) {
2280   case lockseq_tas:
2281     seq = lockseq_nested_tas;
2282     break;
2283 #if KMP_USE_FUTEX
2284   case lockseq_futex:
2285     seq = lockseq_nested_futex;
2286     break;
2287 #endif
2288   case lockseq_ticket:
2289     seq = lockseq_nested_ticket;
2290     break;
2291   case lockseq_queuing:
2292     seq = lockseq_nested_queuing;
2293     break;
2294   case lockseq_drdpa:
2295     seq = lockseq_nested_drdpa;
2296     break;
2297   default:
2298     seq = lockseq_nested_queuing;
2299   }
2300   KMP_INIT_I_LOCK(lock, seq);
2301 #if USE_ITT_BUILD
2302   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2303   __kmp_itt_lock_creating(ilk->lock, loc);
2304 #endif
2305 }
2306 
2307 /* initialize the lock with a hint */
2308 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2309                                 uintptr_t hint) {
2310   KMP_DEBUG_ASSERT(__kmp_init_serial);
2311   if (__kmp_env_consistency_check && user_lock == NULL) {
2312     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2313   }
2314 
2315   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2316 
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318   // This is the case, if called from omp_init_lock_with_hint:
2319   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2320   if (!codeptr)
2321     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2322   if (ompt_enabled.ompt_callback_lock_init) {
2323     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2324         ompt_mutex_lock, (omp_lock_hint_t)hint,
2325         __ompt_get_mutex_impl_type(user_lock),
2326         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2327   }
2328 #endif
2329 }
2330 
2331 /* initialize the lock with a hint */
2332 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2333                                      void **user_lock, uintptr_t hint) {
2334   KMP_DEBUG_ASSERT(__kmp_init_serial);
2335   if (__kmp_env_consistency_check && user_lock == NULL) {
2336     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2337   }
2338 
2339   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2340 
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL
2342   // This is the case, if called from omp_init_lock_with_hint:
2343   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2344   if (!codeptr)
2345     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2346   if (ompt_enabled.ompt_callback_lock_init) {
2347     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2348         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2349         __ompt_get_mutex_impl_type(user_lock),
2350         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2351   }
2352 #endif
2353 }
2354 
2355 #endif // KMP_USE_DYNAMIC_LOCK
2356 
2357 /* initialize the lock */
2358 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2359 #if KMP_USE_DYNAMIC_LOCK
2360 
2361   KMP_DEBUG_ASSERT(__kmp_init_serial);
2362   if (__kmp_env_consistency_check && user_lock == NULL) {
2363     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2364   }
2365   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2366 
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368   // This is the case, if called from omp_init_lock_with_hint:
2369   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2370   if (!codeptr)
2371     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2372   if (ompt_enabled.ompt_callback_lock_init) {
2373     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2374         ompt_mutex_lock, omp_lock_hint_none,
2375         __ompt_get_mutex_impl_type(user_lock),
2376         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2377   }
2378 #endif
2379 
2380 #else // KMP_USE_DYNAMIC_LOCK
2381 
2382   static char const *const func = "omp_init_lock";
2383   kmp_user_lock_p lck;
2384   KMP_DEBUG_ASSERT(__kmp_init_serial);
2385 
2386   if (__kmp_env_consistency_check) {
2387     if (user_lock == NULL) {
2388       KMP_FATAL(LockIsUninitialized, func);
2389     }
2390   }
2391 
2392   KMP_CHECK_USER_LOCK_INIT();
2393 
2394   if ((__kmp_user_lock_kind == lk_tas) &&
2395       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2396     lck = (kmp_user_lock_p)user_lock;
2397   }
2398 #if KMP_USE_FUTEX
2399   else if ((__kmp_user_lock_kind == lk_futex) &&
2400            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2401     lck = (kmp_user_lock_p)user_lock;
2402   }
2403 #endif
2404   else {
2405     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2406   }
2407   INIT_LOCK(lck);
2408   __kmp_set_user_lock_location(lck, loc);
2409 
2410 #if OMPT_SUPPORT && OMPT_OPTIONAL
2411   // This is the case, if called from omp_init_lock_with_hint:
2412   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2413   if (!codeptr)
2414     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2415   if (ompt_enabled.ompt_callback_lock_init) {
2416     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2417         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2418         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2419   }
2420 #endif
2421 
2422 #if USE_ITT_BUILD
2423   __kmp_itt_lock_creating(lck);
2424 #endif /* USE_ITT_BUILD */
2425 
2426 #endif // KMP_USE_DYNAMIC_LOCK
2427 } // __kmpc_init_lock
2428 
2429 /* initialize the lock */
2430 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2431 #if KMP_USE_DYNAMIC_LOCK
2432 
2433   KMP_DEBUG_ASSERT(__kmp_init_serial);
2434   if (__kmp_env_consistency_check && user_lock == NULL) {
2435     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2436   }
2437   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2438 
2439 #if OMPT_SUPPORT && OMPT_OPTIONAL
2440   // This is the case, if called from omp_init_lock_with_hint:
2441   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2442   if (!codeptr)
2443     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2444   if (ompt_enabled.ompt_callback_lock_init) {
2445     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2446         ompt_mutex_nest_lock, omp_lock_hint_none,
2447         __ompt_get_mutex_impl_type(user_lock),
2448         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2449   }
2450 #endif
2451 
2452 #else // KMP_USE_DYNAMIC_LOCK
2453 
2454   static char const *const func = "omp_init_nest_lock";
2455   kmp_user_lock_p lck;
2456   KMP_DEBUG_ASSERT(__kmp_init_serial);
2457 
2458   if (__kmp_env_consistency_check) {
2459     if (user_lock == NULL) {
2460       KMP_FATAL(LockIsUninitialized, func);
2461     }
2462   }
2463 
2464   KMP_CHECK_USER_LOCK_INIT();
2465 
2466   if ((__kmp_user_lock_kind == lk_tas) &&
2467       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2468        OMP_NEST_LOCK_T_SIZE)) {
2469     lck = (kmp_user_lock_p)user_lock;
2470   }
2471 #if KMP_USE_FUTEX
2472   else if ((__kmp_user_lock_kind == lk_futex) &&
2473            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2474             OMP_NEST_LOCK_T_SIZE)) {
2475     lck = (kmp_user_lock_p)user_lock;
2476   }
2477 #endif
2478   else {
2479     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2480   }
2481 
2482   INIT_NESTED_LOCK(lck);
2483   __kmp_set_user_lock_location(lck, loc);
2484 
2485 #if OMPT_SUPPORT && OMPT_OPTIONAL
2486   // This is the case, if called from omp_init_lock_with_hint:
2487   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2488   if (!codeptr)
2489     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2490   if (ompt_enabled.ompt_callback_lock_init) {
2491     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2492         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2493         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2494   }
2495 #endif
2496 
2497 #if USE_ITT_BUILD
2498   __kmp_itt_lock_creating(lck);
2499 #endif /* USE_ITT_BUILD */
2500 
2501 #endif // KMP_USE_DYNAMIC_LOCK
2502 } // __kmpc_init_nest_lock
2503 
2504 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2505 #if KMP_USE_DYNAMIC_LOCK
2506 
2507 #if USE_ITT_BUILD
2508   kmp_user_lock_p lck;
2509   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2510     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2511   } else {
2512     lck = (kmp_user_lock_p)user_lock;
2513   }
2514   __kmp_itt_lock_destroyed(lck);
2515 #endif
2516 #if OMPT_SUPPORT && OMPT_OPTIONAL
2517   // This is the case, if called from omp_init_lock_with_hint:
2518   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2519   if (!codeptr)
2520     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2521   if (ompt_enabled.ompt_callback_lock_destroy) {
2522     kmp_user_lock_p lck;
2523     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2524       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2525     } else {
2526       lck = (kmp_user_lock_p)user_lock;
2527     }
2528     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2529         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2530   }
2531 #endif
2532   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2533 #else
2534   kmp_user_lock_p lck;
2535 
2536   if ((__kmp_user_lock_kind == lk_tas) &&
2537       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2538     lck = (kmp_user_lock_p)user_lock;
2539   }
2540 #if KMP_USE_FUTEX
2541   else if ((__kmp_user_lock_kind == lk_futex) &&
2542            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2543     lck = (kmp_user_lock_p)user_lock;
2544   }
2545 #endif
2546   else {
2547     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2548   }
2549 
2550 #if OMPT_SUPPORT && OMPT_OPTIONAL
2551   // This is the case, if called from omp_init_lock_with_hint:
2552   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2553   if (!codeptr)
2554     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2555   if (ompt_enabled.ompt_callback_lock_destroy) {
2556     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2557         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2558   }
2559 #endif
2560 
2561 #if USE_ITT_BUILD
2562   __kmp_itt_lock_destroyed(lck);
2563 #endif /* USE_ITT_BUILD */
2564   DESTROY_LOCK(lck);
2565 
2566   if ((__kmp_user_lock_kind == lk_tas) &&
2567       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2568     ;
2569   }
2570 #if KMP_USE_FUTEX
2571   else if ((__kmp_user_lock_kind == lk_futex) &&
2572            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2573     ;
2574   }
2575 #endif
2576   else {
2577     __kmp_user_lock_free(user_lock, gtid, lck);
2578   }
2579 #endif // KMP_USE_DYNAMIC_LOCK
2580 } // __kmpc_destroy_lock
2581 
2582 /* destroy the lock */
2583 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2584 #if KMP_USE_DYNAMIC_LOCK
2585 
2586 #if USE_ITT_BUILD
2587   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2588   __kmp_itt_lock_destroyed(ilk->lock);
2589 #endif
2590 #if OMPT_SUPPORT && OMPT_OPTIONAL
2591   // This is the case, if called from omp_init_lock_with_hint:
2592   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2593   if (!codeptr)
2594     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2595   if (ompt_enabled.ompt_callback_lock_destroy) {
2596     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2597         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2598   }
2599 #endif
2600   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2601 
2602 #else // KMP_USE_DYNAMIC_LOCK
2603 
2604   kmp_user_lock_p lck;
2605 
2606   if ((__kmp_user_lock_kind == lk_tas) &&
2607       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2608        OMP_NEST_LOCK_T_SIZE)) {
2609     lck = (kmp_user_lock_p)user_lock;
2610   }
2611 #if KMP_USE_FUTEX
2612   else if ((__kmp_user_lock_kind == lk_futex) &&
2613            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2614             OMP_NEST_LOCK_T_SIZE)) {
2615     lck = (kmp_user_lock_p)user_lock;
2616   }
2617 #endif
2618   else {
2619     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2620   }
2621 
2622 #if OMPT_SUPPORT && OMPT_OPTIONAL
2623   // This is the case, if called from omp_init_lock_with_hint:
2624   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2625   if (!codeptr)
2626     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2627   if (ompt_enabled.ompt_callback_lock_destroy) {
2628     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2629         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2630   }
2631 #endif
2632 
2633 #if USE_ITT_BUILD
2634   __kmp_itt_lock_destroyed(lck);
2635 #endif /* USE_ITT_BUILD */
2636 
2637   DESTROY_NESTED_LOCK(lck);
2638 
2639   if ((__kmp_user_lock_kind == lk_tas) &&
2640       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2641        OMP_NEST_LOCK_T_SIZE)) {
2642     ;
2643   }
2644 #if KMP_USE_FUTEX
2645   else if ((__kmp_user_lock_kind == lk_futex) &&
2646            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2647             OMP_NEST_LOCK_T_SIZE)) {
2648     ;
2649   }
2650 #endif
2651   else {
2652     __kmp_user_lock_free(user_lock, gtid, lck);
2653   }
2654 #endif // KMP_USE_DYNAMIC_LOCK
2655 } // __kmpc_destroy_nest_lock
2656 
2657 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2658   KMP_COUNT_BLOCK(OMP_set_lock);
2659 #if KMP_USE_DYNAMIC_LOCK
2660   int tag = KMP_EXTRACT_D_TAG(user_lock);
2661 #if USE_ITT_BUILD
2662   __kmp_itt_lock_acquiring(
2663       (kmp_user_lock_p)
2664           user_lock); // itt function will get to the right lock object.
2665 #endif
2666 #if OMPT_SUPPORT && OMPT_OPTIONAL
2667   // This is the case, if called from omp_init_lock_with_hint:
2668   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2669   if (!codeptr)
2670     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2671   if (ompt_enabled.ompt_callback_mutex_acquire) {
2672     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2673         ompt_mutex_lock, omp_lock_hint_none,
2674         __ompt_get_mutex_impl_type(user_lock),
2675         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2676   }
2677 #endif
2678 #if KMP_USE_INLINED_TAS
2679   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2680     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2681   } else
2682 #elif KMP_USE_INLINED_FUTEX
2683   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2684     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2685   } else
2686 #endif
2687   {
2688     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2689   }
2690 #if USE_ITT_BUILD
2691   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2692 #endif
2693 #if OMPT_SUPPORT && OMPT_OPTIONAL
2694   if (ompt_enabled.ompt_callback_mutex_acquired) {
2695     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2696         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2697   }
2698 #endif
2699 
2700 #else // KMP_USE_DYNAMIC_LOCK
2701 
2702   kmp_user_lock_p lck;
2703 
2704   if ((__kmp_user_lock_kind == lk_tas) &&
2705       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2706     lck = (kmp_user_lock_p)user_lock;
2707   }
2708 #if KMP_USE_FUTEX
2709   else if ((__kmp_user_lock_kind == lk_futex) &&
2710            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2711     lck = (kmp_user_lock_p)user_lock;
2712   }
2713 #endif
2714   else {
2715     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2716   }
2717 
2718 #if USE_ITT_BUILD
2719   __kmp_itt_lock_acquiring(lck);
2720 #endif /* USE_ITT_BUILD */
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722   // This is the case, if called from omp_init_lock_with_hint:
2723   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2724   if (!codeptr)
2725     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2726   if (ompt_enabled.ompt_callback_mutex_acquire) {
2727     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2728         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2729         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2730   }
2731 #endif
2732 
2733   ACQUIRE_LOCK(lck, gtid);
2734 
2735 #if USE_ITT_BUILD
2736   __kmp_itt_lock_acquired(lck);
2737 #endif /* USE_ITT_BUILD */
2738 
2739 #if OMPT_SUPPORT && OMPT_OPTIONAL
2740   if (ompt_enabled.ompt_callback_mutex_acquired) {
2741     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2742         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2743   }
2744 #endif
2745 
2746 #endif // KMP_USE_DYNAMIC_LOCK
2747 }
2748 
2749 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2750 #if KMP_USE_DYNAMIC_LOCK
2751 
2752 #if USE_ITT_BUILD
2753   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2754 #endif
2755 #if OMPT_SUPPORT && OMPT_OPTIONAL
2756   // This is the case, if called from omp_init_lock_with_hint:
2757   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2758   if (!codeptr)
2759     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2760   if (ompt_enabled.enabled) {
2761     if (ompt_enabled.ompt_callback_mutex_acquire) {
2762       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2763           ompt_mutex_nest_lock, omp_lock_hint_none,
2764           __ompt_get_mutex_impl_type(user_lock),
2765           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2766     }
2767   }
2768 #endif
2769   int acquire_status =
2770       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2771   (void)acquire_status;
2772 #if USE_ITT_BUILD
2773   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2774 #endif
2775 
2776 #if OMPT_SUPPORT && OMPT_OPTIONAL
2777   if (ompt_enabled.enabled) {
2778     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2779       if (ompt_enabled.ompt_callback_mutex_acquired) {
2780         // lock_first
2781         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2782             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2783             codeptr);
2784       }
2785     } else {
2786       if (ompt_enabled.ompt_callback_nest_lock) {
2787         // lock_next
2788         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2789             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2790       }
2791     }
2792   }
2793 #endif
2794 
2795 #else // KMP_USE_DYNAMIC_LOCK
2796   int acquire_status;
2797   kmp_user_lock_p lck;
2798 
2799   if ((__kmp_user_lock_kind == lk_tas) &&
2800       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2801        OMP_NEST_LOCK_T_SIZE)) {
2802     lck = (kmp_user_lock_p)user_lock;
2803   }
2804 #if KMP_USE_FUTEX
2805   else if ((__kmp_user_lock_kind == lk_futex) &&
2806            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2807             OMP_NEST_LOCK_T_SIZE)) {
2808     lck = (kmp_user_lock_p)user_lock;
2809   }
2810 #endif
2811   else {
2812     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2813   }
2814 
2815 #if USE_ITT_BUILD
2816   __kmp_itt_lock_acquiring(lck);
2817 #endif /* USE_ITT_BUILD */
2818 #if OMPT_SUPPORT && OMPT_OPTIONAL
2819   // This is the case, if called from omp_init_lock_with_hint:
2820   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2821   if (!codeptr)
2822     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2823   if (ompt_enabled.enabled) {
2824     if (ompt_enabled.ompt_callback_mutex_acquire) {
2825       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2826           ompt_mutex_nest_lock, omp_lock_hint_none,
2827           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2828           codeptr);
2829     }
2830   }
2831 #endif
2832 
2833   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2834 
2835 #if USE_ITT_BUILD
2836   __kmp_itt_lock_acquired(lck);
2837 #endif /* USE_ITT_BUILD */
2838 
2839 #if OMPT_SUPPORT && OMPT_OPTIONAL
2840   if (ompt_enabled.enabled) {
2841     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2842       if (ompt_enabled.ompt_callback_mutex_acquired) {
2843         // lock_first
2844         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2845             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2846       }
2847     } else {
2848       if (ompt_enabled.ompt_callback_nest_lock) {
2849         // lock_next
2850         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2851             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2852       }
2853     }
2854   }
2855 #endif
2856 
2857 #endif // KMP_USE_DYNAMIC_LOCK
2858 }
2859 
2860 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2861 #if KMP_USE_DYNAMIC_LOCK
2862 
2863   int tag = KMP_EXTRACT_D_TAG(user_lock);
2864 #if USE_ITT_BUILD
2865   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2866 #endif
2867 #if KMP_USE_INLINED_TAS
2868   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2869     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2870   } else
2871 #elif KMP_USE_INLINED_FUTEX
2872   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2873     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2874   } else
2875 #endif
2876   {
2877     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2878   }
2879 
2880 #if OMPT_SUPPORT && OMPT_OPTIONAL
2881   // This is the case, if called from omp_init_lock_with_hint:
2882   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2883   if (!codeptr)
2884     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2885   if (ompt_enabled.ompt_callback_mutex_released) {
2886     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2887         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2888   }
2889 #endif
2890 
2891 #else // KMP_USE_DYNAMIC_LOCK
2892 
2893   kmp_user_lock_p lck;
2894 
2895   /* Can't use serial interval since not block structured */
2896   /* release the lock */
2897 
2898   if ((__kmp_user_lock_kind == lk_tas) &&
2899       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2900 #if KMP_OS_LINUX &&                                                            \
2901     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2902 // "fast" path implemented to fix customer performance issue
2903 #if USE_ITT_BUILD
2904     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2905 #endif /* USE_ITT_BUILD */
2906     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2907     KMP_MB();
2908 
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910     // This is the case, if called from omp_init_lock_with_hint:
2911     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2912     if (!codeptr)
2913       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2914     if (ompt_enabled.ompt_callback_mutex_released) {
2915       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2916           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2917     }
2918 #endif
2919 
2920     return;
2921 #else
2922     lck = (kmp_user_lock_p)user_lock;
2923 #endif
2924   }
2925 #if KMP_USE_FUTEX
2926   else if ((__kmp_user_lock_kind == lk_futex) &&
2927            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2928     lck = (kmp_user_lock_p)user_lock;
2929   }
2930 #endif
2931   else {
2932     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2933   }
2934 
2935 #if USE_ITT_BUILD
2936   __kmp_itt_lock_releasing(lck);
2937 #endif /* USE_ITT_BUILD */
2938 
2939   RELEASE_LOCK(lck, gtid);
2940 
2941 #if OMPT_SUPPORT && OMPT_OPTIONAL
2942   // This is the case, if called from omp_init_lock_with_hint:
2943   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2944   if (!codeptr)
2945     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2946   if (ompt_enabled.ompt_callback_mutex_released) {
2947     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2948         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2949   }
2950 #endif
2951 
2952 #endif // KMP_USE_DYNAMIC_LOCK
2953 }
2954 
2955 /* release the lock */
2956 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2957 #if KMP_USE_DYNAMIC_LOCK
2958 
2959 #if USE_ITT_BUILD
2960   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2961 #endif
2962   int release_status =
2963       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2964   (void)release_status;
2965 
2966 #if OMPT_SUPPORT && OMPT_OPTIONAL
2967   // This is the case, if called from omp_init_lock_with_hint:
2968   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2969   if (!codeptr)
2970     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2971   if (ompt_enabled.enabled) {
2972     if (release_status == KMP_LOCK_RELEASED) {
2973       if (ompt_enabled.ompt_callback_mutex_released) {
2974         // release_lock_last
2975         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2976             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2977             codeptr);
2978       }
2979     } else if (ompt_enabled.ompt_callback_nest_lock) {
2980       // release_lock_prev
2981       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2982           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2983     }
2984   }
2985 #endif
2986 
2987 #else // KMP_USE_DYNAMIC_LOCK
2988 
2989   kmp_user_lock_p lck;
2990 
2991   /* Can't use serial interval since not block structured */
2992 
2993   if ((__kmp_user_lock_kind == lk_tas) &&
2994       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2995        OMP_NEST_LOCK_T_SIZE)) {
2996 #if KMP_OS_LINUX &&                                                            \
2997     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2998     // "fast" path implemented to fix customer performance issue
2999     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
3000 #if USE_ITT_BUILD
3001     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
3002 #endif /* USE_ITT_BUILD */
3003 
3004 #if OMPT_SUPPORT && OMPT_OPTIONAL
3005     int release_status = KMP_LOCK_STILL_HELD;
3006 #endif
3007 
3008     if (--(tl->lk.depth_locked) == 0) {
3009       TCW_4(tl->lk.poll, 0);
3010 #if OMPT_SUPPORT && OMPT_OPTIONAL
3011       release_status = KMP_LOCK_RELEASED;
3012 #endif
3013     }
3014     KMP_MB();
3015 
3016 #if OMPT_SUPPORT && OMPT_OPTIONAL
3017     // This is the case, if called from omp_init_lock_with_hint:
3018     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3019     if (!codeptr)
3020       codeptr = OMPT_GET_RETURN_ADDRESS(0);
3021     if (ompt_enabled.enabled) {
3022       if (release_status == KMP_LOCK_RELEASED) {
3023         if (ompt_enabled.ompt_callback_mutex_released) {
3024           // release_lock_last
3025           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3026               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3027         }
3028       } else if (ompt_enabled.ompt_callback_nest_lock) {
3029         // release_lock_previous
3030         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3031             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3032       }
3033     }
3034 #endif
3035 
3036     return;
3037 #else
3038     lck = (kmp_user_lock_p)user_lock;
3039 #endif
3040   }
3041 #if KMP_USE_FUTEX
3042   else if ((__kmp_user_lock_kind == lk_futex) &&
3043            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3044             OMP_NEST_LOCK_T_SIZE)) {
3045     lck = (kmp_user_lock_p)user_lock;
3046   }
3047 #endif
3048   else {
3049     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
3050   }
3051 
3052 #if USE_ITT_BUILD
3053   __kmp_itt_lock_releasing(lck);
3054 #endif /* USE_ITT_BUILD */
3055 
3056   int release_status;
3057   release_status = RELEASE_NESTED_LOCK(lck, gtid);
3058 #if OMPT_SUPPORT && OMPT_OPTIONAL
3059   // This is the case, if called from omp_init_lock_with_hint:
3060   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3061   if (!codeptr)
3062     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3063   if (ompt_enabled.enabled) {
3064     if (release_status == KMP_LOCK_RELEASED) {
3065       if (ompt_enabled.ompt_callback_mutex_released) {
3066         // release_lock_last
3067         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
3068             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3069       }
3070     } else if (ompt_enabled.ompt_callback_nest_lock) {
3071       // release_lock_previous
3072       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3073           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3074     }
3075   }
3076 #endif
3077 
3078 #endif // KMP_USE_DYNAMIC_LOCK
3079 }
3080 
3081 /* try to acquire the lock */
3082 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3083   KMP_COUNT_BLOCK(OMP_test_lock);
3084 
3085 #if KMP_USE_DYNAMIC_LOCK
3086   int rc;
3087   int tag = KMP_EXTRACT_D_TAG(user_lock);
3088 #if USE_ITT_BUILD
3089   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3090 #endif
3091 #if OMPT_SUPPORT && OMPT_OPTIONAL
3092   // This is the case, if called from omp_init_lock_with_hint:
3093   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3094   if (!codeptr)
3095     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3096   if (ompt_enabled.ompt_callback_mutex_acquire) {
3097     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3098         ompt_mutex_lock, omp_lock_hint_none,
3099         __ompt_get_mutex_impl_type(user_lock),
3100         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3101   }
3102 #endif
3103 #if KMP_USE_INLINED_TAS
3104   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3105     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3106   } else
3107 #elif KMP_USE_INLINED_FUTEX
3108   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3109     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3110   } else
3111 #endif
3112   {
3113     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3114   }
3115   if (rc) {
3116 #if USE_ITT_BUILD
3117     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3118 #endif
3119 #if OMPT_SUPPORT && OMPT_OPTIONAL
3120     if (ompt_enabled.ompt_callback_mutex_acquired) {
3121       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3122           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3123     }
3124 #endif
3125     return FTN_TRUE;
3126   } else {
3127 #if USE_ITT_BUILD
3128     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3129 #endif
3130     return FTN_FALSE;
3131   }
3132 
3133 #else // KMP_USE_DYNAMIC_LOCK
3134 
3135   kmp_user_lock_p lck;
3136   int rc;
3137 
3138   if ((__kmp_user_lock_kind == lk_tas) &&
3139       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3140     lck = (kmp_user_lock_p)user_lock;
3141   }
3142 #if KMP_USE_FUTEX
3143   else if ((__kmp_user_lock_kind == lk_futex) &&
3144            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3145     lck = (kmp_user_lock_p)user_lock;
3146   }
3147 #endif
3148   else {
3149     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3150   }
3151 
3152 #if USE_ITT_BUILD
3153   __kmp_itt_lock_acquiring(lck);
3154 #endif /* USE_ITT_BUILD */
3155 #if OMPT_SUPPORT && OMPT_OPTIONAL
3156   // This is the case, if called from omp_init_lock_with_hint:
3157   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3158   if (!codeptr)
3159     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3160   if (ompt_enabled.ompt_callback_mutex_acquire) {
3161     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3162         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3163         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3164   }
3165 #endif
3166 
3167   rc = TEST_LOCK(lck, gtid);
3168 #if USE_ITT_BUILD
3169   if (rc) {
3170     __kmp_itt_lock_acquired(lck);
3171   } else {
3172     __kmp_itt_lock_cancelled(lck);
3173   }
3174 #endif /* USE_ITT_BUILD */
3175 #if OMPT_SUPPORT && OMPT_OPTIONAL
3176   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3177     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3178         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3179   }
3180 #endif
3181 
3182   return (rc ? FTN_TRUE : FTN_FALSE);
3183 
3184   /* Can't use serial interval since not block structured */
3185 
3186 #endif // KMP_USE_DYNAMIC_LOCK
3187 }
3188 
3189 /* try to acquire the lock */
3190 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3191 #if KMP_USE_DYNAMIC_LOCK
3192   int rc;
3193 #if USE_ITT_BUILD
3194   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3195 #endif
3196 #if OMPT_SUPPORT && OMPT_OPTIONAL
3197   // This is the case, if called from omp_init_lock_with_hint:
3198   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3199   if (!codeptr)
3200     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3201   if (ompt_enabled.ompt_callback_mutex_acquire) {
3202     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3203         ompt_mutex_nest_lock, omp_lock_hint_none,
3204         __ompt_get_mutex_impl_type(user_lock),
3205         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3206   }
3207 #endif
3208   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3209 #if USE_ITT_BUILD
3210   if (rc) {
3211     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3212   } else {
3213     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3214   }
3215 #endif
3216 #if OMPT_SUPPORT && OMPT_OPTIONAL
3217   if (ompt_enabled.enabled && rc) {
3218     if (rc == 1) {
3219       if (ompt_enabled.ompt_callback_mutex_acquired) {
3220         // lock_first
3221         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3222             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3223             codeptr);
3224       }
3225     } else {
3226       if (ompt_enabled.ompt_callback_nest_lock) {
3227         // lock_next
3228         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3229             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3230       }
3231     }
3232   }
3233 #endif
3234   return rc;
3235 
3236 #else // KMP_USE_DYNAMIC_LOCK
3237 
3238   kmp_user_lock_p lck;
3239   int rc;
3240 
3241   if ((__kmp_user_lock_kind == lk_tas) &&
3242       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3243        OMP_NEST_LOCK_T_SIZE)) {
3244     lck = (kmp_user_lock_p)user_lock;
3245   }
3246 #if KMP_USE_FUTEX
3247   else if ((__kmp_user_lock_kind == lk_futex) &&
3248            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3249             OMP_NEST_LOCK_T_SIZE)) {
3250     lck = (kmp_user_lock_p)user_lock;
3251   }
3252 #endif
3253   else {
3254     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3255   }
3256 
3257 #if USE_ITT_BUILD
3258   __kmp_itt_lock_acquiring(lck);
3259 #endif /* USE_ITT_BUILD */
3260 
3261 #if OMPT_SUPPORT && OMPT_OPTIONAL
3262   // This is the case, if called from omp_init_lock_with_hint:
3263   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3264   if (!codeptr)
3265     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3266   if (ompt_enabled.enabled) &&
3267         ompt_enabled.ompt_callback_mutex_acquire) {
3268       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3269           ompt_mutex_nest_lock, omp_lock_hint_none,
3270           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3271           codeptr);
3272     }
3273 #endif
3274 
3275   rc = TEST_NESTED_LOCK(lck, gtid);
3276 #if USE_ITT_BUILD
3277   if (rc) {
3278     __kmp_itt_lock_acquired(lck);
3279   } else {
3280     __kmp_itt_lock_cancelled(lck);
3281   }
3282 #endif /* USE_ITT_BUILD */
3283 #if OMPT_SUPPORT && OMPT_OPTIONAL
3284   if (ompt_enabled.enabled && rc) {
3285     if (rc == 1) {
3286       if (ompt_enabled.ompt_callback_mutex_acquired) {
3287         // lock_first
3288         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3289             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3290       }
3291     } else {
3292       if (ompt_enabled.ompt_callback_nest_lock) {
3293         // lock_next
3294         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3295             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3296       }
3297     }
3298   }
3299 #endif
3300   return rc;
3301 
3302   /* Can't use serial interval since not block structured */
3303 
3304 #endif // KMP_USE_DYNAMIC_LOCK
3305 }
3306 
3307 // Interface to fast scalable reduce methods routines
3308 
3309 // keep the selected method in a thread local structure for cross-function
3310 // usage: will be used in __kmpc_end_reduce* functions;
3311 // another solution: to re-determine the method one more time in
3312 // __kmpc_end_reduce* functions (new prototype required then)
3313 // AT: which solution is better?
3314 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3315   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3316 
3317 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3318   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3319 
3320 // description of the packed_reduction_method variable: look at the macros in
3321 // kmp.h
3322 
3323 // used in a critical section reduce block
3324 static __forceinline void
3325 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3326                                           kmp_critical_name *crit) {
3327 
3328   // this lock was visible to a customer and to the threading profile tool as a
3329   // serial overhead span (although it's used for an internal purpose only)
3330   //            why was it visible in previous implementation?
3331   //            should we keep it visible in new reduce block?
3332   kmp_user_lock_p lck;
3333 
3334 #if KMP_USE_DYNAMIC_LOCK
3335 
3336   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3337   // Check if it is initialized.
3338   if (*lk == 0) {
3339     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3340       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3341                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3342     } else {
3343       __kmp_init_indirect_csptr(crit, loc, global_tid,
3344                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3345     }
3346   }
3347   // Branch for accessing the actual lock object and set operation. This
3348   // branching is inevitable since this lock initialization does not follow the
3349   // normal dispatch path (lock table is not used).
3350   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3351     lck = (kmp_user_lock_p)lk;
3352     KMP_DEBUG_ASSERT(lck != NULL);
3353     if (__kmp_env_consistency_check) {
3354       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3355     }
3356     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3357   } else {
3358     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3359     lck = ilk->lock;
3360     KMP_DEBUG_ASSERT(lck != NULL);
3361     if (__kmp_env_consistency_check) {
3362       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3363     }
3364     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3365   }
3366 
3367 #else // KMP_USE_DYNAMIC_LOCK
3368 
3369   // We know that the fast reduction code is only emitted by Intel compilers
3370   // with 32 byte critical sections. If there isn't enough space, then we
3371   // have to use a pointer.
3372   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3373     lck = (kmp_user_lock_p)crit;
3374   } else {
3375     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3376   }
3377   KMP_DEBUG_ASSERT(lck != NULL);
3378 
3379   if (__kmp_env_consistency_check)
3380     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3381 
3382   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3383 
3384 #endif // KMP_USE_DYNAMIC_LOCK
3385 }
3386 
3387 // used in a critical section reduce block
3388 static __forceinline void
3389 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3390                                         kmp_critical_name *crit) {
3391 
3392   kmp_user_lock_p lck;
3393 
3394 #if KMP_USE_DYNAMIC_LOCK
3395 
3396   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3397     lck = (kmp_user_lock_p)crit;
3398     if (__kmp_env_consistency_check)
3399       __kmp_pop_sync(global_tid, ct_critical, loc);
3400     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3401   } else {
3402     kmp_indirect_lock_t *ilk =
3403         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3404     if (__kmp_env_consistency_check)
3405       __kmp_pop_sync(global_tid, ct_critical, loc);
3406     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3407   }
3408 
3409 #else // KMP_USE_DYNAMIC_LOCK
3410 
3411   // We know that the fast reduction code is only emitted by Intel compilers
3412   // with 32 byte critical sections. If there isn't enough space, then we have
3413   // to use a pointer.
3414   if (__kmp_base_user_lock_size > 32) {
3415     lck = *((kmp_user_lock_p *)crit);
3416     KMP_ASSERT(lck != NULL);
3417   } else {
3418     lck = (kmp_user_lock_p)crit;
3419   }
3420 
3421   if (__kmp_env_consistency_check)
3422     __kmp_pop_sync(global_tid, ct_critical, loc);
3423 
3424   __kmp_release_user_lock_with_checks(lck, global_tid);
3425 
3426 #endif // KMP_USE_DYNAMIC_LOCK
3427 } // __kmp_end_critical_section_reduce_block
3428 
3429 static __forceinline int
3430 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3431                                      int *task_state) {
3432   kmp_team_t *team;
3433 
3434   // Check if we are inside the teams construct?
3435   if (th->th.th_teams_microtask) {
3436     *team_p = team = th->th.th_team;
3437     if (team->t.t_level == th->th.th_teams_level) {
3438       // This is reduction at teams construct.
3439       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3440       // Let's swap teams temporarily for the reduction.
3441       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3442       th->th.th_team = team->t.t_parent;
3443       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3444       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3445       *task_state = th->th.th_task_state;
3446       th->th.th_task_state = 0;
3447 
3448       return 1;
3449     }
3450   }
3451   return 0;
3452 }
3453 
3454 static __forceinline void
3455 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3456   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3457   th->th.th_info.ds.ds_tid = 0;
3458   th->th.th_team = team;
3459   th->th.th_team_nproc = team->t.t_nproc;
3460   th->th.th_task_team = team->t.t_task_team[task_state];
3461   __kmp_type_convert(task_state, &(th->th.th_task_state));
3462 }
3463 
3464 /* 2.a.i. Reduce Block without a terminating barrier */
3465 /*!
3466 @ingroup SYNCHRONIZATION
3467 @param loc source location information
3468 @param global_tid global thread number
3469 @param num_vars number of items (variables) to be reduced
3470 @param reduce_size size of data in bytes to be reduced
3471 @param reduce_data pointer to data to be reduced
3472 @param reduce_func callback function providing reduction operation on two
3473 operands and returning result of reduction in lhs_data
3474 @param lck pointer to the unique lock data structure
3475 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3476 threads if atomic reduction needed
3477 
3478 The nowait version is used for a reduce clause with the nowait argument.
3479 */
3480 kmp_int32
3481 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3482                      size_t reduce_size, void *reduce_data,
3483                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3484                      kmp_critical_name *lck) {
3485 
3486   KMP_COUNT_BLOCK(REDUCE_nowait);
3487   int retval = 0;
3488   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3489   kmp_info_t *th;
3490   kmp_team_t *team;
3491   int teams_swapped = 0, task_state;
3492   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3493   __kmp_assert_valid_gtid(global_tid);
3494 
3495   // why do we need this initialization here at all?
3496   // Reduction clause can not be used as a stand-alone directive.
3497 
3498   // do not call __kmp_serial_initialize(), it will be called by
3499   // __kmp_parallel_initialize() if needed
3500   // possible detection of false-positive race by the threadchecker ???
3501   if (!TCR_4(__kmp_init_parallel))
3502     __kmp_parallel_initialize();
3503 
3504   __kmp_resume_if_soft_paused();
3505 
3506 // check correctness of reduce block nesting
3507 #if KMP_USE_DYNAMIC_LOCK
3508   if (__kmp_env_consistency_check)
3509     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3510 #else
3511   if (__kmp_env_consistency_check)
3512     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3513 #endif
3514 
3515   th = __kmp_thread_from_gtid(global_tid);
3516   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3517 
3518   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3519   // the value should be kept in a variable
3520   // the variable should be either a construct-specific or thread-specific
3521   // property, not a team specific property
3522   //     (a thread can reach the next reduce block on the next construct, reduce
3523   //     method may differ on the next construct)
3524   // an ident_t "loc" parameter could be used as a construct-specific property
3525   // (what if loc == 0?)
3526   //     (if both construct-specific and team-specific variables were shared,
3527   //     then unness extra syncs should be needed)
3528   // a thread-specific variable is better regarding two issues above (next
3529   // construct and extra syncs)
3530   // a thread-specific "th_local.reduction_method" variable is used currently
3531   // each thread executes 'determine' and 'set' lines (no need to execute by one
3532   // thread, to avoid unness extra syncs)
3533 
3534   packed_reduction_method = __kmp_determine_reduction_method(
3535       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3536   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3537 
3538   OMPT_REDUCTION_DECL(th, global_tid);
3539   if (packed_reduction_method == critical_reduce_block) {
3540 
3541     OMPT_REDUCTION_BEGIN;
3542 
3543     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3544     retval = 1;
3545 
3546   } else if (packed_reduction_method == empty_reduce_block) {
3547 
3548     OMPT_REDUCTION_BEGIN;
3549 
3550     // usage: if team size == 1, no synchronization is required ( Intel
3551     // platforms only )
3552     retval = 1;
3553 
3554   } else if (packed_reduction_method == atomic_reduce_block) {
3555 
3556     retval = 2;
3557 
3558     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3559     // won't be called by the code gen)
3560     //     (it's not quite good, because the checking block has been closed by
3561     //     this 'pop',
3562     //      but atomic operation has not been executed yet, will be executed
3563     //      slightly later, literally on next instruction)
3564     if (__kmp_env_consistency_check)
3565       __kmp_pop_sync(global_tid, ct_reduce, loc);
3566 
3567   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3568                                    tree_reduce_block)) {
3569 
3570 // AT: performance issue: a real barrier here
3571 // AT: (if primary thread is slow, other threads are blocked here waiting for
3572 //      the primary thread to come and release them)
3573 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3574 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3575 //      be confusing to a customer)
3576 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3577 // might go faster and be more in line with sense of NOWAIT
3578 // AT: TO DO: do epcc test and compare times
3579 
3580 // this barrier should be invisible to a customer and to the threading profile
3581 // tool (it's neither a terminating barrier nor customer's code, it's
3582 // used for an internal purpose)
3583 #if OMPT_SUPPORT
3584     // JP: can this barrier potentially leed to task scheduling?
3585     // JP: as long as there is a barrier in the implementation, OMPT should and
3586     // will provide the barrier events
3587     //         so we set-up the necessary frame/return addresses.
3588     ompt_frame_t *ompt_frame;
3589     if (ompt_enabled.enabled) {
3590       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3591       if (ompt_frame->enter_frame.ptr == NULL)
3592         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3593     }
3594     OMPT_STORE_RETURN_ADDRESS(global_tid);
3595 #endif
3596 #if USE_ITT_NOTIFY
3597     __kmp_threads[global_tid]->th.th_ident = loc;
3598 #endif
3599     retval =
3600         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3601                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3602     retval = (retval != 0) ? (0) : (1);
3603 #if OMPT_SUPPORT && OMPT_OPTIONAL
3604     if (ompt_enabled.enabled) {
3605       ompt_frame->enter_frame = ompt_data_none;
3606     }
3607 #endif
3608 
3609     // all other workers except primary thread should do this pop here
3610     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3611     if (__kmp_env_consistency_check) {
3612       if (retval == 0) {
3613         __kmp_pop_sync(global_tid, ct_reduce, loc);
3614       }
3615     }
3616 
3617   } else {
3618 
3619     // should never reach this block
3620     KMP_ASSERT(0); // "unexpected method"
3621   }
3622   if (teams_swapped) {
3623     __kmp_restore_swapped_teams(th, team, task_state);
3624   }
3625   KA_TRACE(
3626       10,
3627       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3628        global_tid, packed_reduction_method, retval));
3629 
3630   return retval;
3631 }
3632 
3633 /*!
3634 @ingroup SYNCHRONIZATION
3635 @param loc source location information
3636 @param global_tid global thread id.
3637 @param lck pointer to the unique lock data structure
3638 
3639 Finish the execution of a reduce nowait.
3640 */
3641 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3642                               kmp_critical_name *lck) {
3643 
3644   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3645 
3646   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3647   __kmp_assert_valid_gtid(global_tid);
3648 
3649   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3650 
3651   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3652 
3653   if (packed_reduction_method == critical_reduce_block) {
3654 
3655     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3656     OMPT_REDUCTION_END;
3657 
3658   } else if (packed_reduction_method == empty_reduce_block) {
3659 
3660     // usage: if team size == 1, no synchronization is required ( on Intel
3661     // platforms only )
3662 
3663     OMPT_REDUCTION_END;
3664 
3665   } else if (packed_reduction_method == atomic_reduce_block) {
3666 
3667     // neither primary thread nor other workers should get here
3668     //     (code gen does not generate this call in case 2: atomic reduce block)
3669     // actually it's better to remove this elseif at all;
3670     // after removal this value will checked by the 'else' and will assert
3671 
3672   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3673                                    tree_reduce_block)) {
3674 
3675     // only primary thread gets here
3676     // OMPT: tree reduction is annotated in the barrier code
3677 
3678   } else {
3679 
3680     // should never reach this block
3681     KMP_ASSERT(0); // "unexpected method"
3682   }
3683 
3684   if (__kmp_env_consistency_check)
3685     __kmp_pop_sync(global_tid, ct_reduce, loc);
3686 
3687   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3688                 global_tid, packed_reduction_method));
3689 
3690   return;
3691 }
3692 
3693 /* 2.a.ii. Reduce Block with a terminating barrier */
3694 
3695 /*!
3696 @ingroup SYNCHRONIZATION
3697 @param loc source location information
3698 @param global_tid global thread number
3699 @param num_vars number of items (variables) to be reduced
3700 @param reduce_size size of data in bytes to be reduced
3701 @param reduce_data pointer to data to be reduced
3702 @param reduce_func callback function providing reduction operation on two
3703 operands and returning result of reduction in lhs_data
3704 @param lck pointer to the unique lock data structure
3705 @result 1 for the primary thread, 0 for all other team threads, 2 for all team
3706 threads if atomic reduction needed
3707 
3708 A blocking reduce that includes an implicit barrier.
3709 */
3710 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3711                         size_t reduce_size, void *reduce_data,
3712                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3713                         kmp_critical_name *lck) {
3714   KMP_COUNT_BLOCK(REDUCE_wait);
3715   int retval = 0;
3716   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3717   kmp_info_t *th;
3718   kmp_team_t *team;
3719   int teams_swapped = 0, task_state;
3720 
3721   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3722   __kmp_assert_valid_gtid(global_tid);
3723 
3724   // why do we need this initialization here at all?
3725   // Reduction clause can not be a stand-alone directive.
3726 
3727   // do not call __kmp_serial_initialize(), it will be called by
3728   // __kmp_parallel_initialize() if needed
3729   // possible detection of false-positive race by the threadchecker ???
3730   if (!TCR_4(__kmp_init_parallel))
3731     __kmp_parallel_initialize();
3732 
3733   __kmp_resume_if_soft_paused();
3734 
3735 // check correctness of reduce block nesting
3736 #if KMP_USE_DYNAMIC_LOCK
3737   if (__kmp_env_consistency_check)
3738     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3739 #else
3740   if (__kmp_env_consistency_check)
3741     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3742 #endif
3743 
3744   th = __kmp_thread_from_gtid(global_tid);
3745   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3746 
3747   packed_reduction_method = __kmp_determine_reduction_method(
3748       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3749   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3750 
3751   OMPT_REDUCTION_DECL(th, global_tid);
3752 
3753   if (packed_reduction_method == critical_reduce_block) {
3754 
3755     OMPT_REDUCTION_BEGIN;
3756     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3757     retval = 1;
3758 
3759   } else if (packed_reduction_method == empty_reduce_block) {
3760 
3761     OMPT_REDUCTION_BEGIN;
3762     // usage: if team size == 1, no synchronization is required ( Intel
3763     // platforms only )
3764     retval = 1;
3765 
3766   } else if (packed_reduction_method == atomic_reduce_block) {
3767 
3768     retval = 2;
3769 
3770   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3771                                    tree_reduce_block)) {
3772 
3773 // case tree_reduce_block:
3774 // this barrier should be visible to a customer and to the threading profile
3775 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3776 #if OMPT_SUPPORT
3777     ompt_frame_t *ompt_frame;
3778     if (ompt_enabled.enabled) {
3779       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3780       if (ompt_frame->enter_frame.ptr == NULL)
3781         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3782     }
3783     OMPT_STORE_RETURN_ADDRESS(global_tid);
3784 #endif
3785 #if USE_ITT_NOTIFY
3786     __kmp_threads[global_tid]->th.th_ident =
3787         loc; // needed for correct notification of frames
3788 #endif
3789     retval =
3790         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3791                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3792     retval = (retval != 0) ? (0) : (1);
3793 #if OMPT_SUPPORT && OMPT_OPTIONAL
3794     if (ompt_enabled.enabled) {
3795       ompt_frame->enter_frame = ompt_data_none;
3796     }
3797 #endif
3798 
3799     // all other workers except primary thread should do this pop here
3800     // (none of other workers except primary will enter __kmpc_end_reduce())
3801     if (__kmp_env_consistency_check) {
3802       if (retval == 0) { // 0: all other workers; 1: primary thread
3803         __kmp_pop_sync(global_tid, ct_reduce, loc);
3804       }
3805     }
3806 
3807   } else {
3808 
3809     // should never reach this block
3810     KMP_ASSERT(0); // "unexpected method"
3811   }
3812   if (teams_swapped) {
3813     __kmp_restore_swapped_teams(th, team, task_state);
3814   }
3815 
3816   KA_TRACE(10,
3817            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3818             global_tid, packed_reduction_method, retval));
3819   return retval;
3820 }
3821 
3822 /*!
3823 @ingroup SYNCHRONIZATION
3824 @param loc source location information
3825 @param global_tid global thread id.
3826 @param lck pointer to the unique lock data structure
3827 
3828 Finish the execution of a blocking reduce.
3829 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3830 start function.
3831 */
3832 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3833                        kmp_critical_name *lck) {
3834 
3835   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3836   kmp_info_t *th;
3837   kmp_team_t *team;
3838   int teams_swapped = 0, task_state;
3839 
3840   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3841   __kmp_assert_valid_gtid(global_tid);
3842 
3843   th = __kmp_thread_from_gtid(global_tid);
3844   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3845 
3846   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3847 
3848   // this barrier should be visible to a customer and to the threading profile
3849   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3850   OMPT_REDUCTION_DECL(th, global_tid);
3851 
3852   if (packed_reduction_method == critical_reduce_block) {
3853     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3854 
3855     OMPT_REDUCTION_END;
3856 
3857 // TODO: implicit barrier: should be exposed
3858 #if OMPT_SUPPORT
3859     ompt_frame_t *ompt_frame;
3860     if (ompt_enabled.enabled) {
3861       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3862       if (ompt_frame->enter_frame.ptr == NULL)
3863         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3864     }
3865     OMPT_STORE_RETURN_ADDRESS(global_tid);
3866 #endif
3867 #if USE_ITT_NOTIFY
3868     __kmp_threads[global_tid]->th.th_ident = loc;
3869 #endif
3870     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3871 #if OMPT_SUPPORT && OMPT_OPTIONAL
3872     if (ompt_enabled.enabled) {
3873       ompt_frame->enter_frame = ompt_data_none;
3874     }
3875 #endif
3876 
3877   } else if (packed_reduction_method == empty_reduce_block) {
3878 
3879     OMPT_REDUCTION_END;
3880 
3881 // usage: if team size==1, no synchronization is required (Intel platforms only)
3882 
3883 // TODO: implicit barrier: should be exposed
3884 #if OMPT_SUPPORT
3885     ompt_frame_t *ompt_frame;
3886     if (ompt_enabled.enabled) {
3887       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3888       if (ompt_frame->enter_frame.ptr == NULL)
3889         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3890     }
3891     OMPT_STORE_RETURN_ADDRESS(global_tid);
3892 #endif
3893 #if USE_ITT_NOTIFY
3894     __kmp_threads[global_tid]->th.th_ident = loc;
3895 #endif
3896     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3897 #if OMPT_SUPPORT && OMPT_OPTIONAL
3898     if (ompt_enabled.enabled) {
3899       ompt_frame->enter_frame = ompt_data_none;
3900     }
3901 #endif
3902 
3903   } else if (packed_reduction_method == atomic_reduce_block) {
3904 
3905 #if OMPT_SUPPORT
3906     ompt_frame_t *ompt_frame;
3907     if (ompt_enabled.enabled) {
3908       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3909       if (ompt_frame->enter_frame.ptr == NULL)
3910         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3911     }
3912     OMPT_STORE_RETURN_ADDRESS(global_tid);
3913 #endif
3914 // TODO: implicit barrier: should be exposed
3915 #if USE_ITT_NOTIFY
3916     __kmp_threads[global_tid]->th.th_ident = loc;
3917 #endif
3918     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3919 #if OMPT_SUPPORT && OMPT_OPTIONAL
3920     if (ompt_enabled.enabled) {
3921       ompt_frame->enter_frame = ompt_data_none;
3922     }
3923 #endif
3924 
3925   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3926                                    tree_reduce_block)) {
3927 
3928     // only primary thread executes here (primary releases all other workers)
3929     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3930                             global_tid);
3931 
3932   } else {
3933 
3934     // should never reach this block
3935     KMP_ASSERT(0); // "unexpected method"
3936   }
3937   if (teams_swapped) {
3938     __kmp_restore_swapped_teams(th, team, task_state);
3939   }
3940 
3941   if (__kmp_env_consistency_check)
3942     __kmp_pop_sync(global_tid, ct_reduce, loc);
3943 
3944   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3945                 global_tid, packed_reduction_method));
3946 
3947   return;
3948 }
3949 
3950 #undef __KMP_GET_REDUCTION_METHOD
3951 #undef __KMP_SET_REDUCTION_METHOD
3952 
3953 /* end of interface to fast scalable reduce routines */
3954 
3955 kmp_uint64 __kmpc_get_taskid() {
3956 
3957   kmp_int32 gtid;
3958   kmp_info_t *thread;
3959 
3960   gtid = __kmp_get_gtid();
3961   if (gtid < 0) {
3962     return 0;
3963   }
3964   thread = __kmp_thread_from_gtid(gtid);
3965   return thread->th.th_current_task->td_task_id;
3966 
3967 } // __kmpc_get_taskid
3968 
3969 kmp_uint64 __kmpc_get_parent_taskid() {
3970 
3971   kmp_int32 gtid;
3972   kmp_info_t *thread;
3973   kmp_taskdata_t *parent_task;
3974 
3975   gtid = __kmp_get_gtid();
3976   if (gtid < 0) {
3977     return 0;
3978   }
3979   thread = __kmp_thread_from_gtid(gtid);
3980   parent_task = thread->th.th_current_task->td_parent;
3981   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3982 
3983 } // __kmpc_get_parent_taskid
3984 
3985 /*!
3986 @ingroup WORK_SHARING
3987 @param loc  source location information.
3988 @param gtid  global thread number.
3989 @param num_dims  number of associated doacross loops.
3990 @param dims  info on loops bounds.
3991 
3992 Initialize doacross loop information.
3993 Expect compiler send us inclusive bounds,
3994 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3995 */
3996 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3997                           const struct kmp_dim *dims) {
3998   __kmp_assert_valid_gtid(gtid);
3999   int j, idx;
4000   kmp_int64 last, trace_count;
4001   kmp_info_t *th = __kmp_threads[gtid];
4002   kmp_team_t *team = th->th.th_team;
4003   kmp_uint32 *flags;
4004   kmp_disp_t *pr_buf = th->th.th_dispatch;
4005   dispatch_shared_info_t *sh_buf;
4006 
4007   KA_TRACE(
4008       20,
4009       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
4010        gtid, num_dims, !team->t.t_serialized));
4011   KMP_DEBUG_ASSERT(dims != NULL);
4012   KMP_DEBUG_ASSERT(num_dims > 0);
4013 
4014   if (team->t.t_serialized) {
4015     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
4016     return; // no dependencies if team is serialized
4017   }
4018   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
4019   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
4020   // the next loop
4021   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4022 
4023   // Save bounds info into allocated private buffer
4024   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
4025   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
4026       th, sizeof(kmp_int64) * (4 * num_dims + 1));
4027   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4028   pr_buf->th_doacross_info[0] =
4029       (kmp_int64)num_dims; // first element is number of dimensions
4030   // Save also address of num_done in order to access it later without knowing
4031   // the buffer index
4032   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
4033   pr_buf->th_doacross_info[2] = dims[0].lo;
4034   pr_buf->th_doacross_info[3] = dims[0].up;
4035   pr_buf->th_doacross_info[4] = dims[0].st;
4036   last = 5;
4037   for (j = 1; j < num_dims; ++j) {
4038     kmp_int64
4039         range_length; // To keep ranges of all dimensions but the first dims[0]
4040     if (dims[j].st == 1) { // most common case
4041       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
4042       range_length = dims[j].up - dims[j].lo + 1;
4043     } else {
4044       if (dims[j].st > 0) {
4045         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
4046         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
4047       } else { // negative increment
4048         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
4049         range_length =
4050             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
4051       }
4052     }
4053     pr_buf->th_doacross_info[last++] = range_length;
4054     pr_buf->th_doacross_info[last++] = dims[j].lo;
4055     pr_buf->th_doacross_info[last++] = dims[j].up;
4056     pr_buf->th_doacross_info[last++] = dims[j].st;
4057   }
4058 
4059   // Compute total trip count.
4060   // Start with range of dims[0] which we don't need to keep in the buffer.
4061   if (dims[0].st == 1) { // most common case
4062     trace_count = dims[0].up - dims[0].lo + 1;
4063   } else if (dims[0].st > 0) {
4064     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
4065     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
4066   } else { // negative increment
4067     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
4068     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
4069   }
4070   for (j = 1; j < num_dims; ++j) {
4071     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
4072   }
4073   KMP_DEBUG_ASSERT(trace_count > 0);
4074 
4075   // Check if shared buffer is not occupied by other loop (idx -
4076   // __kmp_dispatch_num_buffers)
4077   if (idx != sh_buf->doacross_buf_idx) {
4078     // Shared buffer is occupied, wait for it to be free
4079     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
4080                  __kmp_eq_4, NULL);
4081   }
4082 #if KMP_32_BIT_ARCH
4083   // Check if we are the first thread. After the CAS the first thread gets 0,
4084   // others get 1 if initialization is in progress, allocated pointer otherwise.
4085   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
4086   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
4087       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
4088 #else
4089   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
4090       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
4091 #endif
4092   if (flags == NULL) {
4093     // we are the first thread, allocate the array of flags
4094     size_t size =
4095         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
4096     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4097     KMP_MB();
4098     sh_buf->doacross_flags = flags;
4099   } else if (flags == (kmp_uint32 *)1) {
4100 #if KMP_32_BIT_ARCH
4101     // initialization is still in progress, need to wait
4102     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4103 #else
4104     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4105 #endif
4106       KMP_YIELD(TRUE);
4107     KMP_MB();
4108   } else {
4109     KMP_MB();
4110   }
4111   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4112   pr_buf->th_doacross_flags =
4113       sh_buf->doacross_flags; // save private copy in order to not
4114   // touch shared buffer on each iteration
4115   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4116 }
4117 
4118 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4119   __kmp_assert_valid_gtid(gtid);
4120   kmp_int64 shft;
4121   size_t num_dims, i;
4122   kmp_uint32 flag;
4123   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4124   kmp_info_t *th = __kmp_threads[gtid];
4125   kmp_team_t *team = th->th.th_team;
4126   kmp_disp_t *pr_buf;
4127   kmp_int64 lo, up, st;
4128 
4129   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4130   if (team->t.t_serialized) {
4131     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4132     return; // no dependencies if team is serialized
4133   }
4134 
4135   // calculate sequential iteration number and check out-of-bounds condition
4136   pr_buf = th->th.th_dispatch;
4137   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4138   num_dims = (size_t)pr_buf->th_doacross_info[0];
4139   lo = pr_buf->th_doacross_info[2];
4140   up = pr_buf->th_doacross_info[3];
4141   st = pr_buf->th_doacross_info[4];
4142 #if OMPT_SUPPORT && OMPT_OPTIONAL
4143   ompt_dependence_t deps[num_dims];
4144 #endif
4145   if (st == 1) { // most common case
4146     if (vec[0] < lo || vec[0] > up) {
4147       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4148                     "bounds [%lld,%lld]\n",
4149                     gtid, vec[0], lo, up));
4150       return;
4151     }
4152     iter_number = vec[0] - lo;
4153   } else if (st > 0) {
4154     if (vec[0] < lo || vec[0] > up) {
4155       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4156                     "bounds [%lld,%lld]\n",
4157                     gtid, vec[0], lo, up));
4158       return;
4159     }
4160     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4161   } else { // negative increment
4162     if (vec[0] > lo || vec[0] < up) {
4163       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4164                     "bounds [%lld,%lld]\n",
4165                     gtid, vec[0], lo, up));
4166       return;
4167     }
4168     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4169   }
4170 #if OMPT_SUPPORT && OMPT_OPTIONAL
4171   deps[0].variable.value = iter_number;
4172   deps[0].dependence_type = ompt_dependence_type_sink;
4173 #endif
4174   for (i = 1; i < num_dims; ++i) {
4175     kmp_int64 iter, ln;
4176     size_t j = i * 4;
4177     ln = pr_buf->th_doacross_info[j + 1];
4178     lo = pr_buf->th_doacross_info[j + 2];
4179     up = pr_buf->th_doacross_info[j + 3];
4180     st = pr_buf->th_doacross_info[j + 4];
4181     if (st == 1) {
4182       if (vec[i] < lo || vec[i] > up) {
4183         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4184                       "bounds [%lld,%lld]\n",
4185                       gtid, vec[i], lo, up));
4186         return;
4187       }
4188       iter = vec[i] - lo;
4189     } else if (st > 0) {
4190       if (vec[i] < lo || vec[i] > up) {
4191         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4192                       "bounds [%lld,%lld]\n",
4193                       gtid, vec[i], lo, up));
4194         return;
4195       }
4196       iter = (kmp_uint64)(vec[i] - lo) / st;
4197     } else { // st < 0
4198       if (vec[i] > lo || vec[i] < up) {
4199         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4200                       "bounds [%lld,%lld]\n",
4201                       gtid, vec[i], lo, up));
4202         return;
4203       }
4204       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4205     }
4206     iter_number = iter + ln * iter_number;
4207 #if OMPT_SUPPORT && OMPT_OPTIONAL
4208     deps[i].variable.value = iter;
4209     deps[i].dependence_type = ompt_dependence_type_sink;
4210 #endif
4211   }
4212   shft = iter_number % 32; // use 32-bit granularity
4213   iter_number >>= 5; // divided by 32
4214   flag = 1 << shft;
4215   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4216     KMP_YIELD(TRUE);
4217   }
4218   KMP_MB();
4219 #if OMPT_SUPPORT && OMPT_OPTIONAL
4220   if (ompt_enabled.ompt_callback_dependences) {
4221     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4222         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4223   }
4224 #endif
4225   KA_TRACE(20,
4226            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4227             gtid, (iter_number << 5) + shft));
4228 }
4229 
4230 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4231   __kmp_assert_valid_gtid(gtid);
4232   kmp_int64 shft;
4233   size_t num_dims, i;
4234   kmp_uint32 flag;
4235   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4236   kmp_info_t *th = __kmp_threads[gtid];
4237   kmp_team_t *team = th->th.th_team;
4238   kmp_disp_t *pr_buf;
4239   kmp_int64 lo, st;
4240 
4241   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4242   if (team->t.t_serialized) {
4243     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4244     return; // no dependencies if team is serialized
4245   }
4246 
4247   // calculate sequential iteration number (same as in "wait" but no
4248   // out-of-bounds checks)
4249   pr_buf = th->th.th_dispatch;
4250   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4251   num_dims = (size_t)pr_buf->th_doacross_info[0];
4252   lo = pr_buf->th_doacross_info[2];
4253   st = pr_buf->th_doacross_info[4];
4254 #if OMPT_SUPPORT && OMPT_OPTIONAL
4255   ompt_dependence_t deps[num_dims];
4256 #endif
4257   if (st == 1) { // most common case
4258     iter_number = vec[0] - lo;
4259   } else if (st > 0) {
4260     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4261   } else { // negative increment
4262     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4263   }
4264 #if OMPT_SUPPORT && OMPT_OPTIONAL
4265   deps[0].variable.value = iter_number;
4266   deps[0].dependence_type = ompt_dependence_type_source;
4267 #endif
4268   for (i = 1; i < num_dims; ++i) {
4269     kmp_int64 iter, ln;
4270     size_t j = i * 4;
4271     ln = pr_buf->th_doacross_info[j + 1];
4272     lo = pr_buf->th_doacross_info[j + 2];
4273     st = pr_buf->th_doacross_info[j + 4];
4274     if (st == 1) {
4275       iter = vec[i] - lo;
4276     } else if (st > 0) {
4277       iter = (kmp_uint64)(vec[i] - lo) / st;
4278     } else { // st < 0
4279       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4280     }
4281     iter_number = iter + ln * iter_number;
4282 #if OMPT_SUPPORT && OMPT_OPTIONAL
4283     deps[i].variable.value = iter;
4284     deps[i].dependence_type = ompt_dependence_type_source;
4285 #endif
4286   }
4287 #if OMPT_SUPPORT && OMPT_OPTIONAL
4288   if (ompt_enabled.ompt_callback_dependences) {
4289     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4290         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4291   }
4292 #endif
4293   shft = iter_number % 32; // use 32-bit granularity
4294   iter_number >>= 5; // divided by 32
4295   flag = 1 << shft;
4296   KMP_MB();
4297   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4298     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4299   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4300                 (iter_number << 5) + shft));
4301 }
4302 
4303 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4304   __kmp_assert_valid_gtid(gtid);
4305   kmp_int32 num_done;
4306   kmp_info_t *th = __kmp_threads[gtid];
4307   kmp_team_t *team = th->th.th_team;
4308   kmp_disp_t *pr_buf = th->th.th_dispatch;
4309 
4310   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4311   if (team->t.t_serialized) {
4312     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4313     return; // nothing to do
4314   }
4315   num_done =
4316       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4317   if (num_done == th->th.th_team_nproc) {
4318     // we are the last thread, need to free shared resources
4319     int idx = pr_buf->th_doacross_buf_idx - 1;
4320     dispatch_shared_info_t *sh_buf =
4321         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4322     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4323                      (kmp_int64)&sh_buf->doacross_num_done);
4324     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4325     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4326     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4327     sh_buf->doacross_flags = NULL;
4328     sh_buf->doacross_num_done = 0;
4329     sh_buf->doacross_buf_idx +=
4330         __kmp_dispatch_num_buffers; // free buffer for future re-use
4331   }
4332   // free private resources (need to keep buffer index forever)
4333   pr_buf->th_doacross_flags = NULL;
4334   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4335   pr_buf->th_doacross_info = NULL;
4336   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4337 }
4338 
4339 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4340 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4341   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4342 }
4343 
4344 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4345   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4346 }
4347 
4348 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4349                   omp_allocator_handle_t free_allocator) {
4350   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4351                         free_allocator);
4352 }
4353 
4354 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4355   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4356 }
4357 
4358 int __kmpc_get_target_offload(void) {
4359   if (!__kmp_init_serial) {
4360     __kmp_serial_initialize();
4361   }
4362   return __kmp_target_offload;
4363 }
4364 
4365 int __kmpc_pause_resource(kmp_pause_status_t level) {
4366   if (!__kmp_init_serial) {
4367     return 1; // Can't pause if runtime is not initialized
4368   }
4369   return __kmp_pause_resource(level);
4370 }
4371 
4372 void __kmpc_error(ident_t *loc, int severity, const char *message) {
4373   if (!__kmp_init_serial)
4374     __kmp_serial_initialize();
4375 
4376   KMP_ASSERT(severity == severity_warning || severity == severity_fatal);
4377 
4378 #if OMPT_SUPPORT
4379   if (ompt_enabled.enabled && ompt_enabled.ompt_callback_error) {
4380     ompt_callbacks.ompt_callback(ompt_callback_error)(
4381         (ompt_severity_t)severity, message, KMP_STRLEN(message),
4382         OMPT_GET_RETURN_ADDRESS(0));
4383   }
4384 #endif // OMPT_SUPPORT
4385 
4386   char *src_loc;
4387   if (loc && loc->psource) {
4388     kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, false);
4389     src_loc =
4390         __kmp_str_format("%s:%s:%s", str_loc.file, str_loc.line, str_loc.col);
4391     __kmp_str_loc_free(&str_loc);
4392   } else {
4393     src_loc = __kmp_str_format("unknown");
4394   }
4395 
4396   if (severity == severity_warning)
4397     KMP_WARNING(UserDirectedWarning, src_loc, message);
4398   else
4399     KMP_FATAL(UserDirectedError, src_loc, message);
4400 
4401   __kmp_str_free(&src_loc);
4402 }
4403 
4404 #ifdef KMP_USE_VERSION_SYMBOLS
4405 // For GOMP compatibility there are two versions of each omp_* API.
4406 // One is the plain C symbol and one is the Fortran symbol with an appended
4407 // underscore. When we implement a specific ompc_* version of an omp_*
4408 // function, we want the plain GOMP versioned symbol to alias the ompc_* version
4409 // instead of the Fortran versions in kmp_ftn_entry.h
4410 extern "C" {
4411 // Have to undef these from omp.h so they aren't translated into
4412 // their ompc counterparts in the KMP_VERSION_OMPC_SYMBOL macros below
4413 #ifdef omp_set_affinity_format
4414 #undef omp_set_affinity_format
4415 #endif
4416 #ifdef omp_get_affinity_format
4417 #undef omp_get_affinity_format
4418 #endif
4419 #ifdef omp_display_affinity
4420 #undef omp_display_affinity
4421 #endif
4422 #ifdef omp_capture_affinity
4423 #undef omp_capture_affinity
4424 #endif
4425 KMP_VERSION_OMPC_SYMBOL(ompc_set_affinity_format, omp_set_affinity_format, 50,
4426                         "OMP_5.0");
4427 KMP_VERSION_OMPC_SYMBOL(ompc_get_affinity_format, omp_get_affinity_format, 50,
4428                         "OMP_5.0");
4429 KMP_VERSION_OMPC_SYMBOL(ompc_display_affinity, omp_display_affinity, 50,
4430                         "OMP_5.0");
4431 KMP_VERSION_OMPC_SYMBOL(ompc_capture_affinity, omp_capture_affinity, 50,
4432                         "OMP_5.0");
4433 } // extern "C"
4434 #endif
4435