1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc source location information
357 @param global_tid global thread number
358 @param num_teams_lo lower bound on number of teams requested for the teams
359 construct
360 @param num_teams_up upper bound on number of teams requested for the teams
361 construct
362 @param num_threads number of threads per team requested for the teams construct
363 
364 Set the number of teams to be used by the teams construct. The number of initial
365 teams cretaed will be greater than or equal to the lower bound and less than or
366 equal to the upper bound.
367 This call is only required if the teams construct has a `num_teams` clause
368 or a `thread_limit` clause (or both).
369 */
370 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
371                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
372                               kmp_int32 num_threads) {
373   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
374                 " num_teams_ub=%d num_threads=%d\n",
375                 global_tid, num_teams_lb, num_teams_ub, num_threads));
376   __kmp_assert_valid_gtid(global_tid);
377   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
378                           num_threads);
379 }
380 
381 /*!
382 @ingroup PARALLEL
383 @param loc  source location information
384 @param argc  total number of arguments in the ellipsis
385 @param microtask  pointer to callback routine consisting of outlined teams
386 construct
387 @param ...  pointers to shared variables that aren't global
388 
389 Do the actual fork and call the microtask in the relevant number of threads.
390 */
391 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
392                        ...) {
393   int gtid = __kmp_entry_gtid();
394   kmp_info_t *this_thr = __kmp_threads[gtid];
395   va_list ap;
396   va_start(ap, microtask);
397 
398 #if KMP_STATS_ENABLED
399   KMP_COUNT_BLOCK(OMP_TEAMS);
400   stats_state_e previous_state = KMP_GET_THREAD_STATE();
401   if (previous_state == stats_state_e::SERIAL_REGION) {
402     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
403   } else {
404     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
405   }
406 #endif
407 
408   // remember teams entry point and nesting level
409   this_thr->th.th_teams_microtask = microtask;
410   this_thr->th.th_teams_level =
411       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
412 
413 #if OMPT_SUPPORT
414   kmp_team_t *parent_team = this_thr->th.th_team;
415   int tid = __kmp_tid_from_gtid(gtid);
416   if (ompt_enabled.enabled) {
417     parent_team->t.t_implicit_task_taskdata[tid]
418         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
419   }
420   OMPT_STORE_RETURN_ADDRESS(gtid);
421 #endif
422 
423   // check if __kmpc_push_num_teams called, set default number of teams
424   // otherwise
425   if (this_thr->th.th_teams_size.nteams == 0) {
426     __kmp_push_num_teams(loc, gtid, 0, 0);
427   }
428   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
431 
432   __kmp_fork_call(
433       loc, gtid, fork_context_intel, argc,
434       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
435       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
436   __kmp_join_call(loc, gtid
437 #if OMPT_SUPPORT
438                   ,
439                   fork_context_intel
440 #endif
441   );
442 
443   // Pop current CG root off list
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
446   this_thr->th.th_cg_roots = tmp->up;
447   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
448                  " to node %p. cg_nthreads was %d\n",
449                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
450   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
451   int i = tmp->cg_nthreads--;
452   if (i == 1) { // check is we are the last thread in CG (not always the case)
453     __kmp_free(tmp);
454   }
455   // Restore current task's thread_limit from CG root
456   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
457   this_thr->th.th_current_task->td_icvs.thread_limit =
458       this_thr->th.th_cg_roots->cg_thread_limit;
459 
460   this_thr->th.th_teams_microtask = NULL;
461   this_thr->th.th_teams_level = 0;
462   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
463   va_end(ap);
464 #if KMP_STATS_ENABLED
465   if (previous_state == stats_state_e::SERIAL_REGION) {
466     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
467     KMP_SET_THREAD_STATE(previous_state);
468   } else {
469     KMP_POP_PARTITIONED_TIMER();
470   }
471 #endif // KMP_STATS_ENABLED
472 }
473 
474 // I don't think this function should ever have been exported.
475 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
476 // openmp code ever called it, but it's been exported from the RTL for so
477 // long that I'm afraid to remove the definition.
478 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
479 
480 /*!
481 @ingroup PARALLEL
482 @param loc  source location information
483 @param global_tid  global thread number
484 
485 Enter a serialized parallel construct. This interface is used to handle a
486 conditional parallel region, like this,
487 @code
488 #pragma omp parallel if (condition)
489 @endcode
490 when the condition is false.
491 */
492 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
493   // The implementation is now in kmp_runtime.cpp so that it can share static
494   // functions with kmp_fork_call since the tasks to be done are similar in
495   // each case.
496   __kmp_assert_valid_gtid(global_tid);
497 #if OMPT_SUPPORT
498   OMPT_STORE_RETURN_ADDRESS(global_tid);
499 #endif
500   __kmp_serialized_parallel(loc, global_tid);
501 }
502 
503 /*!
504 @ingroup PARALLEL
505 @param loc  source location information
506 @param global_tid  global thread number
507 
508 Leave a serialized parallel construct.
509 */
510 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
511   kmp_internal_control_t *top;
512   kmp_info_t *this_thr;
513   kmp_team_t *serial_team;
514 
515   KC_TRACE(10,
516            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
517 
518   /* skip all this code for autopar serialized loops since it results in
519      unacceptable overhead */
520   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
521     return;
522 
523   // Not autopar code
524   __kmp_assert_valid_gtid(global_tid);
525   if (!TCR_4(__kmp_init_parallel))
526     __kmp_parallel_initialize();
527 
528   __kmp_resume_if_soft_paused();
529 
530   this_thr = __kmp_threads[global_tid];
531   serial_team = this_thr->th.th_serial_team;
532 
533   kmp_task_team_t *task_team = this_thr->th.th_task_team;
534   // we need to wait for the proxy tasks before finishing the thread
535   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   // if( serial_team -> t.t_serialized > 1 )
581   serial_team->t.t_level--;
582 
583   /* pop dispatch buffers stack */
584   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
585   {
586     dispatch_private_info_t *disp_buffer =
587         serial_team->t.t_dispatch->th_disp_buffer;
588     serial_team->t.t_dispatch->th_disp_buffer =
589         serial_team->t.t_dispatch->th_disp_buffer->next;
590     __kmp_free(disp_buffer);
591   }
592   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
593 
594   --serial_team->t.t_serialized;
595   if (serial_team->t.t_serialized == 0) {
596 
597     /* return to the parallel section */
598 
599 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
600     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
601       __kmp_clear_x87_fpu_status_word();
602       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
603       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
604     }
605 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
606 
607     this_thr->th.th_team = serial_team->t.t_parent;
608     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
609 
610     /* restore values cached in the thread */
611     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
612     this_thr->th.th_team_master =
613         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
614     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
615 
616     /* TODO the below shouldn't need to be adjusted for serialized teams */
617     this_thr->th.th_dispatch =
618         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
619 
620     __kmp_pop_current_task_from_thread(this_thr);
621 
622     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
623     this_thr->th.th_current_task->td_flags.executing = 1;
624 
625     if (__kmp_tasking_mode != tskm_immediate_exec) {
626       // Copy the task team from the new child / old parent team to the thread.
627       this_thr->th.th_task_team =
628           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
629       KA_TRACE(20,
630                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
631                 "team %p\n",
632                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
633     }
634   } else {
635     if (__kmp_tasking_mode != tskm_immediate_exec) {
636       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
637                     "depth of serial team %p to %d\n",
638                     global_tid, serial_team, serial_team->t.t_serialized));
639     }
640   }
641 
642   if (__kmp_env_consistency_check)
643     __kmp_pop_parallel(global_tid, NULL);
644 #if OMPT_SUPPORT
645   if (ompt_enabled.enabled)
646     this_thr->th.ompt_thread_info.state =
647         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
648                                            : ompt_state_work_parallel);
649 #endif
650 }
651 
652 /*!
653 @ingroup SYNCHRONIZATION
654 @param loc  source location information.
655 
656 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
657 depending on the memory ordering convention obeyed by the compiler
658 even that may not be necessary).
659 */
660 void __kmpc_flush(ident_t *loc) {
661   KC_TRACE(10, ("__kmpc_flush: called\n"));
662 
663   /* need explicit __mf() here since use volatile instead in library */
664   KMP_MB(); /* Flush all pending memory write invalidates.  */
665 
666 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
667 #if KMP_MIC
668 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
669 // We shouldn't need it, though, since the ABI rules require that
670 // * If the compiler generates NGO stores it also generates the fence
671 // * If users hand-code NGO stores they should insert the fence
672 // therefore no incomplete unordered stores should be visible.
673 #else
674   // C74404
675   // This is to address non-temporal store instructions (sfence needed).
676   // The clflush instruction is addressed either (mfence needed).
677   // Probably the non-temporal load monvtdqa instruction should also be
678   // addressed.
679   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
680   if (!__kmp_cpuinfo.initialized) {
681     __kmp_query_cpuid(&__kmp_cpuinfo);
682   }
683   if (!__kmp_cpuinfo.sse2) {
684     // CPU cannot execute SSE2 instructions.
685   } else {
686 #if KMP_COMPILER_ICC
687     _mm_mfence();
688 #elif KMP_COMPILER_MSVC
689     MemoryBarrier();
690 #else
691     __sync_synchronize();
692 #endif // KMP_COMPILER_ICC
693   }
694 #endif // KMP_MIC
695 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
696        KMP_ARCH_RISCV64)
697 // Nothing to see here move along
698 #elif KMP_ARCH_PPC64
699 // Nothing needed here (we have a real MB above).
700 #else
701 #error Unknown or unsupported architecture
702 #endif
703 
704 #if OMPT_SUPPORT && OMPT_OPTIONAL
705   if (ompt_enabled.ompt_callback_flush) {
706     ompt_callbacks.ompt_callback(ompt_callback_flush)(
707         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
708   }
709 #endif
710 }
711 
712 /* -------------------------------------------------------------------------- */
713 /*!
714 @ingroup SYNCHRONIZATION
715 @param loc source location information
716 @param global_tid thread id.
717 
718 Execute a barrier.
719 */
720 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
721   KMP_COUNT_BLOCK(OMP_BARRIER);
722   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
723   __kmp_assert_valid_gtid(global_tid);
724 
725   if (!TCR_4(__kmp_init_parallel))
726     __kmp_parallel_initialize();
727 
728   __kmp_resume_if_soft_paused();
729 
730   if (__kmp_env_consistency_check) {
731     if (loc == 0) {
732       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
733     }
734     __kmp_check_barrier(global_tid, ct_barrier, loc);
735   }
736 
737 #if OMPT_SUPPORT
738   ompt_frame_t *ompt_frame;
739   if (ompt_enabled.enabled) {
740     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
741     if (ompt_frame->enter_frame.ptr == NULL)
742       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
743   }
744   OMPT_STORE_RETURN_ADDRESS(global_tid);
745 #endif
746   __kmp_threads[global_tid]->th.th_ident = loc;
747   // TODO: explicit barrier_wait_id:
748   //   this function is called when 'barrier' directive is present or
749   //   implicit barrier at the end of a worksharing construct.
750   // 1) better to add a per-thread barrier counter to a thread data structure
751   // 2) set to 0 when a new team is created
752   // 4) no sync is required
753 
754   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
755 #if OMPT_SUPPORT && OMPT_OPTIONAL
756   if (ompt_enabled.enabled) {
757     ompt_frame->enter_frame = ompt_data_none;
758   }
759 #endif
760 }
761 
762 /* The BARRIER for a MASTER section is always explicit   */
763 /*!
764 @ingroup WORK_SHARING
765 @param loc  source location information.
766 @param global_tid  global thread number .
767 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
768 */
769 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
770   int status = 0;
771 
772   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
773   __kmp_assert_valid_gtid(global_tid);
774 
775   if (!TCR_4(__kmp_init_parallel))
776     __kmp_parallel_initialize();
777 
778   __kmp_resume_if_soft_paused();
779 
780   if (KMP_MASTER_GTID(global_tid)) {
781     KMP_COUNT_BLOCK(OMP_MASTER);
782     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
783     status = 1;
784   }
785 
786 #if OMPT_SUPPORT && OMPT_OPTIONAL
787   if (status) {
788     if (ompt_enabled.ompt_callback_masked) {
789       kmp_info_t *this_thr = __kmp_threads[global_tid];
790       kmp_team_t *team = this_thr->th.th_team;
791 
792       int tid = __kmp_tid_from_gtid(global_tid);
793       ompt_callbacks.ompt_callback(ompt_callback_masked)(
794           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
795           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
796           OMPT_GET_RETURN_ADDRESS(0));
797     }
798   }
799 #endif
800 
801   if (__kmp_env_consistency_check) {
802 #if KMP_USE_DYNAMIC_LOCK
803     if (status)
804       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
805     else
806       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
807 #else
808     if (status)
809       __kmp_push_sync(global_tid, ct_master, loc, NULL);
810     else
811       __kmp_check_sync(global_tid, ct_master, loc, NULL);
812 #endif
813   }
814 
815   return status;
816 }
817 
818 /*!
819 @ingroup WORK_SHARING
820 @param loc  source location information.
821 @param global_tid  global thread number .
822 
823 Mark the end of a <tt>master</tt> region. This should only be called by the
824 thread that executes the <tt>master</tt> region.
825 */
826 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
827   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
828   __kmp_assert_valid_gtid(global_tid);
829   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
830   KMP_POP_PARTITIONED_TIMER();
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_info_t *this_thr = __kmp_threads[global_tid];
834   kmp_team_t *team = this_thr->th.th_team;
835   if (ompt_enabled.ompt_callback_masked) {
836     int tid = __kmp_tid_from_gtid(global_tid);
837     ompt_callbacks.ompt_callback(ompt_callback_masked)(
838         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
839         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
840         OMPT_GET_RETURN_ADDRESS(0));
841   }
842 #endif
843 
844   if (__kmp_env_consistency_check) {
845     if (KMP_MASTER_GTID(global_tid))
846       __kmp_pop_sync(global_tid, ct_master, loc);
847   }
848 }
849 
850 /*!
851 @ingroup WORK_SHARING
852 @param loc  source location information.
853 @param gtid  global thread number.
854 
855 Start execution of an <tt>ordered</tt> construct.
856 */
857 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
858   int cid = 0;
859   kmp_info_t *th;
860   KMP_DEBUG_ASSERT(__kmp_init_serial);
861 
862   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
863   __kmp_assert_valid_gtid(gtid);
864 
865   if (!TCR_4(__kmp_init_parallel))
866     __kmp_parallel_initialize();
867 
868   __kmp_resume_if_soft_paused();
869 
870 #if USE_ITT_BUILD
871   __kmp_itt_ordered_prep(gtid);
872 // TODO: ordered_wait_id
873 #endif /* USE_ITT_BUILD */
874 
875   th = __kmp_threads[gtid];
876 
877 #if OMPT_SUPPORT && OMPT_OPTIONAL
878   kmp_team_t *team;
879   ompt_wait_id_t lck;
880   void *codeptr_ra;
881   OMPT_STORE_RETURN_ADDRESS(gtid);
882   if (ompt_enabled.enabled) {
883     team = __kmp_team_from_gtid(gtid);
884     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
885     /* OMPT state update */
886     th->th.ompt_thread_info.wait_id = lck;
887     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
888 
889     /* OMPT event callback */
890     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
891     if (ompt_enabled.ompt_callback_mutex_acquire) {
892       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
893           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
894           codeptr_ra);
895     }
896   }
897 #endif
898 
899   if (th->th.th_dispatch->th_deo_fcn != 0)
900     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
901   else
902     __kmp_parallel_deo(&gtid, &cid, loc);
903 
904 #if OMPT_SUPPORT && OMPT_OPTIONAL
905   if (ompt_enabled.enabled) {
906     /* OMPT state update */
907     th->th.ompt_thread_info.state = ompt_state_work_parallel;
908     th->th.ompt_thread_info.wait_id = 0;
909 
910     /* OMPT event callback */
911     if (ompt_enabled.ompt_callback_mutex_acquired) {
912       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
913           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
914     }
915   }
916 #endif
917 
918 #if USE_ITT_BUILD
919   __kmp_itt_ordered_start(gtid);
920 #endif /* USE_ITT_BUILD */
921 }
922 
923 /*!
924 @ingroup WORK_SHARING
925 @param loc  source location information.
926 @param gtid  global thread number.
927 
928 End execution of an <tt>ordered</tt> construct.
929 */
930 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
931   int cid = 0;
932   kmp_info_t *th;
933 
934   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
935   __kmp_assert_valid_gtid(gtid);
936 
937 #if USE_ITT_BUILD
938   __kmp_itt_ordered_end(gtid);
939 // TODO: ordered_wait_id
940 #endif /* USE_ITT_BUILD */
941 
942   th = __kmp_threads[gtid];
943 
944   if (th->th.th_dispatch->th_dxo_fcn != 0)
945     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
946   else
947     __kmp_parallel_dxo(&gtid, &cid, loc);
948 
949 #if OMPT_SUPPORT && OMPT_OPTIONAL
950   OMPT_STORE_RETURN_ADDRESS(gtid);
951   if (ompt_enabled.ompt_callback_mutex_released) {
952     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
953         ompt_mutex_ordered,
954         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
955             ->t.t_ordered.dt.t_value,
956         OMPT_LOAD_RETURN_ADDRESS(gtid));
957   }
958 #endif
959 }
960 
961 #if KMP_USE_DYNAMIC_LOCK
962 
963 static __forceinline void
964 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
965                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
966   // Pointer to the allocated indirect lock is written to crit, while indexing
967   // is ignored.
968   void *idx;
969   kmp_indirect_lock_t **lck;
970   lck = (kmp_indirect_lock_t **)crit;
971   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
972   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
973   KMP_SET_I_LOCK_LOCATION(ilk, loc);
974   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
975   KA_TRACE(20,
976            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
977 #if USE_ITT_BUILD
978   __kmp_itt_critical_creating(ilk->lock, loc);
979 #endif
980   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
981   if (status == 0) {
982 #if USE_ITT_BUILD
983     __kmp_itt_critical_destroyed(ilk->lock);
984 #endif
985     // We don't really need to destroy the unclaimed lock here since it will be
986     // cleaned up at program exit.
987     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
988   }
989   KMP_DEBUG_ASSERT(*lck != NULL);
990 }
991 
992 // Fast-path acquire tas lock
993 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
994   {                                                                            \
995     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
996     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
997     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
998     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
999         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1000       kmp_uint32 spins;                                                        \
1001       KMP_FSYNC_PREPARE(l);                                                    \
1002       KMP_INIT_YIELD(spins);                                                   \
1003       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1004       do {                                                                     \
1005         if (TCR_4(__kmp_nth) >                                                 \
1006             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1007           KMP_YIELD(TRUE);                                                     \
1008         } else {                                                               \
1009           KMP_YIELD_SPIN(spins);                                               \
1010         }                                                                      \
1011         __kmp_spin_backoff(&backoff);                                          \
1012       } while (                                                                \
1013           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1014           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1015     }                                                                          \
1016     KMP_FSYNC_ACQUIRED(l);                                                     \
1017   }
1018 
1019 // Fast-path test tas lock
1020 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1021   {                                                                            \
1022     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1023     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1024     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1025     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1026          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1027   }
1028 
1029 // Fast-path release tas lock
1030 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1031   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1032 
1033 #if KMP_USE_FUTEX
1034 
1035 #include <sys/syscall.h>
1036 #include <unistd.h>
1037 #ifndef FUTEX_WAIT
1038 #define FUTEX_WAIT 0
1039 #endif
1040 #ifndef FUTEX_WAKE
1041 #define FUTEX_WAKE 1
1042 #endif
1043 
1044 // Fast-path acquire futex lock
1045 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1046   {                                                                            \
1047     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1048     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1049     KMP_MB();                                                                  \
1050     KMP_FSYNC_PREPARE(ftx);                                                    \
1051     kmp_int32 poll_val;                                                        \
1052     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1053                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1054                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1055       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1056       if (!cond) {                                                             \
1057         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1058                                          poll_val |                            \
1059                                              KMP_LOCK_BUSY(1, futex))) {       \
1060           continue;                                                            \
1061         }                                                                      \
1062         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1063       }                                                                        \
1064       kmp_int32 rc;                                                            \
1065       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1066                         NULL, NULL, 0)) != 0) {                                \
1067         continue;                                                              \
1068       }                                                                        \
1069       gtid_code |= 1;                                                          \
1070     }                                                                          \
1071     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1072   }
1073 
1074 // Fast-path test futex lock
1075 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1076   {                                                                            \
1077     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1078     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1079                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1080       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1081       rc = TRUE;                                                               \
1082     } else {                                                                   \
1083       rc = FALSE;                                                              \
1084     }                                                                          \
1085   }
1086 
1087 // Fast-path release futex lock
1088 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1089   {                                                                            \
1090     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1091     KMP_MB();                                                                  \
1092     KMP_FSYNC_RELEASING(ftx);                                                  \
1093     kmp_int32 poll_val =                                                       \
1094         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1095     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1096       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1097               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1098     }                                                                          \
1099     KMP_MB();                                                                  \
1100     KMP_YIELD_OVERSUB();                                                       \
1101   }
1102 
1103 #endif // KMP_USE_FUTEX
1104 
1105 #else // KMP_USE_DYNAMIC_LOCK
1106 
1107 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1108                                                       ident_t const *loc,
1109                                                       kmp_int32 gtid) {
1110   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1111 
1112   // Because of the double-check, the following load doesn't need to be volatile
1113   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1114 
1115   if (lck == NULL) {
1116     void *idx;
1117 
1118     // Allocate & initialize the lock.
1119     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1120     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1121     __kmp_init_user_lock_with_checks(lck);
1122     __kmp_set_user_lock_location(lck, loc);
1123 #if USE_ITT_BUILD
1124     __kmp_itt_critical_creating(lck);
1125 // __kmp_itt_critical_creating() should be called *before* the first usage
1126 // of underlying lock. It is the only place where we can guarantee it. There
1127 // are chances the lock will destroyed with no usage, but it is not a
1128 // problem, because this is not real event seen by user but rather setting
1129 // name for object (lock). See more details in kmp_itt.h.
1130 #endif /* USE_ITT_BUILD */
1131 
1132     // Use a cmpxchg instruction to slam the start of the critical section with
1133     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1134     // and use the lock that the other thread allocated.
1135     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1136 
1137     if (status == 0) {
1138 // Deallocate the lock and reload the value.
1139 #if USE_ITT_BUILD
1140       __kmp_itt_critical_destroyed(lck);
1141 // Let ITT know the lock is destroyed and the same memory location may be reused
1142 // for another purpose.
1143 #endif /* USE_ITT_BUILD */
1144       __kmp_destroy_user_lock_with_checks(lck);
1145       __kmp_user_lock_free(&idx, gtid, lck);
1146       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1147       KMP_DEBUG_ASSERT(lck != NULL);
1148     }
1149   }
1150   return lck;
1151 }
1152 
1153 #endif // KMP_USE_DYNAMIC_LOCK
1154 
1155 /*!
1156 @ingroup WORK_SHARING
1157 @param loc  source location information.
1158 @param global_tid  global thread number.
1159 @param crit identity of the critical section. This could be a pointer to a lock
1160 associated with the critical section, or some other suitably unique value.
1161 
1162 Enter code protected by a `critical` construct.
1163 This function blocks until the executing thread can enter the critical section.
1164 */
1165 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1166                      kmp_critical_name *crit) {
1167 #if KMP_USE_DYNAMIC_LOCK
1168 #if OMPT_SUPPORT && OMPT_OPTIONAL
1169   OMPT_STORE_RETURN_ADDRESS(global_tid);
1170 #endif // OMPT_SUPPORT
1171   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1172 #else
1173   KMP_COUNT_BLOCK(OMP_CRITICAL);
1174 #if OMPT_SUPPORT && OMPT_OPTIONAL
1175   ompt_state_t prev_state = ompt_state_undefined;
1176   ompt_thread_info_t ti;
1177 #endif
1178   kmp_user_lock_p lck;
1179 
1180   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1181   __kmp_assert_valid_gtid(global_tid);
1182 
1183   // TODO: add THR_OVHD_STATE
1184 
1185   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1186   KMP_CHECK_USER_LOCK_INIT();
1187 
1188   if ((__kmp_user_lock_kind == lk_tas) &&
1189       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1190     lck = (kmp_user_lock_p)crit;
1191   }
1192 #if KMP_USE_FUTEX
1193   else if ((__kmp_user_lock_kind == lk_futex) &&
1194            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1195     lck = (kmp_user_lock_p)crit;
1196   }
1197 #endif
1198   else { // ticket, queuing or drdpa
1199     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1200   }
1201 
1202   if (__kmp_env_consistency_check)
1203     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1204 
1205     // since the critical directive binds to all threads, not just the current
1206     // team we have to check this even if we are in a serialized team.
1207     // also, even if we are the uber thread, we still have to conduct the lock,
1208     // as we have to contend with sibling threads.
1209 
1210 #if USE_ITT_BUILD
1211   __kmp_itt_critical_acquiring(lck);
1212 #endif /* USE_ITT_BUILD */
1213 #if OMPT_SUPPORT && OMPT_OPTIONAL
1214   OMPT_STORE_RETURN_ADDRESS(gtid);
1215   void *codeptr_ra = NULL;
1216   if (ompt_enabled.enabled) {
1217     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1218     /* OMPT state update */
1219     prev_state = ti.state;
1220     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1221     ti.state = ompt_state_wait_critical;
1222 
1223     /* OMPT event callback */
1224     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1225     if (ompt_enabled.ompt_callback_mutex_acquire) {
1226       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1227           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1228           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1229     }
1230   }
1231 #endif
1232   // Value of 'crit' should be good for using as a critical_id of the critical
1233   // section directive.
1234   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1235 
1236 #if USE_ITT_BUILD
1237   __kmp_itt_critical_acquired(lck);
1238 #endif /* USE_ITT_BUILD */
1239 #if OMPT_SUPPORT && OMPT_OPTIONAL
1240   if (ompt_enabled.enabled) {
1241     /* OMPT state update */
1242     ti.state = prev_state;
1243     ti.wait_id = 0;
1244 
1245     /* OMPT event callback */
1246     if (ompt_enabled.ompt_callback_mutex_acquired) {
1247       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1248           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1249     }
1250   }
1251 #endif
1252   KMP_POP_PARTITIONED_TIMER();
1253 
1254   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1255   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1256 #endif // KMP_USE_DYNAMIC_LOCK
1257 }
1258 
1259 #if KMP_USE_DYNAMIC_LOCK
1260 
1261 // Converts the given hint to an internal lock implementation
1262 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1263 #if KMP_USE_TSX
1264 #define KMP_TSX_LOCK(seq) lockseq_##seq
1265 #else
1266 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1267 #endif
1268 
1269 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1270 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1271 #else
1272 #define KMP_CPUINFO_RTM 0
1273 #endif
1274 
1275   // Hints that do not require further logic
1276   if (hint & kmp_lock_hint_hle)
1277     return KMP_TSX_LOCK(hle);
1278   if (hint & kmp_lock_hint_rtm)
1279     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1280   if (hint & kmp_lock_hint_adaptive)
1281     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1282 
1283   // Rule out conflicting hints first by returning the default lock
1284   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1285     return __kmp_user_lock_seq;
1286   if ((hint & omp_lock_hint_speculative) &&
1287       (hint & omp_lock_hint_nonspeculative))
1288     return __kmp_user_lock_seq;
1289 
1290   // Do not even consider speculation when it appears to be contended
1291   if (hint & omp_lock_hint_contended)
1292     return lockseq_queuing;
1293 
1294   // Uncontended lock without speculation
1295   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1296     return lockseq_tas;
1297 
1298   // Use RTM lock for speculation
1299   if (hint & omp_lock_hint_speculative)
1300     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1301 
1302   return __kmp_user_lock_seq;
1303 }
1304 
1305 #if OMPT_SUPPORT && OMPT_OPTIONAL
1306 #if KMP_USE_DYNAMIC_LOCK
1307 static kmp_mutex_impl_t
1308 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1309   if (user_lock) {
1310     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1311     case 0:
1312       break;
1313 #if KMP_USE_FUTEX
1314     case locktag_futex:
1315       return kmp_mutex_impl_queuing;
1316 #endif
1317     case locktag_tas:
1318       return kmp_mutex_impl_spin;
1319 #if KMP_USE_TSX
1320     case locktag_hle:
1321     case locktag_rtm_spin:
1322       return kmp_mutex_impl_speculative;
1323 #endif
1324     default:
1325       return kmp_mutex_impl_none;
1326     }
1327     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1328   }
1329   KMP_ASSERT(ilock);
1330   switch (ilock->type) {
1331 #if KMP_USE_TSX
1332   case locktag_adaptive:
1333   case locktag_rtm_queuing:
1334     return kmp_mutex_impl_speculative;
1335 #endif
1336   case locktag_nested_tas:
1337     return kmp_mutex_impl_spin;
1338 #if KMP_USE_FUTEX
1339   case locktag_nested_futex:
1340 #endif
1341   case locktag_ticket:
1342   case locktag_queuing:
1343   case locktag_drdpa:
1344   case locktag_nested_ticket:
1345   case locktag_nested_queuing:
1346   case locktag_nested_drdpa:
1347     return kmp_mutex_impl_queuing;
1348   default:
1349     return kmp_mutex_impl_none;
1350   }
1351 }
1352 #else
1353 // For locks without dynamic binding
1354 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1355   switch (__kmp_user_lock_kind) {
1356   case lk_tas:
1357     return kmp_mutex_impl_spin;
1358 #if KMP_USE_FUTEX
1359   case lk_futex:
1360 #endif
1361   case lk_ticket:
1362   case lk_queuing:
1363   case lk_drdpa:
1364     return kmp_mutex_impl_queuing;
1365 #if KMP_USE_TSX
1366   case lk_hle:
1367   case lk_rtm_queuing:
1368   case lk_rtm_spin:
1369   case lk_adaptive:
1370     return kmp_mutex_impl_speculative;
1371 #endif
1372   default:
1373     return kmp_mutex_impl_none;
1374   }
1375 }
1376 #endif // KMP_USE_DYNAMIC_LOCK
1377 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1378 
1379 /*!
1380 @ingroup WORK_SHARING
1381 @param loc  source location information.
1382 @param global_tid  global thread number.
1383 @param crit identity of the critical section. This could be a pointer to a lock
1384 associated with the critical section, or some other suitably unique value.
1385 @param hint the lock hint.
1386 
1387 Enter code protected by a `critical` construct with a hint. The hint value is
1388 used to suggest a lock implementation. This function blocks until the executing
1389 thread can enter the critical section unless the hint suggests use of
1390 speculative execution and the hardware supports it.
1391 */
1392 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1393                                kmp_critical_name *crit, uint32_t hint) {
1394   KMP_COUNT_BLOCK(OMP_CRITICAL);
1395   kmp_user_lock_p lck;
1396 #if OMPT_SUPPORT && OMPT_OPTIONAL
1397   ompt_state_t prev_state = ompt_state_undefined;
1398   ompt_thread_info_t ti;
1399   // This is the case, if called from __kmpc_critical:
1400   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1401   if (!codeptr)
1402     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1403 #endif
1404 
1405   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1406   __kmp_assert_valid_gtid(global_tid);
1407 
1408   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1409   // Check if it is initialized.
1410   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1411   if (*lk == 0) {
1412     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1413     if (KMP_IS_D_LOCK(lckseq)) {
1414       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1415                                   KMP_GET_D_TAG(lckseq));
1416     } else {
1417       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1418     }
1419   }
1420   // Branch for accessing the actual lock object and set operation. This
1421   // branching is inevitable since this lock initialization does not follow the
1422   // normal dispatch path (lock table is not used).
1423   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1424     lck = (kmp_user_lock_p)lk;
1425     if (__kmp_env_consistency_check) {
1426       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1427                       __kmp_map_hint_to_lock(hint));
1428     }
1429 #if USE_ITT_BUILD
1430     __kmp_itt_critical_acquiring(lck);
1431 #endif
1432 #if OMPT_SUPPORT && OMPT_OPTIONAL
1433     if (ompt_enabled.enabled) {
1434       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1435       /* OMPT state update */
1436       prev_state = ti.state;
1437       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1438       ti.state = ompt_state_wait_critical;
1439 
1440       /* OMPT event callback */
1441       if (ompt_enabled.ompt_callback_mutex_acquire) {
1442         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1443             ompt_mutex_critical, (unsigned int)hint,
1444             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1445             codeptr);
1446       }
1447     }
1448 #endif
1449 #if KMP_USE_INLINED_TAS
1450     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1451       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1452     } else
1453 #elif KMP_USE_INLINED_FUTEX
1454     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1455       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1456     } else
1457 #endif
1458     {
1459       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1460     }
1461   } else {
1462     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1463     lck = ilk->lock;
1464     if (__kmp_env_consistency_check) {
1465       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1466                       __kmp_map_hint_to_lock(hint));
1467     }
1468 #if USE_ITT_BUILD
1469     __kmp_itt_critical_acquiring(lck);
1470 #endif
1471 #if OMPT_SUPPORT && OMPT_OPTIONAL
1472     if (ompt_enabled.enabled) {
1473       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1474       /* OMPT state update */
1475       prev_state = ti.state;
1476       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1477       ti.state = ompt_state_wait_critical;
1478 
1479       /* OMPT event callback */
1480       if (ompt_enabled.ompt_callback_mutex_acquire) {
1481         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1482             ompt_mutex_critical, (unsigned int)hint,
1483             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1484             codeptr);
1485       }
1486     }
1487 #endif
1488     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1489   }
1490   KMP_POP_PARTITIONED_TIMER();
1491 
1492 #if USE_ITT_BUILD
1493   __kmp_itt_critical_acquired(lck);
1494 #endif /* USE_ITT_BUILD */
1495 #if OMPT_SUPPORT && OMPT_OPTIONAL
1496   if (ompt_enabled.enabled) {
1497     /* OMPT state update */
1498     ti.state = prev_state;
1499     ti.wait_id = 0;
1500 
1501     /* OMPT event callback */
1502     if (ompt_enabled.ompt_callback_mutex_acquired) {
1503       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1504           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1505     }
1506   }
1507 #endif
1508 
1509   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1510   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1511 } // __kmpc_critical_with_hint
1512 
1513 #endif // KMP_USE_DYNAMIC_LOCK
1514 
1515 /*!
1516 @ingroup WORK_SHARING
1517 @param loc  source location information.
1518 @param global_tid  global thread number .
1519 @param crit identity of the critical section. This could be a pointer to a lock
1520 associated with the critical section, or some other suitably unique value.
1521 
1522 Leave a critical section, releasing any lock that was held during its execution.
1523 */
1524 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1525                          kmp_critical_name *crit) {
1526   kmp_user_lock_p lck;
1527 
1528   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1529 
1530 #if KMP_USE_DYNAMIC_LOCK
1531   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1532     lck = (kmp_user_lock_p)crit;
1533     KMP_ASSERT(lck != NULL);
1534     if (__kmp_env_consistency_check) {
1535       __kmp_pop_sync(global_tid, ct_critical, loc);
1536     }
1537 #if USE_ITT_BUILD
1538     __kmp_itt_critical_releasing(lck);
1539 #endif
1540 #if KMP_USE_INLINED_TAS
1541     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1542       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1543     } else
1544 #elif KMP_USE_INLINED_FUTEX
1545     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1546       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1547     } else
1548 #endif
1549     {
1550       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1551     }
1552   } else {
1553     kmp_indirect_lock_t *ilk =
1554         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1555     KMP_ASSERT(ilk != NULL);
1556     lck = ilk->lock;
1557     if (__kmp_env_consistency_check) {
1558       __kmp_pop_sync(global_tid, ct_critical, loc);
1559     }
1560 #if USE_ITT_BUILD
1561     __kmp_itt_critical_releasing(lck);
1562 #endif
1563     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1564   }
1565 
1566 #else // KMP_USE_DYNAMIC_LOCK
1567 
1568   if ((__kmp_user_lock_kind == lk_tas) &&
1569       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1570     lck = (kmp_user_lock_p)crit;
1571   }
1572 #if KMP_USE_FUTEX
1573   else if ((__kmp_user_lock_kind == lk_futex) &&
1574            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1575     lck = (kmp_user_lock_p)crit;
1576   }
1577 #endif
1578   else { // ticket, queuing or drdpa
1579     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1580   }
1581 
1582   KMP_ASSERT(lck != NULL);
1583 
1584   if (__kmp_env_consistency_check)
1585     __kmp_pop_sync(global_tid, ct_critical, loc);
1586 
1587 #if USE_ITT_BUILD
1588   __kmp_itt_critical_releasing(lck);
1589 #endif /* USE_ITT_BUILD */
1590   // Value of 'crit' should be good for using as a critical_id of the critical
1591   // section directive.
1592   __kmp_release_user_lock_with_checks(lck, global_tid);
1593 
1594 #endif // KMP_USE_DYNAMIC_LOCK
1595 
1596 #if OMPT_SUPPORT && OMPT_OPTIONAL
1597   /* OMPT release event triggers after lock is released; place here to trigger
1598    * for all #if branches */
1599   OMPT_STORE_RETURN_ADDRESS(global_tid);
1600   if (ompt_enabled.ompt_callback_mutex_released) {
1601     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1602         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1603         OMPT_LOAD_RETURN_ADDRESS(0));
1604   }
1605 #endif
1606 
1607   KMP_POP_PARTITIONED_TIMER();
1608   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1609 }
1610 
1611 /*!
1612 @ingroup SYNCHRONIZATION
1613 @param loc source location information
1614 @param global_tid thread id.
1615 @return one if the thread should execute the master block, zero otherwise
1616 
1617 Start execution of a combined barrier and master. The barrier is executed inside
1618 this function.
1619 */
1620 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1621   int status;
1622   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1623   __kmp_assert_valid_gtid(global_tid);
1624 
1625   if (!TCR_4(__kmp_init_parallel))
1626     __kmp_parallel_initialize();
1627 
1628   __kmp_resume_if_soft_paused();
1629 
1630   if (__kmp_env_consistency_check)
1631     __kmp_check_barrier(global_tid, ct_barrier, loc);
1632 
1633 #if OMPT_SUPPORT
1634   ompt_frame_t *ompt_frame;
1635   if (ompt_enabled.enabled) {
1636     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1637     if (ompt_frame->enter_frame.ptr == NULL)
1638       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1639   }
1640   OMPT_STORE_RETURN_ADDRESS(global_tid);
1641 #endif
1642 #if USE_ITT_NOTIFY
1643   __kmp_threads[global_tid]->th.th_ident = loc;
1644 #endif
1645   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1646 #if OMPT_SUPPORT && OMPT_OPTIONAL
1647   if (ompt_enabled.enabled) {
1648     ompt_frame->enter_frame = ompt_data_none;
1649   }
1650 #endif
1651 
1652   return (status != 0) ? 0 : 1;
1653 }
1654 
1655 /*!
1656 @ingroup SYNCHRONIZATION
1657 @param loc source location information
1658 @param global_tid thread id.
1659 
1660 Complete the execution of a combined barrier and master. This function should
1661 only be called at the completion of the <tt>master</tt> code. Other threads will
1662 still be waiting at the barrier and this call releases them.
1663 */
1664 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1665   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1666   __kmp_assert_valid_gtid(global_tid);
1667   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1668 }
1669 
1670 /*!
1671 @ingroup SYNCHRONIZATION
1672 @param loc source location information
1673 @param global_tid thread id.
1674 @return one if the thread should execute the master block, zero otherwise
1675 
1676 Start execution of a combined barrier and master(nowait) construct.
1677 The barrier is executed inside this function.
1678 There is no equivalent "end" function, since the
1679 */
1680 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1681   kmp_int32 ret;
1682   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1683   __kmp_assert_valid_gtid(global_tid);
1684 
1685   if (!TCR_4(__kmp_init_parallel))
1686     __kmp_parallel_initialize();
1687 
1688   __kmp_resume_if_soft_paused();
1689 
1690   if (__kmp_env_consistency_check) {
1691     if (loc == 0) {
1692       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1693     }
1694     __kmp_check_barrier(global_tid, ct_barrier, loc);
1695   }
1696 
1697 #if OMPT_SUPPORT
1698   ompt_frame_t *ompt_frame;
1699   if (ompt_enabled.enabled) {
1700     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1701     if (ompt_frame->enter_frame.ptr == NULL)
1702       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1703   }
1704   OMPT_STORE_RETURN_ADDRESS(global_tid);
1705 #endif
1706 #if USE_ITT_NOTIFY
1707   __kmp_threads[global_tid]->th.th_ident = loc;
1708 #endif
1709   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1710 #if OMPT_SUPPORT && OMPT_OPTIONAL
1711   if (ompt_enabled.enabled) {
1712     ompt_frame->enter_frame = ompt_data_none;
1713   }
1714 #endif
1715 
1716   ret = __kmpc_master(loc, global_tid);
1717 
1718   if (__kmp_env_consistency_check) {
1719     /*  there's no __kmpc_end_master called; so the (stats) */
1720     /*  actions of __kmpc_end_master are done here          */
1721     if (ret) {
1722       /* only one thread should do the pop since only */
1723       /* one did the push (see __kmpc_master())       */
1724       __kmp_pop_sync(global_tid, ct_master, loc);
1725     }
1726   }
1727 
1728   return (ret);
1729 }
1730 
1731 /* The BARRIER for a SINGLE process section is always explicit   */
1732 /*!
1733 @ingroup WORK_SHARING
1734 @param loc  source location information
1735 @param global_tid  global thread number
1736 @return One if this thread should execute the single construct, zero otherwise.
1737 
1738 Test whether to execute a <tt>single</tt> construct.
1739 There are no implicit barriers in the two "single" calls, rather the compiler
1740 should introduce an explicit barrier if it is required.
1741 */
1742 
1743 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1744   __kmp_assert_valid_gtid(global_tid);
1745   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1746 
1747   if (rc) {
1748     // We are going to execute the single statement, so we should count it.
1749     KMP_COUNT_BLOCK(OMP_SINGLE);
1750     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1751   }
1752 
1753 #if OMPT_SUPPORT && OMPT_OPTIONAL
1754   kmp_info_t *this_thr = __kmp_threads[global_tid];
1755   kmp_team_t *team = this_thr->th.th_team;
1756   int tid = __kmp_tid_from_gtid(global_tid);
1757 
1758   if (ompt_enabled.enabled) {
1759     if (rc) {
1760       if (ompt_enabled.ompt_callback_work) {
1761         ompt_callbacks.ompt_callback(ompt_callback_work)(
1762             ompt_work_single_executor, ompt_scope_begin,
1763             &(team->t.ompt_team_info.parallel_data),
1764             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1765             1, OMPT_GET_RETURN_ADDRESS(0));
1766       }
1767     } else {
1768       if (ompt_enabled.ompt_callback_work) {
1769         ompt_callbacks.ompt_callback(ompt_callback_work)(
1770             ompt_work_single_other, ompt_scope_begin,
1771             &(team->t.ompt_team_info.parallel_data),
1772             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1773             1, OMPT_GET_RETURN_ADDRESS(0));
1774         ompt_callbacks.ompt_callback(ompt_callback_work)(
1775             ompt_work_single_other, ompt_scope_end,
1776             &(team->t.ompt_team_info.parallel_data),
1777             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1778             1, OMPT_GET_RETURN_ADDRESS(0));
1779       }
1780     }
1781   }
1782 #endif
1783 
1784   return rc;
1785 }
1786 
1787 /*!
1788 @ingroup WORK_SHARING
1789 @param loc  source location information
1790 @param global_tid  global thread number
1791 
1792 Mark the end of a <tt>single</tt> construct.  This function should
1793 only be called by the thread that executed the block of code protected
1794 by the `single` construct.
1795 */
1796 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1797   __kmp_assert_valid_gtid(global_tid);
1798   __kmp_exit_single(global_tid);
1799   KMP_POP_PARTITIONED_TIMER();
1800 
1801 #if OMPT_SUPPORT && OMPT_OPTIONAL
1802   kmp_info_t *this_thr = __kmp_threads[global_tid];
1803   kmp_team_t *team = this_thr->th.th_team;
1804   int tid = __kmp_tid_from_gtid(global_tid);
1805 
1806   if (ompt_enabled.ompt_callback_work) {
1807     ompt_callbacks.ompt_callback(ompt_callback_work)(
1808         ompt_work_single_executor, ompt_scope_end,
1809         &(team->t.ompt_team_info.parallel_data),
1810         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1811         OMPT_GET_RETURN_ADDRESS(0));
1812   }
1813 #endif
1814 }
1815 
1816 /*!
1817 @ingroup WORK_SHARING
1818 @param loc Source location
1819 @param global_tid Global thread id
1820 
1821 Mark the end of a statically scheduled loop.
1822 */
1823 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1824   KMP_POP_PARTITIONED_TIMER();
1825   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1826 
1827 #if OMPT_SUPPORT && OMPT_OPTIONAL
1828   if (ompt_enabled.ompt_callback_work) {
1829     ompt_work_t ompt_work_type = ompt_work_loop;
1830     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1831     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1832     // Determine workshare type
1833     if (loc != NULL) {
1834       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1835         ompt_work_type = ompt_work_loop;
1836       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1837         ompt_work_type = ompt_work_sections;
1838       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1839         ompt_work_type = ompt_work_distribute;
1840       } else {
1841         // use default set above.
1842         // a warning about this case is provided in __kmpc_for_static_init
1843       }
1844       KMP_DEBUG_ASSERT(ompt_work_type);
1845     }
1846     ompt_callbacks.ompt_callback(ompt_callback_work)(
1847         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1848         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1849   }
1850 #endif
1851   if (__kmp_env_consistency_check)
1852     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1853 }
1854 
1855 // User routines which take C-style arguments (call by value)
1856 // different from the Fortran equivalent routines
1857 
1858 void ompc_set_num_threads(int arg) {
1859   // !!!!! TODO: check the per-task binding
1860   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1861 }
1862 
1863 void ompc_set_dynamic(int flag) {
1864   kmp_info_t *thread;
1865 
1866   /* For the thread-private implementation of the internal controls */
1867   thread = __kmp_entry_thread();
1868 
1869   __kmp_save_internal_controls(thread);
1870 
1871   set__dynamic(thread, flag ? true : false);
1872 }
1873 
1874 void ompc_set_nested(int flag) {
1875   kmp_info_t *thread;
1876 
1877   /* For the thread-private internal controls implementation */
1878   thread = __kmp_entry_thread();
1879 
1880   __kmp_save_internal_controls(thread);
1881 
1882   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1883 }
1884 
1885 void ompc_set_max_active_levels(int max_active_levels) {
1886   /* TO DO */
1887   /* we want per-task implementation of this internal control */
1888 
1889   /* For the per-thread internal controls implementation */
1890   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1891 }
1892 
1893 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1894   // !!!!! TODO: check the per-task binding
1895   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1896 }
1897 
1898 int ompc_get_ancestor_thread_num(int level) {
1899   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1900 }
1901 
1902 int ompc_get_team_size(int level) {
1903   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1904 }
1905 
1906 /* OpenMP 5.0 Affinity Format API */
1907 
1908 void ompc_set_affinity_format(char const *format) {
1909   if (!__kmp_init_serial) {
1910     __kmp_serial_initialize();
1911   }
1912   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1913                          format, KMP_STRLEN(format) + 1);
1914 }
1915 
1916 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1917   size_t format_size;
1918   if (!__kmp_init_serial) {
1919     __kmp_serial_initialize();
1920   }
1921   format_size = KMP_STRLEN(__kmp_affinity_format);
1922   if (buffer && size) {
1923     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1924                            format_size + 1);
1925   }
1926   return format_size;
1927 }
1928 
1929 void ompc_display_affinity(char const *format) {
1930   int gtid;
1931   if (!TCR_4(__kmp_init_middle)) {
1932     __kmp_middle_initialize();
1933   }
1934   gtid = __kmp_get_gtid();
1935   __kmp_aux_display_affinity(gtid, format);
1936 }
1937 
1938 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1939                              char const *format) {
1940   int gtid;
1941   size_t num_required;
1942   kmp_str_buf_t capture_buf;
1943   if (!TCR_4(__kmp_init_middle)) {
1944     __kmp_middle_initialize();
1945   }
1946   gtid = __kmp_get_gtid();
1947   __kmp_str_buf_init(&capture_buf);
1948   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1949   if (buffer && buf_size) {
1950     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1951                            capture_buf.used + 1);
1952   }
1953   __kmp_str_buf_free(&capture_buf);
1954   return num_required;
1955 }
1956 
1957 void kmpc_set_stacksize(int arg) {
1958   // __kmp_aux_set_stacksize initializes the library if needed
1959   __kmp_aux_set_stacksize(arg);
1960 }
1961 
1962 void kmpc_set_stacksize_s(size_t arg) {
1963   // __kmp_aux_set_stacksize initializes the library if needed
1964   __kmp_aux_set_stacksize(arg);
1965 }
1966 
1967 void kmpc_set_blocktime(int arg) {
1968   int gtid, tid;
1969   kmp_info_t *thread;
1970 
1971   gtid = __kmp_entry_gtid();
1972   tid = __kmp_tid_from_gtid(gtid);
1973   thread = __kmp_thread_from_gtid(gtid);
1974 
1975   __kmp_aux_set_blocktime(arg, thread, tid);
1976 }
1977 
1978 void kmpc_set_library(int arg) {
1979   // __kmp_user_set_library initializes the library if needed
1980   __kmp_user_set_library((enum library_type)arg);
1981 }
1982 
1983 void kmpc_set_defaults(char const *str) {
1984   // __kmp_aux_set_defaults initializes the library if needed
1985   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1986 }
1987 
1988 void kmpc_set_disp_num_buffers(int arg) {
1989   // ignore after initialization because some teams have already
1990   // allocated dispatch buffers
1991   if (__kmp_init_serial == FALSE && arg >= KMP_MIN_DISP_NUM_BUFF &&
1992       arg <= KMP_MAX_DISP_NUM_BUFF) {
1993     __kmp_dispatch_num_buffers = arg;
1994   }
1995 }
1996 
1997 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1998 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1999   return -1;
2000 #else
2001   if (!TCR_4(__kmp_init_middle)) {
2002     __kmp_middle_initialize();
2003   }
2004   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2005 #endif
2006 }
2007 
2008 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2009 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2010   return -1;
2011 #else
2012   if (!TCR_4(__kmp_init_middle)) {
2013     __kmp_middle_initialize();
2014   }
2015   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2016 #endif
2017 }
2018 
2019 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2020 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2021   return -1;
2022 #else
2023   if (!TCR_4(__kmp_init_middle)) {
2024     __kmp_middle_initialize();
2025   }
2026   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2027 #endif
2028 }
2029 
2030 /* -------------------------------------------------------------------------- */
2031 /*!
2032 @ingroup THREADPRIVATE
2033 @param loc       source location information
2034 @param gtid      global thread number
2035 @param cpy_size  size of the cpy_data buffer
2036 @param cpy_data  pointer to data to be copied
2037 @param cpy_func  helper function to call for copying data
2038 @param didit     flag variable: 1=single thread; 0=not single thread
2039 
2040 __kmpc_copyprivate implements the interface for the private data broadcast
2041 needed for the copyprivate clause associated with a single region in an
2042 OpenMP<sup>*</sup> program (both C and Fortran).
2043 All threads participating in the parallel region call this routine.
2044 One of the threads (called the single thread) should have the <tt>didit</tt>
2045 variable set to 1 and all other threads should have that variable set to 0.
2046 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2047 
2048 The OpenMP specification forbids the use of nowait on the single region when a
2049 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2050 barrier internally to avoid race conditions, so the code generation for the
2051 single region should avoid generating a barrier after the call to @ref
2052 __kmpc_copyprivate.
2053 
2054 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2055 The <tt>loc</tt> parameter is a pointer to source location information.
2056 
2057 Internal implementation: The single thread will first copy its descriptor
2058 address (cpy_data) to a team-private location, then the other threads will each
2059 call the function pointed to by the parameter cpy_func, which carries out the
2060 copy by copying the data using the cpy_data buffer.
2061 
2062 The cpy_func routine used for the copy and the contents of the data area defined
2063 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2064 to be done. For instance, the cpy_data buffer can hold the actual data to be
2065 copied or it may hold a list of pointers to the data. The cpy_func routine must
2066 interpret the cpy_data buffer appropriately.
2067 
2068 The interface to cpy_func is as follows:
2069 @code
2070 void cpy_func( void *destination, void *source )
2071 @endcode
2072 where void *destination is the cpy_data pointer for the thread being copied to
2073 and void *source is the cpy_data pointer for the thread being copied from.
2074 */
2075 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2076                         void *cpy_data, void (*cpy_func)(void *, void *),
2077                         kmp_int32 didit) {
2078   void **data_ptr;
2079   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2080   __kmp_assert_valid_gtid(gtid);
2081 
2082   KMP_MB();
2083 
2084   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2085 
2086   if (__kmp_env_consistency_check) {
2087     if (loc == 0) {
2088       KMP_WARNING(ConstructIdentInvalid);
2089     }
2090   }
2091 
2092   // ToDo: Optimize the following two barriers into some kind of split barrier
2093 
2094   if (didit)
2095     *data_ptr = cpy_data;
2096 
2097 #if OMPT_SUPPORT
2098   ompt_frame_t *ompt_frame;
2099   if (ompt_enabled.enabled) {
2100     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2101     if (ompt_frame->enter_frame.ptr == NULL)
2102       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2103   }
2104   OMPT_STORE_RETURN_ADDRESS(gtid);
2105 #endif
2106 /* This barrier is not a barrier region boundary */
2107 #if USE_ITT_NOTIFY
2108   __kmp_threads[gtid]->th.th_ident = loc;
2109 #endif
2110   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2111 
2112   if (!didit)
2113     (*cpy_func)(cpy_data, *data_ptr);
2114 
2115   // Consider next barrier a user-visible barrier for barrier region boundaries
2116   // Nesting checks are already handled by the single construct checks
2117   {
2118 #if OMPT_SUPPORT
2119     OMPT_STORE_RETURN_ADDRESS(gtid);
2120 #endif
2121 #if USE_ITT_NOTIFY
2122     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2123 // tasks can overwrite the location)
2124 #endif
2125     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2126 #if OMPT_SUPPORT && OMPT_OPTIONAL
2127     if (ompt_enabled.enabled) {
2128       ompt_frame->enter_frame = ompt_data_none;
2129     }
2130 #endif
2131   }
2132 }
2133 
2134 /* -------------------------------------------------------------------------- */
2135 
2136 #define INIT_LOCK __kmp_init_user_lock_with_checks
2137 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2138 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2139 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2140 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2141 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2142   __kmp_acquire_nested_user_lock_with_checks_timed
2143 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2144 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2145 #define TEST_LOCK __kmp_test_user_lock_with_checks
2146 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2147 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2148 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2149 
2150 // TODO: Make check abort messages use location info & pass it into
2151 // with_checks routines
2152 
2153 #if KMP_USE_DYNAMIC_LOCK
2154 
2155 // internal lock initializer
2156 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2157                                                     kmp_dyna_lockseq_t seq) {
2158   if (KMP_IS_D_LOCK(seq)) {
2159     KMP_INIT_D_LOCK(lock, seq);
2160 #if USE_ITT_BUILD
2161     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2162 #endif
2163   } else {
2164     KMP_INIT_I_LOCK(lock, seq);
2165 #if USE_ITT_BUILD
2166     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2167     __kmp_itt_lock_creating(ilk->lock, loc);
2168 #endif
2169   }
2170 }
2171 
2172 // internal nest lock initializer
2173 static __forceinline void
2174 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2175                                kmp_dyna_lockseq_t seq) {
2176 #if KMP_USE_TSX
2177   // Don't have nested lock implementation for speculative locks
2178   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2179       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2180     seq = __kmp_user_lock_seq;
2181 #endif
2182   switch (seq) {
2183   case lockseq_tas:
2184     seq = lockseq_nested_tas;
2185     break;
2186 #if KMP_USE_FUTEX
2187   case lockseq_futex:
2188     seq = lockseq_nested_futex;
2189     break;
2190 #endif
2191   case lockseq_ticket:
2192     seq = lockseq_nested_ticket;
2193     break;
2194   case lockseq_queuing:
2195     seq = lockseq_nested_queuing;
2196     break;
2197   case lockseq_drdpa:
2198     seq = lockseq_nested_drdpa;
2199     break;
2200   default:
2201     seq = lockseq_nested_queuing;
2202   }
2203   KMP_INIT_I_LOCK(lock, seq);
2204 #if USE_ITT_BUILD
2205   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2206   __kmp_itt_lock_creating(ilk->lock, loc);
2207 #endif
2208 }
2209 
2210 /* initialize the lock with a hint */
2211 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2212                                 uintptr_t hint) {
2213   KMP_DEBUG_ASSERT(__kmp_init_serial);
2214   if (__kmp_env_consistency_check && user_lock == NULL) {
2215     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2216   }
2217 
2218   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2219 
2220 #if OMPT_SUPPORT && OMPT_OPTIONAL
2221   // This is the case, if called from omp_init_lock_with_hint:
2222   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2223   if (!codeptr)
2224     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2225   if (ompt_enabled.ompt_callback_lock_init) {
2226     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2227         ompt_mutex_lock, (omp_lock_hint_t)hint,
2228         __ompt_get_mutex_impl_type(user_lock),
2229         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2230   }
2231 #endif
2232 }
2233 
2234 /* initialize the lock with a hint */
2235 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2236                                      void **user_lock, uintptr_t hint) {
2237   KMP_DEBUG_ASSERT(__kmp_init_serial);
2238   if (__kmp_env_consistency_check && user_lock == NULL) {
2239     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2240   }
2241 
2242   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2243 
2244 #if OMPT_SUPPORT && OMPT_OPTIONAL
2245   // This is the case, if called from omp_init_lock_with_hint:
2246   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2247   if (!codeptr)
2248     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2249   if (ompt_enabled.ompt_callback_lock_init) {
2250     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2251         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2252         __ompt_get_mutex_impl_type(user_lock),
2253         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2254   }
2255 #endif
2256 }
2257 
2258 #endif // KMP_USE_DYNAMIC_LOCK
2259 
2260 /* initialize the lock */
2261 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2262 #if KMP_USE_DYNAMIC_LOCK
2263 
2264   KMP_DEBUG_ASSERT(__kmp_init_serial);
2265   if (__kmp_env_consistency_check && user_lock == NULL) {
2266     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2267   }
2268   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2269 
2270 #if OMPT_SUPPORT && OMPT_OPTIONAL
2271   // This is the case, if called from omp_init_lock_with_hint:
2272   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2273   if (!codeptr)
2274     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2275   if (ompt_enabled.ompt_callback_lock_init) {
2276     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2277         ompt_mutex_lock, omp_lock_hint_none,
2278         __ompt_get_mutex_impl_type(user_lock),
2279         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2280   }
2281 #endif
2282 
2283 #else // KMP_USE_DYNAMIC_LOCK
2284 
2285   static char const *const func = "omp_init_lock";
2286   kmp_user_lock_p lck;
2287   KMP_DEBUG_ASSERT(__kmp_init_serial);
2288 
2289   if (__kmp_env_consistency_check) {
2290     if (user_lock == NULL) {
2291       KMP_FATAL(LockIsUninitialized, func);
2292     }
2293   }
2294 
2295   KMP_CHECK_USER_LOCK_INIT();
2296 
2297   if ((__kmp_user_lock_kind == lk_tas) &&
2298       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2299     lck = (kmp_user_lock_p)user_lock;
2300   }
2301 #if KMP_USE_FUTEX
2302   else if ((__kmp_user_lock_kind == lk_futex) &&
2303            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2304     lck = (kmp_user_lock_p)user_lock;
2305   }
2306 #endif
2307   else {
2308     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2309   }
2310   INIT_LOCK(lck);
2311   __kmp_set_user_lock_location(lck, loc);
2312 
2313 #if OMPT_SUPPORT && OMPT_OPTIONAL
2314   // This is the case, if called from omp_init_lock_with_hint:
2315   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2316   if (!codeptr)
2317     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2318   if (ompt_enabled.ompt_callback_lock_init) {
2319     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2320         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2321         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2322   }
2323 #endif
2324 
2325 #if USE_ITT_BUILD
2326   __kmp_itt_lock_creating(lck);
2327 #endif /* USE_ITT_BUILD */
2328 
2329 #endif // KMP_USE_DYNAMIC_LOCK
2330 } // __kmpc_init_lock
2331 
2332 /* initialize the lock */
2333 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2334 #if KMP_USE_DYNAMIC_LOCK
2335 
2336   KMP_DEBUG_ASSERT(__kmp_init_serial);
2337   if (__kmp_env_consistency_check && user_lock == NULL) {
2338     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2339   }
2340   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2341 
2342 #if OMPT_SUPPORT && OMPT_OPTIONAL
2343   // This is the case, if called from omp_init_lock_with_hint:
2344   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2345   if (!codeptr)
2346     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2347   if (ompt_enabled.ompt_callback_lock_init) {
2348     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2349         ompt_mutex_nest_lock, omp_lock_hint_none,
2350         __ompt_get_mutex_impl_type(user_lock),
2351         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2352   }
2353 #endif
2354 
2355 #else // KMP_USE_DYNAMIC_LOCK
2356 
2357   static char const *const func = "omp_init_nest_lock";
2358   kmp_user_lock_p lck;
2359   KMP_DEBUG_ASSERT(__kmp_init_serial);
2360 
2361   if (__kmp_env_consistency_check) {
2362     if (user_lock == NULL) {
2363       KMP_FATAL(LockIsUninitialized, func);
2364     }
2365   }
2366 
2367   KMP_CHECK_USER_LOCK_INIT();
2368 
2369   if ((__kmp_user_lock_kind == lk_tas) &&
2370       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2371        OMP_NEST_LOCK_T_SIZE)) {
2372     lck = (kmp_user_lock_p)user_lock;
2373   }
2374 #if KMP_USE_FUTEX
2375   else if ((__kmp_user_lock_kind == lk_futex) &&
2376            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2377             OMP_NEST_LOCK_T_SIZE)) {
2378     lck = (kmp_user_lock_p)user_lock;
2379   }
2380 #endif
2381   else {
2382     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2383   }
2384 
2385   INIT_NESTED_LOCK(lck);
2386   __kmp_set_user_lock_location(lck, loc);
2387 
2388 #if OMPT_SUPPORT && OMPT_OPTIONAL
2389   // This is the case, if called from omp_init_lock_with_hint:
2390   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2391   if (!codeptr)
2392     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2393   if (ompt_enabled.ompt_callback_lock_init) {
2394     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2395         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2396         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2397   }
2398 #endif
2399 
2400 #if USE_ITT_BUILD
2401   __kmp_itt_lock_creating(lck);
2402 #endif /* USE_ITT_BUILD */
2403 
2404 #endif // KMP_USE_DYNAMIC_LOCK
2405 } // __kmpc_init_nest_lock
2406 
2407 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2408 #if KMP_USE_DYNAMIC_LOCK
2409 
2410 #if USE_ITT_BUILD
2411   kmp_user_lock_p lck;
2412   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2413     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2414   } else {
2415     lck = (kmp_user_lock_p)user_lock;
2416   }
2417   __kmp_itt_lock_destroyed(lck);
2418 #endif
2419 #if OMPT_SUPPORT && OMPT_OPTIONAL
2420   // This is the case, if called from omp_init_lock_with_hint:
2421   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2422   if (!codeptr)
2423     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2424   if (ompt_enabled.ompt_callback_lock_destroy) {
2425     kmp_user_lock_p lck;
2426     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2427       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2428     } else {
2429       lck = (kmp_user_lock_p)user_lock;
2430     }
2431     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2432         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2433   }
2434 #endif
2435   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2436 #else
2437   kmp_user_lock_p lck;
2438 
2439   if ((__kmp_user_lock_kind == lk_tas) &&
2440       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2441     lck = (kmp_user_lock_p)user_lock;
2442   }
2443 #if KMP_USE_FUTEX
2444   else if ((__kmp_user_lock_kind == lk_futex) &&
2445            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2446     lck = (kmp_user_lock_p)user_lock;
2447   }
2448 #endif
2449   else {
2450     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2451   }
2452 
2453 #if OMPT_SUPPORT && OMPT_OPTIONAL
2454   // This is the case, if called from omp_init_lock_with_hint:
2455   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2456   if (!codeptr)
2457     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2458   if (ompt_enabled.ompt_callback_lock_destroy) {
2459     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2460         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2461   }
2462 #endif
2463 
2464 #if USE_ITT_BUILD
2465   __kmp_itt_lock_destroyed(lck);
2466 #endif /* USE_ITT_BUILD */
2467   DESTROY_LOCK(lck);
2468 
2469   if ((__kmp_user_lock_kind == lk_tas) &&
2470       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2471     ;
2472   }
2473 #if KMP_USE_FUTEX
2474   else if ((__kmp_user_lock_kind == lk_futex) &&
2475            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2476     ;
2477   }
2478 #endif
2479   else {
2480     __kmp_user_lock_free(user_lock, gtid, lck);
2481   }
2482 #endif // KMP_USE_DYNAMIC_LOCK
2483 } // __kmpc_destroy_lock
2484 
2485 /* destroy the lock */
2486 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2487 #if KMP_USE_DYNAMIC_LOCK
2488 
2489 #if USE_ITT_BUILD
2490   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2491   __kmp_itt_lock_destroyed(ilk->lock);
2492 #endif
2493 #if OMPT_SUPPORT && OMPT_OPTIONAL
2494   // This is the case, if called from omp_init_lock_with_hint:
2495   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2496   if (!codeptr)
2497     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2498   if (ompt_enabled.ompt_callback_lock_destroy) {
2499     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2500         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2501   }
2502 #endif
2503   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2504 
2505 #else // KMP_USE_DYNAMIC_LOCK
2506 
2507   kmp_user_lock_p lck;
2508 
2509   if ((__kmp_user_lock_kind == lk_tas) &&
2510       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2511        OMP_NEST_LOCK_T_SIZE)) {
2512     lck = (kmp_user_lock_p)user_lock;
2513   }
2514 #if KMP_USE_FUTEX
2515   else if ((__kmp_user_lock_kind == lk_futex) &&
2516            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2517             OMP_NEST_LOCK_T_SIZE)) {
2518     lck = (kmp_user_lock_p)user_lock;
2519   }
2520 #endif
2521   else {
2522     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2523   }
2524 
2525 #if OMPT_SUPPORT && OMPT_OPTIONAL
2526   // This is the case, if called from omp_init_lock_with_hint:
2527   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2528   if (!codeptr)
2529     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2530   if (ompt_enabled.ompt_callback_lock_destroy) {
2531     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2532         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2533   }
2534 #endif
2535 
2536 #if USE_ITT_BUILD
2537   __kmp_itt_lock_destroyed(lck);
2538 #endif /* USE_ITT_BUILD */
2539 
2540   DESTROY_NESTED_LOCK(lck);
2541 
2542   if ((__kmp_user_lock_kind == lk_tas) &&
2543       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2544        OMP_NEST_LOCK_T_SIZE)) {
2545     ;
2546   }
2547 #if KMP_USE_FUTEX
2548   else if ((__kmp_user_lock_kind == lk_futex) &&
2549            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2550             OMP_NEST_LOCK_T_SIZE)) {
2551     ;
2552   }
2553 #endif
2554   else {
2555     __kmp_user_lock_free(user_lock, gtid, lck);
2556   }
2557 #endif // KMP_USE_DYNAMIC_LOCK
2558 } // __kmpc_destroy_nest_lock
2559 
2560 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2561   KMP_COUNT_BLOCK(OMP_set_lock);
2562 #if KMP_USE_DYNAMIC_LOCK
2563   int tag = KMP_EXTRACT_D_TAG(user_lock);
2564 #if USE_ITT_BUILD
2565   __kmp_itt_lock_acquiring(
2566       (kmp_user_lock_p)
2567           user_lock); // itt function will get to the right lock object.
2568 #endif
2569 #if OMPT_SUPPORT && OMPT_OPTIONAL
2570   // This is the case, if called from omp_init_lock_with_hint:
2571   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2572   if (!codeptr)
2573     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2574   if (ompt_enabled.ompt_callback_mutex_acquire) {
2575     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2576         ompt_mutex_lock, omp_lock_hint_none,
2577         __ompt_get_mutex_impl_type(user_lock),
2578         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2579   }
2580 #endif
2581 #if KMP_USE_INLINED_TAS
2582   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2583     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2584   } else
2585 #elif KMP_USE_INLINED_FUTEX
2586   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2587     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2588   } else
2589 #endif
2590   {
2591     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2592   }
2593 #if USE_ITT_BUILD
2594   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2595 #endif
2596 #if OMPT_SUPPORT && OMPT_OPTIONAL
2597   if (ompt_enabled.ompt_callback_mutex_acquired) {
2598     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2599         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2600   }
2601 #endif
2602 
2603 #else // KMP_USE_DYNAMIC_LOCK
2604 
2605   kmp_user_lock_p lck;
2606 
2607   if ((__kmp_user_lock_kind == lk_tas) &&
2608       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2609     lck = (kmp_user_lock_p)user_lock;
2610   }
2611 #if KMP_USE_FUTEX
2612   else if ((__kmp_user_lock_kind == lk_futex) &&
2613            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2614     lck = (kmp_user_lock_p)user_lock;
2615   }
2616 #endif
2617   else {
2618     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2619   }
2620 
2621 #if USE_ITT_BUILD
2622   __kmp_itt_lock_acquiring(lck);
2623 #endif /* USE_ITT_BUILD */
2624 #if OMPT_SUPPORT && OMPT_OPTIONAL
2625   // This is the case, if called from omp_init_lock_with_hint:
2626   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2627   if (!codeptr)
2628     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2629   if (ompt_enabled.ompt_callback_mutex_acquire) {
2630     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2631         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2632         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2633   }
2634 #endif
2635 
2636   ACQUIRE_LOCK(lck, gtid);
2637 
2638 #if USE_ITT_BUILD
2639   __kmp_itt_lock_acquired(lck);
2640 #endif /* USE_ITT_BUILD */
2641 
2642 #if OMPT_SUPPORT && OMPT_OPTIONAL
2643   if (ompt_enabled.ompt_callback_mutex_acquired) {
2644     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2645         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2646   }
2647 #endif
2648 
2649 #endif // KMP_USE_DYNAMIC_LOCK
2650 }
2651 
2652 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2653 #if KMP_USE_DYNAMIC_LOCK
2654 
2655 #if USE_ITT_BUILD
2656   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2657 #endif
2658 #if OMPT_SUPPORT && OMPT_OPTIONAL
2659   // This is the case, if called from omp_init_lock_with_hint:
2660   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2661   if (!codeptr)
2662     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2663   if (ompt_enabled.enabled) {
2664     if (ompt_enabled.ompt_callback_mutex_acquire) {
2665       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2666           ompt_mutex_nest_lock, omp_lock_hint_none,
2667           __ompt_get_mutex_impl_type(user_lock),
2668           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2669     }
2670   }
2671 #endif
2672   int acquire_status =
2673       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2674   (void)acquire_status;
2675 #if USE_ITT_BUILD
2676   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2677 #endif
2678 
2679 #if OMPT_SUPPORT && OMPT_OPTIONAL
2680   if (ompt_enabled.enabled) {
2681     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2682       if (ompt_enabled.ompt_callback_mutex_acquired) {
2683         // lock_first
2684         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2685             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2686             codeptr);
2687       }
2688     } else {
2689       if (ompt_enabled.ompt_callback_nest_lock) {
2690         // lock_next
2691         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2692             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2693       }
2694     }
2695   }
2696 #endif
2697 
2698 #else // KMP_USE_DYNAMIC_LOCK
2699   int acquire_status;
2700   kmp_user_lock_p lck;
2701 
2702   if ((__kmp_user_lock_kind == lk_tas) &&
2703       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2704        OMP_NEST_LOCK_T_SIZE)) {
2705     lck = (kmp_user_lock_p)user_lock;
2706   }
2707 #if KMP_USE_FUTEX
2708   else if ((__kmp_user_lock_kind == lk_futex) &&
2709            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2710             OMP_NEST_LOCK_T_SIZE)) {
2711     lck = (kmp_user_lock_p)user_lock;
2712   }
2713 #endif
2714   else {
2715     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2716   }
2717 
2718 #if USE_ITT_BUILD
2719   __kmp_itt_lock_acquiring(lck);
2720 #endif /* USE_ITT_BUILD */
2721 #if OMPT_SUPPORT && OMPT_OPTIONAL
2722   // This is the case, if called from omp_init_lock_with_hint:
2723   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2724   if (!codeptr)
2725     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2726   if (ompt_enabled.enabled) {
2727     if (ompt_enabled.ompt_callback_mutex_acquire) {
2728       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2729           ompt_mutex_nest_lock, omp_lock_hint_none,
2730           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2731           codeptr);
2732     }
2733   }
2734 #endif
2735 
2736   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2737 
2738 #if USE_ITT_BUILD
2739   __kmp_itt_lock_acquired(lck);
2740 #endif /* USE_ITT_BUILD */
2741 
2742 #if OMPT_SUPPORT && OMPT_OPTIONAL
2743   if (ompt_enabled.enabled) {
2744     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2745       if (ompt_enabled.ompt_callback_mutex_acquired) {
2746         // lock_first
2747         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2748             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2749       }
2750     } else {
2751       if (ompt_enabled.ompt_callback_nest_lock) {
2752         // lock_next
2753         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2754             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2755       }
2756     }
2757   }
2758 #endif
2759 
2760 #endif // KMP_USE_DYNAMIC_LOCK
2761 }
2762 
2763 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2764 #if KMP_USE_DYNAMIC_LOCK
2765 
2766   int tag = KMP_EXTRACT_D_TAG(user_lock);
2767 #if USE_ITT_BUILD
2768   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2769 #endif
2770 #if KMP_USE_INLINED_TAS
2771   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2772     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2773   } else
2774 #elif KMP_USE_INLINED_FUTEX
2775   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2776     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2777   } else
2778 #endif
2779   {
2780     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2781   }
2782 
2783 #if OMPT_SUPPORT && OMPT_OPTIONAL
2784   // This is the case, if called from omp_init_lock_with_hint:
2785   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2786   if (!codeptr)
2787     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2788   if (ompt_enabled.ompt_callback_mutex_released) {
2789     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2790         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2791   }
2792 #endif
2793 
2794 #else // KMP_USE_DYNAMIC_LOCK
2795 
2796   kmp_user_lock_p lck;
2797 
2798   /* Can't use serial interval since not block structured */
2799   /* release the lock */
2800 
2801   if ((__kmp_user_lock_kind == lk_tas) &&
2802       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2803 #if KMP_OS_LINUX &&                                                            \
2804     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2805 // "fast" path implemented to fix customer performance issue
2806 #if USE_ITT_BUILD
2807     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2808 #endif /* USE_ITT_BUILD */
2809     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2810     KMP_MB();
2811 
2812 #if OMPT_SUPPORT && OMPT_OPTIONAL
2813     // This is the case, if called from omp_init_lock_with_hint:
2814     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2815     if (!codeptr)
2816       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2817     if (ompt_enabled.ompt_callback_mutex_released) {
2818       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2819           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2820     }
2821 #endif
2822 
2823     return;
2824 #else
2825     lck = (kmp_user_lock_p)user_lock;
2826 #endif
2827   }
2828 #if KMP_USE_FUTEX
2829   else if ((__kmp_user_lock_kind == lk_futex) &&
2830            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2831     lck = (kmp_user_lock_p)user_lock;
2832   }
2833 #endif
2834   else {
2835     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2836   }
2837 
2838 #if USE_ITT_BUILD
2839   __kmp_itt_lock_releasing(lck);
2840 #endif /* USE_ITT_BUILD */
2841 
2842   RELEASE_LOCK(lck, gtid);
2843 
2844 #if OMPT_SUPPORT && OMPT_OPTIONAL
2845   // This is the case, if called from omp_init_lock_with_hint:
2846   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2847   if (!codeptr)
2848     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2849   if (ompt_enabled.ompt_callback_mutex_released) {
2850     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2851         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2852   }
2853 #endif
2854 
2855 #endif // KMP_USE_DYNAMIC_LOCK
2856 }
2857 
2858 /* release the lock */
2859 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2860 #if KMP_USE_DYNAMIC_LOCK
2861 
2862 #if USE_ITT_BUILD
2863   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2864 #endif
2865   int release_status =
2866       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2867   (void)release_status;
2868 
2869 #if OMPT_SUPPORT && OMPT_OPTIONAL
2870   // This is the case, if called from omp_init_lock_with_hint:
2871   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2872   if (!codeptr)
2873     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2874   if (ompt_enabled.enabled) {
2875     if (release_status == KMP_LOCK_RELEASED) {
2876       if (ompt_enabled.ompt_callback_mutex_released) {
2877         // release_lock_last
2878         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2879             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2880             codeptr);
2881       }
2882     } else if (ompt_enabled.ompt_callback_nest_lock) {
2883       // release_lock_prev
2884       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2885           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2886     }
2887   }
2888 #endif
2889 
2890 #else // KMP_USE_DYNAMIC_LOCK
2891 
2892   kmp_user_lock_p lck;
2893 
2894   /* Can't use serial interval since not block structured */
2895 
2896   if ((__kmp_user_lock_kind == lk_tas) &&
2897       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2898        OMP_NEST_LOCK_T_SIZE)) {
2899 #if KMP_OS_LINUX &&                                                            \
2900     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2901     // "fast" path implemented to fix customer performance issue
2902     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2903 #if USE_ITT_BUILD
2904     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2905 #endif /* USE_ITT_BUILD */
2906 
2907 #if OMPT_SUPPORT && OMPT_OPTIONAL
2908     int release_status = KMP_LOCK_STILL_HELD;
2909 #endif
2910 
2911     if (--(tl->lk.depth_locked) == 0) {
2912       TCW_4(tl->lk.poll, 0);
2913 #if OMPT_SUPPORT && OMPT_OPTIONAL
2914       release_status = KMP_LOCK_RELEASED;
2915 #endif
2916     }
2917     KMP_MB();
2918 
2919 #if OMPT_SUPPORT && OMPT_OPTIONAL
2920     // This is the case, if called from omp_init_lock_with_hint:
2921     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2922     if (!codeptr)
2923       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2924     if (ompt_enabled.enabled) {
2925       if (release_status == KMP_LOCK_RELEASED) {
2926         if (ompt_enabled.ompt_callback_mutex_released) {
2927           // release_lock_last
2928           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2929               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2930         }
2931       } else if (ompt_enabled.ompt_callback_nest_lock) {
2932         // release_lock_previous
2933         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2934             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2935       }
2936     }
2937 #endif
2938 
2939     return;
2940 #else
2941     lck = (kmp_user_lock_p)user_lock;
2942 #endif
2943   }
2944 #if KMP_USE_FUTEX
2945   else if ((__kmp_user_lock_kind == lk_futex) &&
2946            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2947             OMP_NEST_LOCK_T_SIZE)) {
2948     lck = (kmp_user_lock_p)user_lock;
2949   }
2950 #endif
2951   else {
2952     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2953   }
2954 
2955 #if USE_ITT_BUILD
2956   __kmp_itt_lock_releasing(lck);
2957 #endif /* USE_ITT_BUILD */
2958 
2959   int release_status;
2960   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2961 #if OMPT_SUPPORT && OMPT_OPTIONAL
2962   // This is the case, if called from omp_init_lock_with_hint:
2963   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2964   if (!codeptr)
2965     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2966   if (ompt_enabled.enabled) {
2967     if (release_status == KMP_LOCK_RELEASED) {
2968       if (ompt_enabled.ompt_callback_mutex_released) {
2969         // release_lock_last
2970         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2971             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2972       }
2973     } else if (ompt_enabled.ompt_callback_nest_lock) {
2974       // release_lock_previous
2975       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2976           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2977     }
2978   }
2979 #endif
2980 
2981 #endif // KMP_USE_DYNAMIC_LOCK
2982 }
2983 
2984 /* try to acquire the lock */
2985 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2986   KMP_COUNT_BLOCK(OMP_test_lock);
2987 
2988 #if KMP_USE_DYNAMIC_LOCK
2989   int rc;
2990   int tag = KMP_EXTRACT_D_TAG(user_lock);
2991 #if USE_ITT_BUILD
2992   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2993 #endif
2994 #if OMPT_SUPPORT && OMPT_OPTIONAL
2995   // This is the case, if called from omp_init_lock_with_hint:
2996   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2997   if (!codeptr)
2998     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2999   if (ompt_enabled.ompt_callback_mutex_acquire) {
3000     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3001         ompt_mutex_lock, omp_lock_hint_none,
3002         __ompt_get_mutex_impl_type(user_lock),
3003         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3004   }
3005 #endif
3006 #if KMP_USE_INLINED_TAS
3007   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3008     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3009   } else
3010 #elif KMP_USE_INLINED_FUTEX
3011   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3012     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3013   } else
3014 #endif
3015   {
3016     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3017   }
3018   if (rc) {
3019 #if USE_ITT_BUILD
3020     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3021 #endif
3022 #if OMPT_SUPPORT && OMPT_OPTIONAL
3023     if (ompt_enabled.ompt_callback_mutex_acquired) {
3024       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3025           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3026     }
3027 #endif
3028     return FTN_TRUE;
3029   } else {
3030 #if USE_ITT_BUILD
3031     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3032 #endif
3033     return FTN_FALSE;
3034   }
3035 
3036 #else // KMP_USE_DYNAMIC_LOCK
3037 
3038   kmp_user_lock_p lck;
3039   int rc;
3040 
3041   if ((__kmp_user_lock_kind == lk_tas) &&
3042       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3043     lck = (kmp_user_lock_p)user_lock;
3044   }
3045 #if KMP_USE_FUTEX
3046   else if ((__kmp_user_lock_kind == lk_futex) &&
3047            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3048     lck = (kmp_user_lock_p)user_lock;
3049   }
3050 #endif
3051   else {
3052     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3053   }
3054 
3055 #if USE_ITT_BUILD
3056   __kmp_itt_lock_acquiring(lck);
3057 #endif /* USE_ITT_BUILD */
3058 #if OMPT_SUPPORT && OMPT_OPTIONAL
3059   // This is the case, if called from omp_init_lock_with_hint:
3060   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3061   if (!codeptr)
3062     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3063   if (ompt_enabled.ompt_callback_mutex_acquire) {
3064     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3065         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3066         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3067   }
3068 #endif
3069 
3070   rc = TEST_LOCK(lck, gtid);
3071 #if USE_ITT_BUILD
3072   if (rc) {
3073     __kmp_itt_lock_acquired(lck);
3074   } else {
3075     __kmp_itt_lock_cancelled(lck);
3076   }
3077 #endif /* USE_ITT_BUILD */
3078 #if OMPT_SUPPORT && OMPT_OPTIONAL
3079   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3080     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3081         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3082   }
3083 #endif
3084 
3085   return (rc ? FTN_TRUE : FTN_FALSE);
3086 
3087   /* Can't use serial interval since not block structured */
3088 
3089 #endif // KMP_USE_DYNAMIC_LOCK
3090 }
3091 
3092 /* try to acquire the lock */
3093 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3094 #if KMP_USE_DYNAMIC_LOCK
3095   int rc;
3096 #if USE_ITT_BUILD
3097   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3098 #endif
3099 #if OMPT_SUPPORT && OMPT_OPTIONAL
3100   // This is the case, if called from omp_init_lock_with_hint:
3101   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3102   if (!codeptr)
3103     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3104   if (ompt_enabled.ompt_callback_mutex_acquire) {
3105     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3106         ompt_mutex_nest_lock, omp_lock_hint_none,
3107         __ompt_get_mutex_impl_type(user_lock),
3108         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3109   }
3110 #endif
3111   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3112 #if USE_ITT_BUILD
3113   if (rc) {
3114     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3115   } else {
3116     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3117   }
3118 #endif
3119 #if OMPT_SUPPORT && OMPT_OPTIONAL
3120   if (ompt_enabled.enabled && rc) {
3121     if (rc == 1) {
3122       if (ompt_enabled.ompt_callback_mutex_acquired) {
3123         // lock_first
3124         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3125             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3126             codeptr);
3127       }
3128     } else {
3129       if (ompt_enabled.ompt_callback_nest_lock) {
3130         // lock_next
3131         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3132             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3133       }
3134     }
3135   }
3136 #endif
3137   return rc;
3138 
3139 #else // KMP_USE_DYNAMIC_LOCK
3140 
3141   kmp_user_lock_p lck;
3142   int rc;
3143 
3144   if ((__kmp_user_lock_kind == lk_tas) &&
3145       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3146        OMP_NEST_LOCK_T_SIZE)) {
3147     lck = (kmp_user_lock_p)user_lock;
3148   }
3149 #if KMP_USE_FUTEX
3150   else if ((__kmp_user_lock_kind == lk_futex) &&
3151            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3152             OMP_NEST_LOCK_T_SIZE)) {
3153     lck = (kmp_user_lock_p)user_lock;
3154   }
3155 #endif
3156   else {
3157     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3158   }
3159 
3160 #if USE_ITT_BUILD
3161   __kmp_itt_lock_acquiring(lck);
3162 #endif /* USE_ITT_BUILD */
3163 
3164 #if OMPT_SUPPORT && OMPT_OPTIONAL
3165   // This is the case, if called from omp_init_lock_with_hint:
3166   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3167   if (!codeptr)
3168     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3169   if (ompt_enabled.enabled) &&
3170         ompt_enabled.ompt_callback_mutex_acquire) {
3171       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3172           ompt_mutex_nest_lock, omp_lock_hint_none,
3173           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3174           codeptr);
3175     }
3176 #endif
3177 
3178   rc = TEST_NESTED_LOCK(lck, gtid);
3179 #if USE_ITT_BUILD
3180   if (rc) {
3181     __kmp_itt_lock_acquired(lck);
3182   } else {
3183     __kmp_itt_lock_cancelled(lck);
3184   }
3185 #endif /* USE_ITT_BUILD */
3186 #if OMPT_SUPPORT && OMPT_OPTIONAL
3187   if (ompt_enabled.enabled && rc) {
3188     if (rc == 1) {
3189       if (ompt_enabled.ompt_callback_mutex_acquired) {
3190         // lock_first
3191         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3192             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3193       }
3194     } else {
3195       if (ompt_enabled.ompt_callback_nest_lock) {
3196         // lock_next
3197         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3198             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3199       }
3200     }
3201   }
3202 #endif
3203   return rc;
3204 
3205   /* Can't use serial interval since not block structured */
3206 
3207 #endif // KMP_USE_DYNAMIC_LOCK
3208 }
3209 
3210 // Interface to fast scalable reduce methods routines
3211 
3212 // keep the selected method in a thread local structure for cross-function
3213 // usage: will be used in __kmpc_end_reduce* functions;
3214 // another solution: to re-determine the method one more time in
3215 // __kmpc_end_reduce* functions (new prototype required then)
3216 // AT: which solution is better?
3217 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3218   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3219 
3220 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3221   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3222 
3223 // description of the packed_reduction_method variable: look at the macros in
3224 // kmp.h
3225 
3226 // used in a critical section reduce block
3227 static __forceinline void
3228 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3229                                           kmp_critical_name *crit) {
3230 
3231   // this lock was visible to a customer and to the threading profile tool as a
3232   // serial overhead span (although it's used for an internal purpose only)
3233   //            why was it visible in previous implementation?
3234   //            should we keep it visible in new reduce block?
3235   kmp_user_lock_p lck;
3236 
3237 #if KMP_USE_DYNAMIC_LOCK
3238 
3239   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3240   // Check if it is initialized.
3241   if (*lk == 0) {
3242     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3243       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3244                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3245     } else {
3246       __kmp_init_indirect_csptr(crit, loc, global_tid,
3247                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3248     }
3249   }
3250   // Branch for accessing the actual lock object and set operation. This
3251   // branching is inevitable since this lock initialization does not follow the
3252   // normal dispatch path (lock table is not used).
3253   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3254     lck = (kmp_user_lock_p)lk;
3255     KMP_DEBUG_ASSERT(lck != NULL);
3256     if (__kmp_env_consistency_check) {
3257       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3258     }
3259     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3260   } else {
3261     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3262     lck = ilk->lock;
3263     KMP_DEBUG_ASSERT(lck != NULL);
3264     if (__kmp_env_consistency_check) {
3265       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3266     }
3267     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3268   }
3269 
3270 #else // KMP_USE_DYNAMIC_LOCK
3271 
3272   // We know that the fast reduction code is only emitted by Intel compilers
3273   // with 32 byte critical sections. If there isn't enough space, then we
3274   // have to use a pointer.
3275   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3276     lck = (kmp_user_lock_p)crit;
3277   } else {
3278     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3279   }
3280   KMP_DEBUG_ASSERT(lck != NULL);
3281 
3282   if (__kmp_env_consistency_check)
3283     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3284 
3285   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3286 
3287 #endif // KMP_USE_DYNAMIC_LOCK
3288 }
3289 
3290 // used in a critical section reduce block
3291 static __forceinline void
3292 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3293                                         kmp_critical_name *crit) {
3294 
3295   kmp_user_lock_p lck;
3296 
3297 #if KMP_USE_DYNAMIC_LOCK
3298 
3299   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3300     lck = (kmp_user_lock_p)crit;
3301     if (__kmp_env_consistency_check)
3302       __kmp_pop_sync(global_tid, ct_critical, loc);
3303     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3304   } else {
3305     kmp_indirect_lock_t *ilk =
3306         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3307     if (__kmp_env_consistency_check)
3308       __kmp_pop_sync(global_tid, ct_critical, loc);
3309     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3310   }
3311 
3312 #else // KMP_USE_DYNAMIC_LOCK
3313 
3314   // We know that the fast reduction code is only emitted by Intel compilers
3315   // with 32 byte critical sections. If there isn't enough space, then we have
3316   // to use a pointer.
3317   if (__kmp_base_user_lock_size > 32) {
3318     lck = *((kmp_user_lock_p *)crit);
3319     KMP_ASSERT(lck != NULL);
3320   } else {
3321     lck = (kmp_user_lock_p)crit;
3322   }
3323 
3324   if (__kmp_env_consistency_check)
3325     __kmp_pop_sync(global_tid, ct_critical, loc);
3326 
3327   __kmp_release_user_lock_with_checks(lck, global_tid);
3328 
3329 #endif // KMP_USE_DYNAMIC_LOCK
3330 } // __kmp_end_critical_section_reduce_block
3331 
3332 static __forceinline int
3333 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3334                                      int *task_state) {
3335   kmp_team_t *team;
3336 
3337   // Check if we are inside the teams construct?
3338   if (th->th.th_teams_microtask) {
3339     *team_p = team = th->th.th_team;
3340     if (team->t.t_level == th->th.th_teams_level) {
3341       // This is reduction at teams construct.
3342       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3343       // Let's swap teams temporarily for the reduction.
3344       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3345       th->th.th_team = team->t.t_parent;
3346       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3347       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3348       *task_state = th->th.th_task_state;
3349       th->th.th_task_state = 0;
3350 
3351       return 1;
3352     }
3353   }
3354   return 0;
3355 }
3356 
3357 static __forceinline void
3358 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3359   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3360   th->th.th_info.ds.ds_tid = 0;
3361   th->th.th_team = team;
3362   th->th.th_team_nproc = team->t.t_nproc;
3363   th->th.th_task_team = team->t.t_task_team[task_state];
3364   __kmp_type_convert(task_state, &(th->th.th_task_state));
3365 }
3366 
3367 /* 2.a.i. Reduce Block without a terminating barrier */
3368 /*!
3369 @ingroup SYNCHRONIZATION
3370 @param loc source location information
3371 @param global_tid global thread number
3372 @param num_vars number of items (variables) to be reduced
3373 @param reduce_size size of data in bytes to be reduced
3374 @param reduce_data pointer to data to be reduced
3375 @param reduce_func callback function providing reduction operation on two
3376 operands and returning result of reduction in lhs_data
3377 @param lck pointer to the unique lock data structure
3378 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3379 threads if atomic reduction needed
3380 
3381 The nowait version is used for a reduce clause with the nowait argument.
3382 */
3383 kmp_int32
3384 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3385                      size_t reduce_size, void *reduce_data,
3386                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3387                      kmp_critical_name *lck) {
3388 
3389   KMP_COUNT_BLOCK(REDUCE_nowait);
3390   int retval = 0;
3391   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3392   kmp_info_t *th;
3393   kmp_team_t *team;
3394   int teams_swapped = 0, task_state;
3395   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3396   __kmp_assert_valid_gtid(global_tid);
3397 
3398   // why do we need this initialization here at all?
3399   // Reduction clause can not be used as a stand-alone directive.
3400 
3401   // do not call __kmp_serial_initialize(), it will be called by
3402   // __kmp_parallel_initialize() if needed
3403   // possible detection of false-positive race by the threadchecker ???
3404   if (!TCR_4(__kmp_init_parallel))
3405     __kmp_parallel_initialize();
3406 
3407   __kmp_resume_if_soft_paused();
3408 
3409 // check correctness of reduce block nesting
3410 #if KMP_USE_DYNAMIC_LOCK
3411   if (__kmp_env_consistency_check)
3412     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3413 #else
3414   if (__kmp_env_consistency_check)
3415     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3416 #endif
3417 
3418   th = __kmp_thread_from_gtid(global_tid);
3419   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3420 
3421   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3422   // the value should be kept in a variable
3423   // the variable should be either a construct-specific or thread-specific
3424   // property, not a team specific property
3425   //     (a thread can reach the next reduce block on the next construct, reduce
3426   //     method may differ on the next construct)
3427   // an ident_t "loc" parameter could be used as a construct-specific property
3428   // (what if loc == 0?)
3429   //     (if both construct-specific and team-specific variables were shared,
3430   //     then unness extra syncs should be needed)
3431   // a thread-specific variable is better regarding two issues above (next
3432   // construct and extra syncs)
3433   // a thread-specific "th_local.reduction_method" variable is used currently
3434   // each thread executes 'determine' and 'set' lines (no need to execute by one
3435   // thread, to avoid unness extra syncs)
3436 
3437   packed_reduction_method = __kmp_determine_reduction_method(
3438       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3439   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3440 
3441   OMPT_REDUCTION_DECL(th, global_tid);
3442   if (packed_reduction_method == critical_reduce_block) {
3443 
3444     OMPT_REDUCTION_BEGIN;
3445 
3446     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3447     retval = 1;
3448 
3449   } else if (packed_reduction_method == empty_reduce_block) {
3450 
3451     OMPT_REDUCTION_BEGIN;
3452 
3453     // usage: if team size == 1, no synchronization is required ( Intel
3454     // platforms only )
3455     retval = 1;
3456 
3457   } else if (packed_reduction_method == atomic_reduce_block) {
3458 
3459     retval = 2;
3460 
3461     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3462     // won't be called by the code gen)
3463     //     (it's not quite good, because the checking block has been closed by
3464     //     this 'pop',
3465     //      but atomic operation has not been executed yet, will be executed
3466     //      slightly later, literally on next instruction)
3467     if (__kmp_env_consistency_check)
3468       __kmp_pop_sync(global_tid, ct_reduce, loc);
3469 
3470   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3471                                    tree_reduce_block)) {
3472 
3473 // AT: performance issue: a real barrier here
3474 // AT:     (if master goes slow, other threads are blocked here waiting for the
3475 // master to come and release them)
3476 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3477 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3478 // be confusing to a customer)
3479 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3480 // might go faster and be more in line with sense of NOWAIT
3481 // AT: TO DO: do epcc test and compare times
3482 
3483 // this barrier should be invisible to a customer and to the threading profile
3484 // tool (it's neither a terminating barrier nor customer's code, it's
3485 // used for an internal purpose)
3486 #if OMPT_SUPPORT
3487     // JP: can this barrier potentially leed to task scheduling?
3488     // JP: as long as there is a barrier in the implementation, OMPT should and
3489     // will provide the barrier events
3490     //         so we set-up the necessary frame/return addresses.
3491     ompt_frame_t *ompt_frame;
3492     if (ompt_enabled.enabled) {
3493       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3494       if (ompt_frame->enter_frame.ptr == NULL)
3495         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3496     }
3497     OMPT_STORE_RETURN_ADDRESS(global_tid);
3498 #endif
3499 #if USE_ITT_NOTIFY
3500     __kmp_threads[global_tid]->th.th_ident = loc;
3501 #endif
3502     retval =
3503         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3504                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3505     retval = (retval != 0) ? (0) : (1);
3506 #if OMPT_SUPPORT && OMPT_OPTIONAL
3507     if (ompt_enabled.enabled) {
3508       ompt_frame->enter_frame = ompt_data_none;
3509     }
3510 #endif
3511 
3512     // all other workers except master should do this pop here
3513     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3514     if (__kmp_env_consistency_check) {
3515       if (retval == 0) {
3516         __kmp_pop_sync(global_tid, ct_reduce, loc);
3517       }
3518     }
3519 
3520   } else {
3521 
3522     // should never reach this block
3523     KMP_ASSERT(0); // "unexpected method"
3524   }
3525   if (teams_swapped) {
3526     __kmp_restore_swapped_teams(th, team, task_state);
3527   }
3528   KA_TRACE(
3529       10,
3530       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3531        global_tid, packed_reduction_method, retval));
3532 
3533   return retval;
3534 }
3535 
3536 /*!
3537 @ingroup SYNCHRONIZATION
3538 @param loc source location information
3539 @param global_tid global thread id.
3540 @param lck pointer to the unique lock data structure
3541 
3542 Finish the execution of a reduce nowait.
3543 */
3544 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3545                               kmp_critical_name *lck) {
3546 
3547   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3548 
3549   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3550   __kmp_assert_valid_gtid(global_tid);
3551 
3552   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3553 
3554   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3555 
3556   if (packed_reduction_method == critical_reduce_block) {
3557 
3558     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3559     OMPT_REDUCTION_END;
3560 
3561   } else if (packed_reduction_method == empty_reduce_block) {
3562 
3563     // usage: if team size == 1, no synchronization is required ( on Intel
3564     // platforms only )
3565 
3566     OMPT_REDUCTION_END;
3567 
3568   } else if (packed_reduction_method == atomic_reduce_block) {
3569 
3570     // neither master nor other workers should get here
3571     //     (code gen does not generate this call in case 2: atomic reduce block)
3572     // actually it's better to remove this elseif at all;
3573     // after removal this value will checked by the 'else' and will assert
3574 
3575   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3576                                    tree_reduce_block)) {
3577 
3578     // only master gets here
3579     // OMPT: tree reduction is annotated in the barrier code
3580 
3581   } else {
3582 
3583     // should never reach this block
3584     KMP_ASSERT(0); // "unexpected method"
3585   }
3586 
3587   if (__kmp_env_consistency_check)
3588     __kmp_pop_sync(global_tid, ct_reduce, loc);
3589 
3590   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3591                 global_tid, packed_reduction_method));
3592 
3593   return;
3594 }
3595 
3596 /* 2.a.ii. Reduce Block with a terminating barrier */
3597 
3598 /*!
3599 @ingroup SYNCHRONIZATION
3600 @param loc source location information
3601 @param global_tid global thread number
3602 @param num_vars number of items (variables) to be reduced
3603 @param reduce_size size of data in bytes to be reduced
3604 @param reduce_data pointer to data to be reduced
3605 @param reduce_func callback function providing reduction operation on two
3606 operands and returning result of reduction in lhs_data
3607 @param lck pointer to the unique lock data structure
3608 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3609 threads if atomic reduction needed
3610 
3611 A blocking reduce that includes an implicit barrier.
3612 */
3613 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3614                         size_t reduce_size, void *reduce_data,
3615                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3616                         kmp_critical_name *lck) {
3617   KMP_COUNT_BLOCK(REDUCE_wait);
3618   int retval = 0;
3619   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3620   kmp_info_t *th;
3621   kmp_team_t *team;
3622   int teams_swapped = 0, task_state;
3623 
3624   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3625   __kmp_assert_valid_gtid(global_tid);
3626 
3627   // why do we need this initialization here at all?
3628   // Reduction clause can not be a stand-alone directive.
3629 
3630   // do not call __kmp_serial_initialize(), it will be called by
3631   // __kmp_parallel_initialize() if needed
3632   // possible detection of false-positive race by the threadchecker ???
3633   if (!TCR_4(__kmp_init_parallel))
3634     __kmp_parallel_initialize();
3635 
3636   __kmp_resume_if_soft_paused();
3637 
3638 // check correctness of reduce block nesting
3639 #if KMP_USE_DYNAMIC_LOCK
3640   if (__kmp_env_consistency_check)
3641     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3642 #else
3643   if (__kmp_env_consistency_check)
3644     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3645 #endif
3646 
3647   th = __kmp_thread_from_gtid(global_tid);
3648   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3649 
3650   packed_reduction_method = __kmp_determine_reduction_method(
3651       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3652   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3653 
3654   OMPT_REDUCTION_DECL(th, global_tid);
3655 
3656   if (packed_reduction_method == critical_reduce_block) {
3657 
3658     OMPT_REDUCTION_BEGIN;
3659     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3660     retval = 1;
3661 
3662   } else if (packed_reduction_method == empty_reduce_block) {
3663 
3664     OMPT_REDUCTION_BEGIN;
3665     // usage: if team size == 1, no synchronization is required ( Intel
3666     // platforms only )
3667     retval = 1;
3668 
3669   } else if (packed_reduction_method == atomic_reduce_block) {
3670 
3671     retval = 2;
3672 
3673   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3674                                    tree_reduce_block)) {
3675 
3676 // case tree_reduce_block:
3677 // this barrier should be visible to a customer and to the threading profile
3678 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3679 #if OMPT_SUPPORT
3680     ompt_frame_t *ompt_frame;
3681     if (ompt_enabled.enabled) {
3682       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3683       if (ompt_frame->enter_frame.ptr == NULL)
3684         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3685     }
3686     OMPT_STORE_RETURN_ADDRESS(global_tid);
3687 #endif
3688 #if USE_ITT_NOTIFY
3689     __kmp_threads[global_tid]->th.th_ident =
3690         loc; // needed for correct notification of frames
3691 #endif
3692     retval =
3693         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3694                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3695     retval = (retval != 0) ? (0) : (1);
3696 #if OMPT_SUPPORT && OMPT_OPTIONAL
3697     if (ompt_enabled.enabled) {
3698       ompt_frame->enter_frame = ompt_data_none;
3699     }
3700 #endif
3701 
3702     // all other workers except master should do this pop here
3703     // ( none of other workers except master will enter __kmpc_end_reduce() )
3704     if (__kmp_env_consistency_check) {
3705       if (retval == 0) { // 0: all other workers; 1: master
3706         __kmp_pop_sync(global_tid, ct_reduce, loc);
3707       }
3708     }
3709 
3710   } else {
3711 
3712     // should never reach this block
3713     KMP_ASSERT(0); // "unexpected method"
3714   }
3715   if (teams_swapped) {
3716     __kmp_restore_swapped_teams(th, team, task_state);
3717   }
3718 
3719   KA_TRACE(10,
3720            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3721             global_tid, packed_reduction_method, retval));
3722   return retval;
3723 }
3724 
3725 /*!
3726 @ingroup SYNCHRONIZATION
3727 @param loc source location information
3728 @param global_tid global thread id.
3729 @param lck pointer to the unique lock data structure
3730 
3731 Finish the execution of a blocking reduce.
3732 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3733 start function.
3734 */
3735 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3736                        kmp_critical_name *lck) {
3737 
3738   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3739   kmp_info_t *th;
3740   kmp_team_t *team;
3741   int teams_swapped = 0, task_state;
3742 
3743   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3744   __kmp_assert_valid_gtid(global_tid);
3745 
3746   th = __kmp_thread_from_gtid(global_tid);
3747   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3748 
3749   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3750 
3751   // this barrier should be visible to a customer and to the threading profile
3752   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3753   OMPT_REDUCTION_DECL(th, global_tid);
3754 
3755   if (packed_reduction_method == critical_reduce_block) {
3756     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3757 
3758     OMPT_REDUCTION_END;
3759 
3760 // TODO: implicit barrier: should be exposed
3761 #if OMPT_SUPPORT
3762     ompt_frame_t *ompt_frame;
3763     if (ompt_enabled.enabled) {
3764       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3765       if (ompt_frame->enter_frame.ptr == NULL)
3766         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3767     }
3768     OMPT_STORE_RETURN_ADDRESS(global_tid);
3769 #endif
3770 #if USE_ITT_NOTIFY
3771     __kmp_threads[global_tid]->th.th_ident = loc;
3772 #endif
3773     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3774 #if OMPT_SUPPORT && OMPT_OPTIONAL
3775     if (ompt_enabled.enabled) {
3776       ompt_frame->enter_frame = ompt_data_none;
3777     }
3778 #endif
3779 
3780   } else if (packed_reduction_method == empty_reduce_block) {
3781 
3782     OMPT_REDUCTION_END;
3783 
3784 // usage: if team size==1, no synchronization is required (Intel platforms only)
3785 
3786 // TODO: implicit barrier: should be exposed
3787 #if OMPT_SUPPORT
3788     ompt_frame_t *ompt_frame;
3789     if (ompt_enabled.enabled) {
3790       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3791       if (ompt_frame->enter_frame.ptr == NULL)
3792         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3793     }
3794     OMPT_STORE_RETURN_ADDRESS(global_tid);
3795 #endif
3796 #if USE_ITT_NOTIFY
3797     __kmp_threads[global_tid]->th.th_ident = loc;
3798 #endif
3799     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3800 #if OMPT_SUPPORT && OMPT_OPTIONAL
3801     if (ompt_enabled.enabled) {
3802       ompt_frame->enter_frame = ompt_data_none;
3803     }
3804 #endif
3805 
3806   } else if (packed_reduction_method == atomic_reduce_block) {
3807 
3808 #if OMPT_SUPPORT
3809     ompt_frame_t *ompt_frame;
3810     if (ompt_enabled.enabled) {
3811       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3812       if (ompt_frame->enter_frame.ptr == NULL)
3813         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3814     }
3815     OMPT_STORE_RETURN_ADDRESS(global_tid);
3816 #endif
3817 // TODO: implicit barrier: should be exposed
3818 #if USE_ITT_NOTIFY
3819     __kmp_threads[global_tid]->th.th_ident = loc;
3820 #endif
3821     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3822 #if OMPT_SUPPORT && OMPT_OPTIONAL
3823     if (ompt_enabled.enabled) {
3824       ompt_frame->enter_frame = ompt_data_none;
3825     }
3826 #endif
3827 
3828   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3829                                    tree_reduce_block)) {
3830 
3831     // only master executes here (master releases all other workers)
3832     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3833                             global_tid);
3834 
3835   } else {
3836 
3837     // should never reach this block
3838     KMP_ASSERT(0); // "unexpected method"
3839   }
3840   if (teams_swapped) {
3841     __kmp_restore_swapped_teams(th, team, task_state);
3842   }
3843 
3844   if (__kmp_env_consistency_check)
3845     __kmp_pop_sync(global_tid, ct_reduce, loc);
3846 
3847   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3848                 global_tid, packed_reduction_method));
3849 
3850   return;
3851 }
3852 
3853 #undef __KMP_GET_REDUCTION_METHOD
3854 #undef __KMP_SET_REDUCTION_METHOD
3855 
3856 /* end of interface to fast scalable reduce routines */
3857 
3858 kmp_uint64 __kmpc_get_taskid() {
3859 
3860   kmp_int32 gtid;
3861   kmp_info_t *thread;
3862 
3863   gtid = __kmp_get_gtid();
3864   if (gtid < 0) {
3865     return 0;
3866   }
3867   thread = __kmp_thread_from_gtid(gtid);
3868   return thread->th.th_current_task->td_task_id;
3869 
3870 } // __kmpc_get_taskid
3871 
3872 kmp_uint64 __kmpc_get_parent_taskid() {
3873 
3874   kmp_int32 gtid;
3875   kmp_info_t *thread;
3876   kmp_taskdata_t *parent_task;
3877 
3878   gtid = __kmp_get_gtid();
3879   if (gtid < 0) {
3880     return 0;
3881   }
3882   thread = __kmp_thread_from_gtid(gtid);
3883   parent_task = thread->th.th_current_task->td_parent;
3884   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3885 
3886 } // __kmpc_get_parent_taskid
3887 
3888 /*!
3889 @ingroup WORK_SHARING
3890 @param loc  source location information.
3891 @param gtid  global thread number.
3892 @param num_dims  number of associated doacross loops.
3893 @param dims  info on loops bounds.
3894 
3895 Initialize doacross loop information.
3896 Expect compiler send us inclusive bounds,
3897 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3898 */
3899 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3900                           const struct kmp_dim *dims) {
3901   __kmp_assert_valid_gtid(gtid);
3902   int j, idx;
3903   kmp_int64 last, trace_count;
3904   kmp_info_t *th = __kmp_threads[gtid];
3905   kmp_team_t *team = th->th.th_team;
3906   kmp_uint32 *flags;
3907   kmp_disp_t *pr_buf = th->th.th_dispatch;
3908   dispatch_shared_info_t *sh_buf;
3909 
3910   KA_TRACE(
3911       20,
3912       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3913        gtid, num_dims, !team->t.t_serialized));
3914   KMP_DEBUG_ASSERT(dims != NULL);
3915   KMP_DEBUG_ASSERT(num_dims > 0);
3916 
3917   if (team->t.t_serialized) {
3918     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3919     return; // no dependencies if team is serialized
3920   }
3921   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3922   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3923   // the next loop
3924   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3925 
3926   // Save bounds info into allocated private buffer
3927   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3928   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3929       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3930   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3931   pr_buf->th_doacross_info[0] =
3932       (kmp_int64)num_dims; // first element is number of dimensions
3933   // Save also address of num_done in order to access it later without knowing
3934   // the buffer index
3935   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3936   pr_buf->th_doacross_info[2] = dims[0].lo;
3937   pr_buf->th_doacross_info[3] = dims[0].up;
3938   pr_buf->th_doacross_info[4] = dims[0].st;
3939   last = 5;
3940   for (j = 1; j < num_dims; ++j) {
3941     kmp_int64
3942         range_length; // To keep ranges of all dimensions but the first dims[0]
3943     if (dims[j].st == 1) { // most common case
3944       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3945       range_length = dims[j].up - dims[j].lo + 1;
3946     } else {
3947       if (dims[j].st > 0) {
3948         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3949         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3950       } else { // negative increment
3951         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3952         range_length =
3953             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3954       }
3955     }
3956     pr_buf->th_doacross_info[last++] = range_length;
3957     pr_buf->th_doacross_info[last++] = dims[j].lo;
3958     pr_buf->th_doacross_info[last++] = dims[j].up;
3959     pr_buf->th_doacross_info[last++] = dims[j].st;
3960   }
3961 
3962   // Compute total trip count.
3963   // Start with range of dims[0] which we don't need to keep in the buffer.
3964   if (dims[0].st == 1) { // most common case
3965     trace_count = dims[0].up - dims[0].lo + 1;
3966   } else if (dims[0].st > 0) {
3967     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3968     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3969   } else { // negative increment
3970     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3971     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3972   }
3973   for (j = 1; j < num_dims; ++j) {
3974     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3975   }
3976   KMP_DEBUG_ASSERT(trace_count > 0);
3977 
3978   // Check if shared buffer is not occupied by other loop (idx -
3979   // __kmp_dispatch_num_buffers)
3980   if (idx != sh_buf->doacross_buf_idx) {
3981     // Shared buffer is occupied, wait for it to be free
3982     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3983                  __kmp_eq_4, NULL);
3984   }
3985 #if KMP_32_BIT_ARCH
3986   // Check if we are the first thread. After the CAS the first thread gets 0,
3987   // others get 1 if initialization is in progress, allocated pointer otherwise.
3988   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3989   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3990       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3991 #else
3992   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3993       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3994 #endif
3995   if (flags == NULL) {
3996     // we are the first thread, allocate the array of flags
3997     size_t size =
3998         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
3999     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
4000     KMP_MB();
4001     sh_buf->doacross_flags = flags;
4002   } else if (flags == (kmp_uint32 *)1) {
4003 #if KMP_32_BIT_ARCH
4004     // initialization is still in progress, need to wait
4005     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4006 #else
4007     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4008 #endif
4009       KMP_YIELD(TRUE);
4010     KMP_MB();
4011   } else {
4012     KMP_MB();
4013   }
4014   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4015   pr_buf->th_doacross_flags =
4016       sh_buf->doacross_flags; // save private copy in order to not
4017   // touch shared buffer on each iteration
4018   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4019 }
4020 
4021 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4022   __kmp_assert_valid_gtid(gtid);
4023   kmp_int64 shft;
4024   size_t num_dims, i;
4025   kmp_uint32 flag;
4026   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4027   kmp_info_t *th = __kmp_threads[gtid];
4028   kmp_team_t *team = th->th.th_team;
4029   kmp_disp_t *pr_buf;
4030   kmp_int64 lo, up, st;
4031 
4032   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4033   if (team->t.t_serialized) {
4034     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4035     return; // no dependencies if team is serialized
4036   }
4037 
4038   // calculate sequential iteration number and check out-of-bounds condition
4039   pr_buf = th->th.th_dispatch;
4040   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4041   num_dims = (size_t)pr_buf->th_doacross_info[0];
4042   lo = pr_buf->th_doacross_info[2];
4043   up = pr_buf->th_doacross_info[3];
4044   st = pr_buf->th_doacross_info[4];
4045 #if OMPT_SUPPORT && OMPT_OPTIONAL
4046   ompt_dependence_t deps[num_dims];
4047 #endif
4048   if (st == 1) { // most common case
4049     if (vec[0] < lo || vec[0] > up) {
4050       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4051                     "bounds [%lld,%lld]\n",
4052                     gtid, vec[0], lo, up));
4053       return;
4054     }
4055     iter_number = vec[0] - lo;
4056   } else if (st > 0) {
4057     if (vec[0] < lo || vec[0] > up) {
4058       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4059                     "bounds [%lld,%lld]\n",
4060                     gtid, vec[0], lo, up));
4061       return;
4062     }
4063     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4064   } else { // negative increment
4065     if (vec[0] > lo || vec[0] < up) {
4066       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4067                     "bounds [%lld,%lld]\n",
4068                     gtid, vec[0], lo, up));
4069       return;
4070     }
4071     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4072   }
4073 #if OMPT_SUPPORT && OMPT_OPTIONAL
4074   deps[0].variable.value = iter_number;
4075   deps[0].dependence_type = ompt_dependence_type_sink;
4076 #endif
4077   for (i = 1; i < num_dims; ++i) {
4078     kmp_int64 iter, ln;
4079     size_t j = i * 4;
4080     ln = pr_buf->th_doacross_info[j + 1];
4081     lo = pr_buf->th_doacross_info[j + 2];
4082     up = pr_buf->th_doacross_info[j + 3];
4083     st = pr_buf->th_doacross_info[j + 4];
4084     if (st == 1) {
4085       if (vec[i] < lo || vec[i] > up) {
4086         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4087                       "bounds [%lld,%lld]\n",
4088                       gtid, vec[i], lo, up));
4089         return;
4090       }
4091       iter = vec[i] - lo;
4092     } else if (st > 0) {
4093       if (vec[i] < lo || vec[i] > up) {
4094         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4095                       "bounds [%lld,%lld]\n",
4096                       gtid, vec[i], lo, up));
4097         return;
4098       }
4099       iter = (kmp_uint64)(vec[i] - lo) / st;
4100     } else { // st < 0
4101       if (vec[i] > lo || vec[i] < up) {
4102         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4103                       "bounds [%lld,%lld]\n",
4104                       gtid, vec[i], lo, up));
4105         return;
4106       }
4107       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4108     }
4109     iter_number = iter + ln * iter_number;
4110 #if OMPT_SUPPORT && OMPT_OPTIONAL
4111     deps[i].variable.value = iter;
4112     deps[i].dependence_type = ompt_dependence_type_sink;
4113 #endif
4114   }
4115   shft = iter_number % 32; // use 32-bit granularity
4116   iter_number >>= 5; // divided by 32
4117   flag = 1 << shft;
4118   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4119     KMP_YIELD(TRUE);
4120   }
4121   KMP_MB();
4122 #if OMPT_SUPPORT && OMPT_OPTIONAL
4123   if (ompt_enabled.ompt_callback_dependences) {
4124     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4125         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4126   }
4127 #endif
4128   KA_TRACE(20,
4129            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4130             gtid, (iter_number << 5) + shft));
4131 }
4132 
4133 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4134   __kmp_assert_valid_gtid(gtid);
4135   kmp_int64 shft;
4136   size_t num_dims, i;
4137   kmp_uint32 flag;
4138   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4139   kmp_info_t *th = __kmp_threads[gtid];
4140   kmp_team_t *team = th->th.th_team;
4141   kmp_disp_t *pr_buf;
4142   kmp_int64 lo, st;
4143 
4144   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4145   if (team->t.t_serialized) {
4146     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4147     return; // no dependencies if team is serialized
4148   }
4149 
4150   // calculate sequential iteration number (same as in "wait" but no
4151   // out-of-bounds checks)
4152   pr_buf = th->th.th_dispatch;
4153   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4154   num_dims = (size_t)pr_buf->th_doacross_info[0];
4155   lo = pr_buf->th_doacross_info[2];
4156   st = pr_buf->th_doacross_info[4];
4157 #if OMPT_SUPPORT && OMPT_OPTIONAL
4158   ompt_dependence_t deps[num_dims];
4159 #endif
4160   if (st == 1) { // most common case
4161     iter_number = vec[0] - lo;
4162   } else if (st > 0) {
4163     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4164   } else { // negative increment
4165     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4166   }
4167 #if OMPT_SUPPORT && OMPT_OPTIONAL
4168   deps[0].variable.value = iter_number;
4169   deps[0].dependence_type = ompt_dependence_type_source;
4170 #endif
4171   for (i = 1; i < num_dims; ++i) {
4172     kmp_int64 iter, ln;
4173     size_t j = i * 4;
4174     ln = pr_buf->th_doacross_info[j + 1];
4175     lo = pr_buf->th_doacross_info[j + 2];
4176     st = pr_buf->th_doacross_info[j + 4];
4177     if (st == 1) {
4178       iter = vec[i] - lo;
4179     } else if (st > 0) {
4180       iter = (kmp_uint64)(vec[i] - lo) / st;
4181     } else { // st < 0
4182       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4183     }
4184     iter_number = iter + ln * iter_number;
4185 #if OMPT_SUPPORT && OMPT_OPTIONAL
4186     deps[i].variable.value = iter;
4187     deps[i].dependence_type = ompt_dependence_type_source;
4188 #endif
4189   }
4190 #if OMPT_SUPPORT && OMPT_OPTIONAL
4191   if (ompt_enabled.ompt_callback_dependences) {
4192     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4193         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4194   }
4195 #endif
4196   shft = iter_number % 32; // use 32-bit granularity
4197   iter_number >>= 5; // divided by 32
4198   flag = 1 << shft;
4199   KMP_MB();
4200   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4201     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4202   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4203                 (iter_number << 5) + shft));
4204 }
4205 
4206 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4207   __kmp_assert_valid_gtid(gtid);
4208   kmp_int32 num_done;
4209   kmp_info_t *th = __kmp_threads[gtid];
4210   kmp_team_t *team = th->th.th_team;
4211   kmp_disp_t *pr_buf = th->th.th_dispatch;
4212 
4213   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4214   if (team->t.t_serialized) {
4215     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4216     return; // nothing to do
4217   }
4218   num_done =
4219       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4220   if (num_done == th->th.th_team_nproc) {
4221     // we are the last thread, need to free shared resources
4222     int idx = pr_buf->th_doacross_buf_idx - 1;
4223     dispatch_shared_info_t *sh_buf =
4224         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4225     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4226                      (kmp_int64)&sh_buf->doacross_num_done);
4227     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4228     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4229     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4230     sh_buf->doacross_flags = NULL;
4231     sh_buf->doacross_num_done = 0;
4232     sh_buf->doacross_buf_idx +=
4233         __kmp_dispatch_num_buffers; // free buffer for future re-use
4234   }
4235   // free private resources (need to keep buffer index forever)
4236   pr_buf->th_doacross_flags = NULL;
4237   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4238   pr_buf->th_doacross_info = NULL;
4239   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4240 }
4241 
4242 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4243 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4244   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4245 }
4246 
4247 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4248   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4249 }
4250 
4251 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4252                   omp_allocator_handle_t free_allocator) {
4253   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4254                         free_allocator);
4255 }
4256 
4257 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4258   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4259 }
4260 
4261 int __kmpc_get_target_offload(void) {
4262   if (!__kmp_init_serial) {
4263     __kmp_serial_initialize();
4264   }
4265   return __kmp_target_offload;
4266 }
4267 
4268 int __kmpc_pause_resource(kmp_pause_status_t level) {
4269   if (!__kmp_init_serial) {
4270     return 1; // Can't pause if runtime is not initialized
4271   }
4272   return __kmp_pause_resource(level);
4273 }
4274