1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22 
23 #define MAX_MESSAGE 512
24 
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27 
28 /*!
29  * @ingroup STARTUP_SHUTDOWN
30  * @param loc   in   source location information
31  * @param flags in   for future use (currently ignored)
32  *
33  * Initialize the runtime library. This call is optional; if it is not made then
34  * it will be implicitly called by attempts to use other library functions.
35  */
36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37   // By default __kmpc_begin() is no-op.
38   char *env;
39   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40       __kmp_str_match_true(env)) {
41     __kmp_middle_initialize();
42     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43   } else if (__kmp_ignore_mppbeg() == FALSE) {
44     // By default __kmp_ignore_mppbeg() returns TRUE.
45     __kmp_internal_begin();
46     KC_TRACE(10, ("__kmpc_begin: called\n"));
47   }
48 }
49 
50 /*!
51  * @ingroup STARTUP_SHUTDOWN
52  * @param loc source location information
53  *
54  * Shutdown the runtime library. This is also optional, and even if called will
55  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56  * zero.
57  */
58 void __kmpc_end(ident_t *loc) {
59   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62   // returns FALSE and __kmpc_end() will unregister this root (it can cause
63   // library shut down).
64   if (__kmp_ignore_mppend() == FALSE) {
65     KC_TRACE(10, ("__kmpc_end: called\n"));
66     KA_TRACE(30, ("__kmpc_end\n"));
67 
68     __kmp_internal_end_thread(-1);
69   }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71   // Normal exit process on Windows does not allow worker threads of the final
72   // parallel region to finish reporting their events, so shutting down the
73   // library here fixes the issue at least for the cases where __kmpc_end() is
74   // placed properly.
75   if (ompt_enabled.enabled)
76     __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79 
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84 
85 This function can be called in any context.
86 
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92 
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannot be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99   kmp_int32 gtid = __kmp_entry_gtid();
100 
101   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102 
103   return gtid;
104 }
105 
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110 
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121   KC_TRACE(10,
122            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123 
124   return TCR_4(__kmp_all_nth);
125 }
126 
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135   return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137 
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145 
146   return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148 
149 /*!
150  * @ingroup DEPRECATED
151  * @param loc location description
152  *
153  * This function need not be called. It always returns TRUE.
154  */
155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157 
158   return TRUE;
159 
160 #else
161 
162   const char *semi2;
163   const char *semi3;
164   int line_no;
165 
166   if (__kmp_par_range == 0) {
167     return TRUE;
168   }
169   semi2 = loc->psource;
170   if (semi2 == NULL) {
171     return TRUE;
172   }
173   semi2 = strchr(semi2, ';');
174   if (semi2 == NULL) {
175     return TRUE;
176   }
177   semi2 = strchr(semi2 + 1, ';');
178   if (semi2 == NULL) {
179     return TRUE;
180   }
181   if (__kmp_par_range_filename[0]) {
182     const char *name = semi2 - 1;
183     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184       name--;
185     }
186     if ((*name == '/') || (*name == ';')) {
187       name++;
188     }
189     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190       return __kmp_par_range < 0;
191     }
192   }
193   semi3 = strchr(semi2 + 1, ';');
194   if (__kmp_par_range_routine[0]) {
195     if ((semi3 != NULL) && (semi3 > semi2) &&
196         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197       return __kmp_par_range < 0;
198     }
199   }
200   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202       return __kmp_par_range > 0;
203     }
204     return __kmp_par_range < 0;
205   }
206   return TRUE;
207 
208 #endif /* KMP_DEBUG */
209 }
210 
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218   return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220 
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226 
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231                              kmp_int32 num_threads) {
232   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233                 global_tid, num_threads));
234   __kmp_assert_valid_gtid(global_tid);
235   __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237 
238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240   /* the num_threads are automatically popped */
241 }
242 
243 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
244                            kmp_int32 proc_bind) {
245   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
246                 proc_bind));
247   __kmp_assert_valid_gtid(global_tid);
248   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
249 }
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     ompt_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
300     }
301     OMPT_STORE_RETURN_ADDRESS(gtid);
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310                     kmp_va_addr_of(ap));
311 #if INCLUDE_SSC_MARKS
312     SSC_MARK_JOINING();
313 #endif
314     __kmp_join_call(loc, gtid
315 #if OMPT_SUPPORT
316                     ,
317                     fork_context_intel
318 #endif
319     );
320 
321     va_end(ap);
322   }
323 
324 #if KMP_STATS_ENABLED
325   if (previous_state == stats_state_e::SERIAL_REGION) {
326     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
327     KMP_SET_THREAD_STATE(previous_state);
328   } else {
329     KMP_POP_PARTITIONED_TIMER();
330   }
331 #endif // KMP_STATS_ENABLED
332 }
333 
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340 
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346                            kmp_int32 num_teams, kmp_int32 num_threads) {
347   KA_TRACE(20,
348            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349             global_tid, num_teams, num_threads));
350   __kmp_assert_valid_gtid(global_tid);
351   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353 
354 /*!
355 @ingroup PARALLEL
356 @param loc source location information
357 @param global_tid global thread number
358 @param num_teams_lo lower bound on number of teams requested for the teams
359 construct
360 @param num_teams_up upper bound on number of teams requested for the teams
361 construct
362 @param num_threads number of threads per team requested for the teams construct
363 
364 Set the number of teams to be used by the teams construct. The number of initial
365 teams cretaed will be greater than or equal to the lower bound and less than or
366 equal to the upper bound.
367 This call is only required if the teams construct has a `num_teams` clause
368 or a `thread_limit` clause (or both).
369 */
370 void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
371                               kmp_int32 num_teams_lb, kmp_int32 num_teams_ub,
372                               kmp_int32 num_threads) {
373   KA_TRACE(20, ("__kmpc_push_num_teams_51: enter T#%d num_teams_lb=%d"
374                 " num_teams_ub=%d num_threads=%d\n",
375                 global_tid, num_teams_lb, num_teams_ub, num_threads));
376   __kmp_assert_valid_gtid(global_tid);
377   __kmp_push_num_teams_51(loc, global_tid, num_teams_lb, num_teams_ub,
378                           num_threads);
379 }
380 
381 /*!
382 @ingroup PARALLEL
383 @param loc  source location information
384 @param argc  total number of arguments in the ellipsis
385 @param microtask  pointer to callback routine consisting of outlined teams
386 construct
387 @param ...  pointers to shared variables that aren't global
388 
389 Do the actual fork and call the microtask in the relevant number of threads.
390 */
391 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
392                        ...) {
393   int gtid = __kmp_entry_gtid();
394   kmp_info_t *this_thr = __kmp_threads[gtid];
395   va_list ap;
396   va_start(ap, microtask);
397 
398 #if KMP_STATS_ENABLED
399   KMP_COUNT_BLOCK(OMP_TEAMS);
400   stats_state_e previous_state = KMP_GET_THREAD_STATE();
401   if (previous_state == stats_state_e::SERIAL_REGION) {
402     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
403   } else {
404     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
405   }
406 #endif
407 
408   // remember teams entry point and nesting level
409   this_thr->th.th_teams_microtask = microtask;
410   this_thr->th.th_teams_level =
411       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
412 
413 #if OMPT_SUPPORT
414   kmp_team_t *parent_team = this_thr->th.th_team;
415   int tid = __kmp_tid_from_gtid(gtid);
416   if (ompt_enabled.enabled) {
417     parent_team->t.t_implicit_task_taskdata[tid]
418         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
419   }
420   OMPT_STORE_RETURN_ADDRESS(gtid);
421 #endif
422 
423   // check if __kmpc_push_num_teams called, set default number of teams
424   // otherwise
425   if (this_thr->th.th_teams_size.nteams == 0) {
426     __kmp_push_num_teams(loc, gtid, 0, 0);
427   }
428   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
429   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
430   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
431 
432   __kmp_fork_call(
433       loc, gtid, fork_context_intel, argc,
434       VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
435       VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
436   __kmp_join_call(loc, gtid
437 #if OMPT_SUPPORT
438                   ,
439                   fork_context_intel
440 #endif
441   );
442 
443   // Pop current CG root off list
444   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
445   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
446   this_thr->th.th_cg_roots = tmp->up;
447   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
448                  " to node %p. cg_nthreads was %d\n",
449                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
450   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
451   int i = tmp->cg_nthreads--;
452   if (i == 1) { // check is we are the last thread in CG (not always the case)
453     __kmp_free(tmp);
454   }
455   // Restore current task's thread_limit from CG root
456   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
457   this_thr->th.th_current_task->td_icvs.thread_limit =
458       this_thr->th.th_cg_roots->cg_thread_limit;
459 
460   this_thr->th.th_teams_microtask = NULL;
461   this_thr->th.th_teams_level = 0;
462   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
463   va_end(ap);
464 #if KMP_STATS_ENABLED
465   if (previous_state == stats_state_e::SERIAL_REGION) {
466     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
467     KMP_SET_THREAD_STATE(previous_state);
468   } else {
469     KMP_POP_PARTITIONED_TIMER();
470   }
471 #endif // KMP_STATS_ENABLED
472 }
473 
474 // I don't think this function should ever have been exported.
475 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
476 // openmp code ever called it, but it's been exported from the RTL for so
477 // long that I'm afraid to remove the definition.
478 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
479 
480 /*!
481 @ingroup PARALLEL
482 @param loc  source location information
483 @param global_tid  global thread number
484 
485 Enter a serialized parallel construct. This interface is used to handle a
486 conditional parallel region, like this,
487 @code
488 #pragma omp parallel if (condition)
489 @endcode
490 when the condition is false.
491 */
492 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
493   // The implementation is now in kmp_runtime.cpp so that it can share static
494   // functions with kmp_fork_call since the tasks to be done are similar in
495   // each case.
496   __kmp_assert_valid_gtid(global_tid);
497 #if OMPT_SUPPORT
498   OMPT_STORE_RETURN_ADDRESS(global_tid);
499 #endif
500   __kmp_serialized_parallel(loc, global_tid);
501 }
502 
503 /*!
504 @ingroup PARALLEL
505 @param loc  source location information
506 @param global_tid  global thread number
507 
508 Leave a serialized parallel construct.
509 */
510 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
511   kmp_internal_control_t *top;
512   kmp_info_t *this_thr;
513   kmp_team_t *serial_team;
514 
515   KC_TRACE(10,
516            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
517 
518   /* skip all this code for autopar serialized loops since it results in
519      unacceptable overhead */
520   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
521     return;
522 
523   // Not autopar code
524   __kmp_assert_valid_gtid(global_tid);
525   if (!TCR_4(__kmp_init_parallel))
526     __kmp_parallel_initialize();
527 
528   __kmp_resume_if_soft_paused();
529 
530   this_thr = __kmp_threads[global_tid];
531   serial_team = this_thr->th.th_serial_team;
532 
533   kmp_task_team_t *task_team = this_thr->th.th_task_team;
534   // we need to wait for the proxy tasks before finishing the thread
535   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
536     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
537 
538   KMP_MB();
539   KMP_DEBUG_ASSERT(serial_team);
540   KMP_ASSERT(serial_team->t.t_serialized);
541   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
542   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
543   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
544   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
545 
546 #if OMPT_SUPPORT
547   if (ompt_enabled.enabled &&
548       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
549     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
550     if (ompt_enabled.ompt_callback_implicit_task) {
551       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
552           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
553           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
554     }
555 
556     // reset clear the task id only after unlinking the task
557     ompt_data_t *parent_task_data;
558     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
559 
560     if (ompt_enabled.ompt_callback_parallel_end) {
561       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
562           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
563           ompt_parallel_invoker_program | ompt_parallel_team,
564           OMPT_LOAD_RETURN_ADDRESS(global_tid));
565     }
566     __ompt_lw_taskteam_unlink(this_thr);
567     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
568   }
569 #endif
570 
571   /* If necessary, pop the internal control stack values and replace the team
572    * values */
573   top = serial_team->t.t_control_stack_top;
574   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
575     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
576     serial_team->t.t_control_stack_top = top->next;
577     __kmp_free(top);
578   }
579 
580   // if( serial_team -> t.t_serialized > 1 )
581   serial_team->t.t_level--;
582 
583   /* pop dispatch buffers stack */
584   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
585   {
586     dispatch_private_info_t *disp_buffer =
587         serial_team->t.t_dispatch->th_disp_buffer;
588     serial_team->t.t_dispatch->th_disp_buffer =
589         serial_team->t.t_dispatch->th_disp_buffer->next;
590     __kmp_free(disp_buffer);
591   }
592   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
593 
594   --serial_team->t.t_serialized;
595   if (serial_team->t.t_serialized == 0) {
596 
597     /* return to the parallel section */
598 
599 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
600     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
601       __kmp_clear_x87_fpu_status_word();
602       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
603       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
604     }
605 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
606 
607     this_thr->th.th_team = serial_team->t.t_parent;
608     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
609 
610     /* restore values cached in the thread */
611     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
612     this_thr->th.th_team_master =
613         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
614     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
615 
616     /* TODO the below shouldn't need to be adjusted for serialized teams */
617     this_thr->th.th_dispatch =
618         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
619 
620     __kmp_pop_current_task_from_thread(this_thr);
621 
622     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
623     this_thr->th.th_current_task->td_flags.executing = 1;
624 
625     if (__kmp_tasking_mode != tskm_immediate_exec) {
626       // Copy the task team from the new child / old parent team to the thread.
627       this_thr->th.th_task_team =
628           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
629       KA_TRACE(20,
630                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
631                 "team %p\n",
632                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
633     }
634   } else {
635     if (__kmp_tasking_mode != tskm_immediate_exec) {
636       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
637                     "depth of serial team %p to %d\n",
638                     global_tid, serial_team, serial_team->t.t_serialized));
639     }
640   }
641 
642   if (__kmp_env_consistency_check)
643     __kmp_pop_parallel(global_tid, NULL);
644 #if OMPT_SUPPORT
645   if (ompt_enabled.enabled)
646     this_thr->th.ompt_thread_info.state =
647         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
648                                            : ompt_state_work_parallel);
649 #endif
650 }
651 
652 /*!
653 @ingroup SYNCHRONIZATION
654 @param loc  source location information.
655 
656 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
657 depending on the memory ordering convention obeyed by the compiler
658 even that may not be necessary).
659 */
660 void __kmpc_flush(ident_t *loc) {
661   KC_TRACE(10, ("__kmpc_flush: called\n"));
662 
663   /* need explicit __mf() here since use volatile instead in library */
664   KMP_MB(); /* Flush all pending memory write invalidates.  */
665 
666 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
667 #if KMP_MIC
668 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
669 // We shouldn't need it, though, since the ABI rules require that
670 // * If the compiler generates NGO stores it also generates the fence
671 // * If users hand-code NGO stores they should insert the fence
672 // therefore no incomplete unordered stores should be visible.
673 #else
674   // C74404
675   // This is to address non-temporal store instructions (sfence needed).
676   // The clflush instruction is addressed either (mfence needed).
677   // Probably the non-temporal load monvtdqa instruction should also be
678   // addressed.
679   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
680   if (!__kmp_cpuinfo.initialized) {
681     __kmp_query_cpuid(&__kmp_cpuinfo);
682   }
683   if (!__kmp_cpuinfo.sse2) {
684     // CPU cannot execute SSE2 instructions.
685   } else {
686 #if KMP_COMPILER_ICC
687     _mm_mfence();
688 #elif KMP_COMPILER_MSVC
689     MemoryBarrier();
690 #else
691     __sync_synchronize();
692 #endif // KMP_COMPILER_ICC
693   }
694 #endif // KMP_MIC
695 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
696        KMP_ARCH_RISCV64)
697 // Nothing to see here move along
698 #elif KMP_ARCH_PPC64
699 // Nothing needed here (we have a real MB above).
700 #else
701 #error Unknown or unsupported architecture
702 #endif
703 
704 #if OMPT_SUPPORT && OMPT_OPTIONAL
705   if (ompt_enabled.ompt_callback_flush) {
706     ompt_callbacks.ompt_callback(ompt_callback_flush)(
707         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
708   }
709 #endif
710 }
711 
712 /* -------------------------------------------------------------------------- */
713 /*!
714 @ingroup SYNCHRONIZATION
715 @param loc source location information
716 @param global_tid thread id.
717 
718 Execute a barrier.
719 */
720 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
721   KMP_COUNT_BLOCK(OMP_BARRIER);
722   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
723   __kmp_assert_valid_gtid(global_tid);
724 
725   if (!TCR_4(__kmp_init_parallel))
726     __kmp_parallel_initialize();
727 
728   __kmp_resume_if_soft_paused();
729 
730   if (__kmp_env_consistency_check) {
731     if (loc == 0) {
732       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
733     }
734     __kmp_check_barrier(global_tid, ct_barrier, loc);
735   }
736 
737 #if OMPT_SUPPORT
738   ompt_frame_t *ompt_frame;
739   if (ompt_enabled.enabled) {
740     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
741     if (ompt_frame->enter_frame.ptr == NULL)
742       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
743   }
744   OMPT_STORE_RETURN_ADDRESS(global_tid);
745 #endif
746   __kmp_threads[global_tid]->th.th_ident = loc;
747   // TODO: explicit barrier_wait_id:
748   //   this function is called when 'barrier' directive is present or
749   //   implicit barrier at the end of a worksharing construct.
750   // 1) better to add a per-thread barrier counter to a thread data structure
751   // 2) set to 0 when a new team is created
752   // 4) no sync is required
753 
754   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
755 #if OMPT_SUPPORT && OMPT_OPTIONAL
756   if (ompt_enabled.enabled) {
757     ompt_frame->enter_frame = ompt_data_none;
758   }
759 #endif
760 }
761 
762 /* The BARRIER for a MASTER section is always explicit   */
763 /*!
764 @ingroup WORK_SHARING
765 @param loc  source location information.
766 @param global_tid  global thread number .
767 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
768 */
769 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
770   int status = 0;
771 
772   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
773   __kmp_assert_valid_gtid(global_tid);
774 
775   if (!TCR_4(__kmp_init_parallel))
776     __kmp_parallel_initialize();
777 
778   __kmp_resume_if_soft_paused();
779 
780   if (KMP_MASTER_GTID(global_tid)) {
781     KMP_COUNT_BLOCK(OMP_MASTER);
782     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
783     status = 1;
784   }
785 
786 #if OMPT_SUPPORT && OMPT_OPTIONAL
787   if (status) {
788     if (ompt_enabled.ompt_callback_masked) {
789       kmp_info_t *this_thr = __kmp_threads[global_tid];
790       kmp_team_t *team = this_thr->th.th_team;
791 
792       int tid = __kmp_tid_from_gtid(global_tid);
793       ompt_callbacks.ompt_callback(ompt_callback_masked)(
794           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
795           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
796           OMPT_GET_RETURN_ADDRESS(0));
797     }
798   }
799 #endif
800 
801   if (__kmp_env_consistency_check) {
802 #if KMP_USE_DYNAMIC_LOCK
803     if (status)
804       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
805     else
806       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
807 #else
808     if (status)
809       __kmp_push_sync(global_tid, ct_master, loc, NULL);
810     else
811       __kmp_check_sync(global_tid, ct_master, loc, NULL);
812 #endif
813   }
814 
815   return status;
816 }
817 
818 /*!
819 @ingroup WORK_SHARING
820 @param loc  source location information.
821 @param global_tid  global thread number .
822 
823 Mark the end of a <tt>master</tt> region. This should only be called by the
824 thread that executes the <tt>master</tt> region.
825 */
826 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
827   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
828   __kmp_assert_valid_gtid(global_tid);
829   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
830   KMP_POP_PARTITIONED_TIMER();
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_info_t *this_thr = __kmp_threads[global_tid];
834   kmp_team_t *team = this_thr->th.th_team;
835   if (ompt_enabled.ompt_callback_masked) {
836     int tid = __kmp_tid_from_gtid(global_tid);
837     ompt_callbacks.ompt_callback(ompt_callback_masked)(
838         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
839         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
840         OMPT_GET_RETURN_ADDRESS(0));
841   }
842 #endif
843 
844   if (__kmp_env_consistency_check) {
845     if (KMP_MASTER_GTID(global_tid))
846       __kmp_pop_sync(global_tid, ct_master, loc);
847   }
848 }
849 
850 /*!
851 @ingroup WORK_SHARING
852 @param loc  source location information.
853 @param gtid  global thread number.
854 
855 Start execution of an <tt>ordered</tt> construct.
856 */
857 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
858   int cid = 0;
859   kmp_info_t *th;
860   KMP_DEBUG_ASSERT(__kmp_init_serial);
861 
862   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
863   __kmp_assert_valid_gtid(gtid);
864 
865   if (!TCR_4(__kmp_init_parallel))
866     __kmp_parallel_initialize();
867 
868   __kmp_resume_if_soft_paused();
869 
870 #if USE_ITT_BUILD
871   __kmp_itt_ordered_prep(gtid);
872 // TODO: ordered_wait_id
873 #endif /* USE_ITT_BUILD */
874 
875   th = __kmp_threads[gtid];
876 
877 #if OMPT_SUPPORT && OMPT_OPTIONAL
878   kmp_team_t *team;
879   ompt_wait_id_t lck;
880   void *codeptr_ra;
881   OMPT_STORE_RETURN_ADDRESS(gtid);
882   if (ompt_enabled.enabled) {
883     team = __kmp_team_from_gtid(gtid);
884     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
885     /* OMPT state update */
886     th->th.ompt_thread_info.wait_id = lck;
887     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
888 
889     /* OMPT event callback */
890     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
891     if (ompt_enabled.ompt_callback_mutex_acquire) {
892       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
893           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
894           codeptr_ra);
895     }
896   }
897 #endif
898 
899   if (th->th.th_dispatch->th_deo_fcn != 0)
900     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
901   else
902     __kmp_parallel_deo(&gtid, &cid, loc);
903 
904 #if OMPT_SUPPORT && OMPT_OPTIONAL
905   if (ompt_enabled.enabled) {
906     /* OMPT state update */
907     th->th.ompt_thread_info.state = ompt_state_work_parallel;
908     th->th.ompt_thread_info.wait_id = 0;
909 
910     /* OMPT event callback */
911     if (ompt_enabled.ompt_callback_mutex_acquired) {
912       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
913           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
914     }
915   }
916 #endif
917 
918 #if USE_ITT_BUILD
919   __kmp_itt_ordered_start(gtid);
920 #endif /* USE_ITT_BUILD */
921 }
922 
923 /*!
924 @ingroup WORK_SHARING
925 @param loc  source location information.
926 @param gtid  global thread number.
927 
928 End execution of an <tt>ordered</tt> construct.
929 */
930 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
931   int cid = 0;
932   kmp_info_t *th;
933 
934   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
935   __kmp_assert_valid_gtid(gtid);
936 
937 #if USE_ITT_BUILD
938   __kmp_itt_ordered_end(gtid);
939 // TODO: ordered_wait_id
940 #endif /* USE_ITT_BUILD */
941 
942   th = __kmp_threads[gtid];
943 
944   if (th->th.th_dispatch->th_dxo_fcn != 0)
945     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
946   else
947     __kmp_parallel_dxo(&gtid, &cid, loc);
948 
949 #if OMPT_SUPPORT && OMPT_OPTIONAL
950   OMPT_STORE_RETURN_ADDRESS(gtid);
951   if (ompt_enabled.ompt_callback_mutex_released) {
952     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
953         ompt_mutex_ordered,
954         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
955             ->t.t_ordered.dt.t_value,
956         OMPT_LOAD_RETURN_ADDRESS(gtid));
957   }
958 #endif
959 }
960 
961 #if KMP_USE_DYNAMIC_LOCK
962 
963 static __forceinline void
964 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
965                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
966   // Pointer to the allocated indirect lock is written to crit, while indexing
967   // is ignored.
968   void *idx;
969   kmp_indirect_lock_t **lck;
970   lck = (kmp_indirect_lock_t **)crit;
971   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
972   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
973   KMP_SET_I_LOCK_LOCATION(ilk, loc);
974   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
975   KA_TRACE(20,
976            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
977 #if USE_ITT_BUILD
978   __kmp_itt_critical_creating(ilk->lock, loc);
979 #endif
980   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
981   if (status == 0) {
982 #if USE_ITT_BUILD
983     __kmp_itt_critical_destroyed(ilk->lock);
984 #endif
985     // We don't really need to destroy the unclaimed lock here since it will be
986     // cleaned up at program exit.
987     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
988   }
989   KMP_DEBUG_ASSERT(*lck != NULL);
990 }
991 
992 // Fast-path acquire tas lock
993 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
994   {                                                                            \
995     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
996     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
997     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
998     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
999         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
1000       kmp_uint32 spins;                                                        \
1001       KMP_FSYNC_PREPARE(l);                                                    \
1002       KMP_INIT_YIELD(spins);                                                   \
1003       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
1004       do {                                                                     \
1005         if (TCR_4(__kmp_nth) >                                                 \
1006             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
1007           KMP_YIELD(TRUE);                                                     \
1008         } else {                                                               \
1009           KMP_YIELD_SPIN(spins);                                               \
1010         }                                                                      \
1011         __kmp_spin_backoff(&backoff);                                          \
1012       } while (                                                                \
1013           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
1014           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
1015     }                                                                          \
1016     KMP_FSYNC_ACQUIRED(l);                                                     \
1017   }
1018 
1019 // Fast-path test tas lock
1020 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
1021   {                                                                            \
1022     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
1023     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
1024     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
1025     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
1026          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
1027   }
1028 
1029 // Fast-path release tas lock
1030 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
1031   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1032 
1033 #if KMP_USE_FUTEX
1034 
1035 #include <sys/syscall.h>
1036 #include <unistd.h>
1037 #ifndef FUTEX_WAIT
1038 #define FUTEX_WAIT 0
1039 #endif
1040 #ifndef FUTEX_WAKE
1041 #define FUTEX_WAKE 1
1042 #endif
1043 
1044 // Fast-path acquire futex lock
1045 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1046   {                                                                            \
1047     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1048     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1049     KMP_MB();                                                                  \
1050     KMP_FSYNC_PREPARE(ftx);                                                    \
1051     kmp_int32 poll_val;                                                        \
1052     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1053                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1054                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1055       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1056       if (!cond) {                                                             \
1057         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1058                                          poll_val |                            \
1059                                              KMP_LOCK_BUSY(1, futex))) {       \
1060           continue;                                                            \
1061         }                                                                      \
1062         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1063       }                                                                        \
1064       kmp_int32 rc;                                                            \
1065       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1066                         NULL, NULL, 0)) != 0) {                                \
1067         continue;                                                              \
1068       }                                                                        \
1069       gtid_code |= 1;                                                          \
1070     }                                                                          \
1071     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1072   }
1073 
1074 // Fast-path test futex lock
1075 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1076   {                                                                            \
1077     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1078     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1079                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1080       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1081       rc = TRUE;                                                               \
1082     } else {                                                                   \
1083       rc = FALSE;                                                              \
1084     }                                                                          \
1085   }
1086 
1087 // Fast-path release futex lock
1088 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1089   {                                                                            \
1090     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1091     KMP_MB();                                                                  \
1092     KMP_FSYNC_RELEASING(ftx);                                                  \
1093     kmp_int32 poll_val =                                                       \
1094         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1095     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1096       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1097               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1098     }                                                                          \
1099     KMP_MB();                                                                  \
1100     KMP_YIELD_OVERSUB();                                                       \
1101   }
1102 
1103 #endif // KMP_USE_FUTEX
1104 
1105 #else // KMP_USE_DYNAMIC_LOCK
1106 
1107 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1108                                                       ident_t const *loc,
1109                                                       kmp_int32 gtid) {
1110   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1111 
1112   // Because of the double-check, the following load doesn't need to be volatile
1113   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1114 
1115   if (lck == NULL) {
1116     void *idx;
1117 
1118     // Allocate & initialize the lock.
1119     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1120     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1121     __kmp_init_user_lock_with_checks(lck);
1122     __kmp_set_user_lock_location(lck, loc);
1123 #if USE_ITT_BUILD
1124     __kmp_itt_critical_creating(lck);
1125 // __kmp_itt_critical_creating() should be called *before* the first usage
1126 // of underlying lock. It is the only place where we can guarantee it. There
1127 // are chances the lock will destroyed with no usage, but it is not a
1128 // problem, because this is not real event seen by user but rather setting
1129 // name for object (lock). See more details in kmp_itt.h.
1130 #endif /* USE_ITT_BUILD */
1131 
1132     // Use a cmpxchg instruction to slam the start of the critical section with
1133     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1134     // and use the lock that the other thread allocated.
1135     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1136 
1137     if (status == 0) {
1138 // Deallocate the lock and reload the value.
1139 #if USE_ITT_BUILD
1140       __kmp_itt_critical_destroyed(lck);
1141 // Let ITT know the lock is destroyed and the same memory location may be reused
1142 // for another purpose.
1143 #endif /* USE_ITT_BUILD */
1144       __kmp_destroy_user_lock_with_checks(lck);
1145       __kmp_user_lock_free(&idx, gtid, lck);
1146       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1147       KMP_DEBUG_ASSERT(lck != NULL);
1148     }
1149   }
1150   return lck;
1151 }
1152 
1153 #endif // KMP_USE_DYNAMIC_LOCK
1154 
1155 /*!
1156 @ingroup WORK_SHARING
1157 @param loc  source location information.
1158 @param global_tid  global thread number.
1159 @param crit identity of the critical section. This could be a pointer to a lock
1160 associated with the critical section, or some other suitably unique value.
1161 
1162 Enter code protected by a `critical` construct.
1163 This function blocks until the executing thread can enter the critical section.
1164 */
1165 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1166                      kmp_critical_name *crit) {
1167 #if KMP_USE_DYNAMIC_LOCK
1168 #if OMPT_SUPPORT && OMPT_OPTIONAL
1169   OMPT_STORE_RETURN_ADDRESS(global_tid);
1170 #endif // OMPT_SUPPORT
1171   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1172 #else
1173   KMP_COUNT_BLOCK(OMP_CRITICAL);
1174 #if OMPT_SUPPORT && OMPT_OPTIONAL
1175   ompt_state_t prev_state = ompt_state_undefined;
1176   ompt_thread_info_t ti;
1177 #endif
1178   kmp_user_lock_p lck;
1179 
1180   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1181   __kmp_assert_valid_gtid(global_tid);
1182 
1183   // TODO: add THR_OVHD_STATE
1184 
1185   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1186   KMP_CHECK_USER_LOCK_INIT();
1187 
1188   if ((__kmp_user_lock_kind == lk_tas) &&
1189       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1190     lck = (kmp_user_lock_p)crit;
1191   }
1192 #if KMP_USE_FUTEX
1193   else if ((__kmp_user_lock_kind == lk_futex) &&
1194            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1195     lck = (kmp_user_lock_p)crit;
1196   }
1197 #endif
1198   else { // ticket, queuing or drdpa
1199     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1200   }
1201 
1202   if (__kmp_env_consistency_check)
1203     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1204 
1205     // since the critical directive binds to all threads, not just the current
1206     // team we have to check this even if we are in a serialized team.
1207     // also, even if we are the uber thread, we still have to conduct the lock,
1208     // as we have to contend with sibling threads.
1209 
1210 #if USE_ITT_BUILD
1211   __kmp_itt_critical_acquiring(lck);
1212 #endif /* USE_ITT_BUILD */
1213 #if OMPT_SUPPORT && OMPT_OPTIONAL
1214   OMPT_STORE_RETURN_ADDRESS(gtid);
1215   void *codeptr_ra = NULL;
1216   if (ompt_enabled.enabled) {
1217     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1218     /* OMPT state update */
1219     prev_state = ti.state;
1220     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1221     ti.state = ompt_state_wait_critical;
1222 
1223     /* OMPT event callback */
1224     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1225     if (ompt_enabled.ompt_callback_mutex_acquire) {
1226       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1227           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1228           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1229     }
1230   }
1231 #endif
1232   // Value of 'crit' should be good for using as a critical_id of the critical
1233   // section directive.
1234   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1235 
1236 #if USE_ITT_BUILD
1237   __kmp_itt_critical_acquired(lck);
1238 #endif /* USE_ITT_BUILD */
1239 #if OMPT_SUPPORT && OMPT_OPTIONAL
1240   if (ompt_enabled.enabled) {
1241     /* OMPT state update */
1242     ti.state = prev_state;
1243     ti.wait_id = 0;
1244 
1245     /* OMPT event callback */
1246     if (ompt_enabled.ompt_callback_mutex_acquired) {
1247       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1248           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1249     }
1250   }
1251 #endif
1252   KMP_POP_PARTITIONED_TIMER();
1253 
1254   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1255   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1256 #endif // KMP_USE_DYNAMIC_LOCK
1257 }
1258 
1259 #if KMP_USE_DYNAMIC_LOCK
1260 
1261 // Converts the given hint to an internal lock implementation
1262 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1263 #if KMP_USE_TSX
1264 #define KMP_TSX_LOCK(seq) lockseq_##seq
1265 #else
1266 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1267 #endif
1268 
1269 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1270 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1271 #else
1272 #define KMP_CPUINFO_RTM 0
1273 #endif
1274 
1275   // Hints that do not require further logic
1276   if (hint & kmp_lock_hint_hle)
1277     return KMP_TSX_LOCK(hle);
1278   if (hint & kmp_lock_hint_rtm)
1279     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_queuing) : __kmp_user_lock_seq;
1280   if (hint & kmp_lock_hint_adaptive)
1281     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1282 
1283   // Rule out conflicting hints first by returning the default lock
1284   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1285     return __kmp_user_lock_seq;
1286   if ((hint & omp_lock_hint_speculative) &&
1287       (hint & omp_lock_hint_nonspeculative))
1288     return __kmp_user_lock_seq;
1289 
1290   // Do not even consider speculation when it appears to be contended
1291   if (hint & omp_lock_hint_contended)
1292     return lockseq_queuing;
1293 
1294   // Uncontended lock without speculation
1295   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1296     return lockseq_tas;
1297 
1298   // Use RTM lock for speculation
1299   if (hint & omp_lock_hint_speculative)
1300     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm_spin) : __kmp_user_lock_seq;
1301 
1302   return __kmp_user_lock_seq;
1303 }
1304 
1305 #if OMPT_SUPPORT && OMPT_OPTIONAL
1306 #if KMP_USE_DYNAMIC_LOCK
1307 static kmp_mutex_impl_t
1308 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1309   if (user_lock) {
1310     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1311     case 0:
1312       break;
1313 #if KMP_USE_FUTEX
1314     case locktag_futex:
1315       return kmp_mutex_impl_queuing;
1316 #endif
1317     case locktag_tas:
1318       return kmp_mutex_impl_spin;
1319 #if KMP_USE_TSX
1320     case locktag_hle:
1321     case locktag_rtm_spin:
1322       return kmp_mutex_impl_speculative;
1323 #endif
1324     default:
1325       return kmp_mutex_impl_none;
1326     }
1327     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1328   }
1329   KMP_ASSERT(ilock);
1330   switch (ilock->type) {
1331 #if KMP_USE_TSX
1332   case locktag_adaptive:
1333   case locktag_rtm_queuing:
1334     return kmp_mutex_impl_speculative;
1335 #endif
1336   case locktag_nested_tas:
1337     return kmp_mutex_impl_spin;
1338 #if KMP_USE_FUTEX
1339   case locktag_nested_futex:
1340 #endif
1341   case locktag_ticket:
1342   case locktag_queuing:
1343   case locktag_drdpa:
1344   case locktag_nested_ticket:
1345   case locktag_nested_queuing:
1346   case locktag_nested_drdpa:
1347     return kmp_mutex_impl_queuing;
1348   default:
1349     return kmp_mutex_impl_none;
1350   }
1351 }
1352 #else
1353 // For locks without dynamic binding
1354 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1355   switch (__kmp_user_lock_kind) {
1356   case lk_tas:
1357     return kmp_mutex_impl_spin;
1358 #if KMP_USE_FUTEX
1359   case lk_futex:
1360 #endif
1361   case lk_ticket:
1362   case lk_queuing:
1363   case lk_drdpa:
1364     return kmp_mutex_impl_queuing;
1365 #if KMP_USE_TSX
1366   case lk_hle:
1367   case lk_rtm_queuing:
1368   case lk_rtm_spin:
1369   case lk_adaptive:
1370     return kmp_mutex_impl_speculative;
1371 #endif
1372   default:
1373     return kmp_mutex_impl_none;
1374   }
1375 }
1376 #endif // KMP_USE_DYNAMIC_LOCK
1377 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1378 
1379 /*!
1380 @ingroup WORK_SHARING
1381 @param loc  source location information.
1382 @param global_tid  global thread number.
1383 @param crit identity of the critical section. This could be a pointer to a lock
1384 associated with the critical section, or some other suitably unique value.
1385 @param hint the lock hint.
1386 
1387 Enter code protected by a `critical` construct with a hint. The hint value is
1388 used to suggest a lock implementation. This function blocks until the executing
1389 thread can enter the critical section unless the hint suggests use of
1390 speculative execution and the hardware supports it.
1391 */
1392 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1393                                kmp_critical_name *crit, uint32_t hint) {
1394   KMP_COUNT_BLOCK(OMP_CRITICAL);
1395   kmp_user_lock_p lck;
1396 #if OMPT_SUPPORT && OMPT_OPTIONAL
1397   ompt_state_t prev_state = ompt_state_undefined;
1398   ompt_thread_info_t ti;
1399   // This is the case, if called from __kmpc_critical:
1400   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1401   if (!codeptr)
1402     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1403 #endif
1404 
1405   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1406   __kmp_assert_valid_gtid(global_tid);
1407 
1408   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1409   // Check if it is initialized.
1410   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1411   if (*lk == 0) {
1412     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1413     if (KMP_IS_D_LOCK(lckseq)) {
1414       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1415                                   KMP_GET_D_TAG(lckseq));
1416     } else {
1417       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1418     }
1419   }
1420   // Branch for accessing the actual lock object and set operation. This
1421   // branching is inevitable since this lock initialization does not follow the
1422   // normal dispatch path (lock table is not used).
1423   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1424     lck = (kmp_user_lock_p)lk;
1425     if (__kmp_env_consistency_check) {
1426       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1427                       __kmp_map_hint_to_lock(hint));
1428     }
1429 #if USE_ITT_BUILD
1430     __kmp_itt_critical_acquiring(lck);
1431 #endif
1432 #if OMPT_SUPPORT && OMPT_OPTIONAL
1433     if (ompt_enabled.enabled) {
1434       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1435       /* OMPT state update */
1436       prev_state = ti.state;
1437       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1438       ti.state = ompt_state_wait_critical;
1439 
1440       /* OMPT event callback */
1441       if (ompt_enabled.ompt_callback_mutex_acquire) {
1442         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1443             ompt_mutex_critical, (unsigned int)hint,
1444             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1445             codeptr);
1446       }
1447     }
1448 #endif
1449 #if KMP_USE_INLINED_TAS
1450     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1451       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1452     } else
1453 #elif KMP_USE_INLINED_FUTEX
1454     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1455       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1456     } else
1457 #endif
1458     {
1459       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1460     }
1461   } else {
1462     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1463     lck = ilk->lock;
1464     if (__kmp_env_consistency_check) {
1465       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1466                       __kmp_map_hint_to_lock(hint));
1467     }
1468 #if USE_ITT_BUILD
1469     __kmp_itt_critical_acquiring(lck);
1470 #endif
1471 #if OMPT_SUPPORT && OMPT_OPTIONAL
1472     if (ompt_enabled.enabled) {
1473       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1474       /* OMPT state update */
1475       prev_state = ti.state;
1476       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1477       ti.state = ompt_state_wait_critical;
1478 
1479       /* OMPT event callback */
1480       if (ompt_enabled.ompt_callback_mutex_acquire) {
1481         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1482             ompt_mutex_critical, (unsigned int)hint,
1483             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1484             codeptr);
1485       }
1486     }
1487 #endif
1488     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1489   }
1490   KMP_POP_PARTITIONED_TIMER();
1491 
1492 #if USE_ITT_BUILD
1493   __kmp_itt_critical_acquired(lck);
1494 #endif /* USE_ITT_BUILD */
1495 #if OMPT_SUPPORT && OMPT_OPTIONAL
1496   if (ompt_enabled.enabled) {
1497     /* OMPT state update */
1498     ti.state = prev_state;
1499     ti.wait_id = 0;
1500 
1501     /* OMPT event callback */
1502     if (ompt_enabled.ompt_callback_mutex_acquired) {
1503       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1504           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1505     }
1506   }
1507 #endif
1508 
1509   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1510   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1511 } // __kmpc_critical_with_hint
1512 
1513 #endif // KMP_USE_DYNAMIC_LOCK
1514 
1515 /*!
1516 @ingroup WORK_SHARING
1517 @param loc  source location information.
1518 @param global_tid  global thread number .
1519 @param crit identity of the critical section. This could be a pointer to a lock
1520 associated with the critical section, or some other suitably unique value.
1521 
1522 Leave a critical section, releasing any lock that was held during its execution.
1523 */
1524 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1525                          kmp_critical_name *crit) {
1526   kmp_user_lock_p lck;
1527 
1528   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1529 
1530 #if KMP_USE_DYNAMIC_LOCK
1531   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1532     lck = (kmp_user_lock_p)crit;
1533     KMP_ASSERT(lck != NULL);
1534     if (__kmp_env_consistency_check) {
1535       __kmp_pop_sync(global_tid, ct_critical, loc);
1536     }
1537 #if USE_ITT_BUILD
1538     __kmp_itt_critical_releasing(lck);
1539 #endif
1540 #if KMP_USE_INLINED_TAS
1541     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1542       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1543     } else
1544 #elif KMP_USE_INLINED_FUTEX
1545     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1546       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1547     } else
1548 #endif
1549     {
1550       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1551     }
1552   } else {
1553     kmp_indirect_lock_t *ilk =
1554         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1555     KMP_ASSERT(ilk != NULL);
1556     lck = ilk->lock;
1557     if (__kmp_env_consistency_check) {
1558       __kmp_pop_sync(global_tid, ct_critical, loc);
1559     }
1560 #if USE_ITT_BUILD
1561     __kmp_itt_critical_releasing(lck);
1562 #endif
1563     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1564   }
1565 
1566 #else // KMP_USE_DYNAMIC_LOCK
1567 
1568   if ((__kmp_user_lock_kind == lk_tas) &&
1569       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1570     lck = (kmp_user_lock_p)crit;
1571   }
1572 #if KMP_USE_FUTEX
1573   else if ((__kmp_user_lock_kind == lk_futex) &&
1574            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1575     lck = (kmp_user_lock_p)crit;
1576   }
1577 #endif
1578   else { // ticket, queuing or drdpa
1579     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1580   }
1581 
1582   KMP_ASSERT(lck != NULL);
1583 
1584   if (__kmp_env_consistency_check)
1585     __kmp_pop_sync(global_tid, ct_critical, loc);
1586 
1587 #if USE_ITT_BUILD
1588   __kmp_itt_critical_releasing(lck);
1589 #endif /* USE_ITT_BUILD */
1590   // Value of 'crit' should be good for using as a critical_id of the critical
1591   // section directive.
1592   __kmp_release_user_lock_with_checks(lck, global_tid);
1593 
1594 #endif // KMP_USE_DYNAMIC_LOCK
1595 
1596 #if OMPT_SUPPORT && OMPT_OPTIONAL
1597   /* OMPT release event triggers after lock is released; place here to trigger
1598    * for all #if branches */
1599   OMPT_STORE_RETURN_ADDRESS(global_tid);
1600   if (ompt_enabled.ompt_callback_mutex_released) {
1601     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1602         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1603         OMPT_LOAD_RETURN_ADDRESS(0));
1604   }
1605 #endif
1606 
1607   KMP_POP_PARTITIONED_TIMER();
1608   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1609 }
1610 
1611 /*!
1612 @ingroup SYNCHRONIZATION
1613 @param loc source location information
1614 @param global_tid thread id.
1615 @return one if the thread should execute the master block, zero otherwise
1616 
1617 Start execution of a combined barrier and master. The barrier is executed inside
1618 this function.
1619 */
1620 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1621   int status;
1622   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1623   __kmp_assert_valid_gtid(global_tid);
1624 
1625   if (!TCR_4(__kmp_init_parallel))
1626     __kmp_parallel_initialize();
1627 
1628   __kmp_resume_if_soft_paused();
1629 
1630   if (__kmp_env_consistency_check)
1631     __kmp_check_barrier(global_tid, ct_barrier, loc);
1632 
1633 #if OMPT_SUPPORT
1634   ompt_frame_t *ompt_frame;
1635   if (ompt_enabled.enabled) {
1636     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1637     if (ompt_frame->enter_frame.ptr == NULL)
1638       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1639   }
1640   OMPT_STORE_RETURN_ADDRESS(global_tid);
1641 #endif
1642 #if USE_ITT_NOTIFY
1643   __kmp_threads[global_tid]->th.th_ident = loc;
1644 #endif
1645   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1646 #if OMPT_SUPPORT && OMPT_OPTIONAL
1647   if (ompt_enabled.enabled) {
1648     ompt_frame->enter_frame = ompt_data_none;
1649   }
1650 #endif
1651 
1652   return (status != 0) ? 0 : 1;
1653 }
1654 
1655 /*!
1656 @ingroup SYNCHRONIZATION
1657 @param loc source location information
1658 @param global_tid thread id.
1659 
1660 Complete the execution of a combined barrier and master. This function should
1661 only be called at the completion of the <tt>master</tt> code. Other threads will
1662 still be waiting at the barrier and this call releases them.
1663 */
1664 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1665   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1666   __kmp_assert_valid_gtid(global_tid);
1667   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1668 }
1669 
1670 /*!
1671 @ingroup SYNCHRONIZATION
1672 @param loc source location information
1673 @param global_tid thread id.
1674 @return one if the thread should execute the master block, zero otherwise
1675 
1676 Start execution of a combined barrier and master(nowait) construct.
1677 The barrier is executed inside this function.
1678 There is no equivalent "end" function, since the
1679 */
1680 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1681   kmp_int32 ret;
1682   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1683   __kmp_assert_valid_gtid(global_tid);
1684 
1685   if (!TCR_4(__kmp_init_parallel))
1686     __kmp_parallel_initialize();
1687 
1688   __kmp_resume_if_soft_paused();
1689 
1690   if (__kmp_env_consistency_check) {
1691     if (loc == 0) {
1692       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1693     }
1694     __kmp_check_barrier(global_tid, ct_barrier, loc);
1695   }
1696 
1697 #if OMPT_SUPPORT
1698   ompt_frame_t *ompt_frame;
1699   if (ompt_enabled.enabled) {
1700     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1701     if (ompt_frame->enter_frame.ptr == NULL)
1702       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1703   }
1704   OMPT_STORE_RETURN_ADDRESS(global_tid);
1705 #endif
1706 #if USE_ITT_NOTIFY
1707   __kmp_threads[global_tid]->th.th_ident = loc;
1708 #endif
1709   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1710 #if OMPT_SUPPORT && OMPT_OPTIONAL
1711   if (ompt_enabled.enabled) {
1712     ompt_frame->enter_frame = ompt_data_none;
1713   }
1714 #endif
1715 
1716   ret = __kmpc_master(loc, global_tid);
1717 
1718   if (__kmp_env_consistency_check) {
1719     /*  there's no __kmpc_end_master called; so the (stats) */
1720     /*  actions of __kmpc_end_master are done here          */
1721     if (ret) {
1722       /* only one thread should do the pop since only */
1723       /* one did the push (see __kmpc_master())       */
1724       __kmp_pop_sync(global_tid, ct_master, loc);
1725     }
1726   }
1727 
1728   return (ret);
1729 }
1730 
1731 /* The BARRIER for a SINGLE process section is always explicit   */
1732 /*!
1733 @ingroup WORK_SHARING
1734 @param loc  source location information
1735 @param global_tid  global thread number
1736 @return One if this thread should execute the single construct, zero otherwise.
1737 
1738 Test whether to execute a <tt>single</tt> construct.
1739 There are no implicit barriers in the two "single" calls, rather the compiler
1740 should introduce an explicit barrier if it is required.
1741 */
1742 
1743 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1744   __kmp_assert_valid_gtid(global_tid);
1745   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1746 
1747   if (rc) {
1748     // We are going to execute the single statement, so we should count it.
1749     KMP_COUNT_BLOCK(OMP_SINGLE);
1750     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1751   }
1752 
1753 #if OMPT_SUPPORT && OMPT_OPTIONAL
1754   kmp_info_t *this_thr = __kmp_threads[global_tid];
1755   kmp_team_t *team = this_thr->th.th_team;
1756   int tid = __kmp_tid_from_gtid(global_tid);
1757 
1758   if (ompt_enabled.enabled) {
1759     if (rc) {
1760       if (ompt_enabled.ompt_callback_work) {
1761         ompt_callbacks.ompt_callback(ompt_callback_work)(
1762             ompt_work_single_executor, ompt_scope_begin,
1763             &(team->t.ompt_team_info.parallel_data),
1764             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1765             1, OMPT_GET_RETURN_ADDRESS(0));
1766       }
1767     } else {
1768       if (ompt_enabled.ompt_callback_work) {
1769         ompt_callbacks.ompt_callback(ompt_callback_work)(
1770             ompt_work_single_other, ompt_scope_begin,
1771             &(team->t.ompt_team_info.parallel_data),
1772             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1773             1, OMPT_GET_RETURN_ADDRESS(0));
1774         ompt_callbacks.ompt_callback(ompt_callback_work)(
1775             ompt_work_single_other, ompt_scope_end,
1776             &(team->t.ompt_team_info.parallel_data),
1777             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1778             1, OMPT_GET_RETURN_ADDRESS(0));
1779       }
1780     }
1781   }
1782 #endif
1783 
1784   return rc;
1785 }
1786 
1787 /*!
1788 @ingroup WORK_SHARING
1789 @param loc  source location information
1790 @param global_tid  global thread number
1791 
1792 Mark the end of a <tt>single</tt> construct.  This function should
1793 only be called by the thread that executed the block of code protected
1794 by the `single` construct.
1795 */
1796 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1797   __kmp_assert_valid_gtid(global_tid);
1798   __kmp_exit_single(global_tid);
1799   KMP_POP_PARTITIONED_TIMER();
1800 
1801 #if OMPT_SUPPORT && OMPT_OPTIONAL
1802   kmp_info_t *this_thr = __kmp_threads[global_tid];
1803   kmp_team_t *team = this_thr->th.th_team;
1804   int tid = __kmp_tid_from_gtid(global_tid);
1805 
1806   if (ompt_enabled.ompt_callback_work) {
1807     ompt_callbacks.ompt_callback(ompt_callback_work)(
1808         ompt_work_single_executor, ompt_scope_end,
1809         &(team->t.ompt_team_info.parallel_data),
1810         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1811         OMPT_GET_RETURN_ADDRESS(0));
1812   }
1813 #endif
1814 }
1815 
1816 /*!
1817 @ingroup WORK_SHARING
1818 @param loc Source location
1819 @param global_tid Global thread id
1820 
1821 Mark the end of a statically scheduled loop.
1822 */
1823 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1824   KMP_POP_PARTITIONED_TIMER();
1825   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1826 
1827 #if OMPT_SUPPORT && OMPT_OPTIONAL
1828   if (ompt_enabled.ompt_callback_work) {
1829     ompt_work_t ompt_work_type = ompt_work_loop;
1830     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1831     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1832     // Determine workshare type
1833     if (loc != NULL) {
1834       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1835         ompt_work_type = ompt_work_loop;
1836       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1837         ompt_work_type = ompt_work_sections;
1838       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1839         ompt_work_type = ompt_work_distribute;
1840       } else {
1841         // use default set above.
1842         // a warning about this case is provided in __kmpc_for_static_init
1843       }
1844       KMP_DEBUG_ASSERT(ompt_work_type);
1845     }
1846     ompt_callbacks.ompt_callback(ompt_callback_work)(
1847         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1848         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1849   }
1850 #endif
1851   if (__kmp_env_consistency_check)
1852     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1853 }
1854 
1855 // User routines which take C-style arguments (call by value)
1856 // different from the Fortran equivalent routines
1857 
1858 void ompc_set_num_threads(int arg) {
1859   // !!!!! TODO: check the per-task binding
1860   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1861 }
1862 
1863 void ompc_set_dynamic(int flag) {
1864   kmp_info_t *thread;
1865 
1866   /* For the thread-private implementation of the internal controls */
1867   thread = __kmp_entry_thread();
1868 
1869   __kmp_save_internal_controls(thread);
1870 
1871   set__dynamic(thread, flag ? true : false);
1872 }
1873 
1874 void ompc_set_nested(int flag) {
1875   kmp_info_t *thread;
1876 
1877   /* For the thread-private internal controls implementation */
1878   thread = __kmp_entry_thread();
1879 
1880   __kmp_save_internal_controls(thread);
1881 
1882   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1883 }
1884 
1885 void ompc_set_max_active_levels(int max_active_levels) {
1886   /* TO DO */
1887   /* we want per-task implementation of this internal control */
1888 
1889   /* For the per-thread internal controls implementation */
1890   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1891 }
1892 
1893 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1894   // !!!!! TODO: check the per-task binding
1895   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1896 }
1897 
1898 int ompc_get_ancestor_thread_num(int level) {
1899   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1900 }
1901 
1902 int ompc_get_team_size(int level) {
1903   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1904 }
1905 
1906 /* OpenMP 5.0 Affinity Format API */
1907 
1908 void ompc_set_affinity_format(char const *format) {
1909   if (!__kmp_init_serial) {
1910     __kmp_serial_initialize();
1911   }
1912   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1913                          format, KMP_STRLEN(format) + 1);
1914 }
1915 
1916 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1917   size_t format_size;
1918   if (!__kmp_init_serial) {
1919     __kmp_serial_initialize();
1920   }
1921   format_size = KMP_STRLEN(__kmp_affinity_format);
1922   if (buffer && size) {
1923     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1924                            format_size + 1);
1925   }
1926   return format_size;
1927 }
1928 
1929 void ompc_display_affinity(char const *format) {
1930   int gtid;
1931   if (!TCR_4(__kmp_init_middle)) {
1932     __kmp_middle_initialize();
1933   }
1934   gtid = __kmp_get_gtid();
1935   __kmp_aux_display_affinity(gtid, format);
1936 }
1937 
1938 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1939                              char const *format) {
1940   int gtid;
1941   size_t num_required;
1942   kmp_str_buf_t capture_buf;
1943   if (!TCR_4(__kmp_init_middle)) {
1944     __kmp_middle_initialize();
1945   }
1946   gtid = __kmp_get_gtid();
1947   __kmp_str_buf_init(&capture_buf);
1948   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1949   if (buffer && buf_size) {
1950     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1951                            capture_buf.used + 1);
1952   }
1953   __kmp_str_buf_free(&capture_buf);
1954   return num_required;
1955 }
1956 
1957 void kmpc_set_stacksize(int arg) {
1958   // __kmp_aux_set_stacksize initializes the library if needed
1959   __kmp_aux_set_stacksize(arg);
1960 }
1961 
1962 void kmpc_set_stacksize_s(size_t arg) {
1963   // __kmp_aux_set_stacksize initializes the library if needed
1964   __kmp_aux_set_stacksize(arg);
1965 }
1966 
1967 void kmpc_set_blocktime(int arg) {
1968   int gtid, tid;
1969   kmp_info_t *thread;
1970 
1971   gtid = __kmp_entry_gtid();
1972   tid = __kmp_tid_from_gtid(gtid);
1973   thread = __kmp_thread_from_gtid(gtid);
1974 
1975   __kmp_aux_set_blocktime(arg, thread, tid);
1976 }
1977 
1978 void kmpc_set_library(int arg) {
1979   // __kmp_user_set_library initializes the library if needed
1980   __kmp_user_set_library((enum library_type)arg);
1981 }
1982 
1983 void kmpc_set_defaults(char const *str) {
1984   // __kmp_aux_set_defaults initializes the library if needed
1985   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1986 }
1987 
1988 void kmpc_set_disp_num_buffers(int arg) {
1989   // ignore after initialization because some teams have already
1990   // allocated dispatch buffers
1991   if (__kmp_init_serial == 0 && arg > 0)
1992     __kmp_dispatch_num_buffers = arg;
1993 }
1994 
1995 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1996 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1997   return -1;
1998 #else
1999   if (!TCR_4(__kmp_init_middle)) {
2000     __kmp_middle_initialize();
2001   }
2002   return __kmp_aux_set_affinity_mask_proc(proc, mask);
2003 #endif
2004 }
2005 
2006 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
2007 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2008   return -1;
2009 #else
2010   if (!TCR_4(__kmp_init_middle)) {
2011     __kmp_middle_initialize();
2012   }
2013   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
2014 #endif
2015 }
2016 
2017 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
2018 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
2019   return -1;
2020 #else
2021   if (!TCR_4(__kmp_init_middle)) {
2022     __kmp_middle_initialize();
2023   }
2024   return __kmp_aux_get_affinity_mask_proc(proc, mask);
2025 #endif
2026 }
2027 
2028 /* -------------------------------------------------------------------------- */
2029 /*!
2030 @ingroup THREADPRIVATE
2031 @param loc       source location information
2032 @param gtid      global thread number
2033 @param cpy_size  size of the cpy_data buffer
2034 @param cpy_data  pointer to data to be copied
2035 @param cpy_func  helper function to call for copying data
2036 @param didit     flag variable: 1=single thread; 0=not single thread
2037 
2038 __kmpc_copyprivate implements the interface for the private data broadcast
2039 needed for the copyprivate clause associated with a single region in an
2040 OpenMP<sup>*</sup> program (both C and Fortran).
2041 All threads participating in the parallel region call this routine.
2042 One of the threads (called the single thread) should have the <tt>didit</tt>
2043 variable set to 1 and all other threads should have that variable set to 0.
2044 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2045 
2046 The OpenMP specification forbids the use of nowait on the single region when a
2047 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2048 barrier internally to avoid race conditions, so the code generation for the
2049 single region should avoid generating a barrier after the call to @ref
2050 __kmpc_copyprivate.
2051 
2052 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2053 The <tt>loc</tt> parameter is a pointer to source location information.
2054 
2055 Internal implementation: The single thread will first copy its descriptor
2056 address (cpy_data) to a team-private location, then the other threads will each
2057 call the function pointed to by the parameter cpy_func, which carries out the
2058 copy by copying the data using the cpy_data buffer.
2059 
2060 The cpy_func routine used for the copy and the contents of the data area defined
2061 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2062 to be done. For instance, the cpy_data buffer can hold the actual data to be
2063 copied or it may hold a list of pointers to the data. The cpy_func routine must
2064 interpret the cpy_data buffer appropriately.
2065 
2066 The interface to cpy_func is as follows:
2067 @code
2068 void cpy_func( void *destination, void *source )
2069 @endcode
2070 where void *destination is the cpy_data pointer for the thread being copied to
2071 and void *source is the cpy_data pointer for the thread being copied from.
2072 */
2073 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2074                         void *cpy_data, void (*cpy_func)(void *, void *),
2075                         kmp_int32 didit) {
2076   void **data_ptr;
2077   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2078   __kmp_assert_valid_gtid(gtid);
2079 
2080   KMP_MB();
2081 
2082   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2083 
2084   if (__kmp_env_consistency_check) {
2085     if (loc == 0) {
2086       KMP_WARNING(ConstructIdentInvalid);
2087     }
2088   }
2089 
2090   // ToDo: Optimize the following two barriers into some kind of split barrier
2091 
2092   if (didit)
2093     *data_ptr = cpy_data;
2094 
2095 #if OMPT_SUPPORT
2096   ompt_frame_t *ompt_frame;
2097   if (ompt_enabled.enabled) {
2098     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2099     if (ompt_frame->enter_frame.ptr == NULL)
2100       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2101   }
2102   OMPT_STORE_RETURN_ADDRESS(gtid);
2103 #endif
2104 /* This barrier is not a barrier region boundary */
2105 #if USE_ITT_NOTIFY
2106   __kmp_threads[gtid]->th.th_ident = loc;
2107 #endif
2108   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2109 
2110   if (!didit)
2111     (*cpy_func)(cpy_data, *data_ptr);
2112 
2113   // Consider next barrier a user-visible barrier for barrier region boundaries
2114   // Nesting checks are already handled by the single construct checks
2115   {
2116 #if OMPT_SUPPORT
2117     OMPT_STORE_RETURN_ADDRESS(gtid);
2118 #endif
2119 #if USE_ITT_NOTIFY
2120     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2121 // tasks can overwrite the location)
2122 #endif
2123     __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2124 #if OMPT_SUPPORT && OMPT_OPTIONAL
2125     if (ompt_enabled.enabled) {
2126       ompt_frame->enter_frame = ompt_data_none;
2127     }
2128 #endif
2129   }
2130 }
2131 
2132 /* -------------------------------------------------------------------------- */
2133 
2134 #define INIT_LOCK __kmp_init_user_lock_with_checks
2135 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2136 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2137 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2138 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2139 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2140   __kmp_acquire_nested_user_lock_with_checks_timed
2141 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2142 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2143 #define TEST_LOCK __kmp_test_user_lock_with_checks
2144 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2145 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2146 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2147 
2148 // TODO: Make check abort messages use location info & pass it into
2149 // with_checks routines
2150 
2151 #if KMP_USE_DYNAMIC_LOCK
2152 
2153 // internal lock initializer
2154 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2155                                                     kmp_dyna_lockseq_t seq) {
2156   if (KMP_IS_D_LOCK(seq)) {
2157     KMP_INIT_D_LOCK(lock, seq);
2158 #if USE_ITT_BUILD
2159     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2160 #endif
2161   } else {
2162     KMP_INIT_I_LOCK(lock, seq);
2163 #if USE_ITT_BUILD
2164     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2165     __kmp_itt_lock_creating(ilk->lock, loc);
2166 #endif
2167   }
2168 }
2169 
2170 // internal nest lock initializer
2171 static __forceinline void
2172 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2173                                kmp_dyna_lockseq_t seq) {
2174 #if KMP_USE_TSX
2175   // Don't have nested lock implementation for speculative locks
2176   if (seq == lockseq_hle || seq == lockseq_rtm_queuing ||
2177       seq == lockseq_rtm_spin || seq == lockseq_adaptive)
2178     seq = __kmp_user_lock_seq;
2179 #endif
2180   switch (seq) {
2181   case lockseq_tas:
2182     seq = lockseq_nested_tas;
2183     break;
2184 #if KMP_USE_FUTEX
2185   case lockseq_futex:
2186     seq = lockseq_nested_futex;
2187     break;
2188 #endif
2189   case lockseq_ticket:
2190     seq = lockseq_nested_ticket;
2191     break;
2192   case lockseq_queuing:
2193     seq = lockseq_nested_queuing;
2194     break;
2195   case lockseq_drdpa:
2196     seq = lockseq_nested_drdpa;
2197     break;
2198   default:
2199     seq = lockseq_nested_queuing;
2200   }
2201   KMP_INIT_I_LOCK(lock, seq);
2202 #if USE_ITT_BUILD
2203   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2204   __kmp_itt_lock_creating(ilk->lock, loc);
2205 #endif
2206 }
2207 
2208 /* initialize the lock with a hint */
2209 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2210                                 uintptr_t hint) {
2211   KMP_DEBUG_ASSERT(__kmp_init_serial);
2212   if (__kmp_env_consistency_check && user_lock == NULL) {
2213     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2214   }
2215 
2216   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2217 
2218 #if OMPT_SUPPORT && OMPT_OPTIONAL
2219   // This is the case, if called from omp_init_lock_with_hint:
2220   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2221   if (!codeptr)
2222     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2223   if (ompt_enabled.ompt_callback_lock_init) {
2224     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2225         ompt_mutex_lock, (omp_lock_hint_t)hint,
2226         __ompt_get_mutex_impl_type(user_lock),
2227         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2228   }
2229 #endif
2230 }
2231 
2232 /* initialize the lock with a hint */
2233 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2234                                      void **user_lock, uintptr_t hint) {
2235   KMP_DEBUG_ASSERT(__kmp_init_serial);
2236   if (__kmp_env_consistency_check && user_lock == NULL) {
2237     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2238   }
2239 
2240   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2241 
2242 #if OMPT_SUPPORT && OMPT_OPTIONAL
2243   // This is the case, if called from omp_init_lock_with_hint:
2244   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2245   if (!codeptr)
2246     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2247   if (ompt_enabled.ompt_callback_lock_init) {
2248     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2249         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2250         __ompt_get_mutex_impl_type(user_lock),
2251         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2252   }
2253 #endif
2254 }
2255 
2256 #endif // KMP_USE_DYNAMIC_LOCK
2257 
2258 /* initialize the lock */
2259 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2260 #if KMP_USE_DYNAMIC_LOCK
2261 
2262   KMP_DEBUG_ASSERT(__kmp_init_serial);
2263   if (__kmp_env_consistency_check && user_lock == NULL) {
2264     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2265   }
2266   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2267 
2268 #if OMPT_SUPPORT && OMPT_OPTIONAL
2269   // This is the case, if called from omp_init_lock_with_hint:
2270   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2271   if (!codeptr)
2272     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2273   if (ompt_enabled.ompt_callback_lock_init) {
2274     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2275         ompt_mutex_lock, omp_lock_hint_none,
2276         __ompt_get_mutex_impl_type(user_lock),
2277         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2278   }
2279 #endif
2280 
2281 #else // KMP_USE_DYNAMIC_LOCK
2282 
2283   static char const *const func = "omp_init_lock";
2284   kmp_user_lock_p lck;
2285   KMP_DEBUG_ASSERT(__kmp_init_serial);
2286 
2287   if (__kmp_env_consistency_check) {
2288     if (user_lock == NULL) {
2289       KMP_FATAL(LockIsUninitialized, func);
2290     }
2291   }
2292 
2293   KMP_CHECK_USER_LOCK_INIT();
2294 
2295   if ((__kmp_user_lock_kind == lk_tas) &&
2296       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2297     lck = (kmp_user_lock_p)user_lock;
2298   }
2299 #if KMP_USE_FUTEX
2300   else if ((__kmp_user_lock_kind == lk_futex) &&
2301            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2302     lck = (kmp_user_lock_p)user_lock;
2303   }
2304 #endif
2305   else {
2306     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2307   }
2308   INIT_LOCK(lck);
2309   __kmp_set_user_lock_location(lck, loc);
2310 
2311 #if OMPT_SUPPORT && OMPT_OPTIONAL
2312   // This is the case, if called from omp_init_lock_with_hint:
2313   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2314   if (!codeptr)
2315     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2316   if (ompt_enabled.ompt_callback_lock_init) {
2317     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2318         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2319         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2320   }
2321 #endif
2322 
2323 #if USE_ITT_BUILD
2324   __kmp_itt_lock_creating(lck);
2325 #endif /* USE_ITT_BUILD */
2326 
2327 #endif // KMP_USE_DYNAMIC_LOCK
2328 } // __kmpc_init_lock
2329 
2330 /* initialize the lock */
2331 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2332 #if KMP_USE_DYNAMIC_LOCK
2333 
2334   KMP_DEBUG_ASSERT(__kmp_init_serial);
2335   if (__kmp_env_consistency_check && user_lock == NULL) {
2336     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2337   }
2338   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2339 
2340 #if OMPT_SUPPORT && OMPT_OPTIONAL
2341   // This is the case, if called from omp_init_lock_with_hint:
2342   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2343   if (!codeptr)
2344     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2345   if (ompt_enabled.ompt_callback_lock_init) {
2346     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2347         ompt_mutex_nest_lock, omp_lock_hint_none,
2348         __ompt_get_mutex_impl_type(user_lock),
2349         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2350   }
2351 #endif
2352 
2353 #else // KMP_USE_DYNAMIC_LOCK
2354 
2355   static char const *const func = "omp_init_nest_lock";
2356   kmp_user_lock_p lck;
2357   KMP_DEBUG_ASSERT(__kmp_init_serial);
2358 
2359   if (__kmp_env_consistency_check) {
2360     if (user_lock == NULL) {
2361       KMP_FATAL(LockIsUninitialized, func);
2362     }
2363   }
2364 
2365   KMP_CHECK_USER_LOCK_INIT();
2366 
2367   if ((__kmp_user_lock_kind == lk_tas) &&
2368       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2369        OMP_NEST_LOCK_T_SIZE)) {
2370     lck = (kmp_user_lock_p)user_lock;
2371   }
2372 #if KMP_USE_FUTEX
2373   else if ((__kmp_user_lock_kind == lk_futex) &&
2374            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2375             OMP_NEST_LOCK_T_SIZE)) {
2376     lck = (kmp_user_lock_p)user_lock;
2377   }
2378 #endif
2379   else {
2380     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2381   }
2382 
2383   INIT_NESTED_LOCK(lck);
2384   __kmp_set_user_lock_location(lck, loc);
2385 
2386 #if OMPT_SUPPORT && OMPT_OPTIONAL
2387   // This is the case, if called from omp_init_lock_with_hint:
2388   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2389   if (!codeptr)
2390     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2391   if (ompt_enabled.ompt_callback_lock_init) {
2392     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2393         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2394         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2395   }
2396 #endif
2397 
2398 #if USE_ITT_BUILD
2399   __kmp_itt_lock_creating(lck);
2400 #endif /* USE_ITT_BUILD */
2401 
2402 #endif // KMP_USE_DYNAMIC_LOCK
2403 } // __kmpc_init_nest_lock
2404 
2405 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2406 #if KMP_USE_DYNAMIC_LOCK
2407 
2408 #if USE_ITT_BUILD
2409   kmp_user_lock_p lck;
2410   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2411     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2412   } else {
2413     lck = (kmp_user_lock_p)user_lock;
2414   }
2415   __kmp_itt_lock_destroyed(lck);
2416 #endif
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL
2418   // This is the case, if called from omp_init_lock_with_hint:
2419   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2420   if (!codeptr)
2421     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2422   if (ompt_enabled.ompt_callback_lock_destroy) {
2423     kmp_user_lock_p lck;
2424     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2425       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2426     } else {
2427       lck = (kmp_user_lock_p)user_lock;
2428     }
2429     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2430         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2431   }
2432 #endif
2433   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2434 #else
2435   kmp_user_lock_p lck;
2436 
2437   if ((__kmp_user_lock_kind == lk_tas) &&
2438       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2439     lck = (kmp_user_lock_p)user_lock;
2440   }
2441 #if KMP_USE_FUTEX
2442   else if ((__kmp_user_lock_kind == lk_futex) &&
2443            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2444     lck = (kmp_user_lock_p)user_lock;
2445   }
2446 #endif
2447   else {
2448     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2449   }
2450 
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452   // This is the case, if called from omp_init_lock_with_hint:
2453   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2454   if (!codeptr)
2455     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2456   if (ompt_enabled.ompt_callback_lock_destroy) {
2457     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2458         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2459   }
2460 #endif
2461 
2462 #if USE_ITT_BUILD
2463   __kmp_itt_lock_destroyed(lck);
2464 #endif /* USE_ITT_BUILD */
2465   DESTROY_LOCK(lck);
2466 
2467   if ((__kmp_user_lock_kind == lk_tas) &&
2468       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2469     ;
2470   }
2471 #if KMP_USE_FUTEX
2472   else if ((__kmp_user_lock_kind == lk_futex) &&
2473            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2474     ;
2475   }
2476 #endif
2477   else {
2478     __kmp_user_lock_free(user_lock, gtid, lck);
2479   }
2480 #endif // KMP_USE_DYNAMIC_LOCK
2481 } // __kmpc_destroy_lock
2482 
2483 /* destroy the lock */
2484 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2485 #if KMP_USE_DYNAMIC_LOCK
2486 
2487 #if USE_ITT_BUILD
2488   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2489   __kmp_itt_lock_destroyed(ilk->lock);
2490 #endif
2491 #if OMPT_SUPPORT && OMPT_OPTIONAL
2492   // This is the case, if called from omp_init_lock_with_hint:
2493   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2494   if (!codeptr)
2495     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2496   if (ompt_enabled.ompt_callback_lock_destroy) {
2497     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2498         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2499   }
2500 #endif
2501   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2502 
2503 #else // KMP_USE_DYNAMIC_LOCK
2504 
2505   kmp_user_lock_p lck;
2506 
2507   if ((__kmp_user_lock_kind == lk_tas) &&
2508       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2509        OMP_NEST_LOCK_T_SIZE)) {
2510     lck = (kmp_user_lock_p)user_lock;
2511   }
2512 #if KMP_USE_FUTEX
2513   else if ((__kmp_user_lock_kind == lk_futex) &&
2514            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2515             OMP_NEST_LOCK_T_SIZE)) {
2516     lck = (kmp_user_lock_p)user_lock;
2517   }
2518 #endif
2519   else {
2520     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2521   }
2522 
2523 #if OMPT_SUPPORT && OMPT_OPTIONAL
2524   // This is the case, if called from omp_init_lock_with_hint:
2525   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2526   if (!codeptr)
2527     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2528   if (ompt_enabled.ompt_callback_lock_destroy) {
2529     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2530         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2531   }
2532 #endif
2533 
2534 #if USE_ITT_BUILD
2535   __kmp_itt_lock_destroyed(lck);
2536 #endif /* USE_ITT_BUILD */
2537 
2538   DESTROY_NESTED_LOCK(lck);
2539 
2540   if ((__kmp_user_lock_kind == lk_tas) &&
2541       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2542        OMP_NEST_LOCK_T_SIZE)) {
2543     ;
2544   }
2545 #if KMP_USE_FUTEX
2546   else if ((__kmp_user_lock_kind == lk_futex) &&
2547            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2548             OMP_NEST_LOCK_T_SIZE)) {
2549     ;
2550   }
2551 #endif
2552   else {
2553     __kmp_user_lock_free(user_lock, gtid, lck);
2554   }
2555 #endif // KMP_USE_DYNAMIC_LOCK
2556 } // __kmpc_destroy_nest_lock
2557 
2558 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2559   KMP_COUNT_BLOCK(OMP_set_lock);
2560 #if KMP_USE_DYNAMIC_LOCK
2561   int tag = KMP_EXTRACT_D_TAG(user_lock);
2562 #if USE_ITT_BUILD
2563   __kmp_itt_lock_acquiring(
2564       (kmp_user_lock_p)
2565           user_lock); // itt function will get to the right lock object.
2566 #endif
2567 #if OMPT_SUPPORT && OMPT_OPTIONAL
2568   // This is the case, if called from omp_init_lock_with_hint:
2569   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2570   if (!codeptr)
2571     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2572   if (ompt_enabled.ompt_callback_mutex_acquire) {
2573     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2574         ompt_mutex_lock, omp_lock_hint_none,
2575         __ompt_get_mutex_impl_type(user_lock),
2576         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2577   }
2578 #endif
2579 #if KMP_USE_INLINED_TAS
2580   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2581     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2582   } else
2583 #elif KMP_USE_INLINED_FUTEX
2584   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2585     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2586   } else
2587 #endif
2588   {
2589     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2590   }
2591 #if USE_ITT_BUILD
2592   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2593 #endif
2594 #if OMPT_SUPPORT && OMPT_OPTIONAL
2595   if (ompt_enabled.ompt_callback_mutex_acquired) {
2596     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2597         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2598   }
2599 #endif
2600 
2601 #else // KMP_USE_DYNAMIC_LOCK
2602 
2603   kmp_user_lock_p lck;
2604 
2605   if ((__kmp_user_lock_kind == lk_tas) &&
2606       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2607     lck = (kmp_user_lock_p)user_lock;
2608   }
2609 #if KMP_USE_FUTEX
2610   else if ((__kmp_user_lock_kind == lk_futex) &&
2611            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2612     lck = (kmp_user_lock_p)user_lock;
2613   }
2614 #endif
2615   else {
2616     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2617   }
2618 
2619 #if USE_ITT_BUILD
2620   __kmp_itt_lock_acquiring(lck);
2621 #endif /* USE_ITT_BUILD */
2622 #if OMPT_SUPPORT && OMPT_OPTIONAL
2623   // This is the case, if called from omp_init_lock_with_hint:
2624   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2625   if (!codeptr)
2626     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2627   if (ompt_enabled.ompt_callback_mutex_acquire) {
2628     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2629         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2630         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2631   }
2632 #endif
2633 
2634   ACQUIRE_LOCK(lck, gtid);
2635 
2636 #if USE_ITT_BUILD
2637   __kmp_itt_lock_acquired(lck);
2638 #endif /* USE_ITT_BUILD */
2639 
2640 #if OMPT_SUPPORT && OMPT_OPTIONAL
2641   if (ompt_enabled.ompt_callback_mutex_acquired) {
2642     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2643         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2644   }
2645 #endif
2646 
2647 #endif // KMP_USE_DYNAMIC_LOCK
2648 }
2649 
2650 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2651 #if KMP_USE_DYNAMIC_LOCK
2652 
2653 #if USE_ITT_BUILD
2654   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2655 #endif
2656 #if OMPT_SUPPORT && OMPT_OPTIONAL
2657   // This is the case, if called from omp_init_lock_with_hint:
2658   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2659   if (!codeptr)
2660     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2661   if (ompt_enabled.enabled) {
2662     if (ompt_enabled.ompt_callback_mutex_acquire) {
2663       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2664           ompt_mutex_nest_lock, omp_lock_hint_none,
2665           __ompt_get_mutex_impl_type(user_lock),
2666           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2667     }
2668   }
2669 #endif
2670   int acquire_status =
2671       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2672   (void)acquire_status;
2673 #if USE_ITT_BUILD
2674   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2675 #endif
2676 
2677 #if OMPT_SUPPORT && OMPT_OPTIONAL
2678   if (ompt_enabled.enabled) {
2679     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2680       if (ompt_enabled.ompt_callback_mutex_acquired) {
2681         // lock_first
2682         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2683             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2684             codeptr);
2685       }
2686     } else {
2687       if (ompt_enabled.ompt_callback_nest_lock) {
2688         // lock_next
2689         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2690             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2691       }
2692     }
2693   }
2694 #endif
2695 
2696 #else // KMP_USE_DYNAMIC_LOCK
2697   int acquire_status;
2698   kmp_user_lock_p lck;
2699 
2700   if ((__kmp_user_lock_kind == lk_tas) &&
2701       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2702        OMP_NEST_LOCK_T_SIZE)) {
2703     lck = (kmp_user_lock_p)user_lock;
2704   }
2705 #if KMP_USE_FUTEX
2706   else if ((__kmp_user_lock_kind == lk_futex) &&
2707            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2708             OMP_NEST_LOCK_T_SIZE)) {
2709     lck = (kmp_user_lock_p)user_lock;
2710   }
2711 #endif
2712   else {
2713     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2714   }
2715 
2716 #if USE_ITT_BUILD
2717   __kmp_itt_lock_acquiring(lck);
2718 #endif /* USE_ITT_BUILD */
2719 #if OMPT_SUPPORT && OMPT_OPTIONAL
2720   // This is the case, if called from omp_init_lock_with_hint:
2721   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2722   if (!codeptr)
2723     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2724   if (ompt_enabled.enabled) {
2725     if (ompt_enabled.ompt_callback_mutex_acquire) {
2726       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2727           ompt_mutex_nest_lock, omp_lock_hint_none,
2728           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2729           codeptr);
2730     }
2731   }
2732 #endif
2733 
2734   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2735 
2736 #if USE_ITT_BUILD
2737   __kmp_itt_lock_acquired(lck);
2738 #endif /* USE_ITT_BUILD */
2739 
2740 #if OMPT_SUPPORT && OMPT_OPTIONAL
2741   if (ompt_enabled.enabled) {
2742     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2743       if (ompt_enabled.ompt_callback_mutex_acquired) {
2744         // lock_first
2745         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2746             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2747       }
2748     } else {
2749       if (ompt_enabled.ompt_callback_nest_lock) {
2750         // lock_next
2751         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2752             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2753       }
2754     }
2755   }
2756 #endif
2757 
2758 #endif // KMP_USE_DYNAMIC_LOCK
2759 }
2760 
2761 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2762 #if KMP_USE_DYNAMIC_LOCK
2763 
2764   int tag = KMP_EXTRACT_D_TAG(user_lock);
2765 #if USE_ITT_BUILD
2766   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2767 #endif
2768 #if KMP_USE_INLINED_TAS
2769   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2770     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2771   } else
2772 #elif KMP_USE_INLINED_FUTEX
2773   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2774     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2775   } else
2776 #endif
2777   {
2778     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2779   }
2780 
2781 #if OMPT_SUPPORT && OMPT_OPTIONAL
2782   // This is the case, if called from omp_init_lock_with_hint:
2783   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2784   if (!codeptr)
2785     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2786   if (ompt_enabled.ompt_callback_mutex_released) {
2787     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2788         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2789   }
2790 #endif
2791 
2792 #else // KMP_USE_DYNAMIC_LOCK
2793 
2794   kmp_user_lock_p lck;
2795 
2796   /* Can't use serial interval since not block structured */
2797   /* release the lock */
2798 
2799   if ((__kmp_user_lock_kind == lk_tas) &&
2800       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2801 #if KMP_OS_LINUX &&                                                            \
2802     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2803 // "fast" path implemented to fix customer performance issue
2804 #if USE_ITT_BUILD
2805     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2806 #endif /* USE_ITT_BUILD */
2807     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2808     KMP_MB();
2809 
2810 #if OMPT_SUPPORT && OMPT_OPTIONAL
2811     // This is the case, if called from omp_init_lock_with_hint:
2812     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2813     if (!codeptr)
2814       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2815     if (ompt_enabled.ompt_callback_mutex_released) {
2816       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2817           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2818     }
2819 #endif
2820 
2821     return;
2822 #else
2823     lck = (kmp_user_lock_p)user_lock;
2824 #endif
2825   }
2826 #if KMP_USE_FUTEX
2827   else if ((__kmp_user_lock_kind == lk_futex) &&
2828            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2829     lck = (kmp_user_lock_p)user_lock;
2830   }
2831 #endif
2832   else {
2833     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2834   }
2835 
2836 #if USE_ITT_BUILD
2837   __kmp_itt_lock_releasing(lck);
2838 #endif /* USE_ITT_BUILD */
2839 
2840   RELEASE_LOCK(lck, gtid);
2841 
2842 #if OMPT_SUPPORT && OMPT_OPTIONAL
2843   // This is the case, if called from omp_init_lock_with_hint:
2844   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2845   if (!codeptr)
2846     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2847   if (ompt_enabled.ompt_callback_mutex_released) {
2848     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2849         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2850   }
2851 #endif
2852 
2853 #endif // KMP_USE_DYNAMIC_LOCK
2854 }
2855 
2856 /* release the lock */
2857 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2858 #if KMP_USE_DYNAMIC_LOCK
2859 
2860 #if USE_ITT_BUILD
2861   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2862 #endif
2863   int release_status =
2864       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2865   (void)release_status;
2866 
2867 #if OMPT_SUPPORT && OMPT_OPTIONAL
2868   // This is the case, if called from omp_init_lock_with_hint:
2869   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2870   if (!codeptr)
2871     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2872   if (ompt_enabled.enabled) {
2873     if (release_status == KMP_LOCK_RELEASED) {
2874       if (ompt_enabled.ompt_callback_mutex_released) {
2875         // release_lock_last
2876         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2877             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2878             codeptr);
2879       }
2880     } else if (ompt_enabled.ompt_callback_nest_lock) {
2881       // release_lock_prev
2882       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2883           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2884     }
2885   }
2886 #endif
2887 
2888 #else // KMP_USE_DYNAMIC_LOCK
2889 
2890   kmp_user_lock_p lck;
2891 
2892   /* Can't use serial interval since not block structured */
2893 
2894   if ((__kmp_user_lock_kind == lk_tas) &&
2895       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2896        OMP_NEST_LOCK_T_SIZE)) {
2897 #if KMP_OS_LINUX &&                                                            \
2898     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2899     // "fast" path implemented to fix customer performance issue
2900     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2901 #if USE_ITT_BUILD
2902     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2903 #endif /* USE_ITT_BUILD */
2904 
2905 #if OMPT_SUPPORT && OMPT_OPTIONAL
2906     int release_status = KMP_LOCK_STILL_HELD;
2907 #endif
2908 
2909     if (--(tl->lk.depth_locked) == 0) {
2910       TCW_4(tl->lk.poll, 0);
2911 #if OMPT_SUPPORT && OMPT_OPTIONAL
2912       release_status = KMP_LOCK_RELEASED;
2913 #endif
2914     }
2915     KMP_MB();
2916 
2917 #if OMPT_SUPPORT && OMPT_OPTIONAL
2918     // This is the case, if called from omp_init_lock_with_hint:
2919     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2920     if (!codeptr)
2921       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2922     if (ompt_enabled.enabled) {
2923       if (release_status == KMP_LOCK_RELEASED) {
2924         if (ompt_enabled.ompt_callback_mutex_released) {
2925           // release_lock_last
2926           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2927               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2928         }
2929       } else if (ompt_enabled.ompt_callback_nest_lock) {
2930         // release_lock_previous
2931         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2932             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2933       }
2934     }
2935 #endif
2936 
2937     return;
2938 #else
2939     lck = (kmp_user_lock_p)user_lock;
2940 #endif
2941   }
2942 #if KMP_USE_FUTEX
2943   else if ((__kmp_user_lock_kind == lk_futex) &&
2944            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2945             OMP_NEST_LOCK_T_SIZE)) {
2946     lck = (kmp_user_lock_p)user_lock;
2947   }
2948 #endif
2949   else {
2950     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2951   }
2952 
2953 #if USE_ITT_BUILD
2954   __kmp_itt_lock_releasing(lck);
2955 #endif /* USE_ITT_BUILD */
2956 
2957   int release_status;
2958   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2959 #if OMPT_SUPPORT && OMPT_OPTIONAL
2960   // This is the case, if called from omp_init_lock_with_hint:
2961   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2962   if (!codeptr)
2963     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2964   if (ompt_enabled.enabled) {
2965     if (release_status == KMP_LOCK_RELEASED) {
2966       if (ompt_enabled.ompt_callback_mutex_released) {
2967         // release_lock_last
2968         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2969             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2970       }
2971     } else if (ompt_enabled.ompt_callback_nest_lock) {
2972       // release_lock_previous
2973       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2974           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2975     }
2976   }
2977 #endif
2978 
2979 #endif // KMP_USE_DYNAMIC_LOCK
2980 }
2981 
2982 /* try to acquire the lock */
2983 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2984   KMP_COUNT_BLOCK(OMP_test_lock);
2985 
2986 #if KMP_USE_DYNAMIC_LOCK
2987   int rc;
2988   int tag = KMP_EXTRACT_D_TAG(user_lock);
2989 #if USE_ITT_BUILD
2990   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2991 #endif
2992 #if OMPT_SUPPORT && OMPT_OPTIONAL
2993   // This is the case, if called from omp_init_lock_with_hint:
2994   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2995   if (!codeptr)
2996     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2997   if (ompt_enabled.ompt_callback_mutex_acquire) {
2998     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2999         ompt_mutex_lock, omp_lock_hint_none,
3000         __ompt_get_mutex_impl_type(user_lock),
3001         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3002   }
3003 #endif
3004 #if KMP_USE_INLINED_TAS
3005   if (tag == locktag_tas && !__kmp_env_consistency_check) {
3006     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
3007   } else
3008 #elif KMP_USE_INLINED_FUTEX
3009   if (tag == locktag_futex && !__kmp_env_consistency_check) {
3010     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
3011   } else
3012 #endif
3013   {
3014     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
3015   }
3016   if (rc) {
3017 #if USE_ITT_BUILD
3018     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3019 #endif
3020 #if OMPT_SUPPORT && OMPT_OPTIONAL
3021     if (ompt_enabled.ompt_callback_mutex_acquired) {
3022       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3023           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3024     }
3025 #endif
3026     return FTN_TRUE;
3027   } else {
3028 #if USE_ITT_BUILD
3029     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3030 #endif
3031     return FTN_FALSE;
3032   }
3033 
3034 #else // KMP_USE_DYNAMIC_LOCK
3035 
3036   kmp_user_lock_p lck;
3037   int rc;
3038 
3039   if ((__kmp_user_lock_kind == lk_tas) &&
3040       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3041     lck = (kmp_user_lock_p)user_lock;
3042   }
3043 #if KMP_USE_FUTEX
3044   else if ((__kmp_user_lock_kind == lk_futex) &&
3045            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3046     lck = (kmp_user_lock_p)user_lock;
3047   }
3048 #endif
3049   else {
3050     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3051   }
3052 
3053 #if USE_ITT_BUILD
3054   __kmp_itt_lock_acquiring(lck);
3055 #endif /* USE_ITT_BUILD */
3056 #if OMPT_SUPPORT && OMPT_OPTIONAL
3057   // This is the case, if called from omp_init_lock_with_hint:
3058   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3059   if (!codeptr)
3060     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3061   if (ompt_enabled.ompt_callback_mutex_acquire) {
3062     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3063         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3064         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3065   }
3066 #endif
3067 
3068   rc = TEST_LOCK(lck, gtid);
3069 #if USE_ITT_BUILD
3070   if (rc) {
3071     __kmp_itt_lock_acquired(lck);
3072   } else {
3073     __kmp_itt_lock_cancelled(lck);
3074   }
3075 #endif /* USE_ITT_BUILD */
3076 #if OMPT_SUPPORT && OMPT_OPTIONAL
3077   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3078     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3079         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3080   }
3081 #endif
3082 
3083   return (rc ? FTN_TRUE : FTN_FALSE);
3084 
3085   /* Can't use serial interval since not block structured */
3086 
3087 #endif // KMP_USE_DYNAMIC_LOCK
3088 }
3089 
3090 /* try to acquire the lock */
3091 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3092 #if KMP_USE_DYNAMIC_LOCK
3093   int rc;
3094 #if USE_ITT_BUILD
3095   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3096 #endif
3097 #if OMPT_SUPPORT && OMPT_OPTIONAL
3098   // This is the case, if called from omp_init_lock_with_hint:
3099   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3100   if (!codeptr)
3101     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3102   if (ompt_enabled.ompt_callback_mutex_acquire) {
3103     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3104         ompt_mutex_nest_lock, omp_lock_hint_none,
3105         __ompt_get_mutex_impl_type(user_lock),
3106         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3107   }
3108 #endif
3109   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3110 #if USE_ITT_BUILD
3111   if (rc) {
3112     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3113   } else {
3114     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3115   }
3116 #endif
3117 #if OMPT_SUPPORT && OMPT_OPTIONAL
3118   if (ompt_enabled.enabled && rc) {
3119     if (rc == 1) {
3120       if (ompt_enabled.ompt_callback_mutex_acquired) {
3121         // lock_first
3122         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3123             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3124             codeptr);
3125       }
3126     } else {
3127       if (ompt_enabled.ompt_callback_nest_lock) {
3128         // lock_next
3129         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3130             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3131       }
3132     }
3133   }
3134 #endif
3135   return rc;
3136 
3137 #else // KMP_USE_DYNAMIC_LOCK
3138 
3139   kmp_user_lock_p lck;
3140   int rc;
3141 
3142   if ((__kmp_user_lock_kind == lk_tas) &&
3143       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3144        OMP_NEST_LOCK_T_SIZE)) {
3145     lck = (kmp_user_lock_p)user_lock;
3146   }
3147 #if KMP_USE_FUTEX
3148   else if ((__kmp_user_lock_kind == lk_futex) &&
3149            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3150             OMP_NEST_LOCK_T_SIZE)) {
3151     lck = (kmp_user_lock_p)user_lock;
3152   }
3153 #endif
3154   else {
3155     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3156   }
3157 
3158 #if USE_ITT_BUILD
3159   __kmp_itt_lock_acquiring(lck);
3160 #endif /* USE_ITT_BUILD */
3161 
3162 #if OMPT_SUPPORT && OMPT_OPTIONAL
3163   // This is the case, if called from omp_init_lock_with_hint:
3164   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3165   if (!codeptr)
3166     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3167   if (ompt_enabled.enabled) &&
3168         ompt_enabled.ompt_callback_mutex_acquire) {
3169       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3170           ompt_mutex_nest_lock, omp_lock_hint_none,
3171           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3172           codeptr);
3173     }
3174 #endif
3175 
3176   rc = TEST_NESTED_LOCK(lck, gtid);
3177 #if USE_ITT_BUILD
3178   if (rc) {
3179     __kmp_itt_lock_acquired(lck);
3180   } else {
3181     __kmp_itt_lock_cancelled(lck);
3182   }
3183 #endif /* USE_ITT_BUILD */
3184 #if OMPT_SUPPORT && OMPT_OPTIONAL
3185   if (ompt_enabled.enabled && rc) {
3186     if (rc == 1) {
3187       if (ompt_enabled.ompt_callback_mutex_acquired) {
3188         // lock_first
3189         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3190             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3191       }
3192     } else {
3193       if (ompt_enabled.ompt_callback_nest_lock) {
3194         // lock_next
3195         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3196             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3197       }
3198     }
3199   }
3200 #endif
3201   return rc;
3202 
3203   /* Can't use serial interval since not block structured */
3204 
3205 #endif // KMP_USE_DYNAMIC_LOCK
3206 }
3207 
3208 // Interface to fast scalable reduce methods routines
3209 
3210 // keep the selected method in a thread local structure for cross-function
3211 // usage: will be used in __kmpc_end_reduce* functions;
3212 // another solution: to re-determine the method one more time in
3213 // __kmpc_end_reduce* functions (new prototype required then)
3214 // AT: which solution is better?
3215 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3216   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3217 
3218 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3219   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3220 
3221 // description of the packed_reduction_method variable: look at the macros in
3222 // kmp.h
3223 
3224 // used in a critical section reduce block
3225 static __forceinline void
3226 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3227                                           kmp_critical_name *crit) {
3228 
3229   // this lock was visible to a customer and to the threading profile tool as a
3230   // serial overhead span (although it's used for an internal purpose only)
3231   //            why was it visible in previous implementation?
3232   //            should we keep it visible in new reduce block?
3233   kmp_user_lock_p lck;
3234 
3235 #if KMP_USE_DYNAMIC_LOCK
3236 
3237   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3238   // Check if it is initialized.
3239   if (*lk == 0) {
3240     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3241       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3242                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3243     } else {
3244       __kmp_init_indirect_csptr(crit, loc, global_tid,
3245                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3246     }
3247   }
3248   // Branch for accessing the actual lock object and set operation. This
3249   // branching is inevitable since this lock initialization does not follow the
3250   // normal dispatch path (lock table is not used).
3251   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3252     lck = (kmp_user_lock_p)lk;
3253     KMP_DEBUG_ASSERT(lck != NULL);
3254     if (__kmp_env_consistency_check) {
3255       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3256     }
3257     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3258   } else {
3259     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3260     lck = ilk->lock;
3261     KMP_DEBUG_ASSERT(lck != NULL);
3262     if (__kmp_env_consistency_check) {
3263       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3264     }
3265     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3266   }
3267 
3268 #else // KMP_USE_DYNAMIC_LOCK
3269 
3270   // We know that the fast reduction code is only emitted by Intel compilers
3271   // with 32 byte critical sections. If there isn't enough space, then we
3272   // have to use a pointer.
3273   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3274     lck = (kmp_user_lock_p)crit;
3275   } else {
3276     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3277   }
3278   KMP_DEBUG_ASSERT(lck != NULL);
3279 
3280   if (__kmp_env_consistency_check)
3281     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3282 
3283   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3284 
3285 #endif // KMP_USE_DYNAMIC_LOCK
3286 }
3287 
3288 // used in a critical section reduce block
3289 static __forceinline void
3290 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3291                                         kmp_critical_name *crit) {
3292 
3293   kmp_user_lock_p lck;
3294 
3295 #if KMP_USE_DYNAMIC_LOCK
3296 
3297   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3298     lck = (kmp_user_lock_p)crit;
3299     if (__kmp_env_consistency_check)
3300       __kmp_pop_sync(global_tid, ct_critical, loc);
3301     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3302   } else {
3303     kmp_indirect_lock_t *ilk =
3304         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3305     if (__kmp_env_consistency_check)
3306       __kmp_pop_sync(global_tid, ct_critical, loc);
3307     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3308   }
3309 
3310 #else // KMP_USE_DYNAMIC_LOCK
3311 
3312   // We know that the fast reduction code is only emitted by Intel compilers
3313   // with 32 byte critical sections. If there isn't enough space, then we have
3314   // to use a pointer.
3315   if (__kmp_base_user_lock_size > 32) {
3316     lck = *((kmp_user_lock_p *)crit);
3317     KMP_ASSERT(lck != NULL);
3318   } else {
3319     lck = (kmp_user_lock_p)crit;
3320   }
3321 
3322   if (__kmp_env_consistency_check)
3323     __kmp_pop_sync(global_tid, ct_critical, loc);
3324 
3325   __kmp_release_user_lock_with_checks(lck, global_tid);
3326 
3327 #endif // KMP_USE_DYNAMIC_LOCK
3328 } // __kmp_end_critical_section_reduce_block
3329 
3330 static __forceinline int
3331 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3332                                      int *task_state) {
3333   kmp_team_t *team;
3334 
3335   // Check if we are inside the teams construct?
3336   if (th->th.th_teams_microtask) {
3337     *team_p = team = th->th.th_team;
3338     if (team->t.t_level == th->th.th_teams_level) {
3339       // This is reduction at teams construct.
3340       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3341       // Let's swap teams temporarily for the reduction.
3342       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3343       th->th.th_team = team->t.t_parent;
3344       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3345       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3346       *task_state = th->th.th_task_state;
3347       th->th.th_task_state = 0;
3348 
3349       return 1;
3350     }
3351   }
3352   return 0;
3353 }
3354 
3355 static __forceinline void
3356 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3357   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3358   th->th.th_info.ds.ds_tid = 0;
3359   th->th.th_team = team;
3360   th->th.th_team_nproc = team->t.t_nproc;
3361   th->th.th_task_team = team->t.t_task_team[task_state];
3362   __kmp_type_convert(task_state, &(th->th.th_task_state));
3363 }
3364 
3365 /* 2.a.i. Reduce Block without a terminating barrier */
3366 /*!
3367 @ingroup SYNCHRONIZATION
3368 @param loc source location information
3369 @param global_tid global thread number
3370 @param num_vars number of items (variables) to be reduced
3371 @param reduce_size size of data in bytes to be reduced
3372 @param reduce_data pointer to data to be reduced
3373 @param reduce_func callback function providing reduction operation on two
3374 operands and returning result of reduction in lhs_data
3375 @param lck pointer to the unique lock data structure
3376 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3377 threads if atomic reduction needed
3378 
3379 The nowait version is used for a reduce clause with the nowait argument.
3380 */
3381 kmp_int32
3382 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3383                      size_t reduce_size, void *reduce_data,
3384                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3385                      kmp_critical_name *lck) {
3386 
3387   KMP_COUNT_BLOCK(REDUCE_nowait);
3388   int retval = 0;
3389   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3390   kmp_info_t *th;
3391   kmp_team_t *team;
3392   int teams_swapped = 0, task_state;
3393   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3394   __kmp_assert_valid_gtid(global_tid);
3395 
3396   // why do we need this initialization here at all?
3397   // Reduction clause can not be used as a stand-alone directive.
3398 
3399   // do not call __kmp_serial_initialize(), it will be called by
3400   // __kmp_parallel_initialize() if needed
3401   // possible detection of false-positive race by the threadchecker ???
3402   if (!TCR_4(__kmp_init_parallel))
3403     __kmp_parallel_initialize();
3404 
3405   __kmp_resume_if_soft_paused();
3406 
3407 // check correctness of reduce block nesting
3408 #if KMP_USE_DYNAMIC_LOCK
3409   if (__kmp_env_consistency_check)
3410     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3411 #else
3412   if (__kmp_env_consistency_check)
3413     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3414 #endif
3415 
3416   th = __kmp_thread_from_gtid(global_tid);
3417   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3418 
3419   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3420   // the value should be kept in a variable
3421   // the variable should be either a construct-specific or thread-specific
3422   // property, not a team specific property
3423   //     (a thread can reach the next reduce block on the next construct, reduce
3424   //     method may differ on the next construct)
3425   // an ident_t "loc" parameter could be used as a construct-specific property
3426   // (what if loc == 0?)
3427   //     (if both construct-specific and team-specific variables were shared,
3428   //     then unness extra syncs should be needed)
3429   // a thread-specific variable is better regarding two issues above (next
3430   // construct and extra syncs)
3431   // a thread-specific "th_local.reduction_method" variable is used currently
3432   // each thread executes 'determine' and 'set' lines (no need to execute by one
3433   // thread, to avoid unness extra syncs)
3434 
3435   packed_reduction_method = __kmp_determine_reduction_method(
3436       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3437   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3438 
3439   OMPT_REDUCTION_DECL(th, global_tid);
3440   if (packed_reduction_method == critical_reduce_block) {
3441 
3442     OMPT_REDUCTION_BEGIN;
3443 
3444     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3445     retval = 1;
3446 
3447   } else if (packed_reduction_method == empty_reduce_block) {
3448 
3449     OMPT_REDUCTION_BEGIN;
3450 
3451     // usage: if team size == 1, no synchronization is required ( Intel
3452     // platforms only )
3453     retval = 1;
3454 
3455   } else if (packed_reduction_method == atomic_reduce_block) {
3456 
3457     retval = 2;
3458 
3459     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3460     // won't be called by the code gen)
3461     //     (it's not quite good, because the checking block has been closed by
3462     //     this 'pop',
3463     //      but atomic operation has not been executed yet, will be executed
3464     //      slightly later, literally on next instruction)
3465     if (__kmp_env_consistency_check)
3466       __kmp_pop_sync(global_tid, ct_reduce, loc);
3467 
3468   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3469                                    tree_reduce_block)) {
3470 
3471 // AT: performance issue: a real barrier here
3472 // AT:     (if master goes slow, other threads are blocked here waiting for the
3473 // master to come and release them)
3474 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3475 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3476 // be confusing to a customer)
3477 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3478 // might go faster and be more in line with sense of NOWAIT
3479 // AT: TO DO: do epcc test and compare times
3480 
3481 // this barrier should be invisible to a customer and to the threading profile
3482 // tool (it's neither a terminating barrier nor customer's code, it's
3483 // used for an internal purpose)
3484 #if OMPT_SUPPORT
3485     // JP: can this barrier potentially leed to task scheduling?
3486     // JP: as long as there is a barrier in the implementation, OMPT should and
3487     // will provide the barrier events
3488     //         so we set-up the necessary frame/return addresses.
3489     ompt_frame_t *ompt_frame;
3490     if (ompt_enabled.enabled) {
3491       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3492       if (ompt_frame->enter_frame.ptr == NULL)
3493         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3494     }
3495     OMPT_STORE_RETURN_ADDRESS(global_tid);
3496 #endif
3497 #if USE_ITT_NOTIFY
3498     __kmp_threads[global_tid]->th.th_ident = loc;
3499 #endif
3500     retval =
3501         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3502                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3503     retval = (retval != 0) ? (0) : (1);
3504 #if OMPT_SUPPORT && OMPT_OPTIONAL
3505     if (ompt_enabled.enabled) {
3506       ompt_frame->enter_frame = ompt_data_none;
3507     }
3508 #endif
3509 
3510     // all other workers except master should do this pop here
3511     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3512     if (__kmp_env_consistency_check) {
3513       if (retval == 0) {
3514         __kmp_pop_sync(global_tid, ct_reduce, loc);
3515       }
3516     }
3517 
3518   } else {
3519 
3520     // should never reach this block
3521     KMP_ASSERT(0); // "unexpected method"
3522   }
3523   if (teams_swapped) {
3524     __kmp_restore_swapped_teams(th, team, task_state);
3525   }
3526   KA_TRACE(
3527       10,
3528       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3529        global_tid, packed_reduction_method, retval));
3530 
3531   return retval;
3532 }
3533 
3534 /*!
3535 @ingroup SYNCHRONIZATION
3536 @param loc source location information
3537 @param global_tid global thread id.
3538 @param lck pointer to the unique lock data structure
3539 
3540 Finish the execution of a reduce nowait.
3541 */
3542 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3543                               kmp_critical_name *lck) {
3544 
3545   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3546 
3547   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3548   __kmp_assert_valid_gtid(global_tid);
3549 
3550   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3551 
3552   OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3553 
3554   if (packed_reduction_method == critical_reduce_block) {
3555 
3556     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3557     OMPT_REDUCTION_END;
3558 
3559   } else if (packed_reduction_method == empty_reduce_block) {
3560 
3561     // usage: if team size == 1, no synchronization is required ( on Intel
3562     // platforms only )
3563 
3564     OMPT_REDUCTION_END;
3565 
3566   } else if (packed_reduction_method == atomic_reduce_block) {
3567 
3568     // neither master nor other workers should get here
3569     //     (code gen does not generate this call in case 2: atomic reduce block)
3570     // actually it's better to remove this elseif at all;
3571     // after removal this value will checked by the 'else' and will assert
3572 
3573   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3574                                    tree_reduce_block)) {
3575 
3576     // only master gets here
3577     // OMPT: tree reduction is annotated in the barrier code
3578 
3579   } else {
3580 
3581     // should never reach this block
3582     KMP_ASSERT(0); // "unexpected method"
3583   }
3584 
3585   if (__kmp_env_consistency_check)
3586     __kmp_pop_sync(global_tid, ct_reduce, loc);
3587 
3588   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3589                 global_tid, packed_reduction_method));
3590 
3591   return;
3592 }
3593 
3594 /* 2.a.ii. Reduce Block with a terminating barrier */
3595 
3596 /*!
3597 @ingroup SYNCHRONIZATION
3598 @param loc source location information
3599 @param global_tid global thread number
3600 @param num_vars number of items (variables) to be reduced
3601 @param reduce_size size of data in bytes to be reduced
3602 @param reduce_data pointer to data to be reduced
3603 @param reduce_func callback function providing reduction operation on two
3604 operands and returning result of reduction in lhs_data
3605 @param lck pointer to the unique lock data structure
3606 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3607 threads if atomic reduction needed
3608 
3609 A blocking reduce that includes an implicit barrier.
3610 */
3611 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3612                         size_t reduce_size, void *reduce_data,
3613                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3614                         kmp_critical_name *lck) {
3615   KMP_COUNT_BLOCK(REDUCE_wait);
3616   int retval = 0;
3617   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3618   kmp_info_t *th;
3619   kmp_team_t *team;
3620   int teams_swapped = 0, task_state;
3621 
3622   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3623   __kmp_assert_valid_gtid(global_tid);
3624 
3625   // why do we need this initialization here at all?
3626   // Reduction clause can not be a stand-alone directive.
3627 
3628   // do not call __kmp_serial_initialize(), it will be called by
3629   // __kmp_parallel_initialize() if needed
3630   // possible detection of false-positive race by the threadchecker ???
3631   if (!TCR_4(__kmp_init_parallel))
3632     __kmp_parallel_initialize();
3633 
3634   __kmp_resume_if_soft_paused();
3635 
3636 // check correctness of reduce block nesting
3637 #if KMP_USE_DYNAMIC_LOCK
3638   if (__kmp_env_consistency_check)
3639     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3640 #else
3641   if (__kmp_env_consistency_check)
3642     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3643 #endif
3644 
3645   th = __kmp_thread_from_gtid(global_tid);
3646   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3647 
3648   packed_reduction_method = __kmp_determine_reduction_method(
3649       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3650   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3651 
3652   OMPT_REDUCTION_DECL(th, global_tid);
3653 
3654   if (packed_reduction_method == critical_reduce_block) {
3655 
3656     OMPT_REDUCTION_BEGIN;
3657     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3658     retval = 1;
3659 
3660   } else if (packed_reduction_method == empty_reduce_block) {
3661 
3662     OMPT_REDUCTION_BEGIN;
3663     // usage: if team size == 1, no synchronization is required ( Intel
3664     // platforms only )
3665     retval = 1;
3666 
3667   } else if (packed_reduction_method == atomic_reduce_block) {
3668 
3669     retval = 2;
3670 
3671   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3672                                    tree_reduce_block)) {
3673 
3674 // case tree_reduce_block:
3675 // this barrier should be visible to a customer and to the threading profile
3676 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3677 #if OMPT_SUPPORT
3678     ompt_frame_t *ompt_frame;
3679     if (ompt_enabled.enabled) {
3680       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3681       if (ompt_frame->enter_frame.ptr == NULL)
3682         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3683     }
3684     OMPT_STORE_RETURN_ADDRESS(global_tid);
3685 #endif
3686 #if USE_ITT_NOTIFY
3687     __kmp_threads[global_tid]->th.th_ident =
3688         loc; // needed for correct notification of frames
3689 #endif
3690     retval =
3691         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3692                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3693     retval = (retval != 0) ? (0) : (1);
3694 #if OMPT_SUPPORT && OMPT_OPTIONAL
3695     if (ompt_enabled.enabled) {
3696       ompt_frame->enter_frame = ompt_data_none;
3697     }
3698 #endif
3699 
3700     // all other workers except master should do this pop here
3701     // ( none of other workers except master will enter __kmpc_end_reduce() )
3702     if (__kmp_env_consistency_check) {
3703       if (retval == 0) { // 0: all other workers; 1: master
3704         __kmp_pop_sync(global_tid, ct_reduce, loc);
3705       }
3706     }
3707 
3708   } else {
3709 
3710     // should never reach this block
3711     KMP_ASSERT(0); // "unexpected method"
3712   }
3713   if (teams_swapped) {
3714     __kmp_restore_swapped_teams(th, team, task_state);
3715   }
3716 
3717   KA_TRACE(10,
3718            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3719             global_tid, packed_reduction_method, retval));
3720   return retval;
3721 }
3722 
3723 /*!
3724 @ingroup SYNCHRONIZATION
3725 @param loc source location information
3726 @param global_tid global thread id.
3727 @param lck pointer to the unique lock data structure
3728 
3729 Finish the execution of a blocking reduce.
3730 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3731 start function.
3732 */
3733 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3734                        kmp_critical_name *lck) {
3735 
3736   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3737   kmp_info_t *th;
3738   kmp_team_t *team;
3739   int teams_swapped = 0, task_state;
3740 
3741   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3742   __kmp_assert_valid_gtid(global_tid);
3743 
3744   th = __kmp_thread_from_gtid(global_tid);
3745   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3746 
3747   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3748 
3749   // this barrier should be visible to a customer and to the threading profile
3750   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3751   OMPT_REDUCTION_DECL(th, global_tid);
3752 
3753   if (packed_reduction_method == critical_reduce_block) {
3754     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3755 
3756     OMPT_REDUCTION_END;
3757 
3758 // TODO: implicit barrier: should be exposed
3759 #if OMPT_SUPPORT
3760     ompt_frame_t *ompt_frame;
3761     if (ompt_enabled.enabled) {
3762       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3763       if (ompt_frame->enter_frame.ptr == NULL)
3764         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3765     }
3766     OMPT_STORE_RETURN_ADDRESS(global_tid);
3767 #endif
3768 #if USE_ITT_NOTIFY
3769     __kmp_threads[global_tid]->th.th_ident = loc;
3770 #endif
3771     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3772 #if OMPT_SUPPORT && OMPT_OPTIONAL
3773     if (ompt_enabled.enabled) {
3774       ompt_frame->enter_frame = ompt_data_none;
3775     }
3776 #endif
3777 
3778   } else if (packed_reduction_method == empty_reduce_block) {
3779 
3780     OMPT_REDUCTION_END;
3781 
3782 // usage: if team size==1, no synchronization is required (Intel platforms only)
3783 
3784 // TODO: implicit barrier: should be exposed
3785 #if OMPT_SUPPORT
3786     ompt_frame_t *ompt_frame;
3787     if (ompt_enabled.enabled) {
3788       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3789       if (ompt_frame->enter_frame.ptr == NULL)
3790         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3791     }
3792     OMPT_STORE_RETURN_ADDRESS(global_tid);
3793 #endif
3794 #if USE_ITT_NOTIFY
3795     __kmp_threads[global_tid]->th.th_ident = loc;
3796 #endif
3797     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3798 #if OMPT_SUPPORT && OMPT_OPTIONAL
3799     if (ompt_enabled.enabled) {
3800       ompt_frame->enter_frame = ompt_data_none;
3801     }
3802 #endif
3803 
3804   } else if (packed_reduction_method == atomic_reduce_block) {
3805 
3806 #if OMPT_SUPPORT
3807     ompt_frame_t *ompt_frame;
3808     if (ompt_enabled.enabled) {
3809       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3810       if (ompt_frame->enter_frame.ptr == NULL)
3811         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3812     }
3813     OMPT_STORE_RETURN_ADDRESS(global_tid);
3814 #endif
3815 // TODO: implicit barrier: should be exposed
3816 #if USE_ITT_NOTIFY
3817     __kmp_threads[global_tid]->th.th_ident = loc;
3818 #endif
3819     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3820 #if OMPT_SUPPORT && OMPT_OPTIONAL
3821     if (ompt_enabled.enabled) {
3822       ompt_frame->enter_frame = ompt_data_none;
3823     }
3824 #endif
3825 
3826   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3827                                    tree_reduce_block)) {
3828 
3829     // only master executes here (master releases all other workers)
3830     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3831                             global_tid);
3832 
3833   } else {
3834 
3835     // should never reach this block
3836     KMP_ASSERT(0); // "unexpected method"
3837   }
3838   if (teams_swapped) {
3839     __kmp_restore_swapped_teams(th, team, task_state);
3840   }
3841 
3842   if (__kmp_env_consistency_check)
3843     __kmp_pop_sync(global_tid, ct_reduce, loc);
3844 
3845   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3846                 global_tid, packed_reduction_method));
3847 
3848   return;
3849 }
3850 
3851 #undef __KMP_GET_REDUCTION_METHOD
3852 #undef __KMP_SET_REDUCTION_METHOD
3853 
3854 /* end of interface to fast scalable reduce routines */
3855 
3856 kmp_uint64 __kmpc_get_taskid() {
3857 
3858   kmp_int32 gtid;
3859   kmp_info_t *thread;
3860 
3861   gtid = __kmp_get_gtid();
3862   if (gtid < 0) {
3863     return 0;
3864   }
3865   thread = __kmp_thread_from_gtid(gtid);
3866   return thread->th.th_current_task->td_task_id;
3867 
3868 } // __kmpc_get_taskid
3869 
3870 kmp_uint64 __kmpc_get_parent_taskid() {
3871 
3872   kmp_int32 gtid;
3873   kmp_info_t *thread;
3874   kmp_taskdata_t *parent_task;
3875 
3876   gtid = __kmp_get_gtid();
3877   if (gtid < 0) {
3878     return 0;
3879   }
3880   thread = __kmp_thread_from_gtid(gtid);
3881   parent_task = thread->th.th_current_task->td_parent;
3882   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3883 
3884 } // __kmpc_get_parent_taskid
3885 
3886 /*!
3887 @ingroup WORK_SHARING
3888 @param loc  source location information.
3889 @param gtid  global thread number.
3890 @param num_dims  number of associated doacross loops.
3891 @param dims  info on loops bounds.
3892 
3893 Initialize doacross loop information.
3894 Expect compiler send us inclusive bounds,
3895 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3896 */
3897 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3898                           const struct kmp_dim *dims) {
3899   __kmp_assert_valid_gtid(gtid);
3900   int j, idx;
3901   kmp_int64 last, trace_count;
3902   kmp_info_t *th = __kmp_threads[gtid];
3903   kmp_team_t *team = th->th.th_team;
3904   kmp_uint32 *flags;
3905   kmp_disp_t *pr_buf = th->th.th_dispatch;
3906   dispatch_shared_info_t *sh_buf;
3907 
3908   KA_TRACE(
3909       20,
3910       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3911        gtid, num_dims, !team->t.t_serialized));
3912   KMP_DEBUG_ASSERT(dims != NULL);
3913   KMP_DEBUG_ASSERT(num_dims > 0);
3914 
3915   if (team->t.t_serialized) {
3916     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3917     return; // no dependencies if team is serialized
3918   }
3919   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3920   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3921   // the next loop
3922   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3923 
3924   // Save bounds info into allocated private buffer
3925   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3926   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3927       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3928   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3929   pr_buf->th_doacross_info[0] =
3930       (kmp_int64)num_dims; // first element is number of dimensions
3931   // Save also address of num_done in order to access it later without knowing
3932   // the buffer index
3933   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3934   pr_buf->th_doacross_info[2] = dims[0].lo;
3935   pr_buf->th_doacross_info[3] = dims[0].up;
3936   pr_buf->th_doacross_info[4] = dims[0].st;
3937   last = 5;
3938   for (j = 1; j < num_dims; ++j) {
3939     kmp_int64
3940         range_length; // To keep ranges of all dimensions but the first dims[0]
3941     if (dims[j].st == 1) { // most common case
3942       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3943       range_length = dims[j].up - dims[j].lo + 1;
3944     } else {
3945       if (dims[j].st > 0) {
3946         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3947         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3948       } else { // negative increment
3949         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3950         range_length =
3951             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3952       }
3953     }
3954     pr_buf->th_doacross_info[last++] = range_length;
3955     pr_buf->th_doacross_info[last++] = dims[j].lo;
3956     pr_buf->th_doacross_info[last++] = dims[j].up;
3957     pr_buf->th_doacross_info[last++] = dims[j].st;
3958   }
3959 
3960   // Compute total trip count.
3961   // Start with range of dims[0] which we don't need to keep in the buffer.
3962   if (dims[0].st == 1) { // most common case
3963     trace_count = dims[0].up - dims[0].lo + 1;
3964   } else if (dims[0].st > 0) {
3965     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3966     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3967   } else { // negative increment
3968     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3969     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3970   }
3971   for (j = 1; j < num_dims; ++j) {
3972     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3973   }
3974   KMP_DEBUG_ASSERT(trace_count > 0);
3975 
3976   // Check if shared buffer is not occupied by other loop (idx -
3977   // __kmp_dispatch_num_buffers)
3978   if (idx != sh_buf->doacross_buf_idx) {
3979     // Shared buffer is occupied, wait for it to be free
3980     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3981                  __kmp_eq_4, NULL);
3982   }
3983 #if KMP_32_BIT_ARCH
3984   // Check if we are the first thread. After the CAS the first thread gets 0,
3985   // others get 1 if initialization is in progress, allocated pointer otherwise.
3986   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3987   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3988       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3989 #else
3990   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3991       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3992 #endif
3993   if (flags == NULL) {
3994     // we are the first thread, allocate the array of flags
3995     size_t size =
3996         (size_t)trace_count / 8 + 8; // in bytes, use single bit per iteration
3997     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3998     KMP_MB();
3999     sh_buf->doacross_flags = flags;
4000   } else if (flags == (kmp_uint32 *)1) {
4001 #if KMP_32_BIT_ARCH
4002     // initialization is still in progress, need to wait
4003     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
4004 #else
4005     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
4006 #endif
4007       KMP_YIELD(TRUE);
4008     KMP_MB();
4009   } else {
4010     KMP_MB();
4011   }
4012   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
4013   pr_buf->th_doacross_flags =
4014       sh_buf->doacross_flags; // save private copy in order to not
4015   // touch shared buffer on each iteration
4016   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
4017 }
4018 
4019 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
4020   __kmp_assert_valid_gtid(gtid);
4021   kmp_int64 shft;
4022   size_t num_dims, i;
4023   kmp_uint32 flag;
4024   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4025   kmp_info_t *th = __kmp_threads[gtid];
4026   kmp_team_t *team = th->th.th_team;
4027   kmp_disp_t *pr_buf;
4028   kmp_int64 lo, up, st;
4029 
4030   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4031   if (team->t.t_serialized) {
4032     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4033     return; // no dependencies if team is serialized
4034   }
4035 
4036   // calculate sequential iteration number and check out-of-bounds condition
4037   pr_buf = th->th.th_dispatch;
4038   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4039   num_dims = (size_t)pr_buf->th_doacross_info[0];
4040   lo = pr_buf->th_doacross_info[2];
4041   up = pr_buf->th_doacross_info[3];
4042   st = pr_buf->th_doacross_info[4];
4043 #if OMPT_SUPPORT && OMPT_OPTIONAL
4044   ompt_dependence_t deps[num_dims];
4045 #endif
4046   if (st == 1) { // most common case
4047     if (vec[0] < lo || vec[0] > up) {
4048       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4049                     "bounds [%lld,%lld]\n",
4050                     gtid, vec[0], lo, up));
4051       return;
4052     }
4053     iter_number = vec[0] - lo;
4054   } else if (st > 0) {
4055     if (vec[0] < lo || vec[0] > up) {
4056       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4057                     "bounds [%lld,%lld]\n",
4058                     gtid, vec[0], lo, up));
4059       return;
4060     }
4061     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4062   } else { // negative increment
4063     if (vec[0] > lo || vec[0] < up) {
4064       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4065                     "bounds [%lld,%lld]\n",
4066                     gtid, vec[0], lo, up));
4067       return;
4068     }
4069     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4070   }
4071 #if OMPT_SUPPORT && OMPT_OPTIONAL
4072   deps[0].variable.value = iter_number;
4073   deps[0].dependence_type = ompt_dependence_type_sink;
4074 #endif
4075   for (i = 1; i < num_dims; ++i) {
4076     kmp_int64 iter, ln;
4077     size_t j = i * 4;
4078     ln = pr_buf->th_doacross_info[j + 1];
4079     lo = pr_buf->th_doacross_info[j + 2];
4080     up = pr_buf->th_doacross_info[j + 3];
4081     st = pr_buf->th_doacross_info[j + 4];
4082     if (st == 1) {
4083       if (vec[i] < lo || vec[i] > up) {
4084         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4085                       "bounds [%lld,%lld]\n",
4086                       gtid, vec[i], lo, up));
4087         return;
4088       }
4089       iter = vec[i] - lo;
4090     } else if (st > 0) {
4091       if (vec[i] < lo || vec[i] > up) {
4092         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4093                       "bounds [%lld,%lld]\n",
4094                       gtid, vec[i], lo, up));
4095         return;
4096       }
4097       iter = (kmp_uint64)(vec[i] - lo) / st;
4098     } else { // st < 0
4099       if (vec[i] > lo || vec[i] < up) {
4100         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4101                       "bounds [%lld,%lld]\n",
4102                       gtid, vec[i], lo, up));
4103         return;
4104       }
4105       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4106     }
4107     iter_number = iter + ln * iter_number;
4108 #if OMPT_SUPPORT && OMPT_OPTIONAL
4109     deps[i].variable.value = iter;
4110     deps[i].dependence_type = ompt_dependence_type_sink;
4111 #endif
4112   }
4113   shft = iter_number % 32; // use 32-bit granularity
4114   iter_number >>= 5; // divided by 32
4115   flag = 1 << shft;
4116   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4117     KMP_YIELD(TRUE);
4118   }
4119   KMP_MB();
4120 #if OMPT_SUPPORT && OMPT_OPTIONAL
4121   if (ompt_enabled.ompt_callback_dependences) {
4122     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4123         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4124   }
4125 #endif
4126   KA_TRACE(20,
4127            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4128             gtid, (iter_number << 5) + shft));
4129 }
4130 
4131 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4132   __kmp_assert_valid_gtid(gtid);
4133   kmp_int64 shft;
4134   size_t num_dims, i;
4135   kmp_uint32 flag;
4136   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4137   kmp_info_t *th = __kmp_threads[gtid];
4138   kmp_team_t *team = th->th.th_team;
4139   kmp_disp_t *pr_buf;
4140   kmp_int64 lo, st;
4141 
4142   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4143   if (team->t.t_serialized) {
4144     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4145     return; // no dependencies if team is serialized
4146   }
4147 
4148   // calculate sequential iteration number (same as in "wait" but no
4149   // out-of-bounds checks)
4150   pr_buf = th->th.th_dispatch;
4151   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4152   num_dims = (size_t)pr_buf->th_doacross_info[0];
4153   lo = pr_buf->th_doacross_info[2];
4154   st = pr_buf->th_doacross_info[4];
4155 #if OMPT_SUPPORT && OMPT_OPTIONAL
4156   ompt_dependence_t deps[num_dims];
4157 #endif
4158   if (st == 1) { // most common case
4159     iter_number = vec[0] - lo;
4160   } else if (st > 0) {
4161     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4162   } else { // negative increment
4163     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4164   }
4165 #if OMPT_SUPPORT && OMPT_OPTIONAL
4166   deps[0].variable.value = iter_number;
4167   deps[0].dependence_type = ompt_dependence_type_source;
4168 #endif
4169   for (i = 1; i < num_dims; ++i) {
4170     kmp_int64 iter, ln;
4171     size_t j = i * 4;
4172     ln = pr_buf->th_doacross_info[j + 1];
4173     lo = pr_buf->th_doacross_info[j + 2];
4174     st = pr_buf->th_doacross_info[j + 4];
4175     if (st == 1) {
4176       iter = vec[i] - lo;
4177     } else if (st > 0) {
4178       iter = (kmp_uint64)(vec[i] - lo) / st;
4179     } else { // st < 0
4180       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4181     }
4182     iter_number = iter + ln * iter_number;
4183 #if OMPT_SUPPORT && OMPT_OPTIONAL
4184     deps[i].variable.value = iter;
4185     deps[i].dependence_type = ompt_dependence_type_source;
4186 #endif
4187   }
4188 #if OMPT_SUPPORT && OMPT_OPTIONAL
4189   if (ompt_enabled.ompt_callback_dependences) {
4190     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
4191         &(OMPT_CUR_TASK_INFO(th)->task_data), deps, (kmp_uint32)num_dims);
4192   }
4193 #endif
4194   shft = iter_number % 32; // use 32-bit granularity
4195   iter_number >>= 5; // divided by 32
4196   flag = 1 << shft;
4197   KMP_MB();
4198   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4199     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4200   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4201                 (iter_number << 5) + shft));
4202 }
4203 
4204 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4205   __kmp_assert_valid_gtid(gtid);
4206   kmp_int32 num_done;
4207   kmp_info_t *th = __kmp_threads[gtid];
4208   kmp_team_t *team = th->th.th_team;
4209   kmp_disp_t *pr_buf = th->th.th_dispatch;
4210 
4211   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4212   if (team->t.t_serialized) {
4213     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4214     return; // nothing to do
4215   }
4216   num_done =
4217       KMP_TEST_THEN_INC32((kmp_uintptr_t)(pr_buf->th_doacross_info[1])) + 1;
4218   if (num_done == th->th.th_team_nproc) {
4219     // we are the last thread, need to free shared resources
4220     int idx = pr_buf->th_doacross_buf_idx - 1;
4221     dispatch_shared_info_t *sh_buf =
4222         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4223     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4224                      (kmp_int64)&sh_buf->doacross_num_done);
4225     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4226     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4227     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4228     sh_buf->doacross_flags = NULL;
4229     sh_buf->doacross_num_done = 0;
4230     sh_buf->doacross_buf_idx +=
4231         __kmp_dispatch_num_buffers; // free buffer for future re-use
4232   }
4233   // free private resources (need to keep buffer index forever)
4234   pr_buf->th_doacross_flags = NULL;
4235   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4236   pr_buf->th_doacross_info = NULL;
4237   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4238 }
4239 
4240 /* omp_alloc/omp_calloc/omp_free only defined for C/C++, not for Fortran */
4241 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4242   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4243 }
4244 
4245 void *omp_calloc(size_t nmemb, size_t size, omp_allocator_handle_t allocator) {
4246   return __kmpc_calloc(__kmp_entry_gtid(), nmemb, size, allocator);
4247 }
4248 
4249 void *omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
4250                   omp_allocator_handle_t free_allocator) {
4251   return __kmpc_realloc(__kmp_entry_gtid(), ptr, size, allocator,
4252                         free_allocator);
4253 }
4254 
4255 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4256   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4257 }
4258 
4259 int __kmpc_get_target_offload(void) {
4260   if (!__kmp_init_serial) {
4261     __kmp_serial_initialize();
4262   }
4263   return __kmp_target_offload;
4264 }
4265 
4266 int __kmpc_pause_resource(kmp_pause_status_t level) {
4267   if (!__kmp_init_serial) {
4268     return 1; // Can't pause if runtime is not initialized
4269   }
4270   return __kmp_pause_resource(level);
4271 }
4272