1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #define MAX_MESSAGE 512
27 
28 // flags will be used in future, e.g. to implement openmp_strict library
29 // restrictions
30 
31 /*!
32  * @ingroup STARTUP_SHUTDOWN
33  * @param loc   in   source location information
34  * @param flags in   for future use (currently ignored)
35  *
36  * Initialize the runtime library. This call is optional; if it is not made then
37  * it will be implicitly called by attempts to use other library functions.
38  */
39 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
40   // By default __kmpc_begin() is no-op.
41   char *env;
42   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
43       __kmp_str_match_true(env)) {
44     __kmp_middle_initialize();
45     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
46   } else if (__kmp_ignore_mppbeg() == FALSE) {
47     // By default __kmp_ignore_mppbeg() returns TRUE.
48     __kmp_internal_begin();
49     KC_TRACE(10, ("__kmpc_begin: called\n"));
50   }
51 }
52 
53 /*!
54  * @ingroup STARTUP_SHUTDOWN
55  * @param loc source location information
56  *
57  * Shutdown the runtime library. This is also optional, and even if called will
58  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
59  * zero.
60  */
61 void __kmpc_end(ident_t *loc) {
62   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
63   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
64   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
65   // returns FALSE and __kmpc_end() will unregister this root (it can cause
66   // library shut down).
67   if (__kmp_ignore_mppend() == FALSE) {
68     KC_TRACE(10, ("__kmpc_end: called\n"));
69     KA_TRACE(30, ("__kmpc_end\n"));
70 
71     __kmp_internal_end_thread(-1);
72   }
73 }
74 
75 /*!
76 @ingroup THREAD_STATES
77 @param loc Source location information.
78 @return The global thread index of the active thread.
79 
80 This function can be called in any context.
81 
82 If the runtime has ony been entered at the outermost level from a
83 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
84 that which would be returned by omp_get_thread_num() in the outermost
85 active parallel construct. (Or zero if there is no active parallel
86 construct, since the master thread is necessarily thread zero).
87 
88 If multiple non-OpenMP threads all enter an OpenMP construct then this
89 will be a unique thread identifier among all the threads created by
90 the OpenMP runtime (but the value cannote be defined in terms of
91 OpenMP thread ids returned by omp_get_thread_num()).
92 */
93 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
94   kmp_int32 gtid = __kmp_entry_gtid();
95 
96   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
97 
98   return gtid;
99 }
100 
101 /*!
102 @ingroup THREAD_STATES
103 @param loc Source location information.
104 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
105 
106 This function can be called in any context.
107 It returns the total number of threads under the control of the OpenMP runtime.
108 That is not a number that can be determined by any OpenMP standard calls, since
109 the library may be called from more than one non-OpenMP thread, and this
110 reflects the total over all such calls. Similarly the runtime maintains
111 underlying threads even when they are not active (since the cost of creating
112 and destroying OS threads is high), this call counts all such threads even if
113 they are not waiting for work.
114 */
115 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
116   KC_TRACE(10,
117            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
118 
119   return TCR_4(__kmp_all_nth);
120 }
121 
122 /*!
123 @ingroup THREAD_STATES
124 @param loc Source location information.
125 @return The thread number of the calling thread in the innermost active parallel
126 construct.
127 */
128 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
129   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
130   return __kmp_tid_from_gtid(__kmp_entry_gtid());
131 }
132 
133 /*!
134 @ingroup THREAD_STATES
135 @param loc Source location information.
136 @return The number of threads in the innermost active parallel construct.
137 */
138 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
139   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
140 
141   return __kmp_entry_thread()->th.th_team->t.t_nproc;
142 }
143 
144 /*!
145  * @ingroup DEPRECATED
146  * @param loc location description
147  *
148  * This function need not be called. It always returns TRUE.
149  */
150 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
151 #ifndef KMP_DEBUG
152 
153   return TRUE;
154 
155 #else
156 
157   const char *semi2;
158   const char *semi3;
159   int line_no;
160 
161   if (__kmp_par_range == 0) {
162     return TRUE;
163   }
164   semi2 = loc->psource;
165   if (semi2 == NULL) {
166     return TRUE;
167   }
168   semi2 = strchr(semi2, ';');
169   if (semi2 == NULL) {
170     return TRUE;
171   }
172   semi2 = strchr(semi2 + 1, ';');
173   if (semi2 == NULL) {
174     return TRUE;
175   }
176   if (__kmp_par_range_filename[0]) {
177     const char *name = semi2 - 1;
178     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
179       name--;
180     }
181     if ((*name == '/') || (*name == ';')) {
182       name++;
183     }
184     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
185       return __kmp_par_range < 0;
186     }
187   }
188   semi3 = strchr(semi2 + 1, ';');
189   if (__kmp_par_range_routine[0]) {
190     if ((semi3 != NULL) && (semi3 > semi2) &&
191         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
192       return __kmp_par_range < 0;
193     }
194   }
195   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
196     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
197       return __kmp_par_range > 0;
198     }
199     return __kmp_par_range < 0;
200   }
201   return TRUE;
202 
203 #endif /* KMP_DEBUG */
204 }
205 
206 /*!
207 @ingroup THREAD_STATES
208 @param loc Source location information.
209 @return 1 if this thread is executing inside an active parallel region, zero if
210 not.
211 */
212 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
213   return __kmp_entry_thread()->th.th_root->r.r_active;
214 }
215 
216 /*!
217 @ingroup PARALLEL
218 @param loc source location information
219 @param global_tid global thread number
220 @param num_threads number of threads requested for this parallel construct
221 
222 Set the number of threads to be used by the next fork spawned by this thread.
223 This call is only required if the parallel construct has a `num_threads` clause.
224 */
225 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
226                              kmp_int32 num_threads) {
227   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
228                 global_tid, num_threads));
229 
230   __kmp_push_num_threads(loc, global_tid, num_threads);
231 }
232 
233 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
234   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
235 
236   /* the num_threads are automatically popped */
237 }
238 
239 #if OMP_40_ENABLED
240 
241 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
242                            kmp_int32 proc_bind) {
243   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
244                 proc_bind));
245 
246   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
247 }
248 
249 #endif /* OMP_40_ENABLED */
250 
251 /*!
252 @ingroup PARALLEL
253 @param loc  source location information
254 @param argc  total number of arguments in the ellipsis
255 @param microtask  pointer to callback routine consisting of outlined parallel
256 construct
257 @param ...  pointers to shared variables that aren't global
258 
259 Do the actual fork and call the microtask in the relevant number of threads.
260 */
261 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
262   int gtid = __kmp_entry_gtid();
263 
264 #if (KMP_STATS_ENABLED)
265   // If we were in a serial region, then stop the serial timer, record
266   // the event, and start parallel region timer
267   stats_state_e previous_state = KMP_GET_THREAD_STATE();
268   if (previous_state == stats_state_e::SERIAL_REGION) {
269     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
270   } else {
271     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
272   }
273   int inParallel = __kmpc_in_parallel(loc);
274   if (inParallel) {
275     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
276   } else {
277     KMP_COUNT_BLOCK(OMP_PARALLEL);
278   }
279 #endif
280 
281   // maybe to save thr_state is enough here
282   {
283     va_list ap;
284     va_start(ap, microtask);
285 
286 #if OMPT_SUPPORT
287     omp_frame_t *ompt_frame;
288     if (ompt_enabled.enabled) {
289       kmp_info_t *master_th = __kmp_threads[gtid];
290       kmp_team_t *parent_team = master_th->th.th_team;
291       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
292       if (lwt)
293         ompt_frame = &(lwt->ompt_task_info.frame);
294       else {
295         int tid = __kmp_tid_from_gtid(gtid);
296         ompt_frame = &(
297             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
298       }
299       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
300       OMPT_STORE_RETURN_ADDRESS(gtid);
301     }
302 #endif
303 
304 #if INCLUDE_SSC_MARKS
305     SSC_MARK_FORKING();
306 #endif
307     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
308                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
309                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
310 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
311 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
312                     &ap
313 #else
314                     ap
315 #endif
316                     );
317 #if INCLUDE_SSC_MARKS
318     SSC_MARK_JOINING();
319 #endif
320     __kmp_join_call(loc, gtid
321 #if OMPT_SUPPORT
322                     ,
323                     fork_context_intel
324 #endif
325                     );
326 
327     va_end(ap);
328   }
329 
330 #if KMP_STATS_ENABLED
331   if (previous_state == stats_state_e::SERIAL_REGION) {
332     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
333   } else {
334     KMP_POP_PARTITIONED_TIMER();
335   }
336 #endif // KMP_STATS_ENABLED
337 }
338 
339 #if OMP_40_ENABLED
340 /*!
341 @ingroup PARALLEL
342 @param loc source location information
343 @param global_tid global thread number
344 @param num_teams number of teams requested for the teams construct
345 @param num_threads number of threads per team requested for the teams construct
346 
347 Set the number of teams to be used by the teams construct.
348 This call is only required if the teams construct has a `num_teams` clause
349 or a `thread_limit` clause (or both).
350 */
351 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
352                            kmp_int32 num_teams, kmp_int32 num_threads) {
353   KA_TRACE(20,
354            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
355             global_tid, num_teams, num_threads));
356 
357   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
358 }
359 
360 /*!
361 @ingroup PARALLEL
362 @param loc  source location information
363 @param argc  total number of arguments in the ellipsis
364 @param microtask  pointer to callback routine consisting of outlined teams
365 construct
366 @param ...  pointers to shared variables that aren't global
367 
368 Do the actual fork and call the microtask in the relevant number of threads.
369 */
370 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
371                        ...) {
372   int gtid = __kmp_entry_gtid();
373   kmp_info_t *this_thr = __kmp_threads[gtid];
374   va_list ap;
375   va_start(ap, microtask);
376 
377   KMP_COUNT_BLOCK(OMP_TEAMS);
378 
379   // remember teams entry point and nesting level
380   this_thr->th.th_teams_microtask = microtask;
381   this_thr->th.th_teams_level =
382       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
383 
384 #if OMPT_SUPPORT
385   kmp_team_t *parent_team = this_thr->th.th_team;
386   int tid = __kmp_tid_from_gtid(gtid);
387   if (ompt_enabled.enabled) {
388     parent_team->t.t_implicit_task_taskdata[tid]
389         .ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1);
390   }
391   OMPT_STORE_RETURN_ADDRESS(gtid);
392 #endif
393 
394   // check if __kmpc_push_num_teams called, set default number of teams
395   // otherwise
396   if (this_thr->th.th_teams_size.nteams == 0) {
397     __kmp_push_num_teams(loc, gtid, 0, 0);
398   }
399   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
400   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
401   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
402 
403   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
404                   VOLATILE_CAST(microtask_t)
405                       __kmp_teams_master, // "wrapped" task
406                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
407 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
408                   &ap
409 #else
410                   ap
411 #endif
412                   );
413   __kmp_join_call(loc, gtid
414 #if OMPT_SUPPORT
415                   ,
416                   fork_context_intel
417 #endif
418                   );
419 
420   this_thr->th.th_teams_microtask = NULL;
421   this_thr->th.th_teams_level = 0;
422   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
423   va_end(ap);
424 }
425 #endif /* OMP_40_ENABLED */
426 
427 // I don't think this function should ever have been exported.
428 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
429 // openmp code ever called it, but it's been exported from the RTL for so
430 // long that I'm afraid to remove the definition.
431 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
432 
433 /*!
434 @ingroup PARALLEL
435 @param loc  source location information
436 @param global_tid  global thread number
437 
438 Enter a serialized parallel construct. This interface is used to handle a
439 conditional parallel region, like this,
440 @code
441 #pragma omp parallel if (condition)
442 @endcode
443 when the condition is false.
444 */
445 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
446 // The implementation is now in kmp_runtime.cpp so that it can share static
447 // functions with kmp_fork_call since the tasks to be done are similar in
448 // each case.
449 #if OMPT_SUPPORT
450   OMPT_STORE_RETURN_ADDRESS(global_tid);
451 #endif
452   __kmp_serialized_parallel(loc, global_tid);
453 }
454 
455 /*!
456 @ingroup PARALLEL
457 @param loc  source location information
458 @param global_tid  global thread number
459 
460 Leave a serialized parallel construct.
461 */
462 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
463   kmp_internal_control_t *top;
464   kmp_info_t *this_thr;
465   kmp_team_t *serial_team;
466 
467   KC_TRACE(10,
468            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
469 
470   /* skip all this code for autopar serialized loops since it results in
471      unacceptable overhead */
472   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
473     return;
474 
475   // Not autopar code
476   if (!TCR_4(__kmp_init_parallel))
477     __kmp_parallel_initialize();
478 
479   this_thr = __kmp_threads[global_tid];
480   serial_team = this_thr->th.th_serial_team;
481 
482 #if OMP_45_ENABLED
483   kmp_task_team_t *task_team = this_thr->th.th_task_team;
484 
485   // we need to wait for the proxy tasks before finishing the thread
486   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
487     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
488 #endif
489 
490   KMP_MB();
491   KMP_DEBUG_ASSERT(serial_team);
492   KMP_ASSERT(serial_team->t.t_serialized);
493   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
494   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
495   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
496   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
497 
498 #if OMPT_SUPPORT
499   if (ompt_enabled.enabled &&
500       this_thr->th.ompt_thread_info.state != omp_state_overhead) {
501     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = NULL;
502     if (ompt_enabled.ompt_callback_implicit_task) {
503       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
504           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
505           OMPT_CUR_TASK_INFO(this_thr)->thread_num);
506     }
507 
508     // reset clear the task id only after unlinking the task
509     ompt_data_t *parent_task_data;
510     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
511 
512     if (ompt_enabled.ompt_callback_parallel_end) {
513       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
514           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
515           ompt_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
516     }
517     __ompt_lw_taskteam_unlink(this_thr);
518     this_thr->th.ompt_thread_info.state = omp_state_overhead;
519   }
520 #endif
521 
522   /* If necessary, pop the internal control stack values and replace the team
523    * values */
524   top = serial_team->t.t_control_stack_top;
525   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
526     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
527     serial_team->t.t_control_stack_top = top->next;
528     __kmp_free(top);
529   }
530 
531   // if( serial_team -> t.t_serialized > 1 )
532   serial_team->t.t_level--;
533 
534   /* pop dispatch buffers stack */
535   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
536   {
537     dispatch_private_info_t *disp_buffer =
538         serial_team->t.t_dispatch->th_disp_buffer;
539     serial_team->t.t_dispatch->th_disp_buffer =
540         serial_team->t.t_dispatch->th_disp_buffer->next;
541     __kmp_free(disp_buffer);
542   }
543 
544   --serial_team->t.t_serialized;
545   if (serial_team->t.t_serialized == 0) {
546 
547 /* return to the parallel section */
548 
549 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
550     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
551       __kmp_clear_x87_fpu_status_word();
552       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
553       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
554     }
555 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
556 
557     this_thr->th.th_team = serial_team->t.t_parent;
558     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
559 
560     /* restore values cached in the thread */
561     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
562     this_thr->th.th_team_master =
563         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
564     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
565 
566     /* TODO the below shouldn't need to be adjusted for serialized teams */
567     this_thr->th.th_dispatch =
568         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
569 
570     __kmp_pop_current_task_from_thread(this_thr);
571 
572     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
573     this_thr->th.th_current_task->td_flags.executing = 1;
574 
575     if (__kmp_tasking_mode != tskm_immediate_exec) {
576       // Copy the task team from the new child / old parent team to the thread.
577       this_thr->th.th_task_team =
578           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
579       KA_TRACE(20,
580                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
581                 "team %p\n",
582                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
583     }
584   } else {
585     if (__kmp_tasking_mode != tskm_immediate_exec) {
586       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
587                     "depth of serial team %p to %d\n",
588                     global_tid, serial_team, serial_team->t.t_serialized));
589     }
590   }
591 
592   if (__kmp_env_consistency_check)
593     __kmp_pop_parallel(global_tid, NULL);
594 #if OMPT_SUPPORT
595   if (ompt_enabled.enabled)
596     this_thr->th.ompt_thread_info.state =
597         ((this_thr->th.th_team_serialized) ? omp_state_work_serial
598                                            : omp_state_work_parallel);
599 #endif
600 }
601 
602 /*!
603 @ingroup SYNCHRONIZATION
604 @param loc  source location information.
605 
606 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
607 depending on the memory ordering convention obeyed by the compiler
608 even that may not be necessary).
609 */
610 void __kmpc_flush(ident_t *loc) {
611   KC_TRACE(10, ("__kmpc_flush: called\n"));
612 
613   /* need explicit __mf() here since use volatile instead in library */
614   KMP_MB(); /* Flush all pending memory write invalidates.  */
615 
616 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
617 #if KMP_MIC
618 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
619 // We shouldn't need it, though, since the ABI rules require that
620 // * If the compiler generates NGO stores it also generates the fence
621 // * If users hand-code NGO stores they should insert the fence
622 // therefore no incomplete unordered stores should be visible.
623 #else
624   // C74404
625   // This is to address non-temporal store instructions (sfence needed).
626   // The clflush instruction is addressed either (mfence needed).
627   // Probably the non-temporal load monvtdqa instruction should also be
628   // addressed.
629   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
630   if (!__kmp_cpuinfo.initialized) {
631     __kmp_query_cpuid(&__kmp_cpuinfo);
632   }
633   if (!__kmp_cpuinfo.sse2) {
634     // CPU cannot execute SSE2 instructions.
635   } else {
636 #if KMP_COMPILER_ICC
637     _mm_mfence();
638 #elif KMP_COMPILER_MSVC
639     MemoryBarrier();
640 #else
641     __sync_synchronize();
642 #endif // KMP_COMPILER_ICC
643   }
644 #endif // KMP_MIC
645 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
646 // Nothing to see here move along
647 #elif KMP_ARCH_PPC64
648 // Nothing needed here (we have a real MB above).
649 #if KMP_OS_CNK
650   // The flushing thread needs to yield here; this prevents a
651   // busy-waiting thread from saturating the pipeline. flush is
652   // often used in loops like this:
653   // while (!flag) {
654   //   #pragma omp flush(flag)
655   // }
656   // and adding the yield here is good for at least a 10x speedup
657   // when running >2 threads per core (on the NAS LU benchmark).
658   __kmp_yield(TRUE);
659 #endif
660 #else
661 #error Unknown or unsupported architecture
662 #endif
663 
664 #if OMPT_SUPPORT && OMPT_OPTIONAL
665   if (ompt_enabled.ompt_callback_flush) {
666     ompt_callbacks.ompt_callback(ompt_callback_flush)(
667         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
668   }
669 #endif
670 }
671 
672 /* -------------------------------------------------------------------------- */
673 /*!
674 @ingroup SYNCHRONIZATION
675 @param loc source location information
676 @param global_tid thread id.
677 
678 Execute a barrier.
679 */
680 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
681   KMP_COUNT_BLOCK(OMP_BARRIER);
682   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
683 
684   if (!TCR_4(__kmp_init_parallel))
685     __kmp_parallel_initialize();
686 
687   if (__kmp_env_consistency_check) {
688     if (loc == 0) {
689       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
690     }
691 
692     __kmp_check_barrier(global_tid, ct_barrier, loc);
693   }
694 
695 #if OMPT_SUPPORT
696   omp_frame_t *ompt_frame;
697   if (ompt_enabled.enabled) {
698     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
699     if (ompt_frame->enter_frame == NULL)
700       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
701     OMPT_STORE_RETURN_ADDRESS(global_tid);
702   }
703 #endif
704   __kmp_threads[global_tid]->th.th_ident = loc;
705   // TODO: explicit barrier_wait_id:
706   //   this function is called when 'barrier' directive is present or
707   //   implicit barrier at the end of a worksharing construct.
708   // 1) better to add a per-thread barrier counter to a thread data structure
709   // 2) set to 0 when a new team is created
710   // 4) no sync is required
711 
712   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
713 #if OMPT_SUPPORT && OMPT_OPTIONAL
714   if (ompt_enabled.enabled) {
715     ompt_frame->enter_frame = NULL;
716   }
717 #endif
718 }
719 
720 /* The BARRIER for a MASTER section is always explicit   */
721 /*!
722 @ingroup WORK_SHARING
723 @param loc  source location information.
724 @param global_tid  global thread number .
725 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
726 */
727 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
728   int status = 0;
729 
730   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
731 
732   if (!TCR_4(__kmp_init_parallel))
733     __kmp_parallel_initialize();
734 
735   if (KMP_MASTER_GTID(global_tid)) {
736     KMP_COUNT_BLOCK(OMP_MASTER);
737     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
738     status = 1;
739   }
740 
741 #if OMPT_SUPPORT && OMPT_OPTIONAL
742   if (status) {
743     if (ompt_enabled.ompt_callback_master) {
744       kmp_info_t *this_thr = __kmp_threads[global_tid];
745       kmp_team_t *team = this_thr->th.th_team;
746 
747       int tid = __kmp_tid_from_gtid(global_tid);
748       ompt_callbacks.ompt_callback(ompt_callback_master)(
749           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
750           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
751           OMPT_GET_RETURN_ADDRESS(0));
752     }
753   }
754 #endif
755 
756   if (__kmp_env_consistency_check) {
757 #if KMP_USE_DYNAMIC_LOCK
758     if (status)
759       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
760     else
761       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
762 #else
763     if (status)
764       __kmp_push_sync(global_tid, ct_master, loc, NULL);
765     else
766       __kmp_check_sync(global_tid, ct_master, loc, NULL);
767 #endif
768   }
769 
770   return status;
771 }
772 
773 /*!
774 @ingroup WORK_SHARING
775 @param loc  source location information.
776 @param global_tid  global thread number .
777 
778 Mark the end of a <tt>master</tt> region. This should only be called by the
779 thread that executes the <tt>master</tt> region.
780 */
781 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
782   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
783 
784   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
785   KMP_POP_PARTITIONED_TIMER();
786 
787 #if OMPT_SUPPORT && OMPT_OPTIONAL
788   kmp_info_t *this_thr = __kmp_threads[global_tid];
789   kmp_team_t *team = this_thr->th.th_team;
790   if (ompt_enabled.ompt_callback_master) {
791     int tid = __kmp_tid_from_gtid(global_tid);
792     ompt_callbacks.ompt_callback(ompt_callback_master)(
793         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
794         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
795         OMPT_GET_RETURN_ADDRESS(0));
796   }
797 #endif
798 
799   if (__kmp_env_consistency_check) {
800     if (global_tid < 0)
801       KMP_WARNING(ThreadIdentInvalid);
802 
803     if (KMP_MASTER_GTID(global_tid))
804       __kmp_pop_sync(global_tid, ct_master, loc);
805   }
806 }
807 
808 /*!
809 @ingroup WORK_SHARING
810 @param loc  source location information.
811 @param gtid  global thread number.
812 
813 Start execution of an <tt>ordered</tt> construct.
814 */
815 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
816   int cid = 0;
817   kmp_info_t *th;
818   KMP_DEBUG_ASSERT(__kmp_init_serial);
819 
820   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
821 
822   if (!TCR_4(__kmp_init_parallel))
823     __kmp_parallel_initialize();
824 
825 #if USE_ITT_BUILD
826   __kmp_itt_ordered_prep(gtid);
827 // TODO: ordered_wait_id
828 #endif /* USE_ITT_BUILD */
829 
830   th = __kmp_threads[gtid];
831 
832 #if OMPT_SUPPORT && OMPT_OPTIONAL
833   kmp_team_t *team;
834   omp_wait_id_t lck;
835   void *codeptr_ra;
836   if (ompt_enabled.enabled) {
837     OMPT_STORE_RETURN_ADDRESS(gtid);
838     team = __kmp_team_from_gtid(gtid);
839     lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value;
840     /* OMPT state update */
841     th->th.ompt_thread_info.wait_id = lck;
842     th->th.ompt_thread_info.state = omp_state_wait_ordered;
843 
844     /* OMPT event callback */
845     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
846     if (ompt_enabled.ompt_callback_mutex_acquire) {
847       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
848           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
849           (omp_wait_id_t)lck, codeptr_ra);
850     }
851   }
852 #endif
853 
854   if (th->th.th_dispatch->th_deo_fcn != 0)
855     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
856   else
857     __kmp_parallel_deo(&gtid, &cid, loc);
858 
859 #if OMPT_SUPPORT && OMPT_OPTIONAL
860   if (ompt_enabled.enabled) {
861     /* OMPT state update */
862     th->th.ompt_thread_info.state = omp_state_work_parallel;
863     th->th.ompt_thread_info.wait_id = 0;
864 
865     /* OMPT event callback */
866     if (ompt_enabled.ompt_callback_mutex_acquired) {
867       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
868           ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra);
869     }
870   }
871 #endif
872 
873 #if USE_ITT_BUILD
874   __kmp_itt_ordered_start(gtid);
875 #endif /* USE_ITT_BUILD */
876 }
877 
878 /*!
879 @ingroup WORK_SHARING
880 @param loc  source location information.
881 @param gtid  global thread number.
882 
883 End execution of an <tt>ordered</tt> construct.
884 */
885 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
886   int cid = 0;
887   kmp_info_t *th;
888 
889   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
890 
891 #if USE_ITT_BUILD
892   __kmp_itt_ordered_end(gtid);
893 // TODO: ordered_wait_id
894 #endif /* USE_ITT_BUILD */
895 
896   th = __kmp_threads[gtid];
897 
898   if (th->th.th_dispatch->th_dxo_fcn != 0)
899     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
900   else
901     __kmp_parallel_dxo(&gtid, &cid, loc);
902 
903 #if OMPT_SUPPORT && OMPT_OPTIONAL
904   OMPT_STORE_RETURN_ADDRESS(gtid);
905   if (ompt_enabled.ompt_callback_mutex_released) {
906     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
907         ompt_mutex_ordered,
908         (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
909         OMPT_LOAD_RETURN_ADDRESS(gtid));
910   }
911 #endif
912 }
913 
914 #if KMP_USE_DYNAMIC_LOCK
915 
916 static __forceinline void
917 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
918                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
919   // Pointer to the allocated indirect lock is written to crit, while indexing
920   // is ignored.
921   void *idx;
922   kmp_indirect_lock_t **lck;
923   lck = (kmp_indirect_lock_t **)crit;
924   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
925   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
926   KMP_SET_I_LOCK_LOCATION(ilk, loc);
927   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
928   KA_TRACE(20,
929            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
930 #if USE_ITT_BUILD
931   __kmp_itt_critical_creating(ilk->lock, loc);
932 #endif
933   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
934   if (status == 0) {
935 #if USE_ITT_BUILD
936     __kmp_itt_critical_destroyed(ilk->lock);
937 #endif
938     // We don't really need to destroy the unclaimed lock here since it will be
939     // cleaned up at program exit.
940     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
941   }
942   KMP_DEBUG_ASSERT(*lck != NULL);
943 }
944 
945 // Fast-path acquire tas lock
946 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
947   {                                                                            \
948     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
949     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
950     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
951     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
952         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
953       kmp_uint32 spins;                                                        \
954       KMP_FSYNC_PREPARE(l);                                                    \
955       KMP_INIT_YIELD(spins);                                                   \
956       if (TCR_4(__kmp_nth) >                                                   \
957           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
958         KMP_YIELD(TRUE);                                                       \
959       } else {                                                                 \
960         KMP_YIELD_SPIN(spins);                                                 \
961       }                                                                        \
962       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
963       while (                                                                  \
964           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
965           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
966         __kmp_spin_backoff(&backoff);                                          \
967         if (TCR_4(__kmp_nth) >                                                 \
968             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
969           KMP_YIELD(TRUE);                                                     \
970         } else {                                                               \
971           KMP_YIELD_SPIN(spins);                                               \
972         }                                                                      \
973       }                                                                        \
974     }                                                                          \
975     KMP_FSYNC_ACQUIRED(l);                                                     \
976   }
977 
978 // Fast-path test tas lock
979 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
980   {                                                                            \
981     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
982     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
983     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
984     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
985          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
986   }
987 
988 // Fast-path release tas lock
989 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
990   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
991 
992 #if KMP_USE_FUTEX
993 
994 #include <sys/syscall.h>
995 #include <unistd.h>
996 #ifndef FUTEX_WAIT
997 #define FUTEX_WAIT 0
998 #endif
999 #ifndef FUTEX_WAKE
1000 #define FUTEX_WAKE 1
1001 #endif
1002 
1003 // Fast-path acquire futex lock
1004 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
1005   {                                                                            \
1006     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1007     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
1008     KMP_MB();                                                                  \
1009     KMP_FSYNC_PREPARE(ftx);                                                    \
1010     kmp_int32 poll_val;                                                        \
1011     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
1012                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
1013                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
1014       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
1015       if (!cond) {                                                             \
1016         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
1017                                          poll_val |                            \
1018                                              KMP_LOCK_BUSY(1, futex))) {       \
1019           continue;                                                            \
1020         }                                                                      \
1021         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
1022       }                                                                        \
1023       kmp_int32 rc;                                                            \
1024       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
1025                         NULL, NULL, 0)) != 0) {                                \
1026         continue;                                                              \
1027       }                                                                        \
1028       gtid_code |= 1;                                                          \
1029     }                                                                          \
1030     KMP_FSYNC_ACQUIRED(ftx);                                                   \
1031   }
1032 
1033 // Fast-path test futex lock
1034 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
1035   {                                                                            \
1036     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1037     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
1038                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
1039       KMP_FSYNC_ACQUIRED(ftx);                                                 \
1040       rc = TRUE;                                                               \
1041     } else {                                                                   \
1042       rc = FALSE;                                                              \
1043     }                                                                          \
1044   }
1045 
1046 // Fast-path release futex lock
1047 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
1048   {                                                                            \
1049     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
1050     KMP_MB();                                                                  \
1051     KMP_FSYNC_RELEASING(ftx);                                                  \
1052     kmp_int32 poll_val =                                                       \
1053         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
1054     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
1055       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
1056               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
1057     }                                                                          \
1058     KMP_MB();                                                                  \
1059     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1060               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1061   }
1062 
1063 #endif // KMP_USE_FUTEX
1064 
1065 #else // KMP_USE_DYNAMIC_LOCK
1066 
1067 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1068                                                       ident_t const *loc,
1069                                                       kmp_int32 gtid) {
1070   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1071 
1072   // Because of the double-check, the following load doesn't need to be volatile
1073   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1074 
1075   if (lck == NULL) {
1076     void *idx;
1077 
1078     // Allocate & initialize the lock.
1079     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1080     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1081     __kmp_init_user_lock_with_checks(lck);
1082     __kmp_set_user_lock_location(lck, loc);
1083 #if USE_ITT_BUILD
1084     __kmp_itt_critical_creating(lck);
1085 // __kmp_itt_critical_creating() should be called *before* the first usage
1086 // of underlying lock. It is the only place where we can guarantee it. There
1087 // are chances the lock will destroyed with no usage, but it is not a
1088 // problem, because this is not real event seen by user but rather setting
1089 // name for object (lock). See more details in kmp_itt.h.
1090 #endif /* USE_ITT_BUILD */
1091 
1092     // Use a cmpxchg instruction to slam the start of the critical section with
1093     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1094     // and use the lock that the other thread allocated.
1095     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1096 
1097     if (status == 0) {
1098 // Deallocate the lock and reload the value.
1099 #if USE_ITT_BUILD
1100       __kmp_itt_critical_destroyed(lck);
1101 // Let ITT know the lock is destroyed and the same memory location may be reused
1102 // for another purpose.
1103 #endif /* USE_ITT_BUILD */
1104       __kmp_destroy_user_lock_with_checks(lck);
1105       __kmp_user_lock_free(&idx, gtid, lck);
1106       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1107       KMP_DEBUG_ASSERT(lck != NULL);
1108     }
1109   }
1110   return lck;
1111 }
1112 
1113 #endif // KMP_USE_DYNAMIC_LOCK
1114 
1115 /*!
1116 @ingroup WORK_SHARING
1117 @param loc  source location information.
1118 @param global_tid  global thread number .
1119 @param crit identity of the critical section. This could be a pointer to a lock
1120 associated with the critical section, or some other suitably unique value.
1121 
1122 Enter code protected by a `critical` construct.
1123 This function blocks until the executing thread can enter the critical section.
1124 */
1125 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1126                      kmp_critical_name *crit) {
1127 #if KMP_USE_DYNAMIC_LOCK
1128 #if OMPT_SUPPORT && OMPT_OPTIONAL
1129   OMPT_STORE_RETURN_ADDRESS(global_tid);
1130 #endif // OMPT_SUPPORT
1131   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1132 #else
1133   KMP_COUNT_BLOCK(OMP_CRITICAL);
1134 #if OMPT_SUPPORT && OMPT_OPTIONAL
1135   omp_state_t prev_state = omp_state_undefined;
1136   ompt_thread_info_t ti;
1137 #endif
1138   kmp_user_lock_p lck;
1139 
1140   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1141 
1142   // TODO: add THR_OVHD_STATE
1143 
1144   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1145   KMP_CHECK_USER_LOCK_INIT();
1146 
1147   if ((__kmp_user_lock_kind == lk_tas) &&
1148       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1149     lck = (kmp_user_lock_p)crit;
1150   }
1151 #if KMP_USE_FUTEX
1152   else if ((__kmp_user_lock_kind == lk_futex) &&
1153            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1154     lck = (kmp_user_lock_p)crit;
1155   }
1156 #endif
1157   else { // ticket, queuing or drdpa
1158     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1159   }
1160 
1161   if (__kmp_env_consistency_check)
1162     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1163 
1164 // since the critical directive binds to all threads, not just the current
1165 // team we have to check this even if we are in a serialized team.
1166 // also, even if we are the uber thread, we still have to conduct the lock,
1167 // as we have to contend with sibling threads.
1168 
1169 #if USE_ITT_BUILD
1170   __kmp_itt_critical_acquiring(lck);
1171 #endif /* USE_ITT_BUILD */
1172 #if OMPT_SUPPORT && OMPT_OPTIONAL
1173   OMPT_STORE_RETURN_ADDRESS(gtid);
1174   void *codeptr_ra = NULL;
1175   if (ompt_enabled.enabled) {
1176     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1177     /* OMPT state update */
1178     prev_state = ti.state;
1179     ti.wait_id = (omp_wait_id_t)lck;
1180     ti.state = omp_state_wait_critical;
1181 
1182     /* OMPT event callback */
1183     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1184     if (ompt_enabled.ompt_callback_mutex_acquire) {
1185       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1186           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1187           (omp_wait_id_t)crit, codeptr_ra);
1188     }
1189   }
1190 #endif
1191   // Value of 'crit' should be good for using as a critical_id of the critical
1192   // section directive.
1193   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1194 
1195 #if USE_ITT_BUILD
1196   __kmp_itt_critical_acquired(lck);
1197 #endif /* USE_ITT_BUILD */
1198 #if OMPT_SUPPORT && OMPT_OPTIONAL
1199   if (ompt_enabled.enabled) {
1200     /* OMPT state update */
1201     ti.state = prev_state;
1202     ti.wait_id = 0;
1203 
1204     /* OMPT event callback */
1205     if (ompt_enabled.ompt_callback_mutex_acquired) {
1206       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1207           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra);
1208     }
1209   }
1210 #endif
1211   KMP_POP_PARTITIONED_TIMER();
1212 
1213   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1214   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1215 #endif // KMP_USE_DYNAMIC_LOCK
1216 }
1217 
1218 #if KMP_USE_DYNAMIC_LOCK
1219 
1220 // Converts the given hint to an internal lock implementation
1221 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1222 #if KMP_USE_TSX
1223 #define KMP_TSX_LOCK(seq) lockseq_##seq
1224 #else
1225 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1226 #endif
1227 
1228 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1229 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1230 #else
1231 #define KMP_CPUINFO_RTM 0
1232 #endif
1233 
1234   // Hints that do not require further logic
1235   if (hint & kmp_lock_hint_hle)
1236     return KMP_TSX_LOCK(hle);
1237   if (hint & kmp_lock_hint_rtm)
1238     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1239   if (hint & kmp_lock_hint_adaptive)
1240     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1241 
1242   // Rule out conflicting hints first by returning the default lock
1243   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1244     return __kmp_user_lock_seq;
1245   if ((hint & omp_lock_hint_speculative) &&
1246       (hint & omp_lock_hint_nonspeculative))
1247     return __kmp_user_lock_seq;
1248 
1249   // Do not even consider speculation when it appears to be contended
1250   if (hint & omp_lock_hint_contended)
1251     return lockseq_queuing;
1252 
1253   // Uncontended lock without speculation
1254   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1255     return lockseq_tas;
1256 
1257   // HLE lock for speculation
1258   if (hint & omp_lock_hint_speculative)
1259     return KMP_TSX_LOCK(hle);
1260 
1261   return __kmp_user_lock_seq;
1262 }
1263 
1264 #if OMPT_SUPPORT && OMPT_OPTIONAL
1265 static kmp_mutex_impl_t
1266 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1267   if (user_lock) {
1268     switch (KMP_EXTRACT_D_TAG(user_lock)) {
1269     case 0:
1270       break;
1271 #if KMP_USE_FUTEX
1272     case locktag_futex:
1273       return kmp_mutex_impl_queuing;
1274 #endif
1275     case locktag_tas:
1276       return kmp_mutex_impl_spin;
1277 #if KMP_USE_TSX
1278     case locktag_hle:
1279       return kmp_mutex_impl_speculative;
1280 #endif
1281     default:
1282       return ompt_mutex_impl_unknown;
1283     }
1284     ilock = KMP_LOOKUP_I_LOCK(user_lock);
1285   }
1286   KMP_ASSERT(ilock);
1287   switch (ilock->type) {
1288 #if KMP_USE_TSX
1289   case locktag_adaptive:
1290   case locktag_rtm:
1291     return kmp_mutex_impl_speculative;
1292 #endif
1293   case locktag_nested_tas:
1294     return kmp_mutex_impl_spin;
1295 #if KMP_USE_FUTEX
1296   case locktag_nested_futex:
1297 #endif
1298   case locktag_ticket:
1299   case locktag_queuing:
1300   case locktag_drdpa:
1301   case locktag_nested_ticket:
1302   case locktag_nested_queuing:
1303   case locktag_nested_drdpa:
1304     return kmp_mutex_impl_queuing;
1305   default:
1306     return ompt_mutex_impl_unknown;
1307   }
1308 }
1309 
1310 // For locks without dynamic binding
1311 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1312   switch (__kmp_user_lock_kind) {
1313   case lk_tas:
1314     return kmp_mutex_impl_spin;
1315 #if KMP_USE_FUTEX
1316   case lk_futex:
1317 #endif
1318   case lk_ticket:
1319   case lk_queuing:
1320   case lk_drdpa:
1321     return kmp_mutex_impl_queuing;
1322 #if KMP_USE_TSX
1323   case lk_hle:
1324   case lk_rtm:
1325   case lk_adaptive:
1326     return kmp_mutex_impl_speculative;
1327 #endif
1328   default:
1329     return ompt_mutex_impl_unknown;
1330   }
1331 }
1332 #endif
1333 
1334 /*!
1335 @ingroup WORK_SHARING
1336 @param loc  source location information.
1337 @param global_tid  global thread number.
1338 @param crit identity of the critical section. This could be a pointer to a lock
1339 associated with the critical section, or some other suitably unique value.
1340 @param hint the lock hint.
1341 
1342 Enter code protected by a `critical` construct with a hint. The hint value is
1343 used to suggest a lock implementation. This function blocks until the executing
1344 thread can enter the critical section unless the hint suggests use of
1345 speculative execution and the hardware supports it.
1346 */
1347 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1348                                kmp_critical_name *crit, uintptr_t hint) {
1349   KMP_COUNT_BLOCK(OMP_CRITICAL);
1350   kmp_user_lock_p lck;
1351 #if OMPT_SUPPORT && OMPT_OPTIONAL
1352   omp_state_t prev_state = omp_state_undefined;
1353   ompt_thread_info_t ti;
1354   // This is the case, if called from __kmpc_critical:
1355   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1356   if (!codeptr)
1357     codeptr = OMPT_GET_RETURN_ADDRESS(0);
1358 #endif
1359 
1360   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1361 
1362   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1363   // Check if it is initialized.
1364   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1365   if (*lk == 0) {
1366     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1367     if (KMP_IS_D_LOCK(lckseq)) {
1368       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1369                                   KMP_GET_D_TAG(lckseq));
1370     } else {
1371       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1372     }
1373   }
1374   // Branch for accessing the actual lock object and set operation. This
1375   // branching is inevitable since this lock initialization does not follow the
1376   // normal dispatch path (lock table is not used).
1377   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1378     lck = (kmp_user_lock_p)lk;
1379     if (__kmp_env_consistency_check) {
1380       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1381                       __kmp_map_hint_to_lock(hint));
1382     }
1383 #if USE_ITT_BUILD
1384     __kmp_itt_critical_acquiring(lck);
1385 #endif
1386 #if OMPT_SUPPORT && OMPT_OPTIONAL
1387     if (ompt_enabled.enabled) {
1388       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1389       /* OMPT state update */
1390       prev_state = ti.state;
1391       ti.wait_id = (omp_wait_id_t)lck;
1392       ti.state = omp_state_wait_critical;
1393 
1394       /* OMPT event callback */
1395       if (ompt_enabled.ompt_callback_mutex_acquire) {
1396         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1397             ompt_mutex_critical, (unsigned int)hint,
1398             __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr);
1399       }
1400     }
1401 #endif
1402 #if KMP_USE_INLINED_TAS
1403     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1404       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1405     } else
1406 #elif KMP_USE_INLINED_FUTEX
1407     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1408       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1409     } else
1410 #endif
1411     {
1412       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1413     }
1414   } else {
1415     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1416     lck = ilk->lock;
1417     if (__kmp_env_consistency_check) {
1418       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1419                       __kmp_map_hint_to_lock(hint));
1420     }
1421 #if USE_ITT_BUILD
1422     __kmp_itt_critical_acquiring(lck);
1423 #endif
1424 #if OMPT_SUPPORT && OMPT_OPTIONAL
1425     if (ompt_enabled.enabled) {
1426       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1427       /* OMPT state update */
1428       prev_state = ti.state;
1429       ti.wait_id = (omp_wait_id_t)lck;
1430       ti.state = omp_state_wait_critical;
1431 
1432       /* OMPT event callback */
1433       if (ompt_enabled.ompt_callback_mutex_acquire) {
1434         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1435             ompt_mutex_critical, (unsigned int)hint,
1436             __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr);
1437       }
1438     }
1439 #endif
1440     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1441   }
1442   KMP_POP_PARTITIONED_TIMER();
1443 
1444 #if USE_ITT_BUILD
1445   __kmp_itt_critical_acquired(lck);
1446 #endif /* USE_ITT_BUILD */
1447 #if OMPT_SUPPORT && OMPT_OPTIONAL
1448   if (ompt_enabled.enabled) {
1449     /* OMPT state update */
1450     ti.state = prev_state;
1451     ti.wait_id = 0;
1452 
1453     /* OMPT event callback */
1454     if (ompt_enabled.ompt_callback_mutex_acquired) {
1455       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1456           ompt_mutex_critical, (omp_wait_id_t)crit, codeptr);
1457     }
1458   }
1459 #endif
1460 
1461   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1462   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1463 } // __kmpc_critical_with_hint
1464 
1465 #endif // KMP_USE_DYNAMIC_LOCK
1466 
1467 /*!
1468 @ingroup WORK_SHARING
1469 @param loc  source location information.
1470 @param global_tid  global thread number .
1471 @param crit identity of the critical section. This could be a pointer to a lock
1472 associated with the critical section, or some other suitably unique value.
1473 
1474 Leave a critical section, releasing any lock that was held during its execution.
1475 */
1476 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1477                          kmp_critical_name *crit) {
1478   kmp_user_lock_p lck;
1479 
1480   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1481 
1482 #if KMP_USE_DYNAMIC_LOCK
1483   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1484     lck = (kmp_user_lock_p)crit;
1485     KMP_ASSERT(lck != NULL);
1486     if (__kmp_env_consistency_check) {
1487       __kmp_pop_sync(global_tid, ct_critical, loc);
1488     }
1489 #if USE_ITT_BUILD
1490     __kmp_itt_critical_releasing(lck);
1491 #endif
1492 #if KMP_USE_INLINED_TAS
1493     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1494       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1495     } else
1496 #elif KMP_USE_INLINED_FUTEX
1497     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1498       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1499     } else
1500 #endif
1501     {
1502       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1503     }
1504   } else {
1505     kmp_indirect_lock_t *ilk =
1506         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1507     KMP_ASSERT(ilk != NULL);
1508     lck = ilk->lock;
1509     if (__kmp_env_consistency_check) {
1510       __kmp_pop_sync(global_tid, ct_critical, loc);
1511     }
1512 #if USE_ITT_BUILD
1513     __kmp_itt_critical_releasing(lck);
1514 #endif
1515     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1516   }
1517 
1518 #else // KMP_USE_DYNAMIC_LOCK
1519 
1520   if ((__kmp_user_lock_kind == lk_tas) &&
1521       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1522     lck = (kmp_user_lock_p)crit;
1523   }
1524 #if KMP_USE_FUTEX
1525   else if ((__kmp_user_lock_kind == lk_futex) &&
1526            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1527     lck = (kmp_user_lock_p)crit;
1528   }
1529 #endif
1530   else { // ticket, queuing or drdpa
1531     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1532   }
1533 
1534   KMP_ASSERT(lck != NULL);
1535 
1536   if (__kmp_env_consistency_check)
1537     __kmp_pop_sync(global_tid, ct_critical, loc);
1538 
1539 #if USE_ITT_BUILD
1540   __kmp_itt_critical_releasing(lck);
1541 #endif /* USE_ITT_BUILD */
1542   // Value of 'crit' should be good for using as a critical_id of the critical
1543   // section directive.
1544   __kmp_release_user_lock_with_checks(lck, global_tid);
1545 
1546 #endif // KMP_USE_DYNAMIC_LOCK
1547 
1548 #if OMPT_SUPPORT && OMPT_OPTIONAL
1549   /* OMPT release event triggers after lock is released; place here to trigger
1550    * for all #if branches */
1551   OMPT_STORE_RETURN_ADDRESS(global_tid);
1552   if (ompt_enabled.ompt_callback_mutex_released) {
1553     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1554         ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
1555   }
1556 #endif
1557 
1558   KMP_POP_PARTITIONED_TIMER();
1559   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1560 }
1561 
1562 /*!
1563 @ingroup SYNCHRONIZATION
1564 @param loc source location information
1565 @param global_tid thread id.
1566 @return one if the thread should execute the master block, zero otherwise
1567 
1568 Start execution of a combined barrier and master. The barrier is executed inside
1569 this function.
1570 */
1571 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1572   int status;
1573 
1574   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1575 
1576   if (!TCR_4(__kmp_init_parallel))
1577     __kmp_parallel_initialize();
1578 
1579   if (__kmp_env_consistency_check)
1580     __kmp_check_barrier(global_tid, ct_barrier, loc);
1581 
1582 #if OMPT_SUPPORT
1583   omp_frame_t *ompt_frame;
1584   if (ompt_enabled.enabled) {
1585     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1586     if (ompt_frame->enter_frame == NULL)
1587       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1588     OMPT_STORE_RETURN_ADDRESS(global_tid);
1589   }
1590 #endif
1591 #if USE_ITT_NOTIFY
1592   __kmp_threads[global_tid]->th.th_ident = loc;
1593 #endif
1594   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1595 #if OMPT_SUPPORT && OMPT_OPTIONAL
1596   if (ompt_enabled.enabled) {
1597     ompt_frame->enter_frame = NULL;
1598   }
1599 #endif
1600 
1601   return (status != 0) ? 0 : 1;
1602 }
1603 
1604 /*!
1605 @ingroup SYNCHRONIZATION
1606 @param loc source location information
1607 @param global_tid thread id.
1608 
1609 Complete the execution of a combined barrier and master. This function should
1610 only be called at the completion of the <tt>master</tt> code. Other threads will
1611 still be waiting at the barrier and this call releases them.
1612 */
1613 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1614   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1615 
1616   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1617 }
1618 
1619 /*!
1620 @ingroup SYNCHRONIZATION
1621 @param loc source location information
1622 @param global_tid thread id.
1623 @return one if the thread should execute the master block, zero otherwise
1624 
1625 Start execution of a combined barrier and master(nowait) construct.
1626 The barrier is executed inside this function.
1627 There is no equivalent "end" function, since the
1628 */
1629 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1630   kmp_int32 ret;
1631 
1632   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1633 
1634   if (!TCR_4(__kmp_init_parallel))
1635     __kmp_parallel_initialize();
1636 
1637   if (__kmp_env_consistency_check) {
1638     if (loc == 0) {
1639       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1640     }
1641     __kmp_check_barrier(global_tid, ct_barrier, loc);
1642   }
1643 
1644 #if OMPT_SUPPORT
1645   omp_frame_t *ompt_frame;
1646   if (ompt_enabled.enabled) {
1647     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1648     if (ompt_frame->enter_frame == NULL)
1649       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
1650     OMPT_STORE_RETURN_ADDRESS(global_tid);
1651   }
1652 #endif
1653 #if USE_ITT_NOTIFY
1654   __kmp_threads[global_tid]->th.th_ident = loc;
1655 #endif
1656   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1657 #if OMPT_SUPPORT && OMPT_OPTIONAL
1658   if (ompt_enabled.enabled) {
1659     ompt_frame->enter_frame = NULL;
1660   }
1661 #endif
1662 
1663   ret = __kmpc_master(loc, global_tid);
1664 
1665   if (__kmp_env_consistency_check) {
1666     /*  there's no __kmpc_end_master called; so the (stats) */
1667     /*  actions of __kmpc_end_master are done here          */
1668 
1669     if (global_tid < 0) {
1670       KMP_WARNING(ThreadIdentInvalid);
1671     }
1672     if (ret) {
1673       /* only one thread should do the pop since only */
1674       /* one did the push (see __kmpc_master())       */
1675 
1676       __kmp_pop_sync(global_tid, ct_master, loc);
1677     }
1678   }
1679 
1680   return (ret);
1681 }
1682 
1683 /* The BARRIER for a SINGLE process section is always explicit   */
1684 /*!
1685 @ingroup WORK_SHARING
1686 @param loc  source location information
1687 @param global_tid  global thread number
1688 @return One if this thread should execute the single construct, zero otherwise.
1689 
1690 Test whether to execute a <tt>single</tt> construct.
1691 There are no implicit barriers in the two "single" calls, rather the compiler
1692 should introduce an explicit barrier if it is required.
1693 */
1694 
1695 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1696   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1697 
1698   if (rc) {
1699     // We are going to execute the single statement, so we should count it.
1700     KMP_COUNT_BLOCK(OMP_SINGLE);
1701     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1702   }
1703 
1704 #if OMPT_SUPPORT && OMPT_OPTIONAL
1705   kmp_info_t *this_thr = __kmp_threads[global_tid];
1706   kmp_team_t *team = this_thr->th.th_team;
1707   int tid = __kmp_tid_from_gtid(global_tid);
1708 
1709   if (ompt_enabled.enabled) {
1710     if (rc) {
1711       if (ompt_enabled.ompt_callback_work) {
1712         ompt_callbacks.ompt_callback(ompt_callback_work)(
1713             ompt_work_single_executor, ompt_scope_begin,
1714             &(team->t.ompt_team_info.parallel_data),
1715             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1716             1, OMPT_GET_RETURN_ADDRESS(0));
1717       }
1718     } else {
1719       if (ompt_enabled.ompt_callback_work) {
1720         ompt_callbacks.ompt_callback(ompt_callback_work)(
1721             ompt_work_single_other, ompt_scope_begin,
1722             &(team->t.ompt_team_info.parallel_data),
1723             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1724             1, OMPT_GET_RETURN_ADDRESS(0));
1725         ompt_callbacks.ompt_callback(ompt_callback_work)(
1726             ompt_work_single_other, ompt_scope_end,
1727             &(team->t.ompt_team_info.parallel_data),
1728             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1729             1, OMPT_GET_RETURN_ADDRESS(0));
1730       }
1731     }
1732   }
1733 #endif
1734 
1735   return rc;
1736 }
1737 
1738 /*!
1739 @ingroup WORK_SHARING
1740 @param loc  source location information
1741 @param global_tid  global thread number
1742 
1743 Mark the end of a <tt>single</tt> construct.  This function should
1744 only be called by the thread that executed the block of code protected
1745 by the `single` construct.
1746 */
1747 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1748   __kmp_exit_single(global_tid);
1749   KMP_POP_PARTITIONED_TIMER();
1750 
1751 #if OMPT_SUPPORT && OMPT_OPTIONAL
1752   kmp_info_t *this_thr = __kmp_threads[global_tid];
1753   kmp_team_t *team = this_thr->th.th_team;
1754   int tid = __kmp_tid_from_gtid(global_tid);
1755 
1756   if (ompt_enabled.ompt_callback_work) {
1757     ompt_callbacks.ompt_callback(ompt_callback_work)(
1758         ompt_work_single_executor, ompt_scope_end,
1759         &(team->t.ompt_team_info.parallel_data),
1760         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1761         OMPT_GET_RETURN_ADDRESS(0));
1762   }
1763 #endif
1764 }
1765 
1766 /*!
1767 @ingroup WORK_SHARING
1768 @param loc Source location
1769 @param global_tid Global thread id
1770 
1771 Mark the end of a statically scheduled loop.
1772 */
1773 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1774   KMP_POP_PARTITIONED_TIMER();
1775   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1776 
1777 #if OMPT_SUPPORT && OMPT_OPTIONAL
1778   if (ompt_enabled.ompt_callback_work) {
1779     ompt_work_type_t ompt_work_type = ompt_work_loop;
1780     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1781     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1782     // Determine workshare type
1783     if (loc != NULL) {
1784       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1785         ompt_work_type = ompt_work_loop;
1786       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1787         ompt_work_type = ompt_work_sections;
1788       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1789         ompt_work_type = ompt_work_distribute;
1790       } else {
1791         // use default set above.
1792         // a warning about this case is provided in __kmpc_for_static_init
1793       }
1794       KMP_DEBUG_ASSERT(ompt_work_type);
1795     }
1796     ompt_callbacks.ompt_callback(ompt_callback_work)(
1797         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1798         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1799   }
1800 #endif
1801   if (__kmp_env_consistency_check)
1802     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1803 }
1804 
1805 // User routines which take C-style arguments (call by value)
1806 // different from the Fortran equivalent routines
1807 
1808 void ompc_set_num_threads(int arg) {
1809   // !!!!! TODO: check the per-task binding
1810   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1811 }
1812 
1813 void ompc_set_dynamic(int flag) {
1814   kmp_info_t *thread;
1815 
1816   /* For the thread-private implementation of the internal controls */
1817   thread = __kmp_entry_thread();
1818 
1819   __kmp_save_internal_controls(thread);
1820 
1821   set__dynamic(thread, flag ? TRUE : FALSE);
1822 }
1823 
1824 void ompc_set_nested(int flag) {
1825   kmp_info_t *thread;
1826 
1827   /* For the thread-private internal controls implementation */
1828   thread = __kmp_entry_thread();
1829 
1830   __kmp_save_internal_controls(thread);
1831 
1832   set__nested(thread, flag ? TRUE : FALSE);
1833 }
1834 
1835 void ompc_set_max_active_levels(int max_active_levels) {
1836   /* TO DO */
1837   /* we want per-task implementation of this internal control */
1838 
1839   /* For the per-thread internal controls implementation */
1840   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1841 }
1842 
1843 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1844   // !!!!! TODO: check the per-task binding
1845   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1846 }
1847 
1848 int ompc_get_ancestor_thread_num(int level) {
1849   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1850 }
1851 
1852 int ompc_get_team_size(int level) {
1853   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1854 }
1855 
1856 void kmpc_set_stacksize(int arg) {
1857   // __kmp_aux_set_stacksize initializes the library if needed
1858   __kmp_aux_set_stacksize(arg);
1859 }
1860 
1861 void kmpc_set_stacksize_s(size_t arg) {
1862   // __kmp_aux_set_stacksize initializes the library if needed
1863   __kmp_aux_set_stacksize(arg);
1864 }
1865 
1866 void kmpc_set_blocktime(int arg) {
1867   int gtid, tid;
1868   kmp_info_t *thread;
1869 
1870   gtid = __kmp_entry_gtid();
1871   tid = __kmp_tid_from_gtid(gtid);
1872   thread = __kmp_thread_from_gtid(gtid);
1873 
1874   __kmp_aux_set_blocktime(arg, thread, tid);
1875 }
1876 
1877 void kmpc_set_library(int arg) {
1878   // __kmp_user_set_library initializes the library if needed
1879   __kmp_user_set_library((enum library_type)arg);
1880 }
1881 
1882 void kmpc_set_defaults(char const *str) {
1883   // __kmp_aux_set_defaults initializes the library if needed
1884   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1885 }
1886 
1887 void kmpc_set_disp_num_buffers(int arg) {
1888   // ignore after initialization because some teams have already
1889   // allocated dispatch buffers
1890   if (__kmp_init_serial == 0 && arg > 0)
1891     __kmp_dispatch_num_buffers = arg;
1892 }
1893 
1894 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1895 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1896   return -1;
1897 #else
1898   if (!TCR_4(__kmp_init_middle)) {
1899     __kmp_middle_initialize();
1900   }
1901   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1902 #endif
1903 }
1904 
1905 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1906 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1907   return -1;
1908 #else
1909   if (!TCR_4(__kmp_init_middle)) {
1910     __kmp_middle_initialize();
1911   }
1912   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1913 #endif
1914 }
1915 
1916 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1917 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1918   return -1;
1919 #else
1920   if (!TCR_4(__kmp_init_middle)) {
1921     __kmp_middle_initialize();
1922   }
1923   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1924 #endif
1925 }
1926 
1927 /* -------------------------------------------------------------------------- */
1928 /*!
1929 @ingroup THREADPRIVATE
1930 @param loc       source location information
1931 @param gtid      global thread number
1932 @param cpy_size  size of the cpy_data buffer
1933 @param cpy_data  pointer to data to be copied
1934 @param cpy_func  helper function to call for copying data
1935 @param didit     flag variable: 1=single thread; 0=not single thread
1936 
1937 __kmpc_copyprivate implements the interface for the private data broadcast
1938 needed for the copyprivate clause associated with a single region in an
1939 OpenMP<sup>*</sup> program (both C and Fortran).
1940 All threads participating in the parallel region call this routine.
1941 One of the threads (called the single thread) should have the <tt>didit</tt>
1942 variable set to 1 and all other threads should have that variable set to 0.
1943 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1944 
1945 The OpenMP specification forbids the use of nowait on the single region when a
1946 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1947 barrier internally to avoid race conditions, so the code generation for the
1948 single region should avoid generating a barrier after the call to @ref
1949 __kmpc_copyprivate.
1950 
1951 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1952 The <tt>loc</tt> parameter is a pointer to source location information.
1953 
1954 Internal implementation: The single thread will first copy its descriptor
1955 address (cpy_data) to a team-private location, then the other threads will each
1956 call the function pointed to by the parameter cpy_func, which carries out the
1957 copy by copying the data using the cpy_data buffer.
1958 
1959 The cpy_func routine used for the copy and the contents of the data area defined
1960 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1961 to be done. For instance, the cpy_data buffer can hold the actual data to be
1962 copied or it may hold a list of pointers to the data. The cpy_func routine must
1963 interpret the cpy_data buffer appropriately.
1964 
1965 The interface to cpy_func is as follows:
1966 @code
1967 void cpy_func( void *destination, void *source )
1968 @endcode
1969 where void *destination is the cpy_data pointer for the thread being copied to
1970 and void *source is the cpy_data pointer for the thread being copied from.
1971 */
1972 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1973                         void *cpy_data, void (*cpy_func)(void *, void *),
1974                         kmp_int32 didit) {
1975   void **data_ptr;
1976 
1977   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1978 
1979   KMP_MB();
1980 
1981   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1982 
1983   if (__kmp_env_consistency_check) {
1984     if (loc == 0) {
1985       KMP_WARNING(ConstructIdentInvalid);
1986     }
1987   }
1988 
1989   // ToDo: Optimize the following two barriers into some kind of split barrier
1990 
1991   if (didit)
1992     *data_ptr = cpy_data;
1993 
1994 #if OMPT_SUPPORT
1995   omp_frame_t *ompt_frame;
1996   if (ompt_enabled.enabled) {
1997     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1998     if (ompt_frame->enter_frame == NULL)
1999       ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
2000     OMPT_STORE_RETURN_ADDRESS(gtid);
2001   }
2002 #endif
2003 /* This barrier is not a barrier region boundary */
2004 #if USE_ITT_NOTIFY
2005   __kmp_threads[gtid]->th.th_ident = loc;
2006 #endif
2007   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2008 
2009   if (!didit)
2010     (*cpy_func)(cpy_data, *data_ptr);
2011 
2012 // Consider next barrier a user-visible barrier for barrier region boundaries
2013 // Nesting checks are already handled by the single construct checks
2014 
2015 #if OMPT_SUPPORT
2016   if (ompt_enabled.enabled) {
2017     OMPT_STORE_RETURN_ADDRESS(gtid);
2018   }
2019 #endif
2020 #if USE_ITT_NOTIFY
2021   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2022 // tasks can overwrite the location)
2023 #endif
2024   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2025 #if OMPT_SUPPORT && OMPT_OPTIONAL
2026   if (ompt_enabled.enabled) {
2027     ompt_frame->enter_frame = NULL;
2028   }
2029 #endif
2030 }
2031 
2032 /* -------------------------------------------------------------------------- */
2033 
2034 #define INIT_LOCK __kmp_init_user_lock_with_checks
2035 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2036 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2037 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2038 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2039 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
2040   __kmp_acquire_nested_user_lock_with_checks_timed
2041 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2042 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2043 #define TEST_LOCK __kmp_test_user_lock_with_checks
2044 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2045 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2046 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2047 
2048 // TODO: Make check abort messages use location info & pass it into
2049 // with_checks routines
2050 
2051 #if KMP_USE_DYNAMIC_LOCK
2052 
2053 // internal lock initializer
2054 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2055                                                     kmp_dyna_lockseq_t seq) {
2056   if (KMP_IS_D_LOCK(seq)) {
2057     KMP_INIT_D_LOCK(lock, seq);
2058 #if USE_ITT_BUILD
2059     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2060 #endif
2061   } else {
2062     KMP_INIT_I_LOCK(lock, seq);
2063 #if USE_ITT_BUILD
2064     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2065     __kmp_itt_lock_creating(ilk->lock, loc);
2066 #endif
2067   }
2068 }
2069 
2070 // internal nest lock initializer
2071 static __forceinline void
2072 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2073                                kmp_dyna_lockseq_t seq) {
2074 #if KMP_USE_TSX
2075   // Don't have nested lock implementation for speculative locks
2076   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2077     seq = __kmp_user_lock_seq;
2078 #endif
2079   switch (seq) {
2080   case lockseq_tas:
2081     seq = lockseq_nested_tas;
2082     break;
2083 #if KMP_USE_FUTEX
2084   case lockseq_futex:
2085     seq = lockseq_nested_futex;
2086     break;
2087 #endif
2088   case lockseq_ticket:
2089     seq = lockseq_nested_ticket;
2090     break;
2091   case lockseq_queuing:
2092     seq = lockseq_nested_queuing;
2093     break;
2094   case lockseq_drdpa:
2095     seq = lockseq_nested_drdpa;
2096     break;
2097   default:
2098     seq = lockseq_nested_queuing;
2099   }
2100   KMP_INIT_I_LOCK(lock, seq);
2101 #if USE_ITT_BUILD
2102   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2103   __kmp_itt_lock_creating(ilk->lock, loc);
2104 #endif
2105 }
2106 
2107 /* initialize the lock with a hint */
2108 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2109                                 uintptr_t hint) {
2110   KMP_DEBUG_ASSERT(__kmp_init_serial);
2111   if (__kmp_env_consistency_check && user_lock == NULL) {
2112     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2113   }
2114 
2115   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2116 
2117 #if OMPT_SUPPORT && OMPT_OPTIONAL
2118   // This is the case, if called from omp_init_lock_with_hint:
2119   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2120   if (!codeptr)
2121     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2122   if (ompt_enabled.ompt_callback_lock_init) {
2123     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2124         ompt_mutex_lock, (omp_lock_hint_t)hint,
2125         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2126         codeptr);
2127   }
2128 #endif
2129 }
2130 
2131 /* initialize the lock with a hint */
2132 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2133                                      void **user_lock, uintptr_t hint) {
2134   KMP_DEBUG_ASSERT(__kmp_init_serial);
2135   if (__kmp_env_consistency_check && user_lock == NULL) {
2136     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2137   }
2138 
2139   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2140 
2141 #if OMPT_SUPPORT && OMPT_OPTIONAL
2142   // This is the case, if called from omp_init_lock_with_hint:
2143   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2144   if (!codeptr)
2145     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2146   if (ompt_enabled.ompt_callback_lock_init) {
2147     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2148         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2149         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2150         codeptr);
2151   }
2152 #endif
2153 }
2154 
2155 #endif // KMP_USE_DYNAMIC_LOCK
2156 
2157 /* initialize the lock */
2158 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2159 #if KMP_USE_DYNAMIC_LOCK
2160 
2161   KMP_DEBUG_ASSERT(__kmp_init_serial);
2162   if (__kmp_env_consistency_check && user_lock == NULL) {
2163     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2164   }
2165   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2166 
2167 #if OMPT_SUPPORT && OMPT_OPTIONAL
2168   // This is the case, if called from omp_init_lock_with_hint:
2169   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2170   if (!codeptr)
2171     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2172   if (ompt_enabled.ompt_callback_lock_init) {
2173     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2174         ompt_mutex_lock, omp_lock_hint_none,
2175         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2176         codeptr);
2177   }
2178 #endif
2179 
2180 #else // KMP_USE_DYNAMIC_LOCK
2181 
2182   static char const *const func = "omp_init_lock";
2183   kmp_user_lock_p lck;
2184   KMP_DEBUG_ASSERT(__kmp_init_serial);
2185 
2186   if (__kmp_env_consistency_check) {
2187     if (user_lock == NULL) {
2188       KMP_FATAL(LockIsUninitialized, func);
2189     }
2190   }
2191 
2192   KMP_CHECK_USER_LOCK_INIT();
2193 
2194   if ((__kmp_user_lock_kind == lk_tas) &&
2195       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2196     lck = (kmp_user_lock_p)user_lock;
2197   }
2198 #if KMP_USE_FUTEX
2199   else if ((__kmp_user_lock_kind == lk_futex) &&
2200            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2201     lck = (kmp_user_lock_p)user_lock;
2202   }
2203 #endif
2204   else {
2205     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2206   }
2207   INIT_LOCK(lck);
2208   __kmp_set_user_lock_location(lck, loc);
2209 
2210 #if OMPT_SUPPORT && OMPT_OPTIONAL
2211   // This is the case, if called from omp_init_lock_with_hint:
2212   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2213   if (!codeptr)
2214     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2215   if (ompt_enabled.ompt_callback_lock_init) {
2216     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2217         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2218         (omp_wait_id_t)user_lock, codeptr);
2219   }
2220 #endif
2221 
2222 #if USE_ITT_BUILD
2223   __kmp_itt_lock_creating(lck);
2224 #endif /* USE_ITT_BUILD */
2225 
2226 #endif // KMP_USE_DYNAMIC_LOCK
2227 } // __kmpc_init_lock
2228 
2229 /* initialize the lock */
2230 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2231 #if KMP_USE_DYNAMIC_LOCK
2232 
2233   KMP_DEBUG_ASSERT(__kmp_init_serial);
2234   if (__kmp_env_consistency_check && user_lock == NULL) {
2235     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2236   }
2237   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2238 
2239 #if OMPT_SUPPORT && OMPT_OPTIONAL
2240   // This is the case, if called from omp_init_lock_with_hint:
2241   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2242   if (!codeptr)
2243     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2244   if (ompt_enabled.ompt_callback_lock_init) {
2245     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2246         ompt_mutex_nest_lock, omp_lock_hint_none,
2247         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2248         codeptr);
2249   }
2250 #endif
2251 
2252 #else // KMP_USE_DYNAMIC_LOCK
2253 
2254   static char const *const func = "omp_init_nest_lock";
2255   kmp_user_lock_p lck;
2256   KMP_DEBUG_ASSERT(__kmp_init_serial);
2257 
2258   if (__kmp_env_consistency_check) {
2259     if (user_lock == NULL) {
2260       KMP_FATAL(LockIsUninitialized, func);
2261     }
2262   }
2263 
2264   KMP_CHECK_USER_LOCK_INIT();
2265 
2266   if ((__kmp_user_lock_kind == lk_tas) &&
2267       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2268        OMP_NEST_LOCK_T_SIZE)) {
2269     lck = (kmp_user_lock_p)user_lock;
2270   }
2271 #if KMP_USE_FUTEX
2272   else if ((__kmp_user_lock_kind == lk_futex) &&
2273            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2274             OMP_NEST_LOCK_T_SIZE)) {
2275     lck = (kmp_user_lock_p)user_lock;
2276   }
2277 #endif
2278   else {
2279     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2280   }
2281 
2282   INIT_NESTED_LOCK(lck);
2283   __kmp_set_user_lock_location(lck, loc);
2284 
2285 #if OMPT_SUPPORT && OMPT_OPTIONAL
2286   // This is the case, if called from omp_init_lock_with_hint:
2287   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2288   if (!codeptr)
2289     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2290   if (ompt_enabled.ompt_callback_lock_init) {
2291     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2292         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2293         (omp_wait_id_t)user_lock, codeptr);
2294   }
2295 #endif
2296 
2297 #if USE_ITT_BUILD
2298   __kmp_itt_lock_creating(lck);
2299 #endif /* USE_ITT_BUILD */
2300 
2301 #endif // KMP_USE_DYNAMIC_LOCK
2302 } // __kmpc_init_nest_lock
2303 
2304 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2305 #if KMP_USE_DYNAMIC_LOCK
2306 
2307 #if USE_ITT_BUILD
2308   kmp_user_lock_p lck;
2309   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2310     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2311   } else {
2312     lck = (kmp_user_lock_p)user_lock;
2313   }
2314   __kmp_itt_lock_destroyed(lck);
2315 #endif
2316 #if OMPT_SUPPORT && OMPT_OPTIONAL
2317   // This is the case, if called from omp_init_lock_with_hint:
2318   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2319   if (!codeptr)
2320     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2321   if (ompt_enabled.ompt_callback_lock_destroy) {
2322     kmp_user_lock_p lck;
2323     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2324       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2325     } else {
2326       lck = (kmp_user_lock_p)user_lock;
2327     }
2328     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2329         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2330   }
2331 #endif
2332   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2333 #else
2334   kmp_user_lock_p lck;
2335 
2336   if ((__kmp_user_lock_kind == lk_tas) &&
2337       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2338     lck = (kmp_user_lock_p)user_lock;
2339   }
2340 #if KMP_USE_FUTEX
2341   else if ((__kmp_user_lock_kind == lk_futex) &&
2342            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2343     lck = (kmp_user_lock_p)user_lock;
2344   }
2345 #endif
2346   else {
2347     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2348   }
2349 
2350 #if OMPT_SUPPORT && OMPT_OPTIONAL
2351   // This is the case, if called from omp_init_lock_with_hint:
2352   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2353   if (!codeptr)
2354     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2355   if (ompt_enabled.ompt_callback_lock_destroy) {
2356     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2357         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2358   }
2359 #endif
2360 
2361 #if USE_ITT_BUILD
2362   __kmp_itt_lock_destroyed(lck);
2363 #endif /* USE_ITT_BUILD */
2364   DESTROY_LOCK(lck);
2365 
2366   if ((__kmp_user_lock_kind == lk_tas) &&
2367       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2368     ;
2369   }
2370 #if KMP_USE_FUTEX
2371   else if ((__kmp_user_lock_kind == lk_futex) &&
2372            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2373     ;
2374   }
2375 #endif
2376   else {
2377     __kmp_user_lock_free(user_lock, gtid, lck);
2378   }
2379 #endif // KMP_USE_DYNAMIC_LOCK
2380 } // __kmpc_destroy_lock
2381 
2382 /* destroy the lock */
2383 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2384 #if KMP_USE_DYNAMIC_LOCK
2385 
2386 #if USE_ITT_BUILD
2387   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2388   __kmp_itt_lock_destroyed(ilk->lock);
2389 #endif
2390 #if OMPT_SUPPORT && OMPT_OPTIONAL
2391   // This is the case, if called from omp_init_lock_with_hint:
2392   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2393   if (!codeptr)
2394     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2395   if (ompt_enabled.ompt_callback_lock_destroy) {
2396     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2397         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2398   }
2399 #endif
2400   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2401 
2402 #else // KMP_USE_DYNAMIC_LOCK
2403 
2404   kmp_user_lock_p lck;
2405 
2406   if ((__kmp_user_lock_kind == lk_tas) &&
2407       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2408        OMP_NEST_LOCK_T_SIZE)) {
2409     lck = (kmp_user_lock_p)user_lock;
2410   }
2411 #if KMP_USE_FUTEX
2412   else if ((__kmp_user_lock_kind == lk_futex) &&
2413            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2414             OMP_NEST_LOCK_T_SIZE)) {
2415     lck = (kmp_user_lock_p)user_lock;
2416   }
2417 #endif
2418   else {
2419     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2420   }
2421 
2422 #if OMPT_SUPPORT && OMPT_OPTIONAL
2423   // This is the case, if called from omp_init_lock_with_hint:
2424   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2425   if (!codeptr)
2426     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2427   if (ompt_enabled.ompt_callback_lock_destroy) {
2428     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2429         ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2430   }
2431 #endif
2432 
2433 #if USE_ITT_BUILD
2434   __kmp_itt_lock_destroyed(lck);
2435 #endif /* USE_ITT_BUILD */
2436 
2437   DESTROY_NESTED_LOCK(lck);
2438 
2439   if ((__kmp_user_lock_kind == lk_tas) &&
2440       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2441        OMP_NEST_LOCK_T_SIZE)) {
2442     ;
2443   }
2444 #if KMP_USE_FUTEX
2445   else if ((__kmp_user_lock_kind == lk_futex) &&
2446            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2447             OMP_NEST_LOCK_T_SIZE)) {
2448     ;
2449   }
2450 #endif
2451   else {
2452     __kmp_user_lock_free(user_lock, gtid, lck);
2453   }
2454 #endif // KMP_USE_DYNAMIC_LOCK
2455 } // __kmpc_destroy_nest_lock
2456 
2457 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2458   KMP_COUNT_BLOCK(OMP_set_lock);
2459 #if KMP_USE_DYNAMIC_LOCK
2460   int tag = KMP_EXTRACT_D_TAG(user_lock);
2461 #if USE_ITT_BUILD
2462   __kmp_itt_lock_acquiring(
2463       (kmp_user_lock_p)
2464           user_lock); // itt function will get to the right lock object.
2465 #endif
2466 #if OMPT_SUPPORT && OMPT_OPTIONAL
2467   // This is the case, if called from omp_init_lock_with_hint:
2468   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2469   if (!codeptr)
2470     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2471   if (ompt_enabled.ompt_callback_mutex_acquire) {
2472     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2473         ompt_mutex_lock, omp_lock_hint_none,
2474         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2475         codeptr);
2476   }
2477 #endif
2478 #if KMP_USE_INLINED_TAS
2479   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2480     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2481   } else
2482 #elif KMP_USE_INLINED_FUTEX
2483   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2484     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2485   } else
2486 #endif
2487   {
2488     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2489   }
2490 #if USE_ITT_BUILD
2491   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2492 #endif
2493 #if OMPT_SUPPORT && OMPT_OPTIONAL
2494   if (ompt_enabled.ompt_callback_mutex_acquired) {
2495     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2496         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2497   }
2498 #endif
2499 
2500 #else // KMP_USE_DYNAMIC_LOCK
2501 
2502   kmp_user_lock_p lck;
2503 
2504   if ((__kmp_user_lock_kind == lk_tas) &&
2505       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2506     lck = (kmp_user_lock_p)user_lock;
2507   }
2508 #if KMP_USE_FUTEX
2509   else if ((__kmp_user_lock_kind == lk_futex) &&
2510            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2511     lck = (kmp_user_lock_p)user_lock;
2512   }
2513 #endif
2514   else {
2515     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2516   }
2517 
2518 #if USE_ITT_BUILD
2519   __kmp_itt_lock_acquiring(lck);
2520 #endif /* USE_ITT_BUILD */
2521 #if OMPT_SUPPORT && OMPT_OPTIONAL
2522   // This is the case, if called from omp_init_lock_with_hint:
2523   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2524   if (!codeptr)
2525     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2526   if (ompt_enabled.ompt_callback_mutex_acquire) {
2527     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2528         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2529         (omp_wait_id_t)lck, codeptr);
2530   }
2531 #endif
2532 
2533   ACQUIRE_LOCK(lck, gtid);
2534 
2535 #if USE_ITT_BUILD
2536   __kmp_itt_lock_acquired(lck);
2537 #endif /* USE_ITT_BUILD */
2538 
2539 #if OMPT_SUPPORT && OMPT_OPTIONAL
2540   if (ompt_enabled.ompt_callback_mutex_acquired) {
2541     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2542         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2543   }
2544 #endif
2545 
2546 #endif // KMP_USE_DYNAMIC_LOCK
2547 }
2548 
2549 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2550 #if KMP_USE_DYNAMIC_LOCK
2551 
2552 #if USE_ITT_BUILD
2553   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2554 #endif
2555 #if OMPT_SUPPORT && OMPT_OPTIONAL
2556   // This is the case, if called from omp_init_lock_with_hint:
2557   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2558   if (!codeptr)
2559     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2560   if (ompt_enabled.enabled) {
2561     if (ompt_enabled.ompt_callback_mutex_acquire) {
2562       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2563           ompt_mutex_nest_lock, omp_lock_hint_none,
2564           __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2565           codeptr);
2566     }
2567   }
2568 #endif
2569   int acquire_status =
2570       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2571 #if USE_ITT_BUILD
2572   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2573 #endif
2574 
2575 #if OMPT_SUPPORT && OMPT_OPTIONAL
2576   if (ompt_enabled.enabled) {
2577     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2578       if (ompt_enabled.ompt_callback_mutex_acquired) {
2579         // lock_first
2580         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2581             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2582       }
2583     } else {
2584       if (ompt_enabled.ompt_callback_nest_lock) {
2585         // lock_next
2586         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2587             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
2588       }
2589     }
2590   }
2591 #endif
2592 
2593 #else // KMP_USE_DYNAMIC_LOCK
2594   int acquire_status;
2595   kmp_user_lock_p lck;
2596 
2597   if ((__kmp_user_lock_kind == lk_tas) &&
2598       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2599        OMP_NEST_LOCK_T_SIZE)) {
2600     lck = (kmp_user_lock_p)user_lock;
2601   }
2602 #if KMP_USE_FUTEX
2603   else if ((__kmp_user_lock_kind == lk_futex) &&
2604            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2605             OMP_NEST_LOCK_T_SIZE)) {
2606     lck = (kmp_user_lock_p)user_lock;
2607   }
2608 #endif
2609   else {
2610     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2611   }
2612 
2613 #if USE_ITT_BUILD
2614   __kmp_itt_lock_acquiring(lck);
2615 #endif /* USE_ITT_BUILD */
2616 #if OMPT_SUPPORT && OMPT_OPTIONAL
2617   // This is the case, if called from omp_init_lock_with_hint:
2618   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2619   if (!codeptr)
2620     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2621   if (ompt_enabled.enabled) {
2622     if (ompt_enabled.ompt_callback_mutex_acquire) {
2623       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2624           ompt_mutex_nest_lock, omp_lock_hint_none,
2625           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
2626     }
2627   }
2628 #endif
2629 
2630   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2631 
2632 #if USE_ITT_BUILD
2633   __kmp_itt_lock_acquired(lck);
2634 #endif /* USE_ITT_BUILD */
2635 
2636 #if OMPT_SUPPORT && OMPT_OPTIONAL
2637   if (ompt_enabled.enabled) {
2638     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2639       if (ompt_enabled.ompt_callback_mutex_acquired) {
2640         // lock_first
2641         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2642             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2643       }
2644     } else {
2645       if (ompt_enabled.ompt_callback_nest_lock) {
2646         // lock_next
2647         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2648             ompt_scope_begin, (omp_wait_id_t)lck, codeptr);
2649       }
2650     }
2651   }
2652 #endif
2653 
2654 #endif // KMP_USE_DYNAMIC_LOCK
2655 }
2656 
2657 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2658 #if KMP_USE_DYNAMIC_LOCK
2659 
2660   int tag = KMP_EXTRACT_D_TAG(user_lock);
2661 #if USE_ITT_BUILD
2662   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2663 #endif
2664 #if KMP_USE_INLINED_TAS
2665   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2666     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2667   } else
2668 #elif KMP_USE_INLINED_FUTEX
2669   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2670     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2671   } else
2672 #endif
2673   {
2674     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2675   }
2676 
2677 #if OMPT_SUPPORT && OMPT_OPTIONAL
2678   // This is the case, if called from omp_init_lock_with_hint:
2679   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2680   if (!codeptr)
2681     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2682   if (ompt_enabled.ompt_callback_mutex_released) {
2683     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2684         ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2685   }
2686 #endif
2687 
2688 #else // KMP_USE_DYNAMIC_LOCK
2689 
2690   kmp_user_lock_p lck;
2691 
2692   /* Can't use serial interval since not block structured */
2693   /* release the lock */
2694 
2695   if ((__kmp_user_lock_kind == lk_tas) &&
2696       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2697 #if KMP_OS_LINUX &&                                                            \
2698     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2699 // "fast" path implemented to fix customer performance issue
2700 #if USE_ITT_BUILD
2701     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2702 #endif /* USE_ITT_BUILD */
2703     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2704     KMP_MB();
2705 
2706 #if OMPT_SUPPORT && OMPT_OPTIONAL
2707     // This is the case, if called from omp_init_lock_with_hint:
2708     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2709     if (!codeptr)
2710       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2711     if (ompt_enabled.ompt_callback_mutex_released) {
2712       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2713           ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2714     }
2715 #endif
2716 
2717     return;
2718 #else
2719     lck = (kmp_user_lock_p)user_lock;
2720 #endif
2721   }
2722 #if KMP_USE_FUTEX
2723   else if ((__kmp_user_lock_kind == lk_futex) &&
2724            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2725     lck = (kmp_user_lock_p)user_lock;
2726   }
2727 #endif
2728   else {
2729     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2730   }
2731 
2732 #if USE_ITT_BUILD
2733   __kmp_itt_lock_releasing(lck);
2734 #endif /* USE_ITT_BUILD */
2735 
2736   RELEASE_LOCK(lck, gtid);
2737 
2738 #if OMPT_SUPPORT && OMPT_OPTIONAL
2739   // This is the case, if called from omp_init_lock_with_hint:
2740   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2741   if (!codeptr)
2742     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2743   if (ompt_enabled.ompt_callback_mutex_released) {
2744     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2745         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2746   }
2747 #endif
2748 
2749 #endif // KMP_USE_DYNAMIC_LOCK
2750 }
2751 
2752 /* release the lock */
2753 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2754 #if KMP_USE_DYNAMIC_LOCK
2755 
2756 #if USE_ITT_BUILD
2757   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2758 #endif
2759   int release_status =
2760       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2761 
2762 #if OMPT_SUPPORT && OMPT_OPTIONAL
2763   // This is the case, if called from omp_init_lock_with_hint:
2764   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2765   if (!codeptr)
2766     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2767   if (ompt_enabled.enabled) {
2768     if (release_status == KMP_LOCK_RELEASED) {
2769       if (ompt_enabled.ompt_callback_mutex_released) {
2770         // release_lock_last
2771         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2772             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
2773       }
2774     } else if (ompt_enabled.ompt_callback_nest_lock) {
2775       // release_lock_prev
2776       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2777           ompt_scope_end, (omp_wait_id_t)user_lock, codeptr);
2778     }
2779   }
2780 #endif
2781 
2782 #else // KMP_USE_DYNAMIC_LOCK
2783 
2784   kmp_user_lock_p lck;
2785 
2786   /* Can't use serial interval since not block structured */
2787 
2788   if ((__kmp_user_lock_kind == lk_tas) &&
2789       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2790        OMP_NEST_LOCK_T_SIZE)) {
2791 #if KMP_OS_LINUX &&                                                            \
2792     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2793     // "fast" path implemented to fix customer performance issue
2794     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2795 #if USE_ITT_BUILD
2796     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2797 #endif /* USE_ITT_BUILD */
2798 
2799 #if OMPT_SUPPORT && OMPT_OPTIONAL
2800     int release_status = KMP_LOCK_STILL_HELD;
2801 #endif
2802 
2803     if (--(tl->lk.depth_locked) == 0) {
2804       TCW_4(tl->lk.poll, 0);
2805 #if OMPT_SUPPORT && OMPT_OPTIONAL
2806       release_status = KMP_LOCK_RELEASED;
2807 #endif
2808     }
2809     KMP_MB();
2810 
2811 #if OMPT_SUPPORT && OMPT_OPTIONAL
2812     // This is the case, if called from omp_init_lock_with_hint:
2813     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2814     if (!codeptr)
2815       codeptr = OMPT_GET_RETURN_ADDRESS(0);
2816     if (ompt_enabled.enabled) {
2817       if (release_status == KMP_LOCK_RELEASED) {
2818         if (ompt_enabled.ompt_callback_mutex_released) {
2819           // release_lock_last
2820           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2821               ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2822         }
2823       } else if (ompt_enabled.ompt_callback_nest_lock) {
2824         // release_lock_previous
2825         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2826             ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2827       }
2828     }
2829 #endif
2830 
2831     return;
2832 #else
2833     lck = (kmp_user_lock_p)user_lock;
2834 #endif
2835   }
2836 #if KMP_USE_FUTEX
2837   else if ((__kmp_user_lock_kind == lk_futex) &&
2838            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2839             OMP_NEST_LOCK_T_SIZE)) {
2840     lck = (kmp_user_lock_p)user_lock;
2841   }
2842 #endif
2843   else {
2844     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2845   }
2846 
2847 #if USE_ITT_BUILD
2848   __kmp_itt_lock_releasing(lck);
2849 #endif /* USE_ITT_BUILD */
2850 
2851   int release_status;
2852   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2853 #if OMPT_SUPPORT && OMPT_OPTIONAL
2854   // This is the case, if called from omp_init_lock_with_hint:
2855   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2856   if (!codeptr)
2857     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2858   if (ompt_enabled.enabled) {
2859     if (release_status == KMP_LOCK_RELEASED) {
2860       if (ompt_enabled.ompt_callback_mutex_released) {
2861         // release_lock_last
2862         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2863             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
2864       }
2865     } else if (ompt_enabled.ompt_callback_nest_lock) {
2866       // release_lock_previous
2867       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2868           ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
2869     }
2870   }
2871 #endif
2872 
2873 #endif // KMP_USE_DYNAMIC_LOCK
2874 }
2875 
2876 /* try to acquire the lock */
2877 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2878   KMP_COUNT_BLOCK(OMP_test_lock);
2879 
2880 #if KMP_USE_DYNAMIC_LOCK
2881   int rc;
2882   int tag = KMP_EXTRACT_D_TAG(user_lock);
2883 #if USE_ITT_BUILD
2884   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2885 #endif
2886 #if OMPT_SUPPORT && OMPT_OPTIONAL
2887   // This is the case, if called from omp_init_lock_with_hint:
2888   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2889   if (!codeptr)
2890     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2891   if (ompt_enabled.ompt_callback_mutex_acquire) {
2892     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2893         ompt_mutex_lock, omp_lock_hint_none,
2894         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
2895         codeptr);
2896   }
2897 #endif
2898 #if KMP_USE_INLINED_TAS
2899   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2900     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2901   } else
2902 #elif KMP_USE_INLINED_FUTEX
2903   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2904     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2905   } else
2906 #endif
2907   {
2908     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2909   }
2910   if (rc) {
2911 #if USE_ITT_BUILD
2912     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2913 #endif
2914 #if OMPT_SUPPORT && OMPT_OPTIONAL
2915     if (ompt_enabled.ompt_callback_mutex_acquired) {
2916       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2917           ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
2918     }
2919 #endif
2920     return FTN_TRUE;
2921   } else {
2922 #if USE_ITT_BUILD
2923     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2924 #endif
2925     return FTN_FALSE;
2926   }
2927 
2928 #else // KMP_USE_DYNAMIC_LOCK
2929 
2930   kmp_user_lock_p lck;
2931   int rc;
2932 
2933   if ((__kmp_user_lock_kind == lk_tas) &&
2934       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2935     lck = (kmp_user_lock_p)user_lock;
2936   }
2937 #if KMP_USE_FUTEX
2938   else if ((__kmp_user_lock_kind == lk_futex) &&
2939            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2940     lck = (kmp_user_lock_p)user_lock;
2941   }
2942 #endif
2943   else {
2944     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2945   }
2946 
2947 #if USE_ITT_BUILD
2948   __kmp_itt_lock_acquiring(lck);
2949 #endif /* USE_ITT_BUILD */
2950 #if OMPT_SUPPORT && OMPT_OPTIONAL
2951   // This is the case, if called from omp_init_lock_with_hint:
2952   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2953   if (!codeptr)
2954     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2955   if (ompt_enabled.ompt_callback_mutex_acquire) {
2956     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2957         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2958         (omp_wait_id_t)lck, codeptr);
2959   }
2960 #endif
2961 
2962   rc = TEST_LOCK(lck, gtid);
2963 #if USE_ITT_BUILD
2964   if (rc) {
2965     __kmp_itt_lock_acquired(lck);
2966   } else {
2967     __kmp_itt_lock_cancelled(lck);
2968   }
2969 #endif /* USE_ITT_BUILD */
2970 #if OMPT_SUPPORT && OMPT_OPTIONAL
2971   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
2972     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2973         ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
2974   }
2975 #endif
2976 
2977   return (rc ? FTN_TRUE : FTN_FALSE);
2978 
2979 /* Can't use serial interval since not block structured */
2980 
2981 #endif // KMP_USE_DYNAMIC_LOCK
2982 }
2983 
2984 /* try to acquire the lock */
2985 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2986 #if KMP_USE_DYNAMIC_LOCK
2987   int rc;
2988 #if USE_ITT_BUILD
2989   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2990 #endif
2991 #if OMPT_SUPPORT && OMPT_OPTIONAL
2992   // This is the case, if called from omp_init_lock_with_hint:
2993   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2994   if (!codeptr)
2995     codeptr = OMPT_GET_RETURN_ADDRESS(0);
2996   if (ompt_enabled.ompt_callback_mutex_acquire) {
2997     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2998         ompt_mutex_nest_lock, omp_lock_hint_none,
2999         __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
3000         codeptr);
3001   }
3002 #endif
3003   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3004 #if USE_ITT_BUILD
3005   if (rc) {
3006     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3007   } else {
3008     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3009   }
3010 #endif
3011 #if OMPT_SUPPORT && OMPT_OPTIONAL
3012   if (ompt_enabled.enabled && rc) {
3013     if (rc == 1) {
3014       if (ompt_enabled.ompt_callback_mutex_acquired) {
3015         // lock_first
3016         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3017             ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
3018       }
3019     } else {
3020       if (ompt_enabled.ompt_callback_nest_lock) {
3021         // lock_next
3022         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3023             ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
3024       }
3025     }
3026   }
3027 #endif
3028   return rc;
3029 
3030 #else // KMP_USE_DYNAMIC_LOCK
3031 
3032   kmp_user_lock_p lck;
3033   int rc;
3034 
3035   if ((__kmp_user_lock_kind == lk_tas) &&
3036       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3037        OMP_NEST_LOCK_T_SIZE)) {
3038     lck = (kmp_user_lock_p)user_lock;
3039   }
3040 #if KMP_USE_FUTEX
3041   else if ((__kmp_user_lock_kind == lk_futex) &&
3042            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3043             OMP_NEST_LOCK_T_SIZE)) {
3044     lck = (kmp_user_lock_p)user_lock;
3045   }
3046 #endif
3047   else {
3048     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3049   }
3050 
3051 #if USE_ITT_BUILD
3052   __kmp_itt_lock_acquiring(lck);
3053 #endif /* USE_ITT_BUILD */
3054 
3055 #if OMPT_SUPPORT && OMPT_OPTIONAL
3056   // This is the case, if called from omp_init_lock_with_hint:
3057   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3058   if (!codeptr)
3059     codeptr = OMPT_GET_RETURN_ADDRESS(0);
3060   if (ompt_enabled.enabled) &&
3061         ompt_enabled.ompt_callback_mutex_acquire) {
3062       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3063           ompt_mutex_nest_lock, omp_lock_hint_none,
3064           __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
3065     }
3066 #endif
3067 
3068   rc = TEST_NESTED_LOCK(lck, gtid);
3069 #if USE_ITT_BUILD
3070   if (rc) {
3071     __kmp_itt_lock_acquired(lck);
3072   } else {
3073     __kmp_itt_lock_cancelled(lck);
3074   }
3075 #endif /* USE_ITT_BUILD */
3076 #if OMPT_SUPPORT && OMPT_OPTIONAL
3077   if (ompt_enabled.enabled && rc) {
3078     if (rc == 1) {
3079       if (ompt_enabled.ompt_callback_mutex_acquired) {
3080         // lock_first
3081         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3082             ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
3083       }
3084     } else {
3085       if (ompt_enabled.ompt_callback_nest_lock) {
3086         // lock_next
3087         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3088             ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr);
3089       }
3090     }
3091   }
3092 #endif
3093   return rc;
3094 
3095 /* Can't use serial interval since not block structured */
3096 
3097 #endif // KMP_USE_DYNAMIC_LOCK
3098 }
3099 
3100 // Interface to fast scalable reduce methods routines
3101 
3102 // keep the selected method in a thread local structure for cross-function
3103 // usage: will be used in __kmpc_end_reduce* functions;
3104 // another solution: to re-determine the method one more time in
3105 // __kmpc_end_reduce* functions (new prototype required then)
3106 // AT: which solution is better?
3107 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
3108   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3109 
3110 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
3111   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3112 
3113 // description of the packed_reduction_method variable: look at the macros in
3114 // kmp.h
3115 
3116 // used in a critical section reduce block
3117 static __forceinline void
3118 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3119                                           kmp_critical_name *crit) {
3120 
3121   // this lock was visible to a customer and to the threading profile tool as a
3122   // serial overhead span (although it's used for an internal purpose only)
3123   //            why was it visible in previous implementation?
3124   //            should we keep it visible in new reduce block?
3125   kmp_user_lock_p lck;
3126 
3127 #if KMP_USE_DYNAMIC_LOCK
3128 
3129   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3130   // Check if it is initialized.
3131   if (*lk == 0) {
3132     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3133       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3134                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
3135     } else {
3136       __kmp_init_indirect_csptr(crit, loc, global_tid,
3137                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
3138     }
3139   }
3140   // Branch for accessing the actual lock object and set operation. This
3141   // branching is inevitable since this lock initialization does not follow the
3142   // normal dispatch path (lock table is not used).
3143   if (KMP_EXTRACT_D_TAG(lk) != 0) {
3144     lck = (kmp_user_lock_p)lk;
3145     KMP_DEBUG_ASSERT(lck != NULL);
3146     if (__kmp_env_consistency_check) {
3147       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3148     }
3149     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3150   } else {
3151     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3152     lck = ilk->lock;
3153     KMP_DEBUG_ASSERT(lck != NULL);
3154     if (__kmp_env_consistency_check) {
3155       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3156     }
3157     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3158   }
3159 
3160 #else // KMP_USE_DYNAMIC_LOCK
3161 
3162   // We know that the fast reduction code is only emitted by Intel compilers
3163   // with 32 byte critical sections. If there isn't enough space, then we
3164   // have to use a pointer.
3165   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3166     lck = (kmp_user_lock_p)crit;
3167   } else {
3168     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3169   }
3170   KMP_DEBUG_ASSERT(lck != NULL);
3171 
3172   if (__kmp_env_consistency_check)
3173     __kmp_push_sync(global_tid, ct_critical, loc, lck);
3174 
3175   __kmp_acquire_user_lock_with_checks(lck, global_tid);
3176 
3177 #endif // KMP_USE_DYNAMIC_LOCK
3178 }
3179 
3180 // used in a critical section reduce block
3181 static __forceinline void
3182 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3183                                         kmp_critical_name *crit) {
3184 
3185   kmp_user_lock_p lck;
3186 
3187 #if KMP_USE_DYNAMIC_LOCK
3188 
3189   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3190     lck = (kmp_user_lock_p)crit;
3191     if (__kmp_env_consistency_check)
3192       __kmp_pop_sync(global_tid, ct_critical, loc);
3193     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3194   } else {
3195     kmp_indirect_lock_t *ilk =
3196         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3197     if (__kmp_env_consistency_check)
3198       __kmp_pop_sync(global_tid, ct_critical, loc);
3199     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3200   }
3201 
3202 #else // KMP_USE_DYNAMIC_LOCK
3203 
3204   // We know that the fast reduction code is only emitted by Intel compilers
3205   // with 32 byte critical sections. If there isn't enough space, then we have
3206   // to use a pointer.
3207   if (__kmp_base_user_lock_size > 32) {
3208     lck = *((kmp_user_lock_p *)crit);
3209     KMP_ASSERT(lck != NULL);
3210   } else {
3211     lck = (kmp_user_lock_p)crit;
3212   }
3213 
3214   if (__kmp_env_consistency_check)
3215     __kmp_pop_sync(global_tid, ct_critical, loc);
3216 
3217   __kmp_release_user_lock_with_checks(lck, global_tid);
3218 
3219 #endif // KMP_USE_DYNAMIC_LOCK
3220 } // __kmp_end_critical_section_reduce_block
3221 
3222 #if OMP_40_ENABLED
3223 static __forceinline int
3224 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3225                                      int *task_state) {
3226   kmp_team_t *team;
3227 
3228   // Check if we are inside the teams construct?
3229   if (th->th.th_teams_microtask) {
3230     *team_p = team = th->th.th_team;
3231     if (team->t.t_level == th->th.th_teams_level) {
3232       // This is reduction at teams construct.
3233       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3234       // Let's swap teams temporarily for the reduction.
3235       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3236       th->th.th_team = team->t.t_parent;
3237       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3238       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3239       *task_state = th->th.th_task_state;
3240       th->th.th_task_state = 0;
3241 
3242       return 1;
3243     }
3244   }
3245   return 0;
3246 }
3247 
3248 static __forceinline void
3249 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3250   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3251   th->th.th_info.ds.ds_tid = 0;
3252   th->th.th_team = team;
3253   th->th.th_team_nproc = team->t.t_nproc;
3254   th->th.th_task_team = team->t.t_task_team[task_state];
3255   th->th.th_task_state = task_state;
3256 }
3257 #endif
3258 
3259 /* 2.a.i. Reduce Block without a terminating barrier */
3260 /*!
3261 @ingroup SYNCHRONIZATION
3262 @param loc source location information
3263 @param global_tid global thread number
3264 @param num_vars number of items (variables) to be reduced
3265 @param reduce_size size of data in bytes to be reduced
3266 @param reduce_data pointer to data to be reduced
3267 @param reduce_func callback function providing reduction operation on two
3268 operands and returning result of reduction in lhs_data
3269 @param lck pointer to the unique lock data structure
3270 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3271 threads if atomic reduction needed
3272 
3273 The nowait version is used for a reduce clause with the nowait argument.
3274 */
3275 kmp_int32
3276 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3277                      size_t reduce_size, void *reduce_data,
3278                      void (*reduce_func)(void *lhs_data, void *rhs_data),
3279                      kmp_critical_name *lck) {
3280 
3281   KMP_COUNT_BLOCK(REDUCE_nowait);
3282   int retval = 0;
3283   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3284 #if OMP_40_ENABLED
3285   kmp_info_t *th;
3286   kmp_team_t *team;
3287   int teams_swapped = 0, task_state;
3288 #endif
3289   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3290 
3291   // why do we need this initialization here at all?
3292   // Reduction clause can not be used as a stand-alone directive.
3293 
3294   // do not call __kmp_serial_initialize(), it will be called by
3295   // __kmp_parallel_initialize() if needed
3296   // possible detection of false-positive race by the threadchecker ???
3297   if (!TCR_4(__kmp_init_parallel))
3298     __kmp_parallel_initialize();
3299 
3300 // check correctness of reduce block nesting
3301 #if KMP_USE_DYNAMIC_LOCK
3302   if (__kmp_env_consistency_check)
3303     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3304 #else
3305   if (__kmp_env_consistency_check)
3306     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3307 #endif
3308 
3309 #if OMP_40_ENABLED
3310   th = __kmp_thread_from_gtid(global_tid);
3311   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3312 #endif // OMP_40_ENABLED
3313 
3314   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3315   // the value should be kept in a variable
3316   // the variable should be either a construct-specific or thread-specific
3317   // property, not a team specific property
3318   //     (a thread can reach the next reduce block on the next construct, reduce
3319   //     method may differ on the next construct)
3320   // an ident_t "loc" parameter could be used as a construct-specific property
3321   // (what if loc == 0?)
3322   //     (if both construct-specific and team-specific variables were shared,
3323   //     then unness extra syncs should be needed)
3324   // a thread-specific variable is better regarding two issues above (next
3325   // construct and extra syncs)
3326   // a thread-specific "th_local.reduction_method" variable is used currently
3327   // each thread executes 'determine' and 'set' lines (no need to execute by one
3328   // thread, to avoid unness extra syncs)
3329 
3330   packed_reduction_method = __kmp_determine_reduction_method(
3331       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3332   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3333 
3334   if (packed_reduction_method == critical_reduce_block) {
3335 
3336     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3337     retval = 1;
3338 
3339   } else if (packed_reduction_method == empty_reduce_block) {
3340 
3341     // usage: if team size == 1, no synchronization is required ( Intel
3342     // platforms only )
3343     retval = 1;
3344 
3345   } else if (packed_reduction_method == atomic_reduce_block) {
3346 
3347     retval = 2;
3348 
3349     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3350     // won't be called by the code gen)
3351     //     (it's not quite good, because the checking block has been closed by
3352     //     this 'pop',
3353     //      but atomic operation has not been executed yet, will be executed
3354     //      slightly later, literally on next instruction)
3355     if (__kmp_env_consistency_check)
3356       __kmp_pop_sync(global_tid, ct_reduce, loc);
3357 
3358   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3359                                    tree_reduce_block)) {
3360 
3361 // AT: performance issue: a real barrier here
3362 // AT:     (if master goes slow, other threads are blocked here waiting for the
3363 // master to come and release them)
3364 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
3365 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
3366 // be confusing to a customer)
3367 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3368 // might go faster and be more in line with sense of NOWAIT
3369 // AT: TO DO: do epcc test and compare times
3370 
3371 // this barrier should be invisible to a customer and to the threading profile
3372 // tool (it's neither a terminating barrier nor customer's code, it's
3373 // used for an internal purpose)
3374 #if OMPT_SUPPORT
3375     // JP: can this barrier potentially leed to task scheduling?
3376     // JP: as long as there is a barrier in the implementation, OMPT should and
3377     // will provide the barrier events
3378     //         so we set-up the necessary frame/return addresses.
3379     omp_frame_t *ompt_frame;
3380     if (ompt_enabled.enabled) {
3381       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3382       if (ompt_frame->enter_frame == NULL)
3383         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3384       OMPT_STORE_RETURN_ADDRESS(global_tid);
3385     }
3386 #endif
3387 #if USE_ITT_NOTIFY
3388     __kmp_threads[global_tid]->th.th_ident = loc;
3389 #endif
3390     retval =
3391         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3392                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3393     retval = (retval != 0) ? (0) : (1);
3394 #if OMPT_SUPPORT && OMPT_OPTIONAL
3395     if (ompt_enabled.enabled) {
3396       ompt_frame->enter_frame = NULL;
3397     }
3398 #endif
3399 
3400     // all other workers except master should do this pop here
3401     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
3402     if (__kmp_env_consistency_check) {
3403       if (retval == 0) {
3404         __kmp_pop_sync(global_tid, ct_reduce, loc);
3405       }
3406     }
3407 
3408   } else {
3409 
3410     // should never reach this block
3411     KMP_ASSERT(0); // "unexpected method"
3412   }
3413 #if OMP_40_ENABLED
3414   if (teams_swapped) {
3415     __kmp_restore_swapped_teams(th, team, task_state);
3416   }
3417 #endif
3418   KA_TRACE(
3419       10,
3420       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3421        global_tid, packed_reduction_method, retval));
3422 
3423   return retval;
3424 }
3425 
3426 /*!
3427 @ingroup SYNCHRONIZATION
3428 @param loc source location information
3429 @param global_tid global thread id.
3430 @param lck pointer to the unique lock data structure
3431 
3432 Finish the execution of a reduce nowait.
3433 */
3434 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3435                               kmp_critical_name *lck) {
3436 
3437   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3438 
3439   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3440 
3441   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3442 
3443   if (packed_reduction_method == critical_reduce_block) {
3444 
3445     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3446 
3447   } else if (packed_reduction_method == empty_reduce_block) {
3448 
3449     // usage: if team size == 1, no synchronization is required ( on Intel
3450     // platforms only )
3451 
3452   } else if (packed_reduction_method == atomic_reduce_block) {
3453 
3454     // neither master nor other workers should get here
3455     //     (code gen does not generate this call in case 2: atomic reduce block)
3456     // actually it's better to remove this elseif at all;
3457     // after removal this value will checked by the 'else' and will assert
3458 
3459   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3460                                    tree_reduce_block)) {
3461 
3462     // only master gets here
3463 
3464   } else {
3465 
3466     // should never reach this block
3467     KMP_ASSERT(0); // "unexpected method"
3468   }
3469 
3470   if (__kmp_env_consistency_check)
3471     __kmp_pop_sync(global_tid, ct_reduce, loc);
3472 
3473   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3474                 global_tid, packed_reduction_method));
3475 
3476   return;
3477 }
3478 
3479 /* 2.a.ii. Reduce Block with a terminating barrier */
3480 
3481 /*!
3482 @ingroup SYNCHRONIZATION
3483 @param loc source location information
3484 @param global_tid global thread number
3485 @param num_vars number of items (variables) to be reduced
3486 @param reduce_size size of data in bytes to be reduced
3487 @param reduce_data pointer to data to be reduced
3488 @param reduce_func callback function providing reduction operation on two
3489 operands and returning result of reduction in lhs_data
3490 @param lck pointer to the unique lock data structure
3491 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3492 threads if atomic reduction needed
3493 
3494 A blocking reduce that includes an implicit barrier.
3495 */
3496 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3497                         size_t reduce_size, void *reduce_data,
3498                         void (*reduce_func)(void *lhs_data, void *rhs_data),
3499                         kmp_critical_name *lck) {
3500   KMP_COUNT_BLOCK(REDUCE_wait);
3501   int retval = 0;
3502   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3503 #if OMP_40_ENABLED
3504   kmp_info_t *th;
3505   kmp_team_t *team;
3506   int teams_swapped = 0, task_state;
3507 #endif
3508 
3509   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3510 
3511   // why do we need this initialization here at all?
3512   // Reduction clause can not be a stand-alone directive.
3513 
3514   // do not call __kmp_serial_initialize(), it will be called by
3515   // __kmp_parallel_initialize() if needed
3516   // possible detection of false-positive race by the threadchecker ???
3517   if (!TCR_4(__kmp_init_parallel))
3518     __kmp_parallel_initialize();
3519 
3520 // check correctness of reduce block nesting
3521 #if KMP_USE_DYNAMIC_LOCK
3522   if (__kmp_env_consistency_check)
3523     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3524 #else
3525   if (__kmp_env_consistency_check)
3526     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3527 #endif
3528 
3529 #if OMP_40_ENABLED
3530   th = __kmp_thread_from_gtid(global_tid);
3531   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3532 #endif // OMP_40_ENABLED
3533 
3534   packed_reduction_method = __kmp_determine_reduction_method(
3535       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3536   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3537 
3538   if (packed_reduction_method == critical_reduce_block) {
3539 
3540     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3541     retval = 1;
3542 
3543   } else if (packed_reduction_method == empty_reduce_block) {
3544 
3545     // usage: if team size == 1, no synchronization is required ( Intel
3546     // platforms only )
3547     retval = 1;
3548 
3549   } else if (packed_reduction_method == atomic_reduce_block) {
3550 
3551     retval = 2;
3552 
3553   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3554                                    tree_reduce_block)) {
3555 
3556 // case tree_reduce_block:
3557 // this barrier should be visible to a customer and to the threading profile
3558 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3559 #if OMPT_SUPPORT
3560     omp_frame_t *ompt_frame;
3561     if (ompt_enabled.enabled) {
3562       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3563       if (ompt_frame->enter_frame == NULL)
3564         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3565       OMPT_STORE_RETURN_ADDRESS(global_tid);
3566     }
3567 #endif
3568 #if USE_ITT_NOTIFY
3569     __kmp_threads[global_tid]->th.th_ident =
3570         loc; // needed for correct notification of frames
3571 #endif
3572     retval =
3573         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3574                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3575     retval = (retval != 0) ? (0) : (1);
3576 #if OMPT_SUPPORT && OMPT_OPTIONAL
3577     if (ompt_enabled.enabled) {
3578       ompt_frame->enter_frame = NULL;
3579     }
3580 #endif
3581 
3582     // all other workers except master should do this pop here
3583     // ( none of other workers except master will enter __kmpc_end_reduce() )
3584     if (__kmp_env_consistency_check) {
3585       if (retval == 0) { // 0: all other workers; 1: master
3586         __kmp_pop_sync(global_tid, ct_reduce, loc);
3587       }
3588     }
3589 
3590   } else {
3591 
3592     // should never reach this block
3593     KMP_ASSERT(0); // "unexpected method"
3594   }
3595 #if OMP_40_ENABLED
3596   if (teams_swapped) {
3597     __kmp_restore_swapped_teams(th, team, task_state);
3598   }
3599 #endif
3600 
3601   KA_TRACE(10,
3602            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3603             global_tid, packed_reduction_method, retval));
3604 
3605   return retval;
3606 }
3607 
3608 /*!
3609 @ingroup SYNCHRONIZATION
3610 @param loc source location information
3611 @param global_tid global thread id.
3612 @param lck pointer to the unique lock data structure
3613 
3614 Finish the execution of a blocking reduce.
3615 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3616 start function.
3617 */
3618 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3619                        kmp_critical_name *lck) {
3620 
3621   PACKED_REDUCTION_METHOD_T packed_reduction_method;
3622 #if OMP_40_ENABLED
3623   kmp_info_t *th;
3624   kmp_team_t *team;
3625   int teams_swapped = 0, task_state;
3626 #endif
3627 
3628   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3629 
3630 #if OMP_40_ENABLED
3631   th = __kmp_thread_from_gtid(global_tid);
3632   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3633 #endif // OMP_40_ENABLED
3634 
3635   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3636 
3637   // this barrier should be visible to a customer and to the threading profile
3638   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3639 
3640   if (packed_reduction_method == critical_reduce_block) {
3641 
3642     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3643 
3644 // TODO: implicit barrier: should be exposed
3645 #if OMPT_SUPPORT
3646     omp_frame_t *ompt_frame;
3647     if (ompt_enabled.enabled) {
3648       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3649       if (ompt_frame->enter_frame == NULL)
3650         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3651       OMPT_STORE_RETURN_ADDRESS(global_tid);
3652     }
3653 #endif
3654 #if USE_ITT_NOTIFY
3655     __kmp_threads[global_tid]->th.th_ident = loc;
3656 #endif
3657     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3658 #if OMPT_SUPPORT && OMPT_OPTIONAL
3659     if (ompt_enabled.enabled) {
3660       ompt_frame->enter_frame = NULL;
3661     }
3662 #endif
3663 
3664   } else if (packed_reduction_method == empty_reduce_block) {
3665 
3666 // usage: if team size==1, no synchronization is required (Intel platforms only)
3667 
3668 // TODO: implicit barrier: should be exposed
3669 #if OMPT_SUPPORT
3670     omp_frame_t *ompt_frame;
3671     if (ompt_enabled.enabled) {
3672       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3673       if (ompt_frame->enter_frame == NULL)
3674         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3675       OMPT_STORE_RETURN_ADDRESS(global_tid);
3676     }
3677 #endif
3678 #if USE_ITT_NOTIFY
3679     __kmp_threads[global_tid]->th.th_ident = loc;
3680 #endif
3681     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3682 #if OMPT_SUPPORT && OMPT_OPTIONAL
3683     if (ompt_enabled.enabled) {
3684       ompt_frame->enter_frame = NULL;
3685     }
3686 #endif
3687 
3688   } else if (packed_reduction_method == atomic_reduce_block) {
3689 
3690 #if OMPT_SUPPORT
3691     omp_frame_t *ompt_frame;
3692     if (ompt_enabled.enabled) {
3693       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3694       if (ompt_frame->enter_frame == NULL)
3695         ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
3696       OMPT_STORE_RETURN_ADDRESS(global_tid);
3697     }
3698 #endif
3699 // TODO: implicit barrier: should be exposed
3700 #if USE_ITT_NOTIFY
3701     __kmp_threads[global_tid]->th.th_ident = loc;
3702 #endif
3703     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3704 #if OMPT_SUPPORT && OMPT_OPTIONAL
3705     if (ompt_enabled.enabled) {
3706       ompt_frame->enter_frame = NULL;
3707     }
3708 #endif
3709 
3710   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3711                                    tree_reduce_block)) {
3712 
3713     // only master executes here (master releases all other workers)
3714     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3715                             global_tid);
3716 
3717   } else {
3718 
3719     // should never reach this block
3720     KMP_ASSERT(0); // "unexpected method"
3721   }
3722 #if OMP_40_ENABLED
3723   if (teams_swapped) {
3724     __kmp_restore_swapped_teams(th, team, task_state);
3725   }
3726 #endif
3727 
3728   if (__kmp_env_consistency_check)
3729     __kmp_pop_sync(global_tid, ct_reduce, loc);
3730 
3731   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3732                 global_tid, packed_reduction_method));
3733 
3734   return;
3735 }
3736 
3737 #undef __KMP_GET_REDUCTION_METHOD
3738 #undef __KMP_SET_REDUCTION_METHOD
3739 
3740 /* end of interface to fast scalable reduce routines */
3741 
3742 kmp_uint64 __kmpc_get_taskid() {
3743 
3744   kmp_int32 gtid;
3745   kmp_info_t *thread;
3746 
3747   gtid = __kmp_get_gtid();
3748   if (gtid < 0) {
3749     return 0;
3750   }
3751   thread = __kmp_thread_from_gtid(gtid);
3752   return thread->th.th_current_task->td_task_id;
3753 
3754 } // __kmpc_get_taskid
3755 
3756 kmp_uint64 __kmpc_get_parent_taskid() {
3757 
3758   kmp_int32 gtid;
3759   kmp_info_t *thread;
3760   kmp_taskdata_t *parent_task;
3761 
3762   gtid = __kmp_get_gtid();
3763   if (gtid < 0) {
3764     return 0;
3765   }
3766   thread = __kmp_thread_from_gtid(gtid);
3767   parent_task = thread->th.th_current_task->td_parent;
3768   return (parent_task == NULL ? 0 : parent_task->td_task_id);
3769 
3770 } // __kmpc_get_parent_taskid
3771 
3772 #if OMP_45_ENABLED
3773 /*!
3774 @ingroup WORK_SHARING
3775 @param loc  source location information.
3776 @param gtid  global thread number.
3777 @param num_dims  number of associated doacross loops.
3778 @param dims  info on loops bounds.
3779 
3780 Initialize doacross loop information.
3781 Expect compiler send us inclusive bounds,
3782 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3783 */
3784 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3785                           const struct kmp_dim *dims) {
3786   int j, idx;
3787   kmp_int64 last, trace_count;
3788   kmp_info_t *th = __kmp_threads[gtid];
3789   kmp_team_t *team = th->th.th_team;
3790   kmp_uint32 *flags;
3791   kmp_disp_t *pr_buf = th->th.th_dispatch;
3792   dispatch_shared_info_t *sh_buf;
3793 
3794   KA_TRACE(
3795       20,
3796       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3797        gtid, num_dims, !team->t.t_serialized));
3798   KMP_DEBUG_ASSERT(dims != NULL);
3799   KMP_DEBUG_ASSERT(num_dims > 0);
3800 
3801   if (team->t.t_serialized) {
3802     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3803     return; // no dependencies if team is serialized
3804   }
3805   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3806   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3807   // the next loop
3808   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3809 
3810   // Save bounds info into allocated private buffer
3811   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3812   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3813       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3814   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3815   pr_buf->th_doacross_info[0] =
3816       (kmp_int64)num_dims; // first element is number of dimensions
3817   // Save also address of num_done in order to access it later without knowing
3818   // the buffer index
3819   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3820   pr_buf->th_doacross_info[2] = dims[0].lo;
3821   pr_buf->th_doacross_info[3] = dims[0].up;
3822   pr_buf->th_doacross_info[4] = dims[0].st;
3823   last = 5;
3824   for (j = 1; j < num_dims; ++j) {
3825     kmp_int64
3826         range_length; // To keep ranges of all dimensions but the first dims[0]
3827     if (dims[j].st == 1) { // most common case
3828       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3829       range_length = dims[j].up - dims[j].lo + 1;
3830     } else {
3831       if (dims[j].st > 0) {
3832         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3833         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3834       } else { // negative increment
3835         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3836         range_length =
3837             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3838       }
3839     }
3840     pr_buf->th_doacross_info[last++] = range_length;
3841     pr_buf->th_doacross_info[last++] = dims[j].lo;
3842     pr_buf->th_doacross_info[last++] = dims[j].up;
3843     pr_buf->th_doacross_info[last++] = dims[j].st;
3844   }
3845 
3846   // Compute total trip count.
3847   // Start with range of dims[0] which we don't need to keep in the buffer.
3848   if (dims[0].st == 1) { // most common case
3849     trace_count = dims[0].up - dims[0].lo + 1;
3850   } else if (dims[0].st > 0) {
3851     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3852     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3853   } else { // negative increment
3854     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3855     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3856   }
3857   for (j = 1; j < num_dims; ++j) {
3858     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3859   }
3860   KMP_DEBUG_ASSERT(trace_count > 0);
3861 
3862   // Check if shared buffer is not occupied by other loop (idx -
3863   // __kmp_dispatch_num_buffers)
3864   if (idx != sh_buf->doacross_buf_idx) {
3865     // Shared buffer is occupied, wait for it to be free
3866     __kmp_wait_yield_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3867                        __kmp_eq_4, NULL);
3868   }
3869 #if KMP_32_BIT_ARCH
3870   // Check if we are the first thread. After the CAS the first thread gets 0,
3871   // others get 1 if initialization is in progress, allocated pointer otherwise.
3872   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3873   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3874       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3875 #else
3876   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3877       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3878 #endif
3879   if (flags == NULL) {
3880     // we are the first thread, allocate the array of flags
3881     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3882     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3883     KMP_MB();
3884     sh_buf->doacross_flags = flags;
3885   } else if (flags == (kmp_uint32 *)1) {
3886 #if KMP_32_BIT_ARCH
3887     // initialization is still in progress, need to wait
3888     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3889 #else
3890     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3891 #endif
3892       KMP_YIELD(TRUE);
3893     KMP_MB();
3894   } else {
3895     KMP_MB();
3896   }
3897   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3898   pr_buf->th_doacross_flags =
3899       sh_buf->doacross_flags; // save private copy in order to not
3900   // touch shared buffer on each iteration
3901   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3902 }
3903 
3904 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3905   kmp_int32 shft, num_dims, i;
3906   kmp_uint32 flag;
3907   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3908   kmp_info_t *th = __kmp_threads[gtid];
3909   kmp_team_t *team = th->th.th_team;
3910   kmp_disp_t *pr_buf;
3911   kmp_int64 lo, up, st;
3912 
3913   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3914   if (team->t.t_serialized) {
3915     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3916     return; // no dependencies if team is serialized
3917   }
3918 
3919   // calculate sequential iteration number and check out-of-bounds condition
3920   pr_buf = th->th.th_dispatch;
3921   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3922   num_dims = pr_buf->th_doacross_info[0];
3923   lo = pr_buf->th_doacross_info[2];
3924   up = pr_buf->th_doacross_info[3];
3925   st = pr_buf->th_doacross_info[4];
3926   if (st == 1) { // most common case
3927     if (vec[0] < lo || vec[0] > up) {
3928       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3929                     "bounds [%lld,%lld]\n",
3930                     gtid, vec[0], lo, up));
3931       return;
3932     }
3933     iter_number = vec[0] - lo;
3934   } else if (st > 0) {
3935     if (vec[0] < lo || vec[0] > up) {
3936       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3937                     "bounds [%lld,%lld]\n",
3938                     gtid, vec[0], lo, up));
3939       return;
3940     }
3941     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3942   } else { // negative increment
3943     if (vec[0] > lo || vec[0] < up) {
3944       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3945                     "bounds [%lld,%lld]\n",
3946                     gtid, vec[0], lo, up));
3947       return;
3948     }
3949     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3950   }
3951   for (i = 1; i < num_dims; ++i) {
3952     kmp_int64 iter, ln;
3953     kmp_int32 j = i * 4;
3954     ln = pr_buf->th_doacross_info[j + 1];
3955     lo = pr_buf->th_doacross_info[j + 2];
3956     up = pr_buf->th_doacross_info[j + 3];
3957     st = pr_buf->th_doacross_info[j + 4];
3958     if (st == 1) {
3959       if (vec[i] < lo || vec[i] > up) {
3960         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3961                       "bounds [%lld,%lld]\n",
3962                       gtid, vec[i], lo, up));
3963         return;
3964       }
3965       iter = vec[i] - lo;
3966     } else if (st > 0) {
3967       if (vec[i] < lo || vec[i] > up) {
3968         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3969                       "bounds [%lld,%lld]\n",
3970                       gtid, vec[i], lo, up));
3971         return;
3972       }
3973       iter = (kmp_uint64)(vec[i] - lo) / st;
3974     } else { // st < 0
3975       if (vec[i] > lo || vec[i] < up) {
3976         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3977                       "bounds [%lld,%lld]\n",
3978                       gtid, vec[i], lo, up));
3979         return;
3980       }
3981       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3982     }
3983     iter_number = iter + ln * iter_number;
3984   }
3985   shft = iter_number % 32; // use 32-bit granularity
3986   iter_number >>= 5; // divided by 32
3987   flag = 1 << shft;
3988   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3989     KMP_YIELD(TRUE);
3990   }
3991   KMP_MB();
3992   KA_TRACE(20,
3993            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3994             gtid, (iter_number << 5) + shft));
3995 }
3996 
3997 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
3998   kmp_int32 shft, num_dims, i;
3999   kmp_uint32 flag;
4000   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4001   kmp_info_t *th = __kmp_threads[gtid];
4002   kmp_team_t *team = th->th.th_team;
4003   kmp_disp_t *pr_buf;
4004   kmp_int64 lo, st;
4005 
4006   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4007   if (team->t.t_serialized) {
4008     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4009     return; // no dependencies if team is serialized
4010   }
4011 
4012   // calculate sequential iteration number (same as in "wait" but no
4013   // out-of-bounds checks)
4014   pr_buf = th->th.th_dispatch;
4015   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4016   num_dims = pr_buf->th_doacross_info[0];
4017   lo = pr_buf->th_doacross_info[2];
4018   st = pr_buf->th_doacross_info[4];
4019   if (st == 1) { // most common case
4020     iter_number = vec[0] - lo;
4021   } else if (st > 0) {
4022     iter_number = (kmp_uint64)(vec[0] - lo) / st;
4023   } else { // negative increment
4024     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4025   }
4026   for (i = 1; i < num_dims; ++i) {
4027     kmp_int64 iter, ln;
4028     kmp_int32 j = i * 4;
4029     ln = pr_buf->th_doacross_info[j + 1];
4030     lo = pr_buf->th_doacross_info[j + 2];
4031     st = pr_buf->th_doacross_info[j + 4];
4032     if (st == 1) {
4033       iter = vec[i] - lo;
4034     } else if (st > 0) {
4035       iter = (kmp_uint64)(vec[i] - lo) / st;
4036     } else { // st < 0
4037       iter = (kmp_uint64)(lo - vec[i]) / (-st);
4038     }
4039     iter_number = iter + ln * iter_number;
4040   }
4041   shft = iter_number % 32; // use 32-bit granularity
4042   iter_number >>= 5; // divided by 32
4043   flag = 1 << shft;
4044   KMP_MB();
4045   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4046     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4047   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4048                 (iter_number << 5) + shft));
4049 }
4050 
4051 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4052   kmp_int32 num_done;
4053   kmp_info_t *th = __kmp_threads[gtid];
4054   kmp_team_t *team = th->th.th_team;
4055   kmp_disp_t *pr_buf = th->th.th_dispatch;
4056 
4057   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4058   if (team->t.t_serialized) {
4059     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4060     return; // nothing to do
4061   }
4062   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4063   if (num_done == th->th.th_team_nproc) {
4064     // we are the last thread, need to free shared resources
4065     int idx = pr_buf->th_doacross_buf_idx - 1;
4066     dispatch_shared_info_t *sh_buf =
4067         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4068     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4069                      (kmp_int64)&sh_buf->doacross_num_done);
4070     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4071     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4072     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4073     sh_buf->doacross_flags = NULL;
4074     sh_buf->doacross_num_done = 0;
4075     sh_buf->doacross_buf_idx +=
4076         __kmp_dispatch_num_buffers; // free buffer for future re-use
4077   }
4078   // free private resources (need to keep buffer index forever)
4079   pr_buf->th_doacross_flags = NULL;
4080   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4081   pr_buf->th_doacross_info = NULL;
4082   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4083 }
4084 #endif
4085 
4086 #if OMP_50_ENABLED
4087 int __kmpc_get_target_offload(void) {
4088   if (!__kmp_init_serial) {
4089     __kmp_serial_initialize();
4090   }
4091   return __kmp_target_offload;
4092 }
4093 #endif // OMP_50_ENABLED
4094 
4095 // end of file //
4096