1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h" /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_error.h"
19 #include "kmp_i18n.h"
20 #include "kmp_itt.h"
21 #include "kmp_lock.h"
22 #include "kmp_stats.h"
23 
24 #if OMPT_SUPPORT
25 #include "ompt-internal.h"
26 #include "ompt-specific.h"
27 #endif
28 
29 #define MAX_MESSAGE 512
30 
31 // flags will be used in future, e.g. to implement openmp_strict library
32 // restrictions
33 
34 /*!
35  * @ingroup STARTUP_SHUTDOWN
36  * @param loc   in   source location information
37  * @param flags in   for future use (currently ignored)
38  *
39  * Initialize the runtime library. This call is optional; if it is not made then
40  * it will be implicitly called by attempts to use other library functions.
41  */
42 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
43   // By default __kmpc_begin() is no-op.
44   char *env;
45   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
46       __kmp_str_match_true(env)) {
47     __kmp_middle_initialize();
48     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
49   } else if (__kmp_ignore_mppbeg() == FALSE) {
50     // By default __kmp_ignore_mppbeg() returns TRUE.
51     __kmp_internal_begin();
52     KC_TRACE(10, ("__kmpc_begin: called\n"));
53   }
54 }
55 
56 /*!
57  * @ingroup STARTUP_SHUTDOWN
58  * @param loc source location information
59  *
60  * Shutdown the runtime library. This is also optional, and even if called will
61  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
62  * zero.
63  */
64 void __kmpc_end(ident_t *loc) {
65   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
66   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
67   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
68   // returns FALSE and __kmpc_end() will unregister this root (it can cause
69   // library shut down).
70   if (__kmp_ignore_mppend() == FALSE) {
71     KC_TRACE(10, ("__kmpc_end: called\n"));
72     KA_TRACE(30, ("__kmpc_end\n"));
73 
74     __kmp_internal_end_thread(-1);
75   }
76 }
77 
78 /*!
79 @ingroup THREAD_STATES
80 @param loc Source location information.
81 @return The global thread index of the active thread.
82 
83 This function can be called in any context.
84 
85 If the runtime has ony been entered at the outermost level from a
86 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
87 that which would be returned by omp_get_thread_num() in the outermost
88 active parallel construct. (Or zero if there is no active parallel
89 construct, since the master thread is necessarily thread zero).
90 
91 If multiple non-OpenMP threads all enter an OpenMP construct then this
92 will be a unique thread identifier among all the threads created by
93 the OpenMP runtime (but the value cannote be defined in terms of
94 OpenMP thread ids returned by omp_get_thread_num()).
95 */
96 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
97   kmp_int32 gtid = __kmp_entry_gtid();
98 
99   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
100 
101   return gtid;
102 }
103 
104 /*!
105 @ingroup THREAD_STATES
106 @param loc Source location information.
107 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
108 
109 This function can be called in any context.
110 It returns the total number of threads under the control of the OpenMP runtime.
111 That is not a number that can be determined by any OpenMP standard calls, since
112 the library may be called from more than one non-OpenMP thread, and this
113 reflects the total over all such calls. Similarly the runtime maintains
114 underlying threads even when they are not active (since the cost of creating
115 and destroying OS threads is high), this call counts all such threads even if
116 they are not waiting for work.
117 */
118 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
119   KC_TRACE(10,
120            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
121 
122   return TCR_4(__kmp_all_nth);
123 }
124 
125 /*!
126 @ingroup THREAD_STATES
127 @param loc Source location information.
128 @return The thread number of the calling thread in the innermost active parallel
129 construct.
130 */
131 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
132   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
133   return __kmp_tid_from_gtid(__kmp_entry_gtid());
134 }
135 
136 /*!
137 @ingroup THREAD_STATES
138 @param loc Source location information.
139 @return The number of threads in the innermost active parallel construct.
140 */
141 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
142   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
143 
144   return __kmp_entry_thread()->th.th_team->t.t_nproc;
145 }
146 
147 /*!
148  * @ingroup DEPRECATED
149  * @param loc location description
150  *
151  * This function need not be called. It always returns TRUE.
152  */
153 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
154 #ifndef KMP_DEBUG
155 
156   return TRUE;
157 
158 #else
159 
160   const char *semi2;
161   const char *semi3;
162   int line_no;
163 
164   if (__kmp_par_range == 0) {
165     return TRUE;
166   }
167   semi2 = loc->psource;
168   if (semi2 == NULL) {
169     return TRUE;
170   }
171   semi2 = strchr(semi2, ';');
172   if (semi2 == NULL) {
173     return TRUE;
174   }
175   semi2 = strchr(semi2 + 1, ';');
176   if (semi2 == NULL) {
177     return TRUE;
178   }
179   if (__kmp_par_range_filename[0]) {
180     const char *name = semi2 - 1;
181     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
182       name--;
183     }
184     if ((*name == '/') || (*name == ';')) {
185       name++;
186     }
187     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
188       return __kmp_par_range < 0;
189     }
190   }
191   semi3 = strchr(semi2 + 1, ';');
192   if (__kmp_par_range_routine[0]) {
193     if ((semi3 != NULL) && (semi3 > semi2) &&
194         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
195       return __kmp_par_range < 0;
196     }
197   }
198   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
199     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
200       return __kmp_par_range > 0;
201     }
202     return __kmp_par_range < 0;
203   }
204   return TRUE;
205 
206 #endif /* KMP_DEBUG */
207 }
208 
209 /*!
210 @ingroup THREAD_STATES
211 @param loc Source location information.
212 @return 1 if this thread is executing inside an active parallel region, zero if
213 not.
214 */
215 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
216   return __kmp_entry_thread()->th.th_root->r.r_active;
217 }
218 
219 /*!
220 @ingroup PARALLEL
221 @param loc source location information
222 @param global_tid global thread number
223 @param num_threads number of threads requested for this parallel construct
224 
225 Set the number of threads to be used by the next fork spawned by this thread.
226 This call is only required if the parallel construct has a `num_threads` clause.
227 */
228 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
229                              kmp_int32 num_threads) {
230   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
231                 global_tid, num_threads));
232 
233   __kmp_push_num_threads(loc, global_tid, num_threads);
234 }
235 
236 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
237   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
238 
239   /* the num_threads are automatically popped */
240 }
241 
242 #if OMP_40_ENABLED
243 
244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245                            kmp_int32 proc_bind) {
246   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247                 proc_bind));
248 
249   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251 
252 #endif /* OMP_40_ENABLED */
253 
254 /*!
255 @ingroup PARALLEL
256 @param loc  source location information
257 @param argc  total number of arguments in the ellipsis
258 @param microtask  pointer to callback routine consisting of outlined parallel
259 construct
260 @param ...  pointers to shared variables that aren't global
261 
262 Do the actual fork and call the microtask in the relevant number of threads.
263 */
264 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
265   int gtid = __kmp_entry_gtid();
266 
267 #if (KMP_STATS_ENABLED)
268   int inParallel = __kmpc_in_parallel(loc);
269   if (inParallel) {
270     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
271   } else {
272     KMP_COUNT_BLOCK(OMP_PARALLEL);
273   }
274 #endif
275 
276   // maybe to save thr_state is enough here
277   {
278     va_list ap;
279     va_start(ap, microtask);
280 
281 #if OMPT_SUPPORT
282     ompt_frame_t *ompt_frame;
283     if (ompt_enabled) {
284       kmp_info_t *master_th = __kmp_threads[gtid];
285       kmp_team_t *parent_team = master_th->th.th_team;
286       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
287       if (lwt)
288         ompt_frame = &(lwt->ompt_task_info.frame);
289       else {
290         int tid = __kmp_tid_from_gtid(gtid);
291         ompt_frame = &(
292             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
293       }
294       ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
295     }
296 #endif
297 
298 #if INCLUDE_SSC_MARKS
299     SSC_MARK_FORKING();
300 #endif
301     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
302 #if OMPT_SUPPORT
303                     VOLATILE_CAST(void *) microtask, // "unwrapped" task
304 #endif
305                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
306                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
307 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
308 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
309                     &ap
310 #else
311                     ap
312 #endif
313                     );
314 #if INCLUDE_SSC_MARKS
315     SSC_MARK_JOINING();
316 #endif
317     __kmp_join_call(loc, gtid
318 #if OMPT_SUPPORT
319                     ,
320                     fork_context_intel
321 #endif
322                     );
323 
324     va_end(ap);
325   }
326 }
327 
328 #if OMP_40_ENABLED
329 /*!
330 @ingroup PARALLEL
331 @param loc source location information
332 @param global_tid global thread number
333 @param num_teams number of teams requested for the teams construct
334 @param num_threads number of threads per team requested for the teams construct
335 
336 Set the number of teams to be used by the teams construct.
337 This call is only required if the teams construct has a `num_teams` clause
338 or a `thread_limit` clause (or both).
339 */
340 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
341                            kmp_int32 num_teams, kmp_int32 num_threads) {
342   KA_TRACE(20,
343            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
344             global_tid, num_teams, num_threads));
345 
346   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
347 }
348 
349 /*!
350 @ingroup PARALLEL
351 @param loc  source location information
352 @param argc  total number of arguments in the ellipsis
353 @param microtask  pointer to callback routine consisting of outlined teams
354 construct
355 @param ...  pointers to shared variables that aren't global
356 
357 Do the actual fork and call the microtask in the relevant number of threads.
358 */
359 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
360                        ...) {
361   int gtid = __kmp_entry_gtid();
362   kmp_info_t *this_thr = __kmp_threads[gtid];
363   va_list ap;
364   va_start(ap, microtask);
365 
366   KMP_COUNT_BLOCK(OMP_TEAMS);
367 
368   // remember teams entry point and nesting level
369   this_thr->th.th_teams_microtask = microtask;
370   this_thr->th.th_teams_level =
371       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
372 
373 #if OMPT_SUPPORT
374   kmp_team_t *parent_team = this_thr->th.th_team;
375   int tid = __kmp_tid_from_gtid(gtid);
376   if (ompt_enabled) {
377     parent_team->t.t_implicit_task_taskdata[tid]
378         .ompt_task_info.frame.reenter_runtime_frame =
379         __builtin_frame_address(1);
380   }
381 #endif
382 
383   // check if __kmpc_push_num_teams called, set default number of teams
384   // otherwise
385   if (this_thr->th.th_teams_size.nteams == 0) {
386     __kmp_push_num_teams(loc, gtid, 0, 0);
387   }
388   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
389   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
390   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
391 
392   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
393 #if OMPT_SUPPORT
394                   VOLATILE_CAST(void *) microtask, // "unwrapped" task
395 #endif
396                   VOLATILE_CAST(microtask_t)
397                       __kmp_teams_master, // "wrapped" task
398                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
399 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
400                   &ap
401 #else
402                   ap
403 #endif
404                   );
405   __kmp_join_call(loc, gtid
406 #if OMPT_SUPPORT
407                   ,
408                   fork_context_intel
409 #endif
410                   );
411 
412   this_thr->th.th_teams_microtask = NULL;
413   this_thr->th.th_teams_level = 0;
414   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
415   va_end(ap);
416 }
417 #endif /* OMP_40_ENABLED */
418 
419 // I don't think this function should ever have been exported.
420 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
421 // openmp code ever called it, but it's been exported from the RTL for so
422 // long that I'm afraid to remove the definition.
423 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
424 
425 /*!
426 @ingroup PARALLEL
427 @param loc  source location information
428 @param global_tid  global thread number
429 
430 Enter a serialized parallel construct. This interface is used to handle a
431 conditional parallel region, like this,
432 @code
433 #pragma omp parallel if (condition)
434 @endcode
435 when the condition is false.
436 */
437 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
438   // The implementation is now in kmp_runtime.cpp so that it can share static
439   // functions with kmp_fork_call since the tasks to be done are similar in
440   // each case.
441   __kmp_serialized_parallel(loc, global_tid);
442 }
443 
444 /*!
445 @ingroup PARALLEL
446 @param loc  source location information
447 @param global_tid  global thread number
448 
449 Leave a serialized parallel construct.
450 */
451 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
452   kmp_internal_control_t *top;
453   kmp_info_t *this_thr;
454   kmp_team_t *serial_team;
455 
456   KC_TRACE(10,
457            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
458 
459   /* skip all this code for autopar serialized loops since it results in
460      unacceptable overhead */
461   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
462     return;
463 
464   // Not autopar code
465   if (!TCR_4(__kmp_init_parallel))
466     __kmp_parallel_initialize();
467 
468   this_thr = __kmp_threads[global_tid];
469   serial_team = this_thr->th.th_serial_team;
470 
471 #if OMP_45_ENABLED
472   kmp_task_team_t *task_team = this_thr->th.th_task_team;
473 
474   // we need to wait for the proxy tasks before finishing the thread
475   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
476     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
477 #endif
478 
479   KMP_MB();
480   KMP_DEBUG_ASSERT(serial_team);
481   KMP_ASSERT(serial_team->t.t_serialized);
482   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
483   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
484   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
485   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
486 
487   /* If necessary, pop the internal control stack values and replace the team
488    * values */
489   top = serial_team->t.t_control_stack_top;
490   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
491     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
492     serial_team->t.t_control_stack_top = top->next;
493     __kmp_free(top);
494   }
495 
496   // if( serial_team -> t.t_serialized > 1 )
497   serial_team->t.t_level--;
498 
499   /* pop dispatch buffers stack */
500   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
501   {
502     dispatch_private_info_t *disp_buffer =
503         serial_team->t.t_dispatch->th_disp_buffer;
504     serial_team->t.t_dispatch->th_disp_buffer =
505         serial_team->t.t_dispatch->th_disp_buffer->next;
506     __kmp_free(disp_buffer);
507   }
508 
509   --serial_team->t.t_serialized;
510   if (serial_team->t.t_serialized == 0) {
511 
512 /* return to the parallel section */
513 
514 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
515     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
516       __kmp_clear_x87_fpu_status_word();
517       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
518       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
519     }
520 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
521 
522     this_thr->th.th_team = serial_team->t.t_parent;
523     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
524 
525     /* restore values cached in the thread */
526     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
527     this_thr->th.th_team_master =
528         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
529     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
530 
531     /* TODO the below shouldn't need to be adjusted for serialized teams */
532     this_thr->th.th_dispatch =
533         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
534 
535     __kmp_pop_current_task_from_thread(this_thr);
536 
537     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
538     this_thr->th.th_current_task->td_flags.executing = 1;
539 
540     if (__kmp_tasking_mode != tskm_immediate_exec) {
541       // Copy the task team from the new child / old parent team to the thread.
542       this_thr->th.th_task_team =
543           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
544       KA_TRACE(20,
545                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
546                 "team %p\n",
547                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
548     }
549   } else {
550     if (__kmp_tasking_mode != tskm_immediate_exec) {
551       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
552                     "depth of serial team %p to %d\n",
553                     global_tid, serial_team, serial_team->t.t_serialized));
554     }
555   }
556 
557   if (__kmp_env_consistency_check)
558     __kmp_pop_parallel(global_tid, NULL);
559 }
560 
561 /*!
562 @ingroup SYNCHRONIZATION
563 @param loc  source location information.
564 
565 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
566 depending on the memory ordering convention obeyed by the compiler
567 even that may not be necessary).
568 */
569 void __kmpc_flush(ident_t *loc) {
570   KC_TRACE(10, ("__kmpc_flush: called\n"));
571 
572   /* need explicit __mf() here since use volatile instead in library */
573   KMP_MB(); /* Flush all pending memory write invalidates.  */
574 
575 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
576 #if KMP_MIC
577 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
578 // We shouldn't need it, though, since the ABI rules require that
579 // * If the compiler generates NGO stores it also generates the fence
580 // * If users hand-code NGO stores they should insert the fence
581 // therefore no incomplete unordered stores should be visible.
582 #else
583   // C74404
584   // This is to address non-temporal store instructions (sfence needed).
585   // The clflush instruction is addressed either (mfence needed).
586   // Probably the non-temporal load monvtdqa instruction should also be
587   // addressed.
588   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
589   if (!__kmp_cpuinfo.initialized) {
590     __kmp_query_cpuid(&__kmp_cpuinfo);
591   }; // if
592   if (!__kmp_cpuinfo.sse2) {
593     // CPU cannot execute SSE2 instructions.
594   } else {
595 #if KMP_COMPILER_ICC
596     _mm_mfence();
597 #elif KMP_COMPILER_MSVC
598     MemoryBarrier();
599 #else
600     __sync_synchronize();
601 #endif // KMP_COMPILER_ICC
602   }; // if
603 #endif // KMP_MIC
604 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
605 // Nothing to see here move along
606 #elif KMP_ARCH_PPC64
607 // Nothing needed here (we have a real MB above).
608 #if KMP_OS_CNK
609   // The flushing thread needs to yield here; this prevents a
610   // busy-waiting thread from saturating the pipeline. flush is
611   // often used in loops like this:
612   // while (!flag) {
613   //   #pragma omp flush(flag)
614   // }
615   // and adding the yield here is good for at least a 10x speedup
616   // when running >2 threads per core (on the NAS LU benchmark).
617   __kmp_yield(TRUE);
618 #endif
619 #else
620 #error Unknown or unsupported architecture
621 #endif
622 }
623 
624 /* -------------------------------------------------------------------------- */
625 /*!
626 @ingroup SYNCHRONIZATION
627 @param loc source location information
628 @param global_tid thread id.
629 
630 Execute a barrier.
631 */
632 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
633   KMP_COUNT_BLOCK(OMP_BARRIER);
634   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
635 
636   if (!TCR_4(__kmp_init_parallel))
637     __kmp_parallel_initialize();
638 
639   if (__kmp_env_consistency_check) {
640     if (loc == 0) {
641       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
642     }; // if
643 
644     __kmp_check_barrier(global_tid, ct_barrier, loc);
645   }
646 
647 #if OMPT_SUPPORT && OMPT_TRACE
648   ompt_frame_t *ompt_frame;
649   if (ompt_enabled) {
650     ompt_frame = __ompt_get_task_frame_internal(0);
651     if (ompt_frame->reenter_runtime_frame == NULL)
652       ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
653   }
654 #endif
655   __kmp_threads[global_tid]->th.th_ident = loc;
656   // TODO: explicit barrier_wait_id:
657   //   this function is called when 'barrier' directive is present or
658   //   implicit barrier at the end of a worksharing construct.
659   // 1) better to add a per-thread barrier counter to a thread data structure
660   // 2) set to 0 when a new team is created
661   // 4) no sync is required
662 
663   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
664 #if OMPT_SUPPORT && OMPT_TRACE
665   if (ompt_enabled) {
666     ompt_frame->reenter_runtime_frame = NULL;
667   }
668 #endif
669 }
670 
671 /* The BARRIER for a MASTER section is always explicit   */
672 /*!
673 @ingroup WORK_SHARING
674 @param loc  source location information.
675 @param global_tid  global thread number .
676 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
677 */
678 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
679   int status = 0;
680 
681   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
682 
683   if (!TCR_4(__kmp_init_parallel))
684     __kmp_parallel_initialize();
685 
686   if (KMP_MASTER_GTID(global_tid)) {
687     KMP_COUNT_BLOCK(OMP_MASTER);
688     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
689     status = 1;
690   }
691 
692 #if OMPT_SUPPORT && OMPT_TRACE
693   if (status) {
694     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
695       kmp_info_t *this_thr = __kmp_threads[global_tid];
696       kmp_team_t *team = this_thr->th.th_team;
697 
698       int tid = __kmp_tid_from_gtid(global_tid);
699       ompt_callbacks.ompt_callback(ompt_event_master_begin)(
700           team->t.ompt_team_info.parallel_id,
701           team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
702     }
703   }
704 #endif
705 
706   if (__kmp_env_consistency_check) {
707 #if KMP_USE_DYNAMIC_LOCK
708     if (status)
709       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
710     else
711       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
712 #else
713     if (status)
714       __kmp_push_sync(global_tid, ct_master, loc, NULL);
715     else
716       __kmp_check_sync(global_tid, ct_master, loc, NULL);
717 #endif
718   }
719 
720   return status;
721 }
722 
723 /*!
724 @ingroup WORK_SHARING
725 @param loc  source location information.
726 @param global_tid  global thread number .
727 
728 Mark the end of a <tt>master</tt> region. This should only be called by the
729 thread that executes the <tt>master</tt> region.
730 */
731 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
732   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
733 
734   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
735   KMP_POP_PARTITIONED_TIMER();
736 
737 #if OMPT_SUPPORT && OMPT_TRACE
738   kmp_info_t *this_thr = __kmp_threads[global_tid];
739   kmp_team_t *team = this_thr->th.th_team;
740   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_master_end)) {
741     int tid = __kmp_tid_from_gtid(global_tid);
742     ompt_callbacks.ompt_callback(ompt_event_master_end)(
743         team->t.ompt_team_info.parallel_id,
744         team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
745   }
746 #endif
747 
748   if (__kmp_env_consistency_check) {
749     if (global_tid < 0)
750       KMP_WARNING(ThreadIdentInvalid);
751 
752     if (KMP_MASTER_GTID(global_tid))
753       __kmp_pop_sync(global_tid, ct_master, loc);
754   }
755 }
756 
757 /*!
758 @ingroup WORK_SHARING
759 @param loc  source location information.
760 @param gtid  global thread number.
761 
762 Start execution of an <tt>ordered</tt> construct.
763 */
764 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
765   int cid = 0;
766   kmp_info_t *th;
767   KMP_DEBUG_ASSERT(__kmp_init_serial);
768 
769   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
770 
771   if (!TCR_4(__kmp_init_parallel))
772     __kmp_parallel_initialize();
773 
774 #if USE_ITT_BUILD
775   __kmp_itt_ordered_prep(gtid);
776 // TODO: ordered_wait_id
777 #endif /* USE_ITT_BUILD */
778 
779   th = __kmp_threads[gtid];
780 
781 #if OMPT_SUPPORT && OMPT_TRACE
782   if (ompt_enabled) {
783     /* OMPT state update */
784     th->th.ompt_thread_info.wait_id = (uint64_t)loc;
785     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
786 
787     /* OMPT event callback */
788     if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
789       ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
790           th->th.ompt_thread_info.wait_id);
791     }
792   }
793 #endif
794 
795   if (th->th.th_dispatch->th_deo_fcn != 0)
796     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
797   else
798     __kmp_parallel_deo(&gtid, &cid, loc);
799 
800 #if OMPT_SUPPORT && OMPT_TRACE
801   if (ompt_enabled) {
802     /* OMPT state update */
803     th->th.ompt_thread_info.state = ompt_state_work_parallel;
804     th->th.ompt_thread_info.wait_id = 0;
805 
806     /* OMPT event callback */
807     if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
808       ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
809           th->th.ompt_thread_info.wait_id);
810     }
811   }
812 #endif
813 
814 #if USE_ITT_BUILD
815   __kmp_itt_ordered_start(gtid);
816 #endif /* USE_ITT_BUILD */
817 }
818 
819 /*!
820 @ingroup WORK_SHARING
821 @param loc  source location information.
822 @param gtid  global thread number.
823 
824 End execution of an <tt>ordered</tt> construct.
825 */
826 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
827   int cid = 0;
828   kmp_info_t *th;
829 
830   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
831 
832 #if USE_ITT_BUILD
833   __kmp_itt_ordered_end(gtid);
834 // TODO: ordered_wait_id
835 #endif /* USE_ITT_BUILD */
836 
837   th = __kmp_threads[gtid];
838 
839   if (th->th.th_dispatch->th_dxo_fcn != 0)
840     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
841   else
842     __kmp_parallel_dxo(&gtid, &cid, loc);
843 
844 #if OMPT_SUPPORT && OMPT_BLAME
845   if (ompt_enabled &&
846       ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
847     ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
848         th->th.ompt_thread_info.wait_id);
849   }
850 #endif
851 }
852 
853 #if KMP_USE_DYNAMIC_LOCK
854 
855 static __forceinline void
856 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
857                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
858   // Pointer to the allocated indirect lock is written to crit, while indexing
859   // is ignored.
860   void *idx;
861   kmp_indirect_lock_t **lck;
862   lck = (kmp_indirect_lock_t **)crit;
863   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
864   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
865   KMP_SET_I_LOCK_LOCATION(ilk, loc);
866   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
867   KA_TRACE(20,
868            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
869 #if USE_ITT_BUILD
870   __kmp_itt_critical_creating(ilk->lock, loc);
871 #endif
872   int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
873   if (status == 0) {
874 #if USE_ITT_BUILD
875     __kmp_itt_critical_destroyed(ilk->lock);
876 #endif
877     // We don't really need to destroy the unclaimed lock here since it will be
878     // cleaned up at program exit.
879     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
880   }
881   KMP_DEBUG_ASSERT(*lck != NULL);
882 }
883 
884 // Fast-path acquire tas lock
885 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
886   {                                                                            \
887     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
888     if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                    \
889         !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
890                                      KMP_LOCK_BUSY(gtid + 1, tas))) {          \
891       kmp_uint32 spins;                                                        \
892       KMP_FSYNC_PREPARE(l);                                                    \
893       KMP_INIT_YIELD(spins);                                                   \
894       if (TCR_4(__kmp_nth) >                                                   \
895           (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {               \
896         KMP_YIELD(TRUE);                                                       \
897       } else {                                                                 \
898         KMP_YIELD_SPIN(spins);                                                 \
899       }                                                                        \
900       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
901       while (l->lk.poll != KMP_LOCK_FREE(tas) ||                               \
902              !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),   \
903                                           KMP_LOCK_BUSY(gtid + 1, tas))) {     \
904         __kmp_spin_backoff(&backoff);                                          \
905         if (TCR_4(__kmp_nth) >                                                 \
906             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
907           KMP_YIELD(TRUE);                                                     \
908         } else {                                                               \
909           KMP_YIELD_SPIN(spins);                                               \
910         }                                                                      \
911       }                                                                        \
912     }                                                                          \
913     KMP_FSYNC_ACQUIRED(l);                                                     \
914   }
915 
916 // Fast-path test tas lock
917 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
918   {                                                                            \
919     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
920     rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                   \
921          KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
922                                      KMP_LOCK_BUSY(gtid + 1, tas));            \
923   }
924 
925 // Fast-path release tas lock
926 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
927   {                                                                            \
928     TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));              \
929     KMP_MB();                                                                  \
930   }
931 
932 #if KMP_USE_FUTEX
933 
934 #include <sys/syscall.h>
935 #include <unistd.h>
936 #ifndef FUTEX_WAIT
937 #define FUTEX_WAIT 0
938 #endif
939 #ifndef FUTEX_WAKE
940 #define FUTEX_WAKE 1
941 #endif
942 
943 // Fast-path acquire futex lock
944 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
945   {                                                                            \
946     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
947     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
948     KMP_MB();                                                                  \
949     KMP_FSYNC_PREPARE(ftx);                                                    \
950     kmp_int32 poll_val;                                                        \
951     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
952                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
953                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
954       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
955       if (!cond) {                                                             \
956         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
957                                          poll_val |                            \
958                                              KMP_LOCK_BUSY(1, futex))) {       \
959           continue;                                                            \
960         }                                                                      \
961         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
962       }                                                                        \
963       kmp_int32 rc;                                                            \
964       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
965                         NULL, NULL, 0)) != 0) {                                \
966         continue;                                                              \
967       }                                                                        \
968       gtid_code |= 1;                                                          \
969     }                                                                          \
970     KMP_FSYNC_ACQUIRED(ftx);                                                   \
971   }
972 
973 // Fast-path test futex lock
974 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
975   {                                                                            \
976     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
977     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
978                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
979       KMP_FSYNC_ACQUIRED(ftx);                                                 \
980       rc = TRUE;                                                               \
981     } else {                                                                   \
982       rc = FALSE;                                                              \
983     }                                                                          \
984   }
985 
986 // Fast-path release futex lock
987 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
988   {                                                                            \
989     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
990     KMP_MB();                                                                  \
991     KMP_FSYNC_RELEASING(ftx);                                                  \
992     kmp_int32 poll_val =                                                       \
993         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
994     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
995       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
996               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
997     }                                                                          \
998     KMP_MB();                                                                  \
999     KMP_YIELD(TCR_4(__kmp_nth) >                                               \
1000               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));            \
1001   }
1002 
1003 #endif // KMP_USE_FUTEX
1004 
1005 #else // KMP_USE_DYNAMIC_LOCK
1006 
1007 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1008                                                       ident_t const *loc,
1009                                                       kmp_int32 gtid) {
1010   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1011 
1012   // Because of the double-check, the following load doesn't need to be volatile
1013   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1014 
1015   if (lck == NULL) {
1016     void *idx;
1017 
1018     // Allocate & initialize the lock.
1019     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1020     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1021     __kmp_init_user_lock_with_checks(lck);
1022     __kmp_set_user_lock_location(lck, loc);
1023 #if USE_ITT_BUILD
1024     __kmp_itt_critical_creating(lck);
1025 // __kmp_itt_critical_creating() should be called *before* the first usage
1026 // of underlying lock. It is the only place where we can guarantee it. There
1027 // are chances the lock will destroyed with no usage, but it is not a
1028 // problem, because this is not real event seen by user but rather setting
1029 // name for object (lock). See more details in kmp_itt.h.
1030 #endif /* USE_ITT_BUILD */
1031 
1032     // Use a cmpxchg instruction to slam the start of the critical section with
1033     // the lock pointer.  If another thread beat us to it, deallocate the lock,
1034     // and use the lock that the other thread allocated.
1035     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1036 
1037     if (status == 0) {
1038 // Deallocate the lock and reload the value.
1039 #if USE_ITT_BUILD
1040       __kmp_itt_critical_destroyed(lck);
1041 // Let ITT know the lock is destroyed and the same memory location may be reused
1042 // for another purpose.
1043 #endif /* USE_ITT_BUILD */
1044       __kmp_destroy_user_lock_with_checks(lck);
1045       __kmp_user_lock_free(&idx, gtid, lck);
1046       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1047       KMP_DEBUG_ASSERT(lck != NULL);
1048     }
1049   }
1050   return lck;
1051 }
1052 
1053 #endif // KMP_USE_DYNAMIC_LOCK
1054 
1055 /*!
1056 @ingroup WORK_SHARING
1057 @param loc  source location information.
1058 @param global_tid  global thread number .
1059 @param crit identity of the critical section. This could be a pointer to a lock
1060 associated with the critical section, or some other suitably unique value.
1061 
1062 Enter code protected by a `critical` construct.
1063 This function blocks until the executing thread can enter the critical section.
1064 */
1065 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1066                      kmp_critical_name *crit) {
1067 #if KMP_USE_DYNAMIC_LOCK
1068   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1069 #else
1070   KMP_COUNT_BLOCK(OMP_CRITICAL);
1071   KMP_TIME_PARTITIONED_BLOCK(
1072       OMP_critical_wait); /* Time spent waiting to enter the critical section */
1073   kmp_user_lock_p lck;
1074 
1075   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1076 
1077   // TODO: add THR_OVHD_STATE
1078 
1079   KMP_CHECK_USER_LOCK_INIT();
1080 
1081   if ((__kmp_user_lock_kind == lk_tas) &&
1082       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1083     lck = (kmp_user_lock_p)crit;
1084   }
1085 #if KMP_USE_FUTEX
1086   else if ((__kmp_user_lock_kind == lk_futex) &&
1087            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1088     lck = (kmp_user_lock_p)crit;
1089   }
1090 #endif
1091   else { // ticket, queuing or drdpa
1092     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1093   }
1094 
1095   if (__kmp_env_consistency_check)
1096     __kmp_push_sync(global_tid, ct_critical, loc, lck);
1097 
1098 // since the critical directive binds to all threads, not just the current
1099 // team we have to check this even if we are in a serialized team.
1100 // also, even if we are the uber thread, we still have to conduct the lock,
1101 // as we have to contend with sibling threads.
1102 
1103 #if USE_ITT_BUILD
1104   __kmp_itt_critical_acquiring(lck);
1105 #endif /* USE_ITT_BUILD */
1106   // Value of 'crit' should be good for using as a critical_id of the critical
1107   // section directive.
1108   __kmp_acquire_user_lock_with_checks(lck, global_tid);
1109 
1110 #if USE_ITT_BUILD
1111   __kmp_itt_critical_acquired(lck);
1112 #endif /* USE_ITT_BUILD */
1113 
1114   KMP_START_EXPLICIT_TIMER(OMP_critical);
1115   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1116 #endif // KMP_USE_DYNAMIC_LOCK
1117 }
1118 
1119 #if KMP_USE_DYNAMIC_LOCK
1120 
1121 // Converts the given hint to an internal lock implementation
1122 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1123 #if KMP_USE_TSX
1124 #define KMP_TSX_LOCK(seq) lockseq_##seq
1125 #else
1126 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1127 #endif
1128 
1129 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1130 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1131 #else
1132 #define KMP_CPUINFO_RTM 0
1133 #endif
1134 
1135   // Hints that do not require further logic
1136   if (hint & kmp_lock_hint_hle)
1137     return KMP_TSX_LOCK(hle);
1138   if (hint & kmp_lock_hint_rtm)
1139     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1140   if (hint & kmp_lock_hint_adaptive)
1141     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1142 
1143   // Rule out conflicting hints first by returning the default lock
1144   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1145     return __kmp_user_lock_seq;
1146   if ((hint & omp_lock_hint_speculative) &&
1147       (hint & omp_lock_hint_nonspeculative))
1148     return __kmp_user_lock_seq;
1149 
1150   // Do not even consider speculation when it appears to be contended
1151   if (hint & omp_lock_hint_contended)
1152     return lockseq_queuing;
1153 
1154   // Uncontended lock without speculation
1155   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1156     return lockseq_tas;
1157 
1158   // HLE lock for speculation
1159   if (hint & omp_lock_hint_speculative)
1160     return KMP_TSX_LOCK(hle);
1161 
1162   return __kmp_user_lock_seq;
1163 }
1164 
1165 /*!
1166 @ingroup WORK_SHARING
1167 @param loc  source location information.
1168 @param global_tid  global thread number.
1169 @param crit identity of the critical section. This could be a pointer to a lock
1170 associated with the critical section, or some other suitably unique value.
1171 @param hint the lock hint.
1172 
1173 Enter code protected by a `critical` construct with a hint. The hint value is
1174 used to suggest a lock implementation. This function blocks until the executing
1175 thread can enter the critical section unless the hint suggests use of
1176 speculative execution and the hardware supports it.
1177 */
1178 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1179                                kmp_critical_name *crit, uintptr_t hint) {
1180   KMP_COUNT_BLOCK(OMP_CRITICAL);
1181   kmp_user_lock_p lck;
1182 
1183   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1184 
1185   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1186   // Check if it is initialized.
1187   if (*lk == 0) {
1188     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1189     if (KMP_IS_D_LOCK(lckseq)) {
1190       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1191                                   KMP_GET_D_TAG(lckseq));
1192     } else {
1193       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1194     }
1195   }
1196   // Branch for accessing the actual lock object and set operation. This
1197   // branching is inevitable since this lock initialization does not follow the
1198   // normal dispatch path (lock table is not used).
1199   if (KMP_EXTRACT_D_TAG(lk) != 0) {
1200     lck = (kmp_user_lock_p)lk;
1201     if (__kmp_env_consistency_check) {
1202       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1203                       __kmp_map_hint_to_lock(hint));
1204     }
1205 #if USE_ITT_BUILD
1206     __kmp_itt_critical_acquiring(lck);
1207 #endif
1208 #if KMP_USE_INLINED_TAS
1209     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1210       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1211     } else
1212 #elif KMP_USE_INLINED_FUTEX
1213     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1214       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1215     } else
1216 #endif
1217     {
1218       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1219     }
1220   } else {
1221     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1222     lck = ilk->lock;
1223     if (__kmp_env_consistency_check) {
1224       __kmp_push_sync(global_tid, ct_critical, loc, lck,
1225                       __kmp_map_hint_to_lock(hint));
1226     }
1227 #if USE_ITT_BUILD
1228     __kmp_itt_critical_acquiring(lck);
1229 #endif
1230     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1231   }
1232 
1233 #if USE_ITT_BUILD
1234   __kmp_itt_critical_acquired(lck);
1235 #endif /* USE_ITT_BUILD */
1236 
1237   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1238   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1239 } // __kmpc_critical_with_hint
1240 
1241 #endif // KMP_USE_DYNAMIC_LOCK
1242 
1243 /*!
1244 @ingroup WORK_SHARING
1245 @param loc  source location information.
1246 @param global_tid  global thread number .
1247 @param crit identity of the critical section. This could be a pointer to a lock
1248 associated with the critical section, or some other suitably unique value.
1249 
1250 Leave a critical section, releasing any lock that was held during its execution.
1251 */
1252 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1253                          kmp_critical_name *crit) {
1254   kmp_user_lock_p lck;
1255 
1256   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1257 
1258 #if KMP_USE_DYNAMIC_LOCK
1259   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1260     lck = (kmp_user_lock_p)crit;
1261     KMP_ASSERT(lck != NULL);
1262     if (__kmp_env_consistency_check) {
1263       __kmp_pop_sync(global_tid, ct_critical, loc);
1264     }
1265 #if USE_ITT_BUILD
1266     __kmp_itt_critical_releasing(lck);
1267 #endif
1268 #if KMP_USE_INLINED_TAS
1269     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1270       KMP_RELEASE_TAS_LOCK(lck, global_tid);
1271     } else
1272 #elif KMP_USE_INLINED_FUTEX
1273     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1274       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1275     } else
1276 #endif
1277     {
1278       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1279     }
1280   } else {
1281     kmp_indirect_lock_t *ilk =
1282         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1283     KMP_ASSERT(ilk != NULL);
1284     lck = ilk->lock;
1285     if (__kmp_env_consistency_check) {
1286       __kmp_pop_sync(global_tid, ct_critical, loc);
1287     }
1288 #if USE_ITT_BUILD
1289     __kmp_itt_critical_releasing(lck);
1290 #endif
1291     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1292   }
1293 
1294 #else // KMP_USE_DYNAMIC_LOCK
1295 
1296   if ((__kmp_user_lock_kind == lk_tas) &&
1297       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1298     lck = (kmp_user_lock_p)crit;
1299   }
1300 #if KMP_USE_FUTEX
1301   else if ((__kmp_user_lock_kind == lk_futex) &&
1302            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1303     lck = (kmp_user_lock_p)crit;
1304   }
1305 #endif
1306   else { // ticket, queuing or drdpa
1307     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1308   }
1309 
1310   KMP_ASSERT(lck != NULL);
1311 
1312   if (__kmp_env_consistency_check)
1313     __kmp_pop_sync(global_tid, ct_critical, loc);
1314 
1315 #if USE_ITT_BUILD
1316   __kmp_itt_critical_releasing(lck);
1317 #endif /* USE_ITT_BUILD */
1318   // Value of 'crit' should be good for using as a critical_id of the critical
1319   // section directive.
1320   __kmp_release_user_lock_with_checks(lck, global_tid);
1321 
1322 #if OMPT_SUPPORT && OMPT_BLAME
1323   if (ompt_enabled &&
1324       ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1325     ompt_callbacks.ompt_callback(ompt_event_release_critical)((uint64_t)lck);
1326   }
1327 #endif
1328 
1329 #endif // KMP_USE_DYNAMIC_LOCK
1330   KMP_POP_PARTITIONED_TIMER();
1331   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1332 }
1333 
1334 /*!
1335 @ingroup SYNCHRONIZATION
1336 @param loc source location information
1337 @param global_tid thread id.
1338 @return one if the thread should execute the master block, zero otherwise
1339 
1340 Start execution of a combined barrier and master. The barrier is executed inside
1341 this function.
1342 */
1343 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1344   int status;
1345 
1346   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1347 
1348   if (!TCR_4(__kmp_init_parallel))
1349     __kmp_parallel_initialize();
1350 
1351   if (__kmp_env_consistency_check)
1352     __kmp_check_barrier(global_tid, ct_barrier, loc);
1353 
1354 #if USE_ITT_NOTIFY
1355   __kmp_threads[global_tid]->th.th_ident = loc;
1356 #endif
1357   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1358 
1359   return (status != 0) ? 0 : 1;
1360 }
1361 
1362 /*!
1363 @ingroup SYNCHRONIZATION
1364 @param loc source location information
1365 @param global_tid thread id.
1366 
1367 Complete the execution of a combined barrier and master. This function should
1368 only be called at the completion of the <tt>master</tt> code. Other threads will
1369 still be waiting at the barrier and this call releases them.
1370 */
1371 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1372   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1373 
1374   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1375 }
1376 
1377 /*!
1378 @ingroup SYNCHRONIZATION
1379 @param loc source location information
1380 @param global_tid thread id.
1381 @return one if the thread should execute the master block, zero otherwise
1382 
1383 Start execution of a combined barrier and master(nowait) construct.
1384 The barrier is executed inside this function.
1385 There is no equivalent "end" function, since the
1386 */
1387 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1388   kmp_int32 ret;
1389 
1390   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1391 
1392   if (!TCR_4(__kmp_init_parallel))
1393     __kmp_parallel_initialize();
1394 
1395   if (__kmp_env_consistency_check) {
1396     if (loc == 0) {
1397       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1398     }
1399     __kmp_check_barrier(global_tid, ct_barrier, loc);
1400   }
1401 
1402 #if USE_ITT_NOTIFY
1403   __kmp_threads[global_tid]->th.th_ident = loc;
1404 #endif
1405   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1406 
1407   ret = __kmpc_master(loc, global_tid);
1408 
1409   if (__kmp_env_consistency_check) {
1410     /*  there's no __kmpc_end_master called; so the (stats) */
1411     /*  actions of __kmpc_end_master are done here          */
1412 
1413     if (global_tid < 0) {
1414       KMP_WARNING(ThreadIdentInvalid);
1415     }
1416     if (ret) {
1417       /* only one thread should do the pop since only */
1418       /* one did the push (see __kmpc_master())       */
1419 
1420       __kmp_pop_sync(global_tid, ct_master, loc);
1421     }
1422   }
1423 
1424   return (ret);
1425 }
1426 
1427 /* The BARRIER for a SINGLE process section is always explicit   */
1428 /*!
1429 @ingroup WORK_SHARING
1430 @param loc  source location information
1431 @param global_tid  global thread number
1432 @return One if this thread should execute the single construct, zero otherwise.
1433 
1434 Test whether to execute a <tt>single</tt> construct.
1435 There are no implicit barriers in the two "single" calls, rather the compiler
1436 should introduce an explicit barrier if it is required.
1437 */
1438 
1439 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1440   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1441 
1442   if (rc) {
1443     // We are going to execute the single statement, so we should count it.
1444     KMP_COUNT_BLOCK(OMP_SINGLE);
1445     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1446   }
1447 
1448 #if OMPT_SUPPORT && OMPT_TRACE
1449   kmp_info_t *this_thr = __kmp_threads[global_tid];
1450   kmp_team_t *team = this_thr->th.th_team;
1451   int tid = __kmp_tid_from_gtid(global_tid);
1452 
1453   if (ompt_enabled) {
1454     if (rc) {
1455       if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1456         ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1457             team->t.ompt_team_info.parallel_id,
1458             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1459             team->t.ompt_team_info.microtask);
1460       }
1461     } else {
1462       if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1463         ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1464             team->t.ompt_team_info.parallel_id,
1465             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1466       }
1467       this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1468     }
1469   }
1470 #endif
1471 
1472   return rc;
1473 }
1474 
1475 /*!
1476 @ingroup WORK_SHARING
1477 @param loc  source location information
1478 @param global_tid  global thread number
1479 
1480 Mark the end of a <tt>single</tt> construct.  This function should
1481 only be called by the thread that executed the block of code protected
1482 by the `single` construct.
1483 */
1484 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1485   __kmp_exit_single(global_tid);
1486   KMP_POP_PARTITIONED_TIMER();
1487 
1488 #if OMPT_SUPPORT && OMPT_TRACE
1489   kmp_info_t *this_thr = __kmp_threads[global_tid];
1490   kmp_team_t *team = this_thr->th.th_team;
1491   int tid = __kmp_tid_from_gtid(global_tid);
1492 
1493   if (ompt_enabled &&
1494       ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1495     ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1496         team->t.ompt_team_info.parallel_id,
1497         team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1498   }
1499 #endif
1500 }
1501 
1502 /*!
1503 @ingroup WORK_SHARING
1504 @param loc Source location
1505 @param global_tid Global thread id
1506 
1507 Mark the end of a statically scheduled loop.
1508 */
1509 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1510   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1511 
1512 #if OMPT_SUPPORT && OMPT_TRACE
1513   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1514     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1515     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1516     ompt_callbacks.ompt_callback(ompt_event_loop_end)(team_info->parallel_id,
1517                                                       task_info->task_id);
1518   }
1519 #endif
1520 
1521   if (__kmp_env_consistency_check)
1522     __kmp_pop_workshare(global_tid, ct_pdo, loc);
1523 }
1524 
1525 // User routines which take C-style arguments (call by value)
1526 // different from the Fortran equivalent routines
1527 
1528 void ompc_set_num_threads(int arg) {
1529   // !!!!! TODO: check the per-task binding
1530   __kmp_set_num_threads(arg, __kmp_entry_gtid());
1531 }
1532 
1533 void ompc_set_dynamic(int flag) {
1534   kmp_info_t *thread;
1535 
1536   /* For the thread-private implementation of the internal controls */
1537   thread = __kmp_entry_thread();
1538 
1539   __kmp_save_internal_controls(thread);
1540 
1541   set__dynamic(thread, flag ? TRUE : FALSE);
1542 }
1543 
1544 void ompc_set_nested(int flag) {
1545   kmp_info_t *thread;
1546 
1547   /* For the thread-private internal controls implementation */
1548   thread = __kmp_entry_thread();
1549 
1550   __kmp_save_internal_controls(thread);
1551 
1552   set__nested(thread, flag ? TRUE : FALSE);
1553 }
1554 
1555 void ompc_set_max_active_levels(int max_active_levels) {
1556   /* TO DO */
1557   /* we want per-task implementation of this internal control */
1558 
1559   /* For the per-thread internal controls implementation */
1560   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1561 }
1562 
1563 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1564   // !!!!! TODO: check the per-task binding
1565   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1566 }
1567 
1568 int ompc_get_ancestor_thread_num(int level) {
1569   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1570 }
1571 
1572 int ompc_get_team_size(int level) {
1573   return __kmp_get_team_size(__kmp_entry_gtid(), level);
1574 }
1575 
1576 void kmpc_set_stacksize(int arg) {
1577   // __kmp_aux_set_stacksize initializes the library if needed
1578   __kmp_aux_set_stacksize(arg);
1579 }
1580 
1581 void kmpc_set_stacksize_s(size_t arg) {
1582   // __kmp_aux_set_stacksize initializes the library if needed
1583   __kmp_aux_set_stacksize(arg);
1584 }
1585 
1586 void kmpc_set_blocktime(int arg) {
1587   int gtid, tid;
1588   kmp_info_t *thread;
1589 
1590   gtid = __kmp_entry_gtid();
1591   tid = __kmp_tid_from_gtid(gtid);
1592   thread = __kmp_thread_from_gtid(gtid);
1593 
1594   __kmp_aux_set_blocktime(arg, thread, tid);
1595 }
1596 
1597 void kmpc_set_library(int arg) {
1598   // __kmp_user_set_library initializes the library if needed
1599   __kmp_user_set_library((enum library_type)arg);
1600 }
1601 
1602 void kmpc_set_defaults(char const *str) {
1603   // __kmp_aux_set_defaults initializes the library if needed
1604   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1605 }
1606 
1607 void kmpc_set_disp_num_buffers(int arg) {
1608   // ignore after initialization because some teams have already
1609   // allocated dispatch buffers
1610   if (__kmp_init_serial == 0 && arg > 0)
1611     __kmp_dispatch_num_buffers = arg;
1612 }
1613 
1614 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1615 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1616   return -1;
1617 #else
1618   if (!TCR_4(__kmp_init_middle)) {
1619     __kmp_middle_initialize();
1620   }
1621   return __kmp_aux_set_affinity_mask_proc(proc, mask);
1622 #endif
1623 }
1624 
1625 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1626 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1627   return -1;
1628 #else
1629   if (!TCR_4(__kmp_init_middle)) {
1630     __kmp_middle_initialize();
1631   }
1632   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1633 #endif
1634 }
1635 
1636 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1637 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1638   return -1;
1639 #else
1640   if (!TCR_4(__kmp_init_middle)) {
1641     __kmp_middle_initialize();
1642   }
1643   return __kmp_aux_get_affinity_mask_proc(proc, mask);
1644 #endif
1645 }
1646 
1647 /* -------------------------------------------------------------------------- */
1648 /*!
1649 @ingroup THREADPRIVATE
1650 @param loc       source location information
1651 @param gtid      global thread number
1652 @param cpy_size  size of the cpy_data buffer
1653 @param cpy_data  pointer to data to be copied
1654 @param cpy_func  helper function to call for copying data
1655 @param didit     flag variable: 1=single thread; 0=not single thread
1656 
1657 __kmpc_copyprivate implements the interface for the private data broadcast
1658 needed for the copyprivate clause associated with a single region in an
1659 OpenMP<sup>*</sup> program (both C and Fortran).
1660 All threads participating in the parallel region call this routine.
1661 One of the threads (called the single thread) should have the <tt>didit</tt>
1662 variable set to 1 and all other threads should have that variable set to 0.
1663 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1664 
1665 The OpenMP specification forbids the use of nowait on the single region when a
1666 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
1667 barrier internally to avoid race conditions, so the code generation for the
1668 single region should avoid generating a barrier after the call to @ref
1669 __kmpc_copyprivate.
1670 
1671 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1672 The <tt>loc</tt> parameter is a pointer to source location information.
1673 
1674 Internal implementation: The single thread will first copy its descriptor
1675 address (cpy_data) to a team-private location, then the other threads will each
1676 call the function pointed to by the parameter cpy_func, which carries out the
1677 copy by copying the data using the cpy_data buffer.
1678 
1679 The cpy_func routine used for the copy and the contents of the data area defined
1680 by cpy_data and cpy_size may be built in any fashion that will allow the copy
1681 to be done. For instance, the cpy_data buffer can hold the actual data to be
1682 copied or it may hold a list of pointers to the data. The cpy_func routine must
1683 interpret the cpy_data buffer appropriately.
1684 
1685 The interface to cpy_func is as follows:
1686 @code
1687 void cpy_func( void *destination, void *source )
1688 @endcode
1689 where void *destination is the cpy_data pointer for the thread being copied to
1690 and void *source is the cpy_data pointer for the thread being copied from.
1691 */
1692 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
1693                         void *cpy_data, void (*cpy_func)(void *, void *),
1694                         kmp_int32 didit) {
1695   void **data_ptr;
1696 
1697   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
1698 
1699   KMP_MB();
1700 
1701   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
1702 
1703   if (__kmp_env_consistency_check) {
1704     if (loc == 0) {
1705       KMP_WARNING(ConstructIdentInvalid);
1706     }
1707   }
1708 
1709   // ToDo: Optimize the following two barriers into some kind of split barrier
1710 
1711   if (didit)
1712     *data_ptr = cpy_data;
1713 
1714 /* This barrier is not a barrier region boundary */
1715 #if USE_ITT_NOTIFY
1716   __kmp_threads[gtid]->th.th_ident = loc;
1717 #endif
1718   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1719 
1720   if (!didit)
1721     (*cpy_func)(cpy_data, *data_ptr);
1722 
1723 // Consider next barrier a user-visible barrier for barrier region boundaries
1724 // Nesting checks are already handled by the single construct checks
1725 
1726 #if USE_ITT_NOTIFY
1727   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
1728 // tasks can overwrite the location)
1729 #endif
1730   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1731 }
1732 
1733 /* -------------------------------------------------------------------------- */
1734 
1735 #define INIT_LOCK __kmp_init_user_lock_with_checks
1736 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1737 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1738 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1739 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1740 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
1741   __kmp_acquire_nested_user_lock_with_checks_timed
1742 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1743 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1744 #define TEST_LOCK __kmp_test_user_lock_with_checks
1745 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1746 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1747 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1748 
1749 // TODO: Make check abort messages use location info & pass it into
1750 // with_checks routines
1751 
1752 #if KMP_USE_DYNAMIC_LOCK
1753 
1754 // internal lock initializer
1755 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
1756                                                     kmp_dyna_lockseq_t seq) {
1757   if (KMP_IS_D_LOCK(seq)) {
1758     KMP_INIT_D_LOCK(lock, seq);
1759 #if USE_ITT_BUILD
1760     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
1761 #endif
1762   } else {
1763     KMP_INIT_I_LOCK(lock, seq);
1764 #if USE_ITT_BUILD
1765     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1766     __kmp_itt_lock_creating(ilk->lock, loc);
1767 #endif
1768   }
1769 }
1770 
1771 // internal nest lock initializer
1772 static __forceinline void
1773 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
1774                                kmp_dyna_lockseq_t seq) {
1775 #if KMP_USE_TSX
1776   // Don't have nested lock implementation for speculative locks
1777   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
1778     seq = __kmp_user_lock_seq;
1779 #endif
1780   switch (seq) {
1781   case lockseq_tas:
1782     seq = lockseq_nested_tas;
1783     break;
1784 #if KMP_USE_FUTEX
1785   case lockseq_futex:
1786     seq = lockseq_nested_futex;
1787     break;
1788 #endif
1789   case lockseq_ticket:
1790     seq = lockseq_nested_ticket;
1791     break;
1792   case lockseq_queuing:
1793     seq = lockseq_nested_queuing;
1794     break;
1795   case lockseq_drdpa:
1796     seq = lockseq_nested_drdpa;
1797     break;
1798   default:
1799     seq = lockseq_nested_queuing;
1800   }
1801   KMP_INIT_I_LOCK(lock, seq);
1802 #if USE_ITT_BUILD
1803   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1804   __kmp_itt_lock_creating(ilk->lock, loc);
1805 #endif
1806 }
1807 
1808 /* initialize the lock with a hint */
1809 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
1810                                 uintptr_t hint) {
1811   KMP_DEBUG_ASSERT(__kmp_init_serial);
1812   if (__kmp_env_consistency_check && user_lock == NULL) {
1813     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
1814   }
1815 
1816   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1817 }
1818 
1819 /* initialize the lock with a hint */
1820 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
1821                                      void **user_lock, uintptr_t hint) {
1822   KMP_DEBUG_ASSERT(__kmp_init_serial);
1823   if (__kmp_env_consistency_check && user_lock == NULL) {
1824     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
1825   }
1826 
1827   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1828 }
1829 
1830 #endif // KMP_USE_DYNAMIC_LOCK
1831 
1832 /* initialize the lock */
1833 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
1834 #if KMP_USE_DYNAMIC_LOCK
1835 
1836   KMP_DEBUG_ASSERT(__kmp_init_serial);
1837   if (__kmp_env_consistency_check && user_lock == NULL) {
1838     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1839   }
1840   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1841 
1842 #else // KMP_USE_DYNAMIC_LOCK
1843 
1844   static char const *const func = "omp_init_lock";
1845   kmp_user_lock_p lck;
1846   KMP_DEBUG_ASSERT(__kmp_init_serial);
1847 
1848   if (__kmp_env_consistency_check) {
1849     if (user_lock == NULL) {
1850       KMP_FATAL(LockIsUninitialized, func);
1851     }
1852   }
1853 
1854   KMP_CHECK_USER_LOCK_INIT();
1855 
1856   if ((__kmp_user_lock_kind == lk_tas) &&
1857       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
1858     lck = (kmp_user_lock_p)user_lock;
1859   }
1860 #if KMP_USE_FUTEX
1861   else if ((__kmp_user_lock_kind == lk_futex) &&
1862            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
1863     lck = (kmp_user_lock_p)user_lock;
1864   }
1865 #endif
1866   else {
1867     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
1868   }
1869   INIT_LOCK(lck);
1870   __kmp_set_user_lock_location(lck, loc);
1871 
1872 #if OMPT_SUPPORT && OMPT_TRACE
1873   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
1874     ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t)lck);
1875   }
1876 #endif
1877 
1878 #if USE_ITT_BUILD
1879   __kmp_itt_lock_creating(lck);
1880 #endif /* USE_ITT_BUILD */
1881 
1882 #endif // KMP_USE_DYNAMIC_LOCK
1883 } // __kmpc_init_lock
1884 
1885 /* initialize the lock */
1886 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
1887 #if KMP_USE_DYNAMIC_LOCK
1888 
1889   KMP_DEBUG_ASSERT(__kmp_init_serial);
1890   if (__kmp_env_consistency_check && user_lock == NULL) {
1891     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1892   }
1893   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1894 
1895 #else // KMP_USE_DYNAMIC_LOCK
1896 
1897   static char const *const func = "omp_init_nest_lock";
1898   kmp_user_lock_p lck;
1899   KMP_DEBUG_ASSERT(__kmp_init_serial);
1900 
1901   if (__kmp_env_consistency_check) {
1902     if (user_lock == NULL) {
1903       KMP_FATAL(LockIsUninitialized, func);
1904     }
1905   }
1906 
1907   KMP_CHECK_USER_LOCK_INIT();
1908 
1909   if ((__kmp_user_lock_kind == lk_tas) &&
1910       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
1911        OMP_NEST_LOCK_T_SIZE)) {
1912     lck = (kmp_user_lock_p)user_lock;
1913   }
1914 #if KMP_USE_FUTEX
1915   else if ((__kmp_user_lock_kind == lk_futex) &&
1916            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
1917             OMP_NEST_LOCK_T_SIZE)) {
1918     lck = (kmp_user_lock_p)user_lock;
1919   }
1920 #endif
1921   else {
1922     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
1923   }
1924 
1925   INIT_NESTED_LOCK(lck);
1926   __kmp_set_user_lock_location(lck, loc);
1927 
1928 #if OMPT_SUPPORT && OMPT_TRACE
1929   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
1930     ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t)lck);
1931   }
1932 #endif
1933 
1934 #if USE_ITT_BUILD
1935   __kmp_itt_lock_creating(lck);
1936 #endif /* USE_ITT_BUILD */
1937 
1938 #endif // KMP_USE_DYNAMIC_LOCK
1939 } // __kmpc_init_nest_lock
1940 
1941 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
1942 #if KMP_USE_DYNAMIC_LOCK
1943 
1944 #if USE_ITT_BUILD
1945   kmp_user_lock_p lck;
1946   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
1947     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
1948   } else {
1949     lck = (kmp_user_lock_p)user_lock;
1950   }
1951   __kmp_itt_lock_destroyed(lck);
1952 #endif
1953   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
1954 #else
1955   kmp_user_lock_p lck;
1956 
1957   if ((__kmp_user_lock_kind == lk_tas) &&
1958       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
1959     lck = (kmp_user_lock_p)user_lock;
1960   }
1961 #if KMP_USE_FUTEX
1962   else if ((__kmp_user_lock_kind == lk_futex) &&
1963            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
1964     lck = (kmp_user_lock_p)user_lock;
1965   }
1966 #endif
1967   else {
1968     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
1969   }
1970 
1971 #if OMPT_SUPPORT && OMPT_TRACE
1972   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
1973     ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t)lck);
1974   }
1975 #endif
1976 
1977 #if USE_ITT_BUILD
1978   __kmp_itt_lock_destroyed(lck);
1979 #endif /* USE_ITT_BUILD */
1980   DESTROY_LOCK(lck);
1981 
1982   if ((__kmp_user_lock_kind == lk_tas) &&
1983       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
1984     ;
1985   }
1986 #if KMP_USE_FUTEX
1987   else if ((__kmp_user_lock_kind == lk_futex) &&
1988            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
1989     ;
1990   }
1991 #endif
1992   else {
1993     __kmp_user_lock_free(user_lock, gtid, lck);
1994   }
1995 #endif // KMP_USE_DYNAMIC_LOCK
1996 } // __kmpc_destroy_lock
1997 
1998 /* destroy the lock */
1999 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2000 #if KMP_USE_DYNAMIC_LOCK
2001 
2002 #if USE_ITT_BUILD
2003   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2004   __kmp_itt_lock_destroyed(ilk->lock);
2005 #endif
2006   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2007 
2008 #else // KMP_USE_DYNAMIC_LOCK
2009 
2010   kmp_user_lock_p lck;
2011 
2012   if ((__kmp_user_lock_kind == lk_tas) &&
2013       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2014        OMP_NEST_LOCK_T_SIZE)) {
2015     lck = (kmp_user_lock_p)user_lock;
2016   }
2017 #if KMP_USE_FUTEX
2018   else if ((__kmp_user_lock_kind == lk_futex) &&
2019            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2020             OMP_NEST_LOCK_T_SIZE)) {
2021     lck = (kmp_user_lock_p)user_lock;
2022   }
2023 #endif
2024   else {
2025     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2026   }
2027 
2028 #if OMPT_SUPPORT && OMPT_TRACE
2029   if (ompt_enabled &&
2030       ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
2031     ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t)lck);
2032   }
2033 #endif
2034 
2035 #if USE_ITT_BUILD
2036   __kmp_itt_lock_destroyed(lck);
2037 #endif /* USE_ITT_BUILD */
2038 
2039   DESTROY_NESTED_LOCK(lck);
2040 
2041   if ((__kmp_user_lock_kind == lk_tas) &&
2042       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2043        OMP_NEST_LOCK_T_SIZE)) {
2044     ;
2045   }
2046 #if KMP_USE_FUTEX
2047   else if ((__kmp_user_lock_kind == lk_futex) &&
2048            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2049             OMP_NEST_LOCK_T_SIZE)) {
2050     ;
2051   }
2052 #endif
2053   else {
2054     __kmp_user_lock_free(user_lock, gtid, lck);
2055   }
2056 #endif // KMP_USE_DYNAMIC_LOCK
2057 } // __kmpc_destroy_nest_lock
2058 
2059 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2060   KMP_COUNT_BLOCK(OMP_set_lock);
2061 #if KMP_USE_DYNAMIC_LOCK
2062   int tag = KMP_EXTRACT_D_TAG(user_lock);
2063 #if USE_ITT_BUILD
2064   __kmp_itt_lock_acquiring(
2065       (kmp_user_lock_p)
2066           user_lock); // itt function will get to the right lock object.
2067 #endif
2068 #if KMP_USE_INLINED_TAS
2069   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2070     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2071   } else
2072 #elif KMP_USE_INLINED_FUTEX
2073   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2074     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2075   } else
2076 #endif
2077   {
2078     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2079   }
2080 #if USE_ITT_BUILD
2081   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2082 #endif
2083 
2084 #else // KMP_USE_DYNAMIC_LOCK
2085 
2086   kmp_user_lock_p lck;
2087 
2088   if ((__kmp_user_lock_kind == lk_tas) &&
2089       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2090     lck = (kmp_user_lock_p)user_lock;
2091   }
2092 #if KMP_USE_FUTEX
2093   else if ((__kmp_user_lock_kind == lk_futex) &&
2094            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2095     lck = (kmp_user_lock_p)user_lock;
2096   }
2097 #endif
2098   else {
2099     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2100   }
2101 
2102 #if USE_ITT_BUILD
2103   __kmp_itt_lock_acquiring(lck);
2104 #endif /* USE_ITT_BUILD */
2105 
2106   ACQUIRE_LOCK(lck, gtid);
2107 
2108 #if USE_ITT_BUILD
2109   __kmp_itt_lock_acquired(lck);
2110 #endif /* USE_ITT_BUILD */
2111 
2112 #if OMPT_SUPPORT && OMPT_TRACE
2113   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
2114     ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t)lck);
2115   }
2116 #endif
2117 
2118 #endif // KMP_USE_DYNAMIC_LOCK
2119 }
2120 
2121 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2122 #if KMP_USE_DYNAMIC_LOCK
2123 
2124 #if USE_ITT_BUILD
2125   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2126 #endif
2127   KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2128 #if USE_ITT_BUILD
2129   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2130 #endif
2131 
2132 #if OMPT_SUPPORT && OMPT_TRACE
2133   if (ompt_enabled) {
2134     // missing support here: need to know whether acquired first or not
2135   }
2136 #endif
2137 
2138 #else // KMP_USE_DYNAMIC_LOCK
2139   int acquire_status;
2140   kmp_user_lock_p lck;
2141 
2142   if ((__kmp_user_lock_kind == lk_tas) &&
2143       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2144        OMP_NEST_LOCK_T_SIZE)) {
2145     lck = (kmp_user_lock_p)user_lock;
2146   }
2147 #if KMP_USE_FUTEX
2148   else if ((__kmp_user_lock_kind == lk_futex) &&
2149            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2150             OMP_NEST_LOCK_T_SIZE)) {
2151     lck = (kmp_user_lock_p)user_lock;
2152   }
2153 #endif
2154   else {
2155     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2156   }
2157 
2158 #if USE_ITT_BUILD
2159   __kmp_itt_lock_acquiring(lck);
2160 #endif /* USE_ITT_BUILD */
2161 
2162   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2163 
2164 #if USE_ITT_BUILD
2165   __kmp_itt_lock_acquired(lck);
2166 #endif /* USE_ITT_BUILD */
2167 
2168 #if OMPT_SUPPORT && OMPT_TRACE
2169   if (ompt_enabled) {
2170     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2171       if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
2172         ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)(
2173             (uint64_t)lck);
2174     } else {
2175       if (ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
2176         ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)(
2177             (uint64_t)lck);
2178     }
2179   }
2180 #endif
2181 
2182 #endif // KMP_USE_DYNAMIC_LOCK
2183 }
2184 
2185 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2186 #if KMP_USE_DYNAMIC_LOCK
2187 
2188   int tag = KMP_EXTRACT_D_TAG(user_lock);
2189 #if USE_ITT_BUILD
2190   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2191 #endif
2192 #if KMP_USE_INLINED_TAS
2193   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2194     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2195   } else
2196 #elif KMP_USE_INLINED_FUTEX
2197   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2198     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2199   } else
2200 #endif
2201   {
2202     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2203   }
2204 
2205 #else // KMP_USE_DYNAMIC_LOCK
2206 
2207   kmp_user_lock_p lck;
2208 
2209   /* Can't use serial interval since not block structured */
2210   /* release the lock */
2211 
2212   if ((__kmp_user_lock_kind == lk_tas) &&
2213       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2214 #if KMP_OS_LINUX &&                                                            \
2215     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2216 // "fast" path implemented to fix customer performance issue
2217 #if USE_ITT_BUILD
2218     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2219 #endif /* USE_ITT_BUILD */
2220     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2221     KMP_MB();
2222     return;
2223 #else
2224     lck = (kmp_user_lock_p)user_lock;
2225 #endif
2226   }
2227 #if KMP_USE_FUTEX
2228   else if ((__kmp_user_lock_kind == lk_futex) &&
2229            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2230     lck = (kmp_user_lock_p)user_lock;
2231   }
2232 #endif
2233   else {
2234     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2235   }
2236 
2237 #if USE_ITT_BUILD
2238   __kmp_itt_lock_releasing(lck);
2239 #endif /* USE_ITT_BUILD */
2240 
2241   RELEASE_LOCK(lck, gtid);
2242 
2243 #if OMPT_SUPPORT && OMPT_BLAME
2244   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2245     ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t)lck);
2246   }
2247 #endif
2248 
2249 #endif // KMP_USE_DYNAMIC_LOCK
2250 }
2251 
2252 /* release the lock */
2253 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2254 #if KMP_USE_DYNAMIC_LOCK
2255 
2256 #if USE_ITT_BUILD
2257   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2258 #endif
2259   KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2260 
2261 #else // KMP_USE_DYNAMIC_LOCK
2262 
2263   kmp_user_lock_p lck;
2264 
2265   /* Can't use serial interval since not block structured */
2266 
2267   if ((__kmp_user_lock_kind == lk_tas) &&
2268       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2269        OMP_NEST_LOCK_T_SIZE)) {
2270 #if KMP_OS_LINUX &&                                                            \
2271     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2272     // "fast" path implemented to fix customer performance issue
2273     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2274 #if USE_ITT_BUILD
2275     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2276 #endif /* USE_ITT_BUILD */
2277     if (--(tl->lk.depth_locked) == 0) {
2278       TCW_4(tl->lk.poll, 0);
2279     }
2280     KMP_MB();
2281     return;
2282 #else
2283     lck = (kmp_user_lock_p)user_lock;
2284 #endif
2285   }
2286 #if KMP_USE_FUTEX
2287   else if ((__kmp_user_lock_kind == lk_futex) &&
2288            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2289             OMP_NEST_LOCK_T_SIZE)) {
2290     lck = (kmp_user_lock_p)user_lock;
2291   }
2292 #endif
2293   else {
2294     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2295   }
2296 
2297 #if USE_ITT_BUILD
2298   __kmp_itt_lock_releasing(lck);
2299 #endif /* USE_ITT_BUILD */
2300 
2301   int release_status;
2302   release_status = RELEASE_NESTED_LOCK(lck, gtid);
2303 #if OMPT_SUPPORT && OMPT_BLAME
2304   if (ompt_enabled) {
2305     if (release_status == KMP_LOCK_RELEASED) {
2306       if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2307         ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2308             (uint64_t)lck);
2309       }
2310     } else if (ompt_callbacks.ompt_callback(
2311                    ompt_event_release_nest_lock_prev)) {
2312       ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2313           (uint64_t)lck);
2314     }
2315   }
2316 #endif
2317 
2318 #endif // KMP_USE_DYNAMIC_LOCK
2319 }
2320 
2321 /* try to acquire the lock */
2322 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2323   KMP_COUNT_BLOCK(OMP_test_lock);
2324 
2325 #if KMP_USE_DYNAMIC_LOCK
2326   int rc;
2327   int tag = KMP_EXTRACT_D_TAG(user_lock);
2328 #if USE_ITT_BUILD
2329   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2330 #endif
2331 #if KMP_USE_INLINED_TAS
2332   if (tag == locktag_tas && !__kmp_env_consistency_check) {
2333     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2334   } else
2335 #elif KMP_USE_INLINED_FUTEX
2336   if (tag == locktag_futex && !__kmp_env_consistency_check) {
2337     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2338   } else
2339 #endif
2340   {
2341     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2342   }
2343   if (rc) {
2344 #if USE_ITT_BUILD
2345     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2346 #endif
2347     return FTN_TRUE;
2348   } else {
2349 #if USE_ITT_BUILD
2350     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2351 #endif
2352     return FTN_FALSE;
2353   }
2354 
2355 #else // KMP_USE_DYNAMIC_LOCK
2356 
2357   kmp_user_lock_p lck;
2358   int rc;
2359 
2360   if ((__kmp_user_lock_kind == lk_tas) &&
2361       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2362     lck = (kmp_user_lock_p)user_lock;
2363   }
2364 #if KMP_USE_FUTEX
2365   else if ((__kmp_user_lock_kind == lk_futex) &&
2366            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2367     lck = (kmp_user_lock_p)user_lock;
2368   }
2369 #endif
2370   else {
2371     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
2372   }
2373 
2374 #if USE_ITT_BUILD
2375   __kmp_itt_lock_acquiring(lck);
2376 #endif /* USE_ITT_BUILD */
2377 
2378   rc = TEST_LOCK(lck, gtid);
2379 #if USE_ITT_BUILD
2380   if (rc) {
2381     __kmp_itt_lock_acquired(lck);
2382   } else {
2383     __kmp_itt_lock_cancelled(lck);
2384   }
2385 #endif /* USE_ITT_BUILD */
2386   return (rc ? FTN_TRUE : FTN_FALSE);
2387 
2388 /* Can't use serial interval since not block structured */
2389 
2390 #endif // KMP_USE_DYNAMIC_LOCK
2391 }
2392 
2393 /* try to acquire the lock */
2394 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2395 #if KMP_USE_DYNAMIC_LOCK
2396   int rc;
2397 #if USE_ITT_BUILD
2398   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2399 #endif
2400   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2401 #if USE_ITT_BUILD
2402   if (rc) {
2403     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2404   } else {
2405     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2406   }
2407 #endif
2408   return rc;
2409 
2410 #else // KMP_USE_DYNAMIC_LOCK
2411 
2412   kmp_user_lock_p lck;
2413   int rc;
2414 
2415   if ((__kmp_user_lock_kind == lk_tas) &&
2416       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2417        OMP_NEST_LOCK_T_SIZE)) {
2418     lck = (kmp_user_lock_p)user_lock;
2419   }
2420 #if KMP_USE_FUTEX
2421   else if ((__kmp_user_lock_kind == lk_futex) &&
2422            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2423             OMP_NEST_LOCK_T_SIZE)) {
2424     lck = (kmp_user_lock_p)user_lock;
2425   }
2426 #endif
2427   else {
2428     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
2429   }
2430 
2431 #if USE_ITT_BUILD
2432   __kmp_itt_lock_acquiring(lck);
2433 #endif /* USE_ITT_BUILD */
2434 
2435   rc = TEST_NESTED_LOCK(lck, gtid);
2436 #if USE_ITT_BUILD
2437   if (rc) {
2438     __kmp_itt_lock_acquired(lck);
2439   } else {
2440     __kmp_itt_lock_cancelled(lck);
2441   }
2442 #endif /* USE_ITT_BUILD */
2443   return rc;
2444 
2445 /* Can't use serial interval since not block structured */
2446 
2447 #endif // KMP_USE_DYNAMIC_LOCK
2448 }
2449 
2450 // Interface to fast scalable reduce methods routines
2451 
2452 // keep the selected method in a thread local structure for cross-function
2453 // usage: will be used in __kmpc_end_reduce* functions;
2454 // another solution: to re-determine the method one more time in
2455 // __kmpc_end_reduce* functions (new prototype required then)
2456 // AT: which solution is better?
2457 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
2458   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
2459 
2460 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
2461   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
2462 
2463 // description of the packed_reduction_method variable: look at the macros in
2464 // kmp.h
2465 
2466 // used in a critical section reduce block
2467 static __forceinline void
2468 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
2469                                           kmp_critical_name *crit) {
2470 
2471   // this lock was visible to a customer and to the threading profile tool as a
2472   // serial overhead span (although it's used for an internal purpose only)
2473   //            why was it visible in previous implementation?
2474   //            should we keep it visible in new reduce block?
2475   kmp_user_lock_p lck;
2476 
2477 #if KMP_USE_DYNAMIC_LOCK
2478 
2479   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
2480   // Check if it is initialized.
2481   if (*lk == 0) {
2482     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2483       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
2484                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
2485     } else {
2486       __kmp_init_indirect_csptr(crit, loc, global_tid,
2487                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
2488     }
2489   }
2490   // Branch for accessing the actual lock object and set operation. This
2491   // branching is inevitable since this lock initialization does not follow the
2492   // normal dispatch path (lock table is not used).
2493   if (KMP_EXTRACT_D_TAG(lk) != 0) {
2494     lck = (kmp_user_lock_p)lk;
2495     KMP_DEBUG_ASSERT(lck != NULL);
2496     if (__kmp_env_consistency_check) {
2497       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2498     }
2499     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
2500   } else {
2501     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
2502     lck = ilk->lock;
2503     KMP_DEBUG_ASSERT(lck != NULL);
2504     if (__kmp_env_consistency_check) {
2505       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2506     }
2507     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
2508   }
2509 
2510 #else // KMP_USE_DYNAMIC_LOCK
2511 
2512   // We know that the fast reduction code is only emitted by Intel compilers
2513   // with 32 byte critical sections. If there isn't enough space, then we
2514   // have to use a pointer.
2515   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
2516     lck = (kmp_user_lock_p)crit;
2517   } else {
2518     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
2519   }
2520   KMP_DEBUG_ASSERT(lck != NULL);
2521 
2522   if (__kmp_env_consistency_check)
2523     __kmp_push_sync(global_tid, ct_critical, loc, lck);
2524 
2525   __kmp_acquire_user_lock_with_checks(lck, global_tid);
2526 
2527 #endif // KMP_USE_DYNAMIC_LOCK
2528 }
2529 
2530 // used in a critical section reduce block
2531 static __forceinline void
2532 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
2533                                         kmp_critical_name *crit) {
2534 
2535   kmp_user_lock_p lck;
2536 
2537 #if KMP_USE_DYNAMIC_LOCK
2538 
2539   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2540     lck = (kmp_user_lock_p)crit;
2541     if (__kmp_env_consistency_check)
2542       __kmp_pop_sync(global_tid, ct_critical, loc);
2543     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2544   } else {
2545     kmp_indirect_lock_t *ilk =
2546         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2547     if (__kmp_env_consistency_check)
2548       __kmp_pop_sync(global_tid, ct_critical, loc);
2549     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2550   }
2551 
2552 #else // KMP_USE_DYNAMIC_LOCK
2553 
2554   // We know that the fast reduction code is only emitted by Intel compilers
2555   // with 32 byte critical sections. If there isn't enough space, then we have
2556   // to use a pointer.
2557   if (__kmp_base_user_lock_size > 32) {
2558     lck = *((kmp_user_lock_p *)crit);
2559     KMP_ASSERT(lck != NULL);
2560   } else {
2561     lck = (kmp_user_lock_p)crit;
2562   }
2563 
2564   if (__kmp_env_consistency_check)
2565     __kmp_pop_sync(global_tid, ct_critical, loc);
2566 
2567   __kmp_release_user_lock_with_checks(lck, global_tid);
2568 
2569 #endif // KMP_USE_DYNAMIC_LOCK
2570 } // __kmp_end_critical_section_reduce_block
2571 
2572 /* 2.a.i. Reduce Block without a terminating barrier */
2573 /*!
2574 @ingroup SYNCHRONIZATION
2575 @param loc source location information
2576 @param global_tid global thread number
2577 @param num_vars number of items (variables) to be reduced
2578 @param reduce_size size of data in bytes to be reduced
2579 @param reduce_data pointer to data to be reduced
2580 @param reduce_func callback function providing reduction operation on two
2581 operands and returning result of reduction in lhs_data
2582 @param lck pointer to the unique lock data structure
2583 @result 1 for the master thread, 0 for all other team threads, 2 for all team
2584 threads if atomic reduction needed
2585 
2586 The nowait version is used for a reduce clause with the nowait argument.
2587 */
2588 kmp_int32
2589 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
2590                      size_t reduce_size, void *reduce_data,
2591                      void (*reduce_func)(void *lhs_data, void *rhs_data),
2592                      kmp_critical_name *lck) {
2593 
2594   KMP_COUNT_BLOCK(REDUCE_nowait);
2595   int retval = 0;
2596   PACKED_REDUCTION_METHOD_T packed_reduction_method;
2597 #if OMP_40_ENABLED
2598   kmp_team_t *team;
2599   kmp_info_t *th;
2600   int teams_swapped = 0, task_state;
2601 #endif
2602   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
2603 
2604   // why do we need this initialization here at all?
2605   // Reduction clause can not be used as a stand-alone directive.
2606 
2607   // do not call __kmp_serial_initialize(), it will be called by
2608   // __kmp_parallel_initialize() if needed
2609   // possible detection of false-positive race by the threadchecker ???
2610   if (!TCR_4(__kmp_init_parallel))
2611     __kmp_parallel_initialize();
2612 
2613 // check correctness of reduce block nesting
2614 #if KMP_USE_DYNAMIC_LOCK
2615   if (__kmp_env_consistency_check)
2616     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
2617 #else
2618   if (__kmp_env_consistency_check)
2619     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
2620 #endif
2621 
2622 #if OMP_40_ENABLED
2623   th = __kmp_thread_from_gtid(global_tid);
2624   if (th->th.th_teams_microtask) { // AC: check if we are inside the teams
2625     // construct?
2626     team = th->th.th_team;
2627     if (team->t.t_level == th->th.th_teams_level) {
2628       // this is reduction at teams construct
2629       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
2630       // Let's swap teams temporarily for the reduction barrier
2631       teams_swapped = 1;
2632       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2633       th->th.th_team = team->t.t_parent;
2634       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2635       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2636       task_state = th->th.th_task_state;
2637       th->th.th_task_state = 0;
2638     }
2639   }
2640 #endif // OMP_40_ENABLED
2641 
2642   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
2643   // the value should be kept in a variable
2644   // the variable should be either a construct-specific or thread-specific
2645   // property, not a team specific property
2646   //     (a thread can reach the next reduce block on the next construct, reduce
2647   //     method may differ on the next construct)
2648   // an ident_t "loc" parameter could be used as a construct-specific property
2649   // (what if loc == 0?)
2650   //     (if both construct-specific and team-specific variables were shared,
2651   //     then unness extra syncs should be needed)
2652   // a thread-specific variable is better regarding two issues above (next
2653   // construct and extra syncs)
2654   // a thread-specific "th_local.reduction_method" variable is used currently
2655   // each thread executes 'determine' and 'set' lines (no need to execute by one
2656   // thread, to avoid unness extra syncs)
2657 
2658   packed_reduction_method = __kmp_determine_reduction_method(
2659       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
2660   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
2661 
2662   if (packed_reduction_method == critical_reduce_block) {
2663 
2664     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
2665     retval = 1;
2666 
2667   } else if (packed_reduction_method == empty_reduce_block) {
2668 
2669     // usage: if team size == 1, no synchronization is required ( Intel
2670     // platforms only )
2671     retval = 1;
2672 
2673   } else if (packed_reduction_method == atomic_reduce_block) {
2674 
2675     retval = 2;
2676 
2677     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
2678     // won't be called by the code gen)
2679     //     (it's not quite good, because the checking block has been closed by
2680     //     this 'pop',
2681     //      but atomic operation has not been executed yet, will be executed
2682     //      slightly later, literally on next instruction)
2683     if (__kmp_env_consistency_check)
2684       __kmp_pop_sync(global_tid, ct_reduce, loc);
2685 
2686   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2687                                    tree_reduce_block)) {
2688 
2689 // AT: performance issue: a real barrier here
2690 // AT:     (if master goes slow, other threads are blocked here waiting for the
2691 // master to come and release them)
2692 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
2693 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
2694 // be confusing to a customer)
2695 // AT: another implementation of *barrier_gather*nowait() (or some other design)
2696 // might go faster and be more in line with sense of NOWAIT
2697 // AT: TO DO: do epcc test and compare times
2698 
2699 // this barrier should be invisible to a customer and to the threading profile
2700 // tool (it's neither a terminating barrier nor customer's code, it's
2701 // used for an internal purpose)
2702 #if USE_ITT_NOTIFY
2703     __kmp_threads[global_tid]->th.th_ident = loc;
2704 #endif
2705     retval =
2706         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
2707                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
2708     retval = (retval != 0) ? (0) : (1);
2709 
2710     // all other workers except master should do this pop here
2711     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
2712     if (__kmp_env_consistency_check) {
2713       if (retval == 0) {
2714         __kmp_pop_sync(global_tid, ct_reduce, loc);
2715       }
2716     }
2717 
2718   } else {
2719 
2720     // should never reach this block
2721     KMP_ASSERT(0); // "unexpected method"
2722   }
2723 #if OMP_40_ENABLED
2724   if (teams_swapped) {
2725     // Restore thread structure
2726     th->th.th_info.ds.ds_tid = 0;
2727     th->th.th_team = team;
2728     th->th.th_team_nproc = team->t.t_nproc;
2729     th->th.th_task_team = team->t.t_task_team[task_state];
2730     th->th.th_task_state = task_state;
2731   }
2732 #endif
2733   KA_TRACE(
2734       10,
2735       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
2736        global_tid, packed_reduction_method, retval));
2737 
2738   return retval;
2739 }
2740 
2741 /*!
2742 @ingroup SYNCHRONIZATION
2743 @param loc source location information
2744 @param global_tid global thread id.
2745 @param lck pointer to the unique lock data structure
2746 
2747 Finish the execution of a reduce nowait.
2748 */
2749 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
2750                               kmp_critical_name *lck) {
2751 
2752   PACKED_REDUCTION_METHOD_T packed_reduction_method;
2753 
2754   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
2755 
2756   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
2757 
2758   if (packed_reduction_method == critical_reduce_block) {
2759 
2760     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
2761 
2762   } else if (packed_reduction_method == empty_reduce_block) {
2763 
2764     // usage: if team size == 1, no synchronization is required ( on Intel
2765     // platforms only )
2766 
2767   } else if (packed_reduction_method == atomic_reduce_block) {
2768 
2769     // neither master nor other workers should get here
2770     //     (code gen does not generate this call in case 2: atomic reduce block)
2771     // actually it's better to remove this elseif at all;
2772     // after removal this value will checked by the 'else' and will assert
2773 
2774   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2775                                    tree_reduce_block)) {
2776 
2777     // only master gets here
2778 
2779   } else {
2780 
2781     // should never reach this block
2782     KMP_ASSERT(0); // "unexpected method"
2783   }
2784 
2785   if (__kmp_env_consistency_check)
2786     __kmp_pop_sync(global_tid, ct_reduce, loc);
2787 
2788   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
2789                 global_tid, packed_reduction_method));
2790 
2791   return;
2792 }
2793 
2794 /* 2.a.ii. Reduce Block with a terminating barrier */
2795 
2796 /*!
2797 @ingroup SYNCHRONIZATION
2798 @param loc source location information
2799 @param global_tid global thread number
2800 @param num_vars number of items (variables) to be reduced
2801 @param reduce_size size of data in bytes to be reduced
2802 @param reduce_data pointer to data to be reduced
2803 @param reduce_func callback function providing reduction operation on two
2804 operands and returning result of reduction in lhs_data
2805 @param lck pointer to the unique lock data structure
2806 @result 1 for the master thread, 0 for all other team threads, 2 for all team
2807 threads if atomic reduction needed
2808 
2809 A blocking reduce that includes an implicit barrier.
2810 */
2811 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
2812                         size_t reduce_size, void *reduce_data,
2813                         void (*reduce_func)(void *lhs_data, void *rhs_data),
2814                         kmp_critical_name *lck) {
2815   KMP_COUNT_BLOCK(REDUCE_wait);
2816   int retval = 0;
2817   PACKED_REDUCTION_METHOD_T packed_reduction_method;
2818 
2819   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
2820 
2821   // why do we need this initialization here at all?
2822   // Reduction clause can not be a stand-alone directive.
2823 
2824   // do not call __kmp_serial_initialize(), it will be called by
2825   // __kmp_parallel_initialize() if needed
2826   // possible detection of false-positive race by the threadchecker ???
2827   if (!TCR_4(__kmp_init_parallel))
2828     __kmp_parallel_initialize();
2829 
2830 // check correctness of reduce block nesting
2831 #if KMP_USE_DYNAMIC_LOCK
2832   if (__kmp_env_consistency_check)
2833     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
2834 #else
2835   if (__kmp_env_consistency_check)
2836     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
2837 #endif
2838 
2839   packed_reduction_method = __kmp_determine_reduction_method(
2840       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
2841   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
2842 
2843   if (packed_reduction_method == critical_reduce_block) {
2844 
2845     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
2846     retval = 1;
2847 
2848   } else if (packed_reduction_method == empty_reduce_block) {
2849 
2850     // usage: if team size == 1, no synchronization is required ( Intel
2851     // platforms only )
2852     retval = 1;
2853 
2854   } else if (packed_reduction_method == atomic_reduce_block) {
2855 
2856     retval = 2;
2857 
2858   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2859                                    tree_reduce_block)) {
2860 
2861 // case tree_reduce_block:
2862 // this barrier should be visible to a customer and to the threading profile
2863 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
2864 #if USE_ITT_NOTIFY
2865     __kmp_threads[global_tid]->th.th_ident =
2866         loc; // needed for correct notification of frames
2867 #endif
2868     retval =
2869         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
2870                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
2871     retval = (retval != 0) ? (0) : (1);
2872 
2873     // all other workers except master should do this pop here
2874     // ( none of other workers except master will enter __kmpc_end_reduce() )
2875     if (__kmp_env_consistency_check) {
2876       if (retval == 0) { // 0: all other workers; 1: master
2877         __kmp_pop_sync(global_tid, ct_reduce, loc);
2878       }
2879     }
2880 
2881   } else {
2882 
2883     // should never reach this block
2884     KMP_ASSERT(0); // "unexpected method"
2885   }
2886 
2887   KA_TRACE(10,
2888            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
2889             global_tid, packed_reduction_method, retval));
2890 
2891   return retval;
2892 }
2893 
2894 /*!
2895 @ingroup SYNCHRONIZATION
2896 @param loc source location information
2897 @param global_tid global thread id.
2898 @param lck pointer to the unique lock data structure
2899 
2900 Finish the execution of a blocking reduce.
2901 The <tt>lck</tt> pointer must be the same as that used in the corresponding
2902 start function.
2903 */
2904 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
2905                        kmp_critical_name *lck) {
2906 
2907   PACKED_REDUCTION_METHOD_T packed_reduction_method;
2908 
2909   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
2910 
2911   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
2912 
2913   // this barrier should be visible to a customer and to the threading profile
2914   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
2915 
2916   if (packed_reduction_method == critical_reduce_block) {
2917 
2918     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
2919 
2920 // TODO: implicit barrier: should be exposed
2921 #if USE_ITT_NOTIFY
2922     __kmp_threads[global_tid]->th.th_ident = loc;
2923 #endif
2924     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
2925 
2926   } else if (packed_reduction_method == empty_reduce_block) {
2927 
2928 // usage: if team size==1, no synchronization is required (Intel platforms only)
2929 
2930 // TODO: implicit barrier: should be exposed
2931 #if USE_ITT_NOTIFY
2932     __kmp_threads[global_tid]->th.th_ident = loc;
2933 #endif
2934     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
2935 
2936   } else if (packed_reduction_method == atomic_reduce_block) {
2937 
2938 // TODO: implicit barrier: should be exposed
2939 #if USE_ITT_NOTIFY
2940     __kmp_threads[global_tid]->th.th_ident = loc;
2941 #endif
2942     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
2943 
2944   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
2945                                    tree_reduce_block)) {
2946 
2947     // only master executes here (master releases all other workers)
2948     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
2949                             global_tid);
2950 
2951   } else {
2952 
2953     // should never reach this block
2954     KMP_ASSERT(0); // "unexpected method"
2955   }
2956 
2957   if (__kmp_env_consistency_check)
2958     __kmp_pop_sync(global_tid, ct_reduce, loc);
2959 
2960   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
2961                 global_tid, packed_reduction_method));
2962 
2963   return;
2964 }
2965 
2966 #undef __KMP_GET_REDUCTION_METHOD
2967 #undef __KMP_SET_REDUCTION_METHOD
2968 
2969 /* end of interface to fast scalable reduce routines */
2970 
2971 kmp_uint64 __kmpc_get_taskid() {
2972 
2973   kmp_int32 gtid;
2974   kmp_info_t *thread;
2975 
2976   gtid = __kmp_get_gtid();
2977   if (gtid < 0) {
2978     return 0;
2979   }; // if
2980   thread = __kmp_thread_from_gtid(gtid);
2981   return thread->th.th_current_task->td_task_id;
2982 
2983 } // __kmpc_get_taskid
2984 
2985 kmp_uint64 __kmpc_get_parent_taskid() {
2986 
2987   kmp_int32 gtid;
2988   kmp_info_t *thread;
2989   kmp_taskdata_t *parent_task;
2990 
2991   gtid = __kmp_get_gtid();
2992   if (gtid < 0) {
2993     return 0;
2994   }; // if
2995   thread = __kmp_thread_from_gtid(gtid);
2996   parent_task = thread->th.th_current_task->td_parent;
2997   return (parent_task == NULL ? 0 : parent_task->td_task_id);
2998 
2999 } // __kmpc_get_parent_taskid
3000 
3001 #if OMP_45_ENABLED
3002 /*!
3003 @ingroup WORK_SHARING
3004 @param loc  source location information.
3005 @param gtid  global thread number.
3006 @param num_dims  number of associated doacross loops.
3007 @param dims  info on loops bounds.
3008 
3009 Initialize doacross loop information.
3010 Expect compiler send us inclusive bounds,
3011 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3012 */
3013 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3014                           struct kmp_dim *dims) {
3015   int j, idx;
3016   kmp_int64 last, trace_count;
3017   kmp_info_t *th = __kmp_threads[gtid];
3018   kmp_team_t *team = th->th.th_team;
3019   kmp_uint32 *flags;
3020   kmp_disp_t *pr_buf = th->th.th_dispatch;
3021   dispatch_shared_info_t *sh_buf;
3022 
3023   KA_TRACE(
3024       20,
3025       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3026        gtid, num_dims, !team->t.t_serialized));
3027   KMP_DEBUG_ASSERT(dims != NULL);
3028   KMP_DEBUG_ASSERT(num_dims > 0);
3029 
3030   if (team->t.t_serialized) {
3031     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3032     return; // no dependencies if team is serialized
3033   }
3034   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3035   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3036   // the next loop
3037   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3038 
3039   // Save bounds info into allocated private buffer
3040   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3041   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3042       th, sizeof(kmp_int64) * (4 * num_dims + 1));
3043   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3044   pr_buf->th_doacross_info[0] =
3045       (kmp_int64)num_dims; // first element is number of dimensions
3046   // Save also address of num_done in order to access it later without knowing
3047   // the buffer index
3048   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3049   pr_buf->th_doacross_info[2] = dims[0].lo;
3050   pr_buf->th_doacross_info[3] = dims[0].up;
3051   pr_buf->th_doacross_info[4] = dims[0].st;
3052   last = 5;
3053   for (j = 1; j < num_dims; ++j) {
3054     kmp_int64
3055         range_length; // To keep ranges of all dimensions but the first dims[0]
3056     if (dims[j].st == 1) { // most common case
3057       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3058       range_length = dims[j].up - dims[j].lo + 1;
3059     } else {
3060       if (dims[j].st > 0) {
3061         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3062         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3063       } else { // negative increment
3064         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3065         range_length =
3066             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3067       }
3068     }
3069     pr_buf->th_doacross_info[last++] = range_length;
3070     pr_buf->th_doacross_info[last++] = dims[j].lo;
3071     pr_buf->th_doacross_info[last++] = dims[j].up;
3072     pr_buf->th_doacross_info[last++] = dims[j].st;
3073   }
3074 
3075   // Compute total trip count.
3076   // Start with range of dims[0] which we don't need to keep in the buffer.
3077   if (dims[0].st == 1) { // most common case
3078     trace_count = dims[0].up - dims[0].lo + 1;
3079   } else if (dims[0].st > 0) {
3080     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3081     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3082   } else { // negative increment
3083     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3084     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3085   }
3086   for (j = 1; j < num_dims; ++j) {
3087     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3088   }
3089   KMP_DEBUG_ASSERT(trace_count > 0);
3090 
3091   // Check if shared buffer is not occupied by other loop (idx -
3092   // __kmp_dispatch_num_buffers)
3093   if (idx != sh_buf->doacross_buf_idx) {
3094     // Shared buffer is occupied, wait for it to be free
3095     __kmp_wait_yield_4((kmp_uint32 *)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4,
3096                        NULL);
3097   }
3098   // Check if we are the first thread. After the CAS the first thread gets 0,
3099   // others get 1 if initialization is in progress, allocated pointer otherwise.
3100   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3101       (kmp_int64 *)&sh_buf->doacross_flags, NULL, (kmp_int64)1);
3102   if (flags == NULL) {
3103     // we are the first thread, allocate the array of flags
3104     kmp_int64 size =
3105         trace_count / 8 + 8; // in bytes, use single bit per iteration
3106     sh_buf->doacross_flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3107   } else if ((kmp_int64)flags == 1) {
3108     // initialization is still in progress, need to wait
3109     while ((volatile kmp_int64)sh_buf->doacross_flags == 1) {
3110       KMP_YIELD(TRUE);
3111     }
3112   }
3113   KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags >
3114                    1); // check value of pointer
3115   pr_buf->th_doacross_flags =
3116       sh_buf->doacross_flags; // save private copy in order to not
3117   // touch shared buffer on each iteration
3118   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3119 }
3120 
3121 void __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) {
3122   kmp_int32 shft, num_dims, i;
3123   kmp_uint32 flag;
3124   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3125   kmp_info_t *th = __kmp_threads[gtid];
3126   kmp_team_t *team = th->th.th_team;
3127   kmp_disp_t *pr_buf;
3128   kmp_int64 lo, up, st;
3129 
3130   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3131   if (team->t.t_serialized) {
3132     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
3133     return; // no dependencies if team is serialized
3134   }
3135 
3136   // calculate sequential iteration number and check out-of-bounds condition
3137   pr_buf = th->th.th_dispatch;
3138   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3139   num_dims = pr_buf->th_doacross_info[0];
3140   lo = pr_buf->th_doacross_info[2];
3141   up = pr_buf->th_doacross_info[3];
3142   st = pr_buf->th_doacross_info[4];
3143   if (st == 1) { // most common case
3144     if (vec[0] < lo || vec[0] > up) {
3145       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3146                     "bounds [%lld,%lld]\n",
3147                     gtid, vec[0], lo, up));
3148       return;
3149     }
3150     iter_number = vec[0] - lo;
3151   } else if (st > 0) {
3152     if (vec[0] < lo || vec[0] > up) {
3153       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3154                     "bounds [%lld,%lld]\n",
3155                     gtid, vec[0], lo, up));
3156       return;
3157     }
3158     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3159   } else { // negative increment
3160     if (vec[0] > lo || vec[0] < up) {
3161       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3162                     "bounds [%lld,%lld]\n",
3163                     gtid, vec[0], lo, up));
3164       return;
3165     }
3166     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3167   }
3168   for (i = 1; i < num_dims; ++i) {
3169     kmp_int64 iter, ln;
3170     kmp_int32 j = i * 4;
3171     ln = pr_buf->th_doacross_info[j + 1];
3172     lo = pr_buf->th_doacross_info[j + 2];
3173     up = pr_buf->th_doacross_info[j + 3];
3174     st = pr_buf->th_doacross_info[j + 4];
3175     if (st == 1) {
3176       if (vec[i] < lo || vec[i] > up) {
3177         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3178                       "bounds [%lld,%lld]\n",
3179                       gtid, vec[i], lo, up));
3180         return;
3181       }
3182       iter = vec[i] - lo;
3183     } else if (st > 0) {
3184       if (vec[i] < lo || vec[i] > up) {
3185         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3186                       "bounds [%lld,%lld]\n",
3187                       gtid, vec[i], lo, up));
3188         return;
3189       }
3190       iter = (kmp_uint64)(vec[i] - lo) / st;
3191     } else { // st < 0
3192       if (vec[i] > lo || vec[i] < up) {
3193         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
3194                       "bounds [%lld,%lld]\n",
3195                       gtid, vec[i], lo, up));
3196         return;
3197       }
3198       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3199     }
3200     iter_number = iter + ln * iter_number;
3201   }
3202   shft = iter_number % 32; // use 32-bit granularity
3203   iter_number >>= 5; // divided by 32
3204   flag = 1 << shft;
3205   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
3206     KMP_YIELD(TRUE);
3207   }
3208   KA_TRACE(20,
3209            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3210             gtid, (iter_number << 5) + shft));
3211 }
3212 
3213 void __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) {
3214   kmp_int32 shft, num_dims, i;
3215   kmp_uint32 flag;
3216   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3217   kmp_info_t *th = __kmp_threads[gtid];
3218   kmp_team_t *team = th->th.th_team;
3219   kmp_disp_t *pr_buf;
3220   kmp_int64 lo, st;
3221 
3222   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3223   if (team->t.t_serialized) {
3224     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
3225     return; // no dependencies if team is serialized
3226   }
3227 
3228   // calculate sequential iteration number (same as in "wait" but no
3229   // out-of-bounds checks)
3230   pr_buf = th->th.th_dispatch;
3231   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3232   num_dims = pr_buf->th_doacross_info[0];
3233   lo = pr_buf->th_doacross_info[2];
3234   st = pr_buf->th_doacross_info[4];
3235   if (st == 1) { // most common case
3236     iter_number = vec[0] - lo;
3237   } else if (st > 0) {
3238     iter_number = (kmp_uint64)(vec[0] - lo) / st;
3239   } else { // negative increment
3240     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3241   }
3242   for (i = 1; i < num_dims; ++i) {
3243     kmp_int64 iter, ln;
3244     kmp_int32 j = i * 4;
3245     ln = pr_buf->th_doacross_info[j + 1];
3246     lo = pr_buf->th_doacross_info[j + 2];
3247     st = pr_buf->th_doacross_info[j + 4];
3248     if (st == 1) {
3249       iter = vec[i] - lo;
3250     } else if (st > 0) {
3251       iter = (kmp_uint64)(vec[i] - lo) / st;
3252     } else { // st < 0
3253       iter = (kmp_uint64)(lo - vec[i]) / (-st);
3254     }
3255     iter_number = iter + ln * iter_number;
3256   }
3257   shft = iter_number % 32; // use 32-bit granularity
3258   iter_number >>= 5; // divided by 32
3259   flag = 1 << shft;
3260   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
3261     KMP_TEST_THEN_OR32((kmp_int32 *)&pr_buf->th_doacross_flags[iter_number],
3262                        (kmp_int32)flag);
3263   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
3264                 (iter_number << 5) + shft));
3265 }
3266 
3267 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
3268   kmp_int64 num_done;
3269   kmp_info_t *th = __kmp_threads[gtid];
3270   kmp_team_t *team = th->th.th_team;
3271   kmp_disp_t *pr_buf = th->th.th_dispatch;
3272 
3273   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3274   if (team->t.t_serialized) {
3275     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3276     return; // nothing to do
3277   }
3278   num_done = KMP_TEST_THEN_INC64((kmp_int64 *)pr_buf->th_doacross_info[1]) + 1;
3279   if (num_done == th->th.th_team_nproc) {
3280     // we are the last thread, need to free shared resources
3281     int idx = pr_buf->th_doacross_buf_idx - 1;
3282     dispatch_shared_info_t *sh_buf =
3283         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3284     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
3285                      (kmp_int64)&sh_buf->doacross_num_done);
3286     KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3287     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3288     __kmp_thread_free(th, (void *)sh_buf->doacross_flags);
3289     sh_buf->doacross_flags = NULL;
3290     sh_buf->doacross_num_done = 0;
3291     sh_buf->doacross_buf_idx +=
3292         __kmp_dispatch_num_buffers; // free buffer for future re-use
3293   }
3294   // free private resources (need to keep buffer index forever)
3295   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
3296   pr_buf->th_doacross_info = NULL;
3297   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
3298 }
3299 #endif
3300 
3301 // end of file //
3302