1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h"        /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_error.h"
22 #include "kmp_stats.h"
23 
24 #if OMPT_SUPPORT
25 #include "ompt-internal.h"
26 #include "ompt-specific.h"
27 #endif
28 
29 #define MAX_MESSAGE 512
30 
31 /* ------------------------------------------------------------------------ */
32 /* ------------------------------------------------------------------------ */
33 
34 /*  flags will be used in future, e.g., to implement */
35 /*  openmp_strict library restrictions               */
36 
37 /*!
38  * @ingroup STARTUP_SHUTDOWN
39  * @param loc   in   source location information
40  * @param flags in   for future use (currently ignored)
41  *
42  * Initialize the runtime library. This call is optional; if it is not made then
43  * it will be implicitly called by attempts to use other library functions.
44  *
45  */
46 void
47 __kmpc_begin(ident_t *loc, kmp_int32 flags)
48 {
49     // By default __kmpc_begin() is no-op.
50     char *env;
51     if ((env = getenv( "KMP_INITIAL_THREAD_BIND" )) != NULL &&
52         __kmp_str_match_true( env )) {
53         __kmp_middle_initialize();
54         KC_TRACE(10, ("__kmpc_begin: middle initialization called\n" ));
55     } else if (__kmp_ignore_mppbeg() == FALSE) {
56         // By default __kmp_ignore_mppbeg() returns TRUE.
57         __kmp_internal_begin();
58         KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
59     }
60 }
61 
62 /*!
63  * @ingroup STARTUP_SHUTDOWN
64  * @param loc source location information
65  *
66  * Shutdown the runtime library. This is also optional, and even if called will not
67  * do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to zero.
68   */
69 void
70 __kmpc_end(ident_t *loc)
71 {
72     // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
73     // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
74     // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
75     // will unregister this root (it can cause library shut down).
76     if (__kmp_ignore_mppend() == FALSE) {
77         KC_TRACE( 10, ("__kmpc_end: called\n" ) );
78         KA_TRACE( 30, ("__kmpc_end\n" ));
79 
80         __kmp_internal_end_thread( -1 );
81     }
82 }
83 
84 /*!
85 @ingroup THREAD_STATES
86 @param loc Source location information.
87 @return The global thread index of the active thread.
88 
89 This function can be called in any context.
90 
91 If the runtime has ony been entered at the outermost level from a
92 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is that
93 which would be returned by omp_get_thread_num() in the outermost
94 active parallel construct. (Or zero if there is no active parallel
95 construct, since the master thread is necessarily thread zero).
96 
97 If multiple non-OpenMP threads all enter an OpenMP construct then this
98 will be a unique thread identifier among all the threads created by
99 the OpenMP runtime (but the value cannote be defined in terms of
100 OpenMP thread ids returned by omp_get_thread_num()).
101 
102 */
103 kmp_int32
104 __kmpc_global_thread_num(ident_t *loc)
105 {
106     kmp_int32 gtid = __kmp_entry_gtid();
107 
108     KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
109 
110     return gtid;
111 }
112 
113 /*!
114 @ingroup THREAD_STATES
115 @param loc Source location information.
116 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
117 
118 This function can be called in any context.
119 It returns the total number of threads under the control of the OpenMP runtime. That is
120 not a number that can be determined by any OpenMP standard calls, since the library may be
121 called from more than one non-OpenMP thread, and this reflects the total over all such calls.
122 Similarly the runtime maintains underlying threads even when they are not active (since the cost
123 of creating and destroying OS threads is high), this call counts all such threads even if they are not
124 waiting for work.
125 */
126 kmp_int32
127 __kmpc_global_num_threads(ident_t *loc)
128 {
129     KC_TRACE(10,("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
130 
131     return TCR_4(__kmp_all_nth);
132 }
133 
134 /*!
135 @ingroup THREAD_STATES
136 @param loc Source location information.
137 @return The thread number of the calling thread in the innermost active parallel construct.
138 
139 */
140 kmp_int32
141 __kmpc_bound_thread_num(ident_t *loc)
142 {
143     KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
144     return __kmp_tid_from_gtid( __kmp_entry_gtid() );
145 }
146 
147 /*!
148 @ingroup THREAD_STATES
149 @param loc Source location information.
150 @return The number of threads in the innermost active parallel construct.
151 */
152 kmp_int32
153 __kmpc_bound_num_threads(ident_t *loc)
154 {
155     KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
156 
157     return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
158 }
159 
160 /*!
161  * @ingroup DEPRECATED
162  * @param loc location description
163  *
164  * This function need not be called. It always returns TRUE.
165  */
166 kmp_int32
167 __kmpc_ok_to_fork(ident_t *loc)
168 {
169 #ifndef KMP_DEBUG
170 
171     return TRUE;
172 
173 #else
174 
175     const char *semi2;
176     const char *semi3;
177     int line_no;
178 
179     if (__kmp_par_range == 0) {
180         return TRUE;
181     }
182     semi2 = loc->psource;
183     if (semi2 == NULL) {
184         return TRUE;
185     }
186     semi2 = strchr(semi2, ';');
187     if (semi2 == NULL) {
188         return TRUE;
189     }
190     semi2 = strchr(semi2 + 1, ';');
191     if (semi2 == NULL) {
192         return TRUE;
193     }
194     if (__kmp_par_range_filename[0]) {
195         const char *name = semi2 - 1;
196         while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
197             name--;
198         }
199         if ((*name == '/') || (*name == ';')) {
200             name++;
201         }
202         if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
203             return __kmp_par_range < 0;
204         }
205     }
206     semi3 = strchr(semi2 + 1, ';');
207     if (__kmp_par_range_routine[0]) {
208         if ((semi3 != NULL) && (semi3 > semi2)
209           && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
210             return __kmp_par_range < 0;
211         }
212     }
213     if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
214         if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
215             return __kmp_par_range > 0;
216         }
217         return __kmp_par_range < 0;
218     }
219     return TRUE;
220 
221 #endif /* KMP_DEBUG */
222 
223 }
224 
225 /*!
226 @ingroup THREAD_STATES
227 @param loc Source location information.
228 @return 1 if this thread is executing inside an active parallel region, zero if not.
229 */
230 kmp_int32
231 __kmpc_in_parallel( ident_t *loc )
232 {
233     return __kmp_entry_thread() -> th.th_root -> r.r_active;
234 }
235 
236 /*!
237 @ingroup PARALLEL
238 @param loc source location information
239 @param global_tid global thread number
240 @param num_threads number of threads requested for this parallel construct
241 
242 Set the number of threads to be used by the next fork spawned by this thread.
243 This call is only required if the parallel construct has a `num_threads` clause.
244 */
245 void
246 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
247 {
248     KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
249       global_tid, num_threads ) );
250 
251     __kmp_push_num_threads( loc, global_tid, num_threads );
252 }
253 
254 void
255 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
256 {
257     KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
258 
259     /* the num_threads are automatically popped */
260 }
261 
262 
263 #if OMP_40_ENABLED
264 
265 void
266 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
267 {
268     KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
269       global_tid, proc_bind ) );
270 
271     __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
272 }
273 
274 #endif /* OMP_40_ENABLED */
275 
276 
277 /*!
278 @ingroup PARALLEL
279 @param loc  source location information
280 @param argc  total number of arguments in the ellipsis
281 @param microtask  pointer to callback routine consisting of outlined parallel construct
282 @param ...  pointers to shared variables that aren't global
283 
284 Do the actual fork and call the microtask in the relevant number of threads.
285 */
286 void
287 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
288 {
289   int         gtid = __kmp_entry_gtid();
290 
291 #if (KMP_STATS_ENABLED)
292   int inParallel = __kmpc_in_parallel(loc);
293   if (inParallel)
294   {
295       KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
296   }
297   else
298   {
299       KMP_COUNT_BLOCK(OMP_PARALLEL);
300   }
301 #endif
302 
303   // maybe to save thr_state is enough here
304   {
305     va_list     ap;
306     va_start(   ap, microtask );
307 
308 #if OMPT_SUPPORT
309     ompt_frame_t* ompt_frame;
310     if (ompt_enabled) {
311        kmp_info_t *master_th = __kmp_threads[ gtid ];
312        kmp_team_t *parent_team = master_th->th.th_team;
313        ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
314        if (lwt)
315          ompt_frame = &(lwt->ompt_task_info.frame);
316        else
317        {
318          int tid = __kmp_tid_from_gtid( gtid );
319          ompt_frame = &(parent_team->t.t_implicit_task_taskdata[tid].
320          ompt_task_info.frame);
321        }
322        ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
323     }
324 #endif
325 
326 #if INCLUDE_SSC_MARKS
327     SSC_MARK_FORKING();
328 #endif
329     __kmp_fork_call( loc, gtid, fork_context_intel,
330             argc,
331 #if OMPT_SUPPORT
332             VOLATILE_CAST(void *) microtask,      // "unwrapped" task
333 #endif
334             VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
335             VOLATILE_CAST(launch_t)    __kmp_invoke_task_func,
336 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
337 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
338             &ap
339 #else
340             ap
341 #endif
342             );
343 #if INCLUDE_SSC_MARKS
344     SSC_MARK_JOINING();
345 #endif
346     __kmp_join_call( loc, gtid
347 #if OMPT_SUPPORT
348         , fork_context_intel
349 #endif
350     );
351 
352     va_end( ap );
353 
354   }
355 }
356 
357 #if OMP_40_ENABLED
358 /*!
359 @ingroup PARALLEL
360 @param loc source location information
361 @param global_tid global thread number
362 @param num_teams number of teams requested for the teams construct
363 @param num_threads number of threads per team requested for the teams construct
364 
365 Set the number of teams to be used by the teams construct.
366 This call is only required if the teams construct has a `num_teams` clause
367 or a `thread_limit` clause (or both).
368 */
369 void
370 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
371 {
372     KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
373       global_tid, num_teams, num_threads ) );
374 
375     __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
376 }
377 
378 /*!
379 @ingroup PARALLEL
380 @param loc  source location information
381 @param argc  total number of arguments in the ellipsis
382 @param microtask  pointer to callback routine consisting of outlined teams construct
383 @param ...  pointers to shared variables that aren't global
384 
385 Do the actual fork and call the microtask in the relevant number of threads.
386 */
387 void
388 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
389 {
390     int         gtid = __kmp_entry_gtid();
391     kmp_info_t *this_thr = __kmp_threads[ gtid ];
392     va_list     ap;
393     va_start(   ap, microtask );
394 
395     KMP_COUNT_BLOCK(OMP_TEAMS);
396 
397     // remember teams entry point and nesting level
398     this_thr->th.th_teams_microtask = microtask;
399     this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
400 
401 #if OMPT_SUPPORT
402     kmp_team_t *parent_team = this_thr->th.th_team;
403     int tid = __kmp_tid_from_gtid( gtid );
404     if (ompt_enabled) {
405         parent_team->t.t_implicit_task_taskdata[tid].
406            ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
407     }
408 #endif
409 
410     // check if __kmpc_push_num_teams called, set default number of teams otherwise
411     if ( this_thr->th.th_teams_size.nteams == 0 ) {
412         __kmp_push_num_teams( loc, gtid, 0, 0 );
413     }
414     KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
415     KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
416     KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
417 
418     __kmp_fork_call( loc, gtid, fork_context_intel,
419             argc,
420 #if OMPT_SUPPORT
421             VOLATILE_CAST(void *) microtask,               // "unwrapped" task
422 #endif
423             VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
424             VOLATILE_CAST(launch_t)    __kmp_invoke_teams_master,
425 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
426             &ap
427 #else
428             ap
429 #endif
430             );
431     __kmp_join_call( loc, gtid
432 #if OMPT_SUPPORT
433         , fork_context_intel
434 #endif
435     );
436 
437     this_thr->th.th_teams_microtask = NULL;
438     this_thr->th.th_teams_level = 0;
439     *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
440     va_end( ap );
441 }
442 #endif /* OMP_40_ENABLED */
443 
444 
445 //
446 // I don't think this function should ever have been exported.
447 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
448 // openmp code ever called it, but it's been exported from the RTL for so
449 // long that I'm afraid to remove the definition.
450 //
451 int
452 __kmpc_invoke_task_func( int gtid )
453 {
454     return __kmp_invoke_task_func( gtid );
455 }
456 
457 /*!
458 @ingroup PARALLEL
459 @param loc  source location information
460 @param global_tid  global thread number
461 
462 Enter a serialized parallel construct. This interface is used to handle a
463 conditional parallel region, like this,
464 @code
465 #pragma omp parallel if (condition)
466 @endcode
467 when the condition is false.
468 */
469 void
470 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
471 {
472     // The implementation is now in kmp_runtime.cpp so that it can share static
473     // functions with kmp_fork_call since the tasks to be done are similar in
474     // each case.
475     __kmp_serialized_parallel(loc, global_tid);
476 }
477 
478 /*!
479 @ingroup PARALLEL
480 @param loc  source location information
481 @param global_tid  global thread number
482 
483 Leave a serialized parallel construct.
484 */
485 void
486 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
487 {
488     kmp_internal_control_t *top;
489     kmp_info_t *this_thr;
490     kmp_team_t *serial_team;
491 
492     KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
493 
494     /* skip all this code for autopar serialized loops since it results in
495        unacceptable overhead */
496     if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
497         return;
498 
499     // Not autopar code
500     if( ! TCR_4( __kmp_init_parallel ) )
501         __kmp_parallel_initialize();
502 
503     this_thr    = __kmp_threads[ global_tid ];
504     serial_team = this_thr->th.th_serial_team;
505 
506    #if OMP_45_ENABLED
507    kmp_task_team_t *   task_team = this_thr->th.th_task_team;
508 
509    // we need to wait for the proxy tasks before finishing the thread
510    if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
511         __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here?
512    #endif
513 
514     KMP_MB();
515     KMP_DEBUG_ASSERT( serial_team );
516     KMP_ASSERT(       serial_team -> t.t_serialized );
517     KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
518     KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
519     KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
520     KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
521 
522     /* If necessary, pop the internal control stack values and replace the team values */
523     top = serial_team -> t.t_control_stack_top;
524     if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
525         copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
526         serial_team -> t.t_control_stack_top = top -> next;
527         __kmp_free(top);
528     }
529 
530     //if( serial_team -> t.t_serialized > 1 )
531     serial_team -> t.t_level--;
532 
533     /* pop dispatch buffers stack */
534     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
535     {
536         dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
537         serial_team->t.t_dispatch->th_disp_buffer =
538             serial_team->t.t_dispatch->th_disp_buffer->next;
539         __kmp_free( disp_buffer );
540     }
541 
542     -- serial_team -> t.t_serialized;
543     if ( serial_team -> t.t_serialized == 0 ) {
544 
545         /* return to the parallel section */
546 
547 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
548         if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
549             __kmp_clear_x87_fpu_status_word();
550             __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
551             __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
552         }
553 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
554 
555         this_thr -> th.th_team           = serial_team -> t.t_parent;
556         this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
557 
558         /* restore values cached in the thread */
559         this_thr -> th.th_team_nproc     = serial_team -> t.t_parent -> t.t_nproc;          /*  JPH */
560         this_thr -> th.th_team_master    = serial_team -> t.t_parent -> t.t_threads[0];     /* JPH */
561         this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
562 
563         /* TODO the below shouldn't need to be adjusted for serialized teams */
564         this_thr -> th.th_dispatch       = & this_thr -> th.th_team ->
565             t.t_dispatch[ serial_team -> t.t_master_tid ];
566 
567         __kmp_pop_current_task_from_thread( this_thr );
568 
569         KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
570         this_thr -> th.th_current_task -> td_flags.executing = 1;
571 
572         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
573             // Copy the task team from the new child / old parent team to the thread.
574             this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
575             KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
576                             global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
577         }
578     } else {
579         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
580             KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
581                             global_tid, serial_team, serial_team -> t.t_serialized ) );
582         }
583     }
584 
585     if ( __kmp_env_consistency_check )
586         __kmp_pop_parallel( global_tid, NULL );
587 }
588 
589 /*!
590 @ingroup SYNCHRONIZATION
591 @param loc  source location information.
592 
593 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
594 depending on the memory ordering convention obeyed by the compiler
595 even that may not be necessary).
596 */
597 void
598 __kmpc_flush(ident_t *loc)
599 {
600     KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
601 
602     /* need explicit __mf() here since use volatile instead in library */
603     KMP_MB();       /* Flush all pending memory write invalidates.  */
604 
605     #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
606         #if KMP_MIC
607             // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
608             // We shouldn't need it, though, since the ABI rules require that
609             // * If the compiler generates NGO stores it also generates the fence
610             // * If users hand-code NGO stores they should insert the fence
611             // therefore no incomplete unordered stores should be visible.
612         #else
613             // C74404
614             // This is to address non-temporal store instructions (sfence needed).
615             // The clflush instruction is addressed either (mfence needed).
616             // Probably the non-temporal load monvtdqa instruction should also be addressed.
617             // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
618             if ( ! __kmp_cpuinfo.initialized ) {
619                 __kmp_query_cpuid( & __kmp_cpuinfo );
620             }; // if
621             if ( ! __kmp_cpuinfo.sse2 ) {
622                 // CPU cannot execute SSE2 instructions.
623             } else {
624                 #if KMP_COMPILER_ICC
625                 _mm_mfence();
626                 #elif KMP_COMPILER_MSVC
627                 MemoryBarrier();
628                 #else
629                 __sync_synchronize();
630                 #endif // KMP_COMPILER_ICC
631             }; // if
632         #endif // KMP_MIC
633     #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
634         // Nothing to see here move along
635     #elif KMP_ARCH_PPC64
636         // Nothing needed here (we have a real MB above).
637         #if KMP_OS_CNK
638         // The flushing thread needs to yield here; this prevents a
639        // busy-waiting thread from saturating the pipeline. flush is
640           // often used in loops like this:
641            // while (!flag) {
642            //   #pragma omp flush(flag)
643            // }
644        // and adding the yield here is good for at least a 10x speedup
645           // when running >2 threads per core (on the NAS LU benchmark).
646             __kmp_yield(TRUE);
647         #endif
648     #else
649         #error Unknown or unsupported architecture
650     #endif
651 
652 }
653 
654 /* -------------------------------------------------------------------------- */
655 
656 /* -------------------------------------------------------------------------- */
657 
658 /*!
659 @ingroup SYNCHRONIZATION
660 @param loc source location information
661 @param global_tid thread id.
662 
663 Execute a barrier.
664 */
665 void
666 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
667 {
668     KMP_COUNT_BLOCK(OMP_BARRIER);
669     KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
670 
671     if (! TCR_4(__kmp_init_parallel))
672         __kmp_parallel_initialize();
673 
674     if ( __kmp_env_consistency_check ) {
675         if ( loc == 0 ) {
676             KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
677         }; // if
678 
679         __kmp_check_barrier( global_tid, ct_barrier, loc );
680     }
681 
682 #if OMPT_SUPPORT && OMPT_TRACE
683     ompt_frame_t * ompt_frame;
684     if (ompt_enabled ) {
685         ompt_frame = __ompt_get_task_frame_internal(0);
686         if ( ompt_frame->reenter_runtime_frame == NULL )
687             ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
688     }
689 #endif
690     __kmp_threads[ global_tid ]->th.th_ident = loc;
691     // TODO: explicit barrier_wait_id:
692     //   this function is called when 'barrier' directive is present or
693     //   implicit barrier at the end of a worksharing construct.
694     // 1) better to add a per-thread barrier counter to a thread data structure
695     // 2) set to 0 when a new team is created
696     // 4) no sync is required
697 
698     __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
699 #if OMPT_SUPPORT && OMPT_TRACE
700     if (ompt_enabled ) {
701         ompt_frame->reenter_runtime_frame = NULL;
702     }
703 #endif
704 }
705 
706 /* The BARRIER for a MASTER section is always explicit   */
707 /*!
708 @ingroup WORK_SHARING
709 @param loc  source location information.
710 @param global_tid  global thread number .
711 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
712 */
713 kmp_int32
714 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
715 {
716     int status = 0;
717 
718     KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
719 
720     if( ! TCR_4( __kmp_init_parallel ) )
721         __kmp_parallel_initialize();
722 
723     if( KMP_MASTER_GTID( global_tid )) {
724         KMP_COUNT_BLOCK(OMP_MASTER);
725         KMP_PUSH_PARTITIONED_TIMER(OMP_master);
726         status = 1;
727     }
728 
729 #if OMPT_SUPPORT && OMPT_TRACE
730     if (status) {
731         if (ompt_enabled &&
732             ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
733             kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
734             kmp_team_t  *team            = this_thr -> th.th_team;
735 
736             int  tid = __kmp_tid_from_gtid( global_tid );
737             ompt_callbacks.ompt_callback(ompt_event_master_begin)(
738                 team->t.ompt_team_info.parallel_id,
739                 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
740         }
741     }
742 #endif
743 
744     if ( __kmp_env_consistency_check ) {
745 #if KMP_USE_DYNAMIC_LOCK
746         if (status)
747             __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
748         else
749             __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
750 #else
751         if (status)
752             __kmp_push_sync( global_tid, ct_master, loc, NULL );
753         else
754             __kmp_check_sync( global_tid, ct_master, loc, NULL );
755 #endif
756     }
757 
758     return status;
759 }
760 
761 /*!
762 @ingroup WORK_SHARING
763 @param loc  source location information.
764 @param global_tid  global thread number .
765 
766 Mark the end of a <tt>master</tt> region. This should only be called by the thread
767 that executes the <tt>master</tt> region.
768 */
769 void
770 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
771 {
772     KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
773 
774     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
775     KMP_POP_PARTITIONED_TIMER();
776 
777 #if OMPT_SUPPORT && OMPT_TRACE
778     kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
779     kmp_team_t  *team            = this_thr -> th.th_team;
780     if (ompt_enabled &&
781         ompt_callbacks.ompt_callback(ompt_event_master_end)) {
782         int  tid = __kmp_tid_from_gtid( global_tid );
783         ompt_callbacks.ompt_callback(ompt_event_master_end)(
784             team->t.ompt_team_info.parallel_id,
785             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
786     }
787 #endif
788 
789     if ( __kmp_env_consistency_check ) {
790         if( global_tid < 0 )
791             KMP_WARNING( ThreadIdentInvalid );
792 
793         if( KMP_MASTER_GTID( global_tid ))
794             __kmp_pop_sync( global_tid, ct_master, loc );
795     }
796 }
797 
798 /*!
799 @ingroup WORK_SHARING
800 @param loc  source location information.
801 @param gtid  global thread number.
802 
803 Start execution of an <tt>ordered</tt> construct.
804 */
805 void
806 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
807 {
808     int cid = 0;
809     kmp_info_t *th;
810     KMP_DEBUG_ASSERT( __kmp_init_serial );
811 
812     KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
813 
814     if (! TCR_4(__kmp_init_parallel))
815         __kmp_parallel_initialize();
816 
817 #if USE_ITT_BUILD
818     __kmp_itt_ordered_prep( gtid );
819     // TODO: ordered_wait_id
820 #endif /* USE_ITT_BUILD */
821 
822     th = __kmp_threads[ gtid ];
823 
824 #if OMPT_SUPPORT && OMPT_TRACE
825     if (ompt_enabled) {
826         /* OMPT state update */
827         th->th.ompt_thread_info.wait_id = (uint64_t) loc;
828         th->th.ompt_thread_info.state = ompt_state_wait_ordered;
829 
830         /* OMPT event callback */
831         if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
832             ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
833                 th->th.ompt_thread_info.wait_id);
834         }
835     }
836 #endif
837 
838     if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
839         (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
840     else
841         __kmp_parallel_deo( & gtid, & cid, loc );
842 
843 #if OMPT_SUPPORT && OMPT_TRACE
844     if (ompt_enabled) {
845         /* OMPT state update */
846         th->th.ompt_thread_info.state = ompt_state_work_parallel;
847         th->th.ompt_thread_info.wait_id = 0;
848 
849         /* OMPT event callback */
850         if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
851             ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
852                 th->th.ompt_thread_info.wait_id);
853         }
854     }
855 #endif
856 
857 #if USE_ITT_BUILD
858     __kmp_itt_ordered_start( gtid );
859 #endif /* USE_ITT_BUILD */
860 }
861 
862 /*!
863 @ingroup WORK_SHARING
864 @param loc  source location information.
865 @param gtid  global thread number.
866 
867 End execution of an <tt>ordered</tt> construct.
868 */
869 void
870 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
871 {
872     int cid = 0;
873     kmp_info_t *th;
874 
875     KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
876 
877 #if USE_ITT_BUILD
878     __kmp_itt_ordered_end( gtid );
879     // TODO: ordered_wait_id
880 #endif /* USE_ITT_BUILD */
881 
882     th = __kmp_threads[ gtid ];
883 
884     if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
885         (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
886     else
887         __kmp_parallel_dxo( & gtid, & cid, loc );
888 
889 #if OMPT_SUPPORT && OMPT_BLAME
890     if (ompt_enabled &&
891         ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
892         ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
893             th->th.ompt_thread_info.wait_id);
894     }
895 #endif
896 }
897 
898 #if KMP_USE_DYNAMIC_LOCK
899 
900 static __forceinline void
901 __kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag)
902 {
903     // Pointer to the allocated indirect lock is written to crit, while indexing is ignored.
904     void *idx;
905     kmp_indirect_lock_t **lck;
906     lck = (kmp_indirect_lock_t **)crit;
907     kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
908     KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
909     KMP_SET_I_LOCK_LOCATION(ilk, loc);
910     KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
911     KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
912 #if USE_ITT_BUILD
913     __kmp_itt_critical_creating(ilk->lock, loc);
914 #endif
915     int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
916     if (status == 0) {
917 #if USE_ITT_BUILD
918         __kmp_itt_critical_destroyed(ilk->lock);
919 #endif
920         // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit.
921         //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
922     }
923     KMP_DEBUG_ASSERT(*lck != NULL);
924 }
925 
926 // Fast-path acquire tas lock
927 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) {                                                                       \
928     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                                                  \
929     if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                                                      \
930             ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) {      \
931         kmp_uint32 spins;                                                                                        \
932         KMP_FSYNC_PREPARE(l);                                                                                    \
933         KMP_INIT_YIELD(spins);                                                                                   \
934         if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {                            \
935             KMP_YIELD(TRUE);                                                                                     \
936         } else {                                                                                                 \
937             KMP_YIELD_SPIN(spins);                                                                               \
938         }                                                                                                        \
939         kmp_backoff_t backoff = __kmp_spin_backoff_params;                                                       \
940         while (l->lk.poll != KMP_LOCK_FREE(tas) ||                                                               \
941                ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) {   \
942             __kmp_spin_backoff(&backoff);                                                                        \
943             if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {                        \
944                 KMP_YIELD(TRUE);                                                                                 \
945             } else {                                                                                             \
946                 KMP_YIELD_SPIN(spins);                                                                           \
947             }                                                                                                    \
948         }                                                                                                        \
949     }                                                                                                            \
950     KMP_FSYNC_ACQUIRED(l);                                                                                       \
951 }
952 
953 // Fast-path test tas lock
954 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) {                                                            \
955     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                                        \
956     rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                                           \
957          KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas));   \
958 }
959 
960 // Fast-path release tas lock
961 #define KMP_RELEASE_TAS_LOCK(lock, gtid) {                          \
962     TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));   \
963     KMP_MB();                                                       \
964 }
965 
966 #if KMP_USE_FUTEX
967 
968 # include <unistd.h>
969 # include <sys/syscall.h>
970 # ifndef FUTEX_WAIT
971 #  define FUTEX_WAIT 0
972 # endif
973 # ifndef FUTEX_WAKE
974 #  define FUTEX_WAKE 1
975 # endif
976 
977 // Fast-path acquire futex lock
978 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) {                                                                        \
979     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                                               \
980     kmp_int32 gtid_code = (gtid+1) << 1;                                                                            \
981     KMP_MB();                                                                                                       \
982     KMP_FSYNC_PREPARE(ftx);                                                                                         \
983     kmp_int32 poll_val;                                                                                             \
984     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),                           \
985                                                    KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {     \
986         kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                                                              \
987         if (!cond) {                                                                                                \
988             if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) {      \
989                 continue;                                                                                           \
990             }                                                                                                       \
991             poll_val |= KMP_LOCK_BUSY(1, futex);                                                                    \
992         }                                                                                                           \
993         kmp_int32 rc;                                                                                               \
994         if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) {                \
995             continue;                                                                                               \
996         }                                                                                                           \
997         gtid_code |= 1;                                                                                             \
998     }                                                                                                               \
999     KMP_FSYNC_ACQUIRED(ftx);                                                                                        \
1000 }
1001 
1002 // Fast-path test futex lock
1003 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) {                                                                       \
1004     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                                               \
1005     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) {    \
1006         KMP_FSYNC_ACQUIRED(ftx);                                                                                    \
1007         rc = TRUE;                                                                                                  \
1008     } else {                                                                                                        \
1009         rc = FALSE;                                                                                                 \
1010     }                                                                                                               \
1011 }
1012 
1013 // Fast-path release futex lock
1014 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) {                                                        \
1015     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                               \
1016     KMP_MB();                                                                                       \
1017     KMP_FSYNC_RELEASING(ftx);                                                                       \
1018     kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));                   \
1019     if (KMP_LOCK_STRIP(poll_val) & 1) {                                                             \
1020         syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);   \
1021     }                                                                                               \
1022     KMP_MB();                                                                                       \
1023     KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));              \
1024 }
1025 
1026 #endif // KMP_USE_FUTEX
1027 
1028 #else // KMP_USE_DYNAMIC_LOCK
1029 
1030 static kmp_user_lock_p
1031 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1032 {
1033     kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1034 
1035     //
1036     // Because of the double-check, the following load
1037     // doesn't need to be volatile.
1038     //
1039     kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1040 
1041     if ( lck == NULL ) {
1042         void * idx;
1043 
1044         // Allocate & initialize the lock.
1045         // Remember allocated locks in table in order to free them in __kmp_cleanup()
1046         lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1047         __kmp_init_user_lock_with_checks( lck );
1048         __kmp_set_user_lock_location( lck, loc );
1049 #if USE_ITT_BUILD
1050         __kmp_itt_critical_creating( lck );
1051             // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1052             // lock. It is the only place where we can guarantee it. There are chances the lock will
1053             // destroyed with no usage, but it is not a problem, because this is not real event seen
1054             // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1055 #endif /* USE_ITT_BUILD */
1056 
1057         //
1058         // Use a cmpxchg instruction to slam the start of the critical
1059         // section with the lock pointer.  If another thread beat us
1060         // to it, deallocate the lock, and use the lock that the other
1061         // thread allocated.
1062         //
1063         int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1064 
1065         if ( status == 0 ) {
1066             // Deallocate the lock and reload the value.
1067 #if USE_ITT_BUILD
1068             __kmp_itt_critical_destroyed( lck );
1069                 // Let ITT know the lock is destroyed and the same memory location may be reused for
1070                 // another purpose.
1071 #endif /* USE_ITT_BUILD */
1072             __kmp_destroy_user_lock_with_checks( lck );
1073             __kmp_user_lock_free( &idx, gtid, lck );
1074             lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1075             KMP_DEBUG_ASSERT( lck != NULL );
1076         }
1077     }
1078     return lck;
1079 }
1080 
1081 #endif // KMP_USE_DYNAMIC_LOCK
1082 
1083 /*!
1084 @ingroup WORK_SHARING
1085 @param loc  source location information.
1086 @param global_tid  global thread number .
1087 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
1088 some other suitably unique value.
1089 
1090 Enter code protected by a `critical` construct.
1091 This function blocks until the executing thread can enter the critical section.
1092 */
1093 void
1094 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit )
1095 {
1096 #if KMP_USE_DYNAMIC_LOCK
1097     __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1098 #else
1099     KMP_COUNT_BLOCK(OMP_CRITICAL);
1100     KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait);        /* Time spent waiting to enter the critical section */
1101     kmp_user_lock_p lck;
1102 
1103     KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1104 
1105     //TODO: add THR_OVHD_STATE
1106 
1107     KMP_CHECK_USER_LOCK_INIT();
1108 
1109     if ( ( __kmp_user_lock_kind == lk_tas )
1110       && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1111         lck = (kmp_user_lock_p)crit;
1112     }
1113 #if KMP_USE_FUTEX
1114     else if ( ( __kmp_user_lock_kind == lk_futex )
1115       && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1116         lck = (kmp_user_lock_p)crit;
1117     }
1118 #endif
1119     else { // ticket, queuing or drdpa
1120         lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1121     }
1122 
1123     if ( __kmp_env_consistency_check )
1124         __kmp_push_sync( global_tid, ct_critical, loc, lck );
1125 
1126     /* since the critical directive binds to all threads, not just
1127      * the current team we have to check this even if we are in a
1128      * serialized team */
1129     /* also, even if we are the uber thread, we still have to conduct the lock,
1130      * as we have to contend with sibling threads */
1131 
1132 #if USE_ITT_BUILD
1133     __kmp_itt_critical_acquiring( lck );
1134 #endif /* USE_ITT_BUILD */
1135     // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1136     __kmp_acquire_user_lock_with_checks( lck, global_tid );
1137 
1138 #if USE_ITT_BUILD
1139     __kmp_itt_critical_acquired( lck );
1140 #endif /* USE_ITT_BUILD */
1141 
1142     KMP_START_EXPLICIT_TIMER(OMP_critical);
1143     KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1144 #endif // KMP_USE_DYNAMIC_LOCK
1145 }
1146 
1147 #if KMP_USE_DYNAMIC_LOCK
1148 
1149 // Converts the given hint to an internal lock implementation
1150 static __forceinline kmp_dyna_lockseq_t
1151 __kmp_map_hint_to_lock(uintptr_t hint)
1152 {
1153 #if KMP_USE_TSX
1154 # define KMP_TSX_LOCK(seq) lockseq_##seq
1155 #else
1156 # define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1157 #endif
1158 
1159 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1160 # define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1161 #else
1162 # define KMP_CPUINFO_RTM 0
1163 #endif
1164 
1165     // Hints that do not require further logic
1166     if (hint & kmp_lock_hint_hle)
1167         return KMP_TSX_LOCK(hle);
1168     if (hint & kmp_lock_hint_rtm)
1169         return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq;
1170     if (hint & kmp_lock_hint_adaptive)
1171         return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq;
1172 
1173     // Rule out conflicting hints first by returning the default lock
1174     if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1175         return __kmp_user_lock_seq;
1176     if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative))
1177         return __kmp_user_lock_seq;
1178 
1179     // Do not even consider speculation when it appears to be contended
1180     if (hint & omp_lock_hint_contended)
1181         return lockseq_queuing;
1182 
1183     // Uncontended lock without speculation
1184     if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1185         return lockseq_tas;
1186 
1187     // HLE lock for speculation
1188     if (hint & omp_lock_hint_speculative)
1189         return KMP_TSX_LOCK(hle);
1190 
1191     return __kmp_user_lock_seq;
1192 }
1193 
1194 /*!
1195 @ingroup WORK_SHARING
1196 @param loc  source location information.
1197 @param global_tid  global thread number.
1198 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section,
1199 or some other suitably unique value.
1200 @param hint the lock hint.
1201 
1202 Enter code protected by a `critical` construct with a hint. The hint value is used to suggest a lock implementation.
1203 This function blocks until the executing thread can enter the critical section unless the hint suggests use of
1204 speculative execution and the hardware supports it.
1205 */
1206 void
1207 __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint )
1208 {
1209     KMP_COUNT_BLOCK(OMP_CRITICAL);
1210     kmp_user_lock_p lck;
1211 
1212     KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1213 
1214     kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1215     // Check if it is initialized.
1216     if (*lk == 0) {
1217         kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1218         if (KMP_IS_D_LOCK(lckseq)) {
1219             KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq));
1220         } else {
1221             __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1222         }
1223     }
1224     // Branch for accessing the actual lock object and set operation. This branching is inevitable since
1225     // this lock initialization does not follow the normal dispatch path (lock table is not used).
1226     if (KMP_EXTRACT_D_TAG(lk) != 0) {
1227         lck = (kmp_user_lock_p)lk;
1228         if (__kmp_env_consistency_check) {
1229             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1230         }
1231 # if USE_ITT_BUILD
1232         __kmp_itt_critical_acquiring(lck);
1233 # endif
1234 # if KMP_USE_INLINED_TAS
1235         if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1236             KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1237         } else
1238 # elif KMP_USE_INLINED_FUTEX
1239         if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1240             KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1241         } else
1242 # endif
1243         {
1244             KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1245         }
1246     } else {
1247         kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1248         lck = ilk->lock;
1249         if (__kmp_env_consistency_check) {
1250             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1251         }
1252 # if USE_ITT_BUILD
1253         __kmp_itt_critical_acquiring(lck);
1254 # endif
1255         KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1256     }
1257 
1258 #if USE_ITT_BUILD
1259     __kmp_itt_critical_acquired( lck );
1260 #endif /* USE_ITT_BUILD */
1261 
1262     KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1263     KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1264 } // __kmpc_critical_with_hint
1265 
1266 #endif // KMP_USE_DYNAMIC_LOCK
1267 
1268 /*!
1269 @ingroup WORK_SHARING
1270 @param loc  source location information.
1271 @param global_tid  global thread number .
1272 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
1273 some other suitably unique value.
1274 
1275 Leave a critical section, releasing any lock that was held during its execution.
1276 */
1277 void
1278 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1279 {
1280     kmp_user_lock_p lck;
1281 
1282     KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1283 
1284 #if KMP_USE_DYNAMIC_LOCK
1285     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1286         lck = (kmp_user_lock_p)crit;
1287         KMP_ASSERT(lck != NULL);
1288         if (__kmp_env_consistency_check) {
1289             __kmp_pop_sync(global_tid, ct_critical, loc);
1290         }
1291 # if USE_ITT_BUILD
1292         __kmp_itt_critical_releasing( lck );
1293 # endif
1294 # if KMP_USE_INLINED_TAS
1295         if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1296             KMP_RELEASE_TAS_LOCK(lck, global_tid);
1297         } else
1298 # elif KMP_USE_INLINED_FUTEX
1299         if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1300             KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1301         } else
1302 # endif
1303         {
1304             KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1305         }
1306     } else {
1307         kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1308         KMP_ASSERT(ilk != NULL);
1309         lck = ilk->lock;
1310         if (__kmp_env_consistency_check) {
1311             __kmp_pop_sync(global_tid, ct_critical, loc);
1312         }
1313 # if USE_ITT_BUILD
1314         __kmp_itt_critical_releasing( lck );
1315 # endif
1316         KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1317     }
1318 
1319 #else // KMP_USE_DYNAMIC_LOCK
1320 
1321     if ( ( __kmp_user_lock_kind == lk_tas )
1322       && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1323         lck = (kmp_user_lock_p)crit;
1324     }
1325 #if KMP_USE_FUTEX
1326     else if ( ( __kmp_user_lock_kind == lk_futex )
1327       && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1328         lck = (kmp_user_lock_p)crit;
1329     }
1330 #endif
1331     else { // ticket, queuing or drdpa
1332         lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1333     }
1334 
1335     KMP_ASSERT(lck != NULL);
1336 
1337     if ( __kmp_env_consistency_check )
1338         __kmp_pop_sync( global_tid, ct_critical, loc );
1339 
1340 #if USE_ITT_BUILD
1341     __kmp_itt_critical_releasing( lck );
1342 #endif /* USE_ITT_BUILD */
1343     // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1344     __kmp_release_user_lock_with_checks( lck, global_tid );
1345 
1346 #if OMPT_SUPPORT && OMPT_BLAME
1347     if (ompt_enabled &&
1348         ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1349         ompt_callbacks.ompt_callback(ompt_event_release_critical)(
1350             (uint64_t) lck);
1351     }
1352 #endif
1353 
1354 #endif // KMP_USE_DYNAMIC_LOCK
1355     KMP_POP_PARTITIONED_TIMER();
1356     KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1357 }
1358 
1359 /*!
1360 @ingroup SYNCHRONIZATION
1361 @param loc source location information
1362 @param global_tid thread id.
1363 @return one if the thread should execute the master block, zero otherwise
1364 
1365 Start execution of a combined barrier and master. The barrier is executed inside this function.
1366 */
1367 kmp_int32
1368 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1369 {
1370     int status;
1371 
1372     KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1373 
1374     if (! TCR_4(__kmp_init_parallel))
1375         __kmp_parallel_initialize();
1376 
1377     if ( __kmp_env_consistency_check )
1378         __kmp_check_barrier( global_tid, ct_barrier, loc );
1379 
1380 #if USE_ITT_NOTIFY
1381     __kmp_threads[global_tid]->th.th_ident = loc;
1382 #endif
1383     status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1384 
1385     return (status != 0) ? 0 : 1;
1386 }
1387 
1388 /*!
1389 @ingroup SYNCHRONIZATION
1390 @param loc source location information
1391 @param global_tid thread id.
1392 
1393 Complete the execution of a combined barrier and master. This function should
1394 only be called at the completion of the <tt>master</tt> code. Other threads will
1395 still be waiting at the barrier and this call releases them.
1396 */
1397 void
1398 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1399 {
1400     KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1401 
1402     __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1403 }
1404 
1405 /*!
1406 @ingroup SYNCHRONIZATION
1407 @param loc source location information
1408 @param global_tid thread id.
1409 @return one if the thread should execute the master block, zero otherwise
1410 
1411 Start execution of a combined barrier and master(nowait) construct.
1412 The barrier is executed inside this function.
1413 There is no equivalent "end" function, since the
1414 */
1415 kmp_int32
1416 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1417 {
1418     kmp_int32 ret;
1419 
1420     KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1421 
1422     if (! TCR_4(__kmp_init_parallel))
1423         __kmp_parallel_initialize();
1424 
1425     if ( __kmp_env_consistency_check ) {
1426         if ( loc == 0 ) {
1427             KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1428         }
1429         __kmp_check_barrier( global_tid, ct_barrier, loc );
1430     }
1431 
1432 #if USE_ITT_NOTIFY
1433     __kmp_threads[global_tid]->th.th_ident = loc;
1434 #endif
1435     __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1436 
1437     ret = __kmpc_master (loc, global_tid);
1438 
1439     if ( __kmp_env_consistency_check ) {
1440         /*  there's no __kmpc_end_master called; so the (stats) */
1441         /*  actions of __kmpc_end_master are done here          */
1442 
1443         if ( global_tid < 0 ) {
1444             KMP_WARNING( ThreadIdentInvalid );
1445         }
1446         if (ret) {
1447             /* only one thread should do the pop since only */
1448             /* one did the push (see __kmpc_master())       */
1449 
1450             __kmp_pop_sync( global_tid, ct_master, loc );
1451         }
1452     }
1453 
1454     return (ret);
1455 }
1456 
1457 /* The BARRIER for a SINGLE process section is always explicit   */
1458 /*!
1459 @ingroup WORK_SHARING
1460 @param loc  source location information
1461 @param global_tid  global thread number
1462 @return One if this thread should execute the single construct, zero otherwise.
1463 
1464 Test whether to execute a <tt>single</tt> construct.
1465 There are no implicit barriers in the two "single" calls, rather the compiler should
1466 introduce an explicit barrier if it is required.
1467 */
1468 
1469 kmp_int32
1470 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1471 {
1472     kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1473 
1474     if (rc) {
1475         // We are going to execute the single statement, so we should count it.
1476         KMP_COUNT_BLOCK(OMP_SINGLE);
1477         KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1478     }
1479 
1480 #if OMPT_SUPPORT && OMPT_TRACE
1481     kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
1482     kmp_team_t *team            = this_thr -> th.th_team;
1483     int tid = __kmp_tid_from_gtid( global_tid );
1484 
1485     if (ompt_enabled) {
1486         if (rc) {
1487             if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1488                 ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1489                     team->t.ompt_team_info.parallel_id,
1490                     team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1491                     team->t.ompt_team_info.microtask);
1492             }
1493         } else {
1494             if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1495                 ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1496                     team->t.ompt_team_info.parallel_id,
1497                     team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1498             }
1499             this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1500         }
1501     }
1502 #endif
1503 
1504     return rc;
1505 }
1506 
1507 /*!
1508 @ingroup WORK_SHARING
1509 @param loc  source location information
1510 @param global_tid  global thread number
1511 
1512 Mark the end of a <tt>single</tt> construct.  This function should
1513 only be called by the thread that executed the block of code protected
1514 by the `single` construct.
1515 */
1516 void
1517 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1518 {
1519     __kmp_exit_single( global_tid );
1520     KMP_POP_PARTITIONED_TIMER();
1521 
1522 #if OMPT_SUPPORT && OMPT_TRACE
1523     kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
1524     kmp_team_t *team            = this_thr -> th.th_team;
1525     int tid = __kmp_tid_from_gtid( global_tid );
1526 
1527     if (ompt_enabled &&
1528         ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1529         ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1530             team->t.ompt_team_info.parallel_id,
1531             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1532     }
1533 #endif
1534 }
1535 
1536 /*!
1537 @ingroup WORK_SHARING
1538 @param loc Source location
1539 @param global_tid Global thread id
1540 
1541 Mark the end of a statically scheduled loop.
1542 */
1543 void
1544 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1545 {
1546     KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1547 
1548 #if OMPT_SUPPORT && OMPT_TRACE
1549     if (ompt_enabled &&
1550         ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1551         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1552         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1553         ompt_callbacks.ompt_callback(ompt_event_loop_end)(
1554             team_info->parallel_id, task_info->task_id);
1555     }
1556 #endif
1557 
1558     if ( __kmp_env_consistency_check )
1559      __kmp_pop_workshare( global_tid, ct_pdo, loc );
1560 }
1561 
1562 /*
1563  * User routines which take C-style arguments (call by value)
1564  * different from the Fortran equivalent routines
1565  */
1566 
1567 void
1568 ompc_set_num_threads( int arg )
1569 {
1570 // !!!!! TODO: check the per-task binding
1571     __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1572 }
1573 
1574 void
1575 ompc_set_dynamic( int flag )
1576 {
1577     kmp_info_t *thread;
1578 
1579     /* For the thread-private implementation of the internal controls */
1580     thread = __kmp_entry_thread();
1581 
1582     __kmp_save_internal_controls( thread );
1583 
1584     set__dynamic( thread, flag ? TRUE : FALSE );
1585 }
1586 
1587 void
1588 ompc_set_nested( int flag )
1589 {
1590     kmp_info_t *thread;
1591 
1592     /* For the thread-private internal controls implementation */
1593     thread = __kmp_entry_thread();
1594 
1595     __kmp_save_internal_controls( thread );
1596 
1597     set__nested( thread, flag ? TRUE : FALSE );
1598 }
1599 
1600 void
1601 ompc_set_max_active_levels( int max_active_levels )
1602 {
1603     /* TO DO */
1604     /* we want per-task implementation of this internal control */
1605 
1606     /* For the per-thread internal controls implementation */
1607     __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1608 }
1609 
1610 void
1611 ompc_set_schedule( omp_sched_t kind, int modifier )
1612 {
1613 // !!!!! TODO: check the per-task binding
1614     __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1615 }
1616 
1617 int
1618 ompc_get_ancestor_thread_num( int level )
1619 {
1620     return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1621 }
1622 
1623 int
1624 ompc_get_team_size( int level )
1625 {
1626     return __kmp_get_team_size( __kmp_entry_gtid(), level );
1627 }
1628 
1629 void
1630 kmpc_set_stacksize( int arg )
1631 {
1632     // __kmp_aux_set_stacksize initializes the library if needed
1633     __kmp_aux_set_stacksize( arg );
1634 }
1635 
1636 void
1637 kmpc_set_stacksize_s( size_t arg )
1638 {
1639     // __kmp_aux_set_stacksize initializes the library if needed
1640     __kmp_aux_set_stacksize( arg );
1641 }
1642 
1643 void
1644 kmpc_set_blocktime( int arg )
1645 {
1646     int gtid, tid;
1647     kmp_info_t *thread;
1648 
1649     gtid = __kmp_entry_gtid();
1650     tid = __kmp_tid_from_gtid(gtid);
1651     thread = __kmp_thread_from_gtid(gtid);
1652 
1653     __kmp_aux_set_blocktime( arg, thread, tid );
1654 }
1655 
1656 void
1657 kmpc_set_library( int arg )
1658 {
1659     // __kmp_user_set_library initializes the library if needed
1660     __kmp_user_set_library( (enum library_type)arg );
1661 }
1662 
1663 void
1664 kmpc_set_defaults( char const * str )
1665 {
1666     // __kmp_aux_set_defaults initializes the library if needed
1667     __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
1668 }
1669 
1670 void
1671 kmpc_set_disp_num_buffers( int arg )
1672 {
1673     // ignore after initialization because some teams have already
1674     // allocated dispatch buffers
1675     if( __kmp_init_serial == 0 && arg > 0 )
1676         __kmp_dispatch_num_buffers = arg;
1677 }
1678 
1679 int
1680 kmpc_set_affinity_mask_proc( int proc, void **mask )
1681 {
1682 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1683     return -1;
1684 #else
1685     if ( ! TCR_4(__kmp_init_middle) ) {
1686         __kmp_middle_initialize();
1687     }
1688     return __kmp_aux_set_affinity_mask_proc( proc, mask );
1689 #endif
1690 }
1691 
1692 int
1693 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1694 {
1695 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1696     return -1;
1697 #else
1698     if ( ! TCR_4(__kmp_init_middle) ) {
1699         __kmp_middle_initialize();
1700     }
1701     return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1702 #endif
1703 }
1704 
1705 int
1706 kmpc_get_affinity_mask_proc( int proc, void **mask )
1707 {
1708 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1709     return -1;
1710 #else
1711     if ( ! TCR_4(__kmp_init_middle) ) {
1712         __kmp_middle_initialize();
1713     }
1714     return __kmp_aux_get_affinity_mask_proc( proc, mask );
1715 #endif
1716 }
1717 
1718 
1719 /* -------------------------------------------------------------------------- */
1720 /*!
1721 @ingroup THREADPRIVATE
1722 @param loc       source location information
1723 @param gtid      global thread number
1724 @param cpy_size  size of the cpy_data buffer
1725 @param cpy_data  pointer to data to be copied
1726 @param cpy_func  helper function to call for copying data
1727 @param didit     flag variable: 1=single thread; 0=not single thread
1728 
1729 __kmpc_copyprivate implements the interface for the private data broadcast needed for
1730 the copyprivate clause associated with a single region in an OpenMP<sup>*</sup> program (both C and Fortran).
1731 All threads participating in the parallel region call this routine.
1732 One of the threads (called the single thread) should have the <tt>didit</tt> variable set to 1
1733 and all other threads should have that variable set to 0.
1734 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1735 
1736 The OpenMP specification forbids the use of nowait on the single region when a copyprivate
1737 clause is present. However, @ref __kmpc_copyprivate implements a barrier internally to avoid
1738 race conditions, so the code generation for the single region should avoid generating a barrier
1739 after the call to @ref __kmpc_copyprivate.
1740 
1741 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1742 The <tt>loc</tt> parameter is a pointer to source location information.
1743 
1744 Internal implementation: The single thread will first copy its descriptor address (cpy_data)
1745 to a team-private location, then the other threads will each call the function pointed to by
1746 the parameter cpy_func, which carries out the copy by copying the data using the cpy_data buffer.
1747 
1748 The cpy_func routine used for the copy and the contents of the data area defined by cpy_data
1749 and cpy_size may be built in any fashion that will allow the copy to be done. For instance,
1750 the cpy_data buffer can hold the actual data to be copied or it may hold a list of pointers
1751 to the data. The cpy_func routine must interpret the cpy_data buffer appropriately.
1752 
1753 The interface to cpy_func is as follows:
1754 @code
1755 void cpy_func( void *destination, void *source )
1756 @endcode
1757 where void *destination is the cpy_data pointer for the thread being copied to
1758 and void *source is the cpy_data pointer for the thread being copied from.
1759 */
1760 void
1761 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1762 {
1763     void **data_ptr;
1764 
1765     KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1766 
1767     KMP_MB();
1768 
1769     data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1770 
1771     if ( __kmp_env_consistency_check ) {
1772         if ( loc == 0 ) {
1773             KMP_WARNING( ConstructIdentInvalid );
1774         }
1775     }
1776 
1777     /* ToDo: Optimize the following two barriers into some kind of split barrier */
1778 
1779     if (didit) *data_ptr = cpy_data;
1780 
1781     /* This barrier is not a barrier region boundary */
1782 #if USE_ITT_NOTIFY
1783     __kmp_threads[gtid]->th.th_ident = loc;
1784 #endif
1785     __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1786 
1787     if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1788 
1789     /* Consider next barrier the user-visible barrier for barrier region boundaries */
1790     /* Nesting checks are already handled by the single construct checks */
1791 
1792 #if USE_ITT_NOTIFY
1793     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1794 #endif
1795     __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1796 }
1797 
1798 /* -------------------------------------------------------------------------- */
1799 
1800 #define INIT_LOCK                 __kmp_init_user_lock_with_checks
1801 #define INIT_NESTED_LOCK          __kmp_init_nested_user_lock_with_checks
1802 #define ACQUIRE_LOCK              __kmp_acquire_user_lock_with_checks
1803 #define ACQUIRE_LOCK_TIMED        __kmp_acquire_user_lock_with_checks_timed
1804 #define ACQUIRE_NESTED_LOCK       __kmp_acquire_nested_user_lock_with_checks
1805 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1806 #define RELEASE_LOCK              __kmp_release_user_lock_with_checks
1807 #define RELEASE_NESTED_LOCK       __kmp_release_nested_user_lock_with_checks
1808 #define TEST_LOCK                 __kmp_test_user_lock_with_checks
1809 #define TEST_NESTED_LOCK          __kmp_test_nested_user_lock_with_checks
1810 #define DESTROY_LOCK              __kmp_destroy_user_lock_with_checks
1811 #define DESTROY_NESTED_LOCK       __kmp_destroy_nested_user_lock_with_checks
1812 
1813 
1814 /*
1815  * TODO: Make check abort messages use location info & pass it
1816  * into with_checks routines
1817  */
1818 
1819 #if KMP_USE_DYNAMIC_LOCK
1820 
1821 // internal lock initializer
1822 static __forceinline void
1823 __kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1824 {
1825     if (KMP_IS_D_LOCK(seq)) {
1826         KMP_INIT_D_LOCK(lock, seq);
1827 #if USE_ITT_BUILD
1828         __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
1829 #endif
1830     } else {
1831         KMP_INIT_I_LOCK(lock, seq);
1832 #if USE_ITT_BUILD
1833         kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1834         __kmp_itt_lock_creating(ilk->lock, loc);
1835 #endif
1836     }
1837 }
1838 
1839 // internal nest lock initializer
1840 static __forceinline void
1841 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1842 {
1843 #if KMP_USE_TSX
1844     // Don't have nested lock implementation for speculative locks
1845     if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
1846         seq = __kmp_user_lock_seq;
1847 #endif
1848     switch (seq) {
1849         case lockseq_tas:
1850             seq = lockseq_nested_tas;
1851             break;
1852 #if KMP_USE_FUTEX
1853         case lockseq_futex:
1854             seq = lockseq_nested_futex;
1855             break;
1856 #endif
1857         case lockseq_ticket:
1858             seq = lockseq_nested_ticket;
1859             break;
1860         case lockseq_queuing:
1861             seq = lockseq_nested_queuing;
1862             break;
1863         case lockseq_drdpa:
1864             seq = lockseq_nested_drdpa;
1865             break;
1866         default:
1867             seq = lockseq_nested_queuing;
1868     }
1869     KMP_INIT_I_LOCK(lock, seq);
1870 #if USE_ITT_BUILD
1871     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1872     __kmp_itt_lock_creating(ilk->lock, loc);
1873 #endif
1874 }
1875 
1876 /* initialize the lock with a hint */
1877 void
1878 __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1879 {
1880     KMP_DEBUG_ASSERT(__kmp_init_serial);
1881     if (__kmp_env_consistency_check && user_lock == NULL) {
1882         KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
1883     }
1884 
1885     __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1886 }
1887 
1888 /* initialize the lock with a hint */
1889 void
1890 __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1891 {
1892     KMP_DEBUG_ASSERT(__kmp_init_serial);
1893     if (__kmp_env_consistency_check && user_lock == NULL) {
1894         KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
1895     }
1896 
1897     __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1898 }
1899 
1900 #endif // KMP_USE_DYNAMIC_LOCK
1901 
1902 /* initialize the lock */
1903 void
1904 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid,  void ** user_lock ) {
1905 #if KMP_USE_DYNAMIC_LOCK
1906     KMP_DEBUG_ASSERT(__kmp_init_serial);
1907     if (__kmp_env_consistency_check && user_lock == NULL) {
1908         KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1909     }
1910     __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1911 
1912 #else // KMP_USE_DYNAMIC_LOCK
1913 
1914     static char const * const func = "omp_init_lock";
1915     kmp_user_lock_p lck;
1916     KMP_DEBUG_ASSERT( __kmp_init_serial );
1917 
1918     if ( __kmp_env_consistency_check ) {
1919         if ( user_lock == NULL ) {
1920             KMP_FATAL( LockIsUninitialized, func );
1921         }
1922     }
1923 
1924     KMP_CHECK_USER_LOCK_INIT();
1925 
1926     if ( ( __kmp_user_lock_kind == lk_tas )
1927       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1928         lck = (kmp_user_lock_p)user_lock;
1929     }
1930 #if KMP_USE_FUTEX
1931     else if ( ( __kmp_user_lock_kind == lk_futex )
1932       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1933         lck = (kmp_user_lock_p)user_lock;
1934     }
1935 #endif
1936     else {
1937         lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1938     }
1939     INIT_LOCK( lck );
1940     __kmp_set_user_lock_location( lck, loc );
1941 
1942 #if OMPT_SUPPORT && OMPT_TRACE
1943     if (ompt_enabled &&
1944         ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
1945         ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck);
1946     }
1947 #endif
1948 
1949 #if USE_ITT_BUILD
1950     __kmp_itt_lock_creating( lck );
1951 #endif /* USE_ITT_BUILD */
1952 
1953 #endif // KMP_USE_DYNAMIC_LOCK
1954 } // __kmpc_init_lock
1955 
1956 /* initialize the lock */
1957 void
1958 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1959 #if KMP_USE_DYNAMIC_LOCK
1960 
1961     KMP_DEBUG_ASSERT(__kmp_init_serial);
1962     if (__kmp_env_consistency_check && user_lock == NULL) {
1963         KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1964     }
1965     __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1966 
1967 #else // KMP_USE_DYNAMIC_LOCK
1968 
1969     static char const * const func = "omp_init_nest_lock";
1970     kmp_user_lock_p lck;
1971     KMP_DEBUG_ASSERT( __kmp_init_serial );
1972 
1973     if ( __kmp_env_consistency_check ) {
1974         if ( user_lock == NULL ) {
1975             KMP_FATAL( LockIsUninitialized, func );
1976         }
1977     }
1978 
1979     KMP_CHECK_USER_LOCK_INIT();
1980 
1981     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1982       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1983         lck = (kmp_user_lock_p)user_lock;
1984     }
1985 #if KMP_USE_FUTEX
1986     else if ( ( __kmp_user_lock_kind == lk_futex )
1987      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1988      <= OMP_NEST_LOCK_T_SIZE ) ) {
1989         lck = (kmp_user_lock_p)user_lock;
1990     }
1991 #endif
1992     else {
1993         lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1994     }
1995 
1996     INIT_NESTED_LOCK( lck );
1997     __kmp_set_user_lock_location( lck, loc );
1998 
1999 #if OMPT_SUPPORT && OMPT_TRACE
2000     if (ompt_enabled &&
2001         ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
2002         ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck);
2003     }
2004 #endif
2005 
2006 #if USE_ITT_BUILD
2007     __kmp_itt_lock_creating( lck );
2008 #endif /* USE_ITT_BUILD */
2009 
2010 #endif // KMP_USE_DYNAMIC_LOCK
2011 } // __kmpc_init_nest_lock
2012 
2013 void
2014 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2015 #if KMP_USE_DYNAMIC_LOCK
2016 
2017 # if USE_ITT_BUILD
2018     kmp_user_lock_p lck;
2019     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2020         lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2021     } else {
2022         lck = (kmp_user_lock_p)user_lock;
2023     }
2024     __kmp_itt_lock_destroyed(lck);
2025 # endif
2026     KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2027 #else
2028     kmp_user_lock_p lck;
2029 
2030     if ( ( __kmp_user_lock_kind == lk_tas )
2031       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2032         lck = (kmp_user_lock_p)user_lock;
2033     }
2034 #if KMP_USE_FUTEX
2035     else if ( ( __kmp_user_lock_kind == lk_futex )
2036       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2037         lck = (kmp_user_lock_p)user_lock;
2038     }
2039 #endif
2040     else {
2041         lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
2042     }
2043 
2044 #if OMPT_SUPPORT && OMPT_TRACE
2045     if (ompt_enabled &&
2046         ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
2047         ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck);
2048     }
2049 #endif
2050 
2051 #if USE_ITT_BUILD
2052     __kmp_itt_lock_destroyed( lck );
2053 #endif /* USE_ITT_BUILD */
2054     DESTROY_LOCK( lck );
2055 
2056     if ( ( __kmp_user_lock_kind == lk_tas )
2057       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2058         ;
2059     }
2060 #if KMP_USE_FUTEX
2061     else if ( ( __kmp_user_lock_kind == lk_futex )
2062       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2063         ;
2064     }
2065 #endif
2066     else {
2067         __kmp_user_lock_free( user_lock, gtid, lck );
2068     }
2069 #endif // KMP_USE_DYNAMIC_LOCK
2070 } // __kmpc_destroy_lock
2071 
2072 /* destroy the lock */
2073 void
2074 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2075 #if KMP_USE_DYNAMIC_LOCK
2076 
2077 # if USE_ITT_BUILD
2078     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2079     __kmp_itt_lock_destroyed(ilk->lock);
2080 # endif
2081     KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2082 
2083 #else // KMP_USE_DYNAMIC_LOCK
2084 
2085     kmp_user_lock_p lck;
2086 
2087     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2088       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2089         lck = (kmp_user_lock_p)user_lock;
2090     }
2091 #if KMP_USE_FUTEX
2092     else if ( ( __kmp_user_lock_kind == lk_futex )
2093      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2094      <= OMP_NEST_LOCK_T_SIZE ) ) {
2095         lck = (kmp_user_lock_p)user_lock;
2096     }
2097 #endif
2098     else {
2099         lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
2100     }
2101 
2102 #if OMPT_SUPPORT && OMPT_TRACE
2103     if (ompt_enabled &&
2104         ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
2105         ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck);
2106     }
2107 #endif
2108 
2109 #if USE_ITT_BUILD
2110     __kmp_itt_lock_destroyed( lck );
2111 #endif /* USE_ITT_BUILD */
2112 
2113     DESTROY_NESTED_LOCK( lck );
2114 
2115     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2116      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2117         ;
2118     }
2119 #if KMP_USE_FUTEX
2120     else if ( ( __kmp_user_lock_kind == lk_futex )
2121      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2122      <= OMP_NEST_LOCK_T_SIZE ) ) {
2123         ;
2124     }
2125 #endif
2126     else {
2127         __kmp_user_lock_free( user_lock, gtid, lck );
2128     }
2129 #endif // KMP_USE_DYNAMIC_LOCK
2130 } // __kmpc_destroy_nest_lock
2131 
2132 void
2133 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2134     KMP_COUNT_BLOCK(OMP_set_lock);
2135 #if KMP_USE_DYNAMIC_LOCK
2136     int tag = KMP_EXTRACT_D_TAG(user_lock);
2137 # if USE_ITT_BUILD
2138    __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
2139 # endif
2140 # if KMP_USE_INLINED_TAS
2141     if (tag == locktag_tas && !__kmp_env_consistency_check) {
2142         KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2143     } else
2144 # elif KMP_USE_INLINED_FUTEX
2145     if (tag == locktag_futex && !__kmp_env_consistency_check) {
2146         KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2147     } else
2148 # endif
2149     {
2150         __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2151     }
2152 # if USE_ITT_BUILD
2153     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2154 # endif
2155 
2156 #else // KMP_USE_DYNAMIC_LOCK
2157 
2158     kmp_user_lock_p lck;
2159 
2160     if ( ( __kmp_user_lock_kind == lk_tas )
2161       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2162         lck = (kmp_user_lock_p)user_lock;
2163     }
2164 #if KMP_USE_FUTEX
2165     else if ( ( __kmp_user_lock_kind == lk_futex )
2166       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2167         lck = (kmp_user_lock_p)user_lock;
2168     }
2169 #endif
2170     else {
2171         lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
2172     }
2173 
2174 #if USE_ITT_BUILD
2175     __kmp_itt_lock_acquiring( lck );
2176 #endif /* USE_ITT_BUILD */
2177 
2178     ACQUIRE_LOCK( lck, gtid );
2179 
2180 #if USE_ITT_BUILD
2181     __kmp_itt_lock_acquired( lck );
2182 #endif /* USE_ITT_BUILD */
2183 
2184 #if OMPT_SUPPORT && OMPT_TRACE
2185     if (ompt_enabled &&
2186         ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
2187         ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck);
2188     }
2189 #endif
2190 
2191 #endif // KMP_USE_DYNAMIC_LOCK
2192 }
2193 
2194 void
2195 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2196 #if KMP_USE_DYNAMIC_LOCK
2197 
2198 # if USE_ITT_BUILD
2199     __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2200 # endif
2201     KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2202 # if USE_ITT_BUILD
2203     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2204 #endif
2205 
2206 #if OMPT_SUPPORT && OMPT_TRACE
2207     if (ompt_enabled) {
2208         // missing support here: need to know whether acquired first or not
2209     }
2210 #endif
2211 
2212 #else // KMP_USE_DYNAMIC_LOCK
2213     int acquire_status;
2214     kmp_user_lock_p lck;
2215 
2216     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2217       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2218         lck = (kmp_user_lock_p)user_lock;
2219     }
2220 #if KMP_USE_FUTEX
2221     else if ( ( __kmp_user_lock_kind == lk_futex )
2222      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2223      <= OMP_NEST_LOCK_T_SIZE ) ) {
2224         lck = (kmp_user_lock_p)user_lock;
2225     }
2226 #endif
2227     else {
2228         lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
2229     }
2230 
2231 #if USE_ITT_BUILD
2232     __kmp_itt_lock_acquiring( lck );
2233 #endif /* USE_ITT_BUILD */
2234 
2235     ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status );
2236 
2237 #if USE_ITT_BUILD
2238     __kmp_itt_lock_acquired( lck );
2239 #endif /* USE_ITT_BUILD */
2240 
2241 #if OMPT_SUPPORT && OMPT_TRACE
2242     if (ompt_enabled) {
2243         if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2244            if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
2245               ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck);
2246         } else {
2247            if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
2248               ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck);
2249         }
2250     }
2251 #endif
2252 
2253 #endif // KMP_USE_DYNAMIC_LOCK
2254 }
2255 
2256 void
2257 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2258 {
2259 #if KMP_USE_DYNAMIC_LOCK
2260 
2261     int tag = KMP_EXTRACT_D_TAG(user_lock);
2262 # if USE_ITT_BUILD
2263     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2264 # endif
2265 # if KMP_USE_INLINED_TAS
2266     if (tag == locktag_tas && !__kmp_env_consistency_check) {
2267         KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2268     } else
2269 # elif KMP_USE_INLINED_FUTEX
2270     if (tag == locktag_futex && !__kmp_env_consistency_check) {
2271         KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2272     } else
2273 # endif
2274     {
2275         __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2276     }
2277 
2278 #else // KMP_USE_DYNAMIC_LOCK
2279 
2280     kmp_user_lock_p lck;
2281 
2282     /* Can't use serial interval since not block structured */
2283     /* release the lock */
2284 
2285     if ( ( __kmp_user_lock_kind == lk_tas )
2286       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2287 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2288         // "fast" path implemented to fix customer performance issue
2289 #if USE_ITT_BUILD
2290         __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2291 #endif /* USE_ITT_BUILD */
2292         TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2293         KMP_MB();
2294         return;
2295 #else
2296         lck = (kmp_user_lock_p)user_lock;
2297 #endif
2298     }
2299 #if KMP_USE_FUTEX
2300     else if ( ( __kmp_user_lock_kind == lk_futex )
2301       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2302         lck = (kmp_user_lock_p)user_lock;
2303     }
2304 #endif
2305     else {
2306         lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
2307     }
2308 
2309 #if USE_ITT_BUILD
2310     __kmp_itt_lock_releasing( lck );
2311 #endif /* USE_ITT_BUILD */
2312 
2313     RELEASE_LOCK( lck, gtid );
2314 
2315 #if OMPT_SUPPORT && OMPT_BLAME
2316     if (ompt_enabled &&
2317         ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2318         ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
2319     }
2320 #endif
2321 
2322 #endif // KMP_USE_DYNAMIC_LOCK
2323 }
2324 
2325 /* release the lock */
2326 void
2327 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2328 {
2329 #if KMP_USE_DYNAMIC_LOCK
2330 
2331 # if USE_ITT_BUILD
2332     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2333 # endif
2334     KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2335 
2336 #else // KMP_USE_DYNAMIC_LOCK
2337 
2338     kmp_user_lock_p lck;
2339 
2340     /* Can't use serial interval since not block structured */
2341 
2342     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2343       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2344 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2345         // "fast" path implemented to fix customer performance issue
2346         kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
2347 #if USE_ITT_BUILD
2348         __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2349 #endif /* USE_ITT_BUILD */
2350         if ( --(tl->lk.depth_locked) == 0 ) {
2351             TCW_4(tl->lk.poll, 0);
2352         }
2353         KMP_MB();
2354         return;
2355 #else
2356         lck = (kmp_user_lock_p)user_lock;
2357 #endif
2358     }
2359 #if KMP_USE_FUTEX
2360     else if ( ( __kmp_user_lock_kind == lk_futex )
2361      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2362      <= OMP_NEST_LOCK_T_SIZE ) ) {
2363         lck = (kmp_user_lock_p)user_lock;
2364     }
2365 #endif
2366     else {
2367         lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
2368     }
2369 
2370 #if USE_ITT_BUILD
2371     __kmp_itt_lock_releasing( lck );
2372 #endif /* USE_ITT_BUILD */
2373 
2374     int release_status;
2375     release_status = RELEASE_NESTED_LOCK( lck, gtid );
2376 #if OMPT_SUPPORT && OMPT_BLAME
2377     if (ompt_enabled) {
2378         if (release_status == KMP_LOCK_RELEASED) {
2379             if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2380                 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2381                     (uint64_t) lck);
2382             }
2383         } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
2384             ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2385                 (uint64_t) lck);
2386         }
2387     }
2388 #endif
2389 
2390 #endif // KMP_USE_DYNAMIC_LOCK
2391 }
2392 
2393 /* try to acquire the lock */
2394 int
2395 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2396 {
2397     KMP_COUNT_BLOCK(OMP_test_lock);
2398 
2399 #if KMP_USE_DYNAMIC_LOCK
2400     int rc;
2401     int tag = KMP_EXTRACT_D_TAG(user_lock);
2402 # if USE_ITT_BUILD
2403     __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2404 # endif
2405 # if KMP_USE_INLINED_TAS
2406     if (tag == locktag_tas && !__kmp_env_consistency_check) {
2407         KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2408     } else
2409 # elif KMP_USE_INLINED_FUTEX
2410     if (tag == locktag_futex && !__kmp_env_consistency_check) {
2411         KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2412     } else
2413 # endif
2414     {
2415         rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2416     }
2417     if (rc) {
2418 # if USE_ITT_BUILD
2419         __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2420 # endif
2421         return FTN_TRUE;
2422     } else {
2423 # if USE_ITT_BUILD
2424         __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2425 # endif
2426         return FTN_FALSE;
2427     }
2428 
2429 #else // KMP_USE_DYNAMIC_LOCK
2430 
2431     kmp_user_lock_p lck;
2432     int          rc;
2433 
2434     if ( ( __kmp_user_lock_kind == lk_tas )
2435       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2436         lck = (kmp_user_lock_p)user_lock;
2437     }
2438 #if KMP_USE_FUTEX
2439     else if ( ( __kmp_user_lock_kind == lk_futex )
2440       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2441         lck = (kmp_user_lock_p)user_lock;
2442     }
2443 #endif
2444     else {
2445         lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
2446     }
2447 
2448 #if USE_ITT_BUILD
2449     __kmp_itt_lock_acquiring( lck );
2450 #endif /* USE_ITT_BUILD */
2451 
2452     rc = TEST_LOCK( lck, gtid );
2453 #if USE_ITT_BUILD
2454     if ( rc ) {
2455         __kmp_itt_lock_acquired( lck );
2456     } else {
2457         __kmp_itt_lock_cancelled( lck );
2458     }
2459 #endif /* USE_ITT_BUILD */
2460     return ( rc ? FTN_TRUE : FTN_FALSE );
2461 
2462     /* Can't use serial interval since not block structured */
2463 
2464 #endif // KMP_USE_DYNAMIC_LOCK
2465 }
2466 
2467 /* try to acquire the lock */
2468 int
2469 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2470 {
2471 #if KMP_USE_DYNAMIC_LOCK
2472     int rc;
2473 # if USE_ITT_BUILD
2474     __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2475 # endif
2476     rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2477 # if USE_ITT_BUILD
2478     if (rc) {
2479         __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2480     } else {
2481         __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2482     }
2483 # endif
2484     return rc;
2485 
2486 #else // KMP_USE_DYNAMIC_LOCK
2487 
2488     kmp_user_lock_p lck;
2489     int          rc;
2490 
2491     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2492       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2493         lck = (kmp_user_lock_p)user_lock;
2494     }
2495 #if KMP_USE_FUTEX
2496     else if ( ( __kmp_user_lock_kind == lk_futex )
2497      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2498      <= OMP_NEST_LOCK_T_SIZE ) ) {
2499         lck = (kmp_user_lock_p)user_lock;
2500     }
2501 #endif
2502     else {
2503         lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
2504     }
2505 
2506 #if USE_ITT_BUILD
2507     __kmp_itt_lock_acquiring( lck );
2508 #endif /* USE_ITT_BUILD */
2509 
2510     rc = TEST_NESTED_LOCK( lck, gtid );
2511 #if USE_ITT_BUILD
2512     if ( rc ) {
2513         __kmp_itt_lock_acquired( lck );
2514     } else {
2515         __kmp_itt_lock_cancelled( lck );
2516     }
2517 #endif /* USE_ITT_BUILD */
2518     return rc;
2519 
2520     /* Can't use serial interval since not block structured */
2521 
2522 #endif // KMP_USE_DYNAMIC_LOCK
2523 }
2524 
2525 
2526 /*--------------------------------------------------------------------------------------------------------------------*/
2527 
2528 /*
2529  * Interface to fast scalable reduce methods routines
2530  */
2531 
2532 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
2533 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
2534 // AT: which solution is better?
2535 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
2536                    ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
2537 
2538 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2539                    ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
2540 
2541 // description of the packed_reduction_method variable: look at the macros in kmp.h
2542 
2543 
2544 // used in a critical section reduce block
2545 static __forceinline void
2546 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2547 
2548     // this lock was visible to a customer and to the threading profile tool as a serial overhead span
2549     //            (although it's used for an internal purpose only)
2550     //            why was it visible in previous implementation?
2551     //            should we keep it visible in new reduce block?
2552     kmp_user_lock_p lck;
2553 
2554 #if KMP_USE_DYNAMIC_LOCK
2555 
2556     kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
2557     // Check if it is initialized.
2558     if (*lk == 0) {
2559         if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2560             KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq));
2561         } else {
2562             __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq));
2563         }
2564     }
2565     // Branch for accessing the actual lock object and set operation. This branching is inevitable since
2566     // this lock initialization does not follow the normal dispatch path (lock table is not used).
2567     if (KMP_EXTRACT_D_TAG(lk) != 0) {
2568         lck = (kmp_user_lock_p)lk;
2569         KMP_DEBUG_ASSERT(lck != NULL);
2570         if (__kmp_env_consistency_check) {
2571             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2572         }
2573         KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
2574     } else {
2575         kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
2576         lck = ilk->lock;
2577         KMP_DEBUG_ASSERT(lck != NULL);
2578         if (__kmp_env_consistency_check) {
2579             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2580         }
2581         KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
2582     }
2583 
2584 #else // KMP_USE_DYNAMIC_LOCK
2585 
2586     // We know that the fast reduction code is only emitted by Intel compilers
2587     // with 32 byte critical sections. If there isn't enough space, then we
2588     // have to use a pointer.
2589     if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2590         lck = (kmp_user_lock_p)crit;
2591     }
2592     else {
2593         lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2594     }
2595     KMP_DEBUG_ASSERT( lck != NULL );
2596 
2597     if ( __kmp_env_consistency_check )
2598         __kmp_push_sync( global_tid, ct_critical, loc, lck );
2599 
2600     __kmp_acquire_user_lock_with_checks( lck, global_tid );
2601 
2602 #endif // KMP_USE_DYNAMIC_LOCK
2603 }
2604 
2605 // used in a critical section reduce block
2606 static __forceinline void
2607 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2608 
2609     kmp_user_lock_p lck;
2610 
2611 #if KMP_USE_DYNAMIC_LOCK
2612 
2613     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2614         lck = (kmp_user_lock_p)crit;
2615         if (__kmp_env_consistency_check)
2616             __kmp_pop_sync(global_tid, ct_critical, loc);
2617         KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2618     } else {
2619         kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2620         if (__kmp_env_consistency_check)
2621             __kmp_pop_sync(global_tid, ct_critical, loc);
2622         KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2623     }
2624 
2625 #else // KMP_USE_DYNAMIC_LOCK
2626 
2627     // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2628     // sections. If there isn't enough space, then we have to use a pointer.
2629     if ( __kmp_base_user_lock_size > 32 ) {
2630         lck = *( (kmp_user_lock_p *) crit );
2631         KMP_ASSERT( lck != NULL );
2632     } else {
2633         lck = (kmp_user_lock_p) crit;
2634     }
2635 
2636     if ( __kmp_env_consistency_check )
2637         __kmp_pop_sync( global_tid, ct_critical, loc );
2638 
2639     __kmp_release_user_lock_with_checks( lck, global_tid );
2640 
2641 #endif // KMP_USE_DYNAMIC_LOCK
2642 } // __kmp_end_critical_section_reduce_block
2643 
2644 
2645 /* 2.a.i. Reduce Block without a terminating barrier */
2646 /*!
2647 @ingroup SYNCHRONIZATION
2648 @param loc source location information
2649 @param global_tid global thread number
2650 @param num_vars number of items (variables) to be reduced
2651 @param reduce_size size of data in bytes to be reduced
2652 @param reduce_data pointer to data to be reduced
2653 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
2654 @param lck pointer to the unique lock data structure
2655 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
2656 
2657 The nowait version is used for a reduce clause with the nowait argument.
2658 */
2659 kmp_int32
2660 __kmpc_reduce_nowait(
2661     ident_t *loc, kmp_int32 global_tid,
2662     kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2663     kmp_critical_name *lck ) {
2664 
2665     KMP_COUNT_BLOCK(REDUCE_nowait);
2666     int retval = 0;
2667     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2668 #if OMP_40_ENABLED
2669     kmp_team_t *team;
2670     kmp_info_t *th;
2671     int teams_swapped = 0, task_state;
2672 #endif
2673     KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2674 
2675     // why do we need this initialization here at all?
2676     // Reduction clause can not be used as a stand-alone directive.
2677 
2678     // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2679     // possible detection of false-positive race by the threadchecker ???
2680     if( ! TCR_4( __kmp_init_parallel ) )
2681         __kmp_parallel_initialize();
2682 
2683     // check correctness of reduce block nesting
2684 #if KMP_USE_DYNAMIC_LOCK
2685     if ( __kmp_env_consistency_check )
2686         __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2687 #else
2688     if ( __kmp_env_consistency_check )
2689         __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2690 #endif
2691 
2692 #if OMP_40_ENABLED
2693     th = __kmp_thread_from_gtid(global_tid);
2694     if( th->th.th_teams_microtask ) {   // AC: check if we are inside the teams construct?
2695         team = th->th.th_team;
2696         if( team->t.t_level == th->th.th_teams_level ) {
2697             // this is reduction at teams construct
2698             KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid);  // AC: check that tid == 0
2699             // Let's swap teams temporarily for the reduction barrier
2700             teams_swapped = 1;
2701             th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2702             th->th.th_team = team->t.t_parent;
2703             th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2704             th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2705             task_state = th->th.th_task_state;
2706             th->th.th_task_state = 0;
2707         }
2708     }
2709 #endif // OMP_40_ENABLED
2710 
2711     // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2712     // the variable should be either a construct-specific or thread-specific property, not a team specific property
2713     //     (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2714     // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2715     //     (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2716     // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2717     // a thread-specific "th_local.reduction_method" variable is used currently
2718     // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2719 
2720     packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2721     __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2722 
2723     if( packed_reduction_method == critical_reduce_block ) {
2724 
2725         __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2726         retval = 1;
2727 
2728     } else if( packed_reduction_method == empty_reduce_block ) {
2729 
2730         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2731         retval = 1;
2732 
2733     } else if( packed_reduction_method == atomic_reduce_block ) {
2734 
2735         retval = 2;
2736 
2737         // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2738         //     (it's not quite good, because the checking block has been closed by this 'pop',
2739         //      but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2740         if ( __kmp_env_consistency_check )
2741             __kmp_pop_sync( global_tid, ct_reduce, loc );
2742 
2743     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2744 
2745         //AT: performance issue: a real barrier here
2746         //AT:     (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2747         //AT:     (it's not what a customer might expect specifying NOWAIT clause)
2748         //AT:     (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2749         //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2750         //        and be more in line with sense of NOWAIT
2751         //AT: TO DO: do epcc test and compare times
2752 
2753         // this barrier should be invisible to a customer and to the threading profile tool
2754         //              (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2755 #if USE_ITT_NOTIFY
2756         __kmp_threads[global_tid]->th.th_ident = loc;
2757 #endif
2758         retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2759         retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2760 
2761         // all other workers except master should do this pop here
2762         //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
2763         if ( __kmp_env_consistency_check ) {
2764             if( retval == 0 ) {
2765                 __kmp_pop_sync( global_tid, ct_reduce, loc );
2766             }
2767         }
2768 
2769     } else {
2770 
2771         // should never reach this block
2772         KMP_ASSERT( 0 ); // "unexpected method"
2773 
2774     }
2775 #if OMP_40_ENABLED
2776     if( teams_swapped ) {
2777         // Restore thread structure
2778         th->th.th_info.ds.ds_tid = 0;
2779         th->th.th_team = team;
2780         th->th.th_team_nproc = team->t.t_nproc;
2781         th->th.th_task_team = team->t.t_task_team[task_state];
2782         th->th.th_task_state = task_state;
2783     }
2784 #endif
2785     KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2786 
2787     return retval;
2788 }
2789 
2790 /*!
2791 @ingroup SYNCHRONIZATION
2792 @param loc source location information
2793 @param global_tid global thread id.
2794 @param lck pointer to the unique lock data structure
2795 
2796 Finish the execution of a reduce nowait.
2797 */
2798 void
2799 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2800 
2801     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2802 
2803     KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2804 
2805     packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2806 
2807     if( packed_reduction_method == critical_reduce_block ) {
2808 
2809         __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2810 
2811     } else if( packed_reduction_method == empty_reduce_block ) {
2812 
2813         // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2814 
2815     } else if( packed_reduction_method == atomic_reduce_block ) {
2816 
2817         // neither master nor other workers should get here
2818         //     (code gen does not generate this call in case 2: atomic reduce block)
2819         // actually it's better to remove this elseif at all;
2820         // after removal this value will checked by the 'else' and will assert
2821 
2822     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2823 
2824         // only master gets here
2825 
2826     } else {
2827 
2828         // should never reach this block
2829         KMP_ASSERT( 0 ); // "unexpected method"
2830 
2831     }
2832 
2833     if ( __kmp_env_consistency_check )
2834         __kmp_pop_sync( global_tid, ct_reduce, loc );
2835 
2836     KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2837 
2838     return;
2839 }
2840 
2841 /* 2.a.ii. Reduce Block with a terminating barrier */
2842 
2843 /*!
2844 @ingroup SYNCHRONIZATION
2845 @param loc source location information
2846 @param global_tid global thread number
2847 @param num_vars number of items (variables) to be reduced
2848 @param reduce_size size of data in bytes to be reduced
2849 @param reduce_data pointer to data to be reduced
2850 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
2851 @param lck pointer to the unique lock data structure
2852 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
2853 
2854 A blocking reduce that includes an implicit barrier.
2855 */
2856 kmp_int32
2857 __kmpc_reduce(
2858     ident_t *loc, kmp_int32 global_tid,
2859     kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2860     void (*reduce_func)(void *lhs_data, void *rhs_data),
2861     kmp_critical_name *lck )
2862 {
2863     KMP_COUNT_BLOCK(REDUCE_wait);
2864     int retval = 0;
2865     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2866 
2867     KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2868 
2869     // why do we need this initialization here at all?
2870     // Reduction clause can not be a stand-alone directive.
2871 
2872     // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2873     // possible detection of false-positive race by the threadchecker ???
2874     if( ! TCR_4( __kmp_init_parallel ) )
2875         __kmp_parallel_initialize();
2876 
2877     // check correctness of reduce block nesting
2878 #if KMP_USE_DYNAMIC_LOCK
2879     if ( __kmp_env_consistency_check )
2880         __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2881 #else
2882     if ( __kmp_env_consistency_check )
2883         __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2884 #endif
2885 
2886     packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2887     __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2888 
2889     if( packed_reduction_method == critical_reduce_block ) {
2890 
2891         __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2892         retval = 1;
2893 
2894     } else if( packed_reduction_method == empty_reduce_block ) {
2895 
2896         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2897         retval = 1;
2898 
2899     } else if( packed_reduction_method == atomic_reduce_block ) {
2900 
2901         retval = 2;
2902 
2903     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2904 
2905         //case tree_reduce_block:
2906         // this barrier should be visible to a customer and to the threading profile tool
2907         //              (it's a terminating barrier on constructs if NOWAIT not specified)
2908 #if USE_ITT_NOTIFY
2909         __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2910 #endif
2911         retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2912         retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2913 
2914         // all other workers except master should do this pop here
2915         //     ( none of other workers except master will enter __kmpc_end_reduce() )
2916         if ( __kmp_env_consistency_check ) {
2917             if( retval == 0 ) { // 0: all other workers; 1: master
2918                 __kmp_pop_sync( global_tid, ct_reduce, loc );
2919             }
2920         }
2921 
2922     } else {
2923 
2924         // should never reach this block
2925         KMP_ASSERT( 0 ); // "unexpected method"
2926 
2927     }
2928 
2929     KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2930 
2931     return retval;
2932 }
2933 
2934 /*!
2935 @ingroup SYNCHRONIZATION
2936 @param loc source location information
2937 @param global_tid global thread id.
2938 @param lck pointer to the unique lock data structure
2939 
2940 Finish the execution of a blocking reduce.
2941 The <tt>lck</tt> pointer must be the same as that used in the corresponding start function.
2942 */
2943 void
2944 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2945 
2946     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2947 
2948     KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2949 
2950     packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2951 
2952     // this barrier should be visible to a customer and to the threading profile tool
2953     //              (it's a terminating barrier on constructs if NOWAIT not specified)
2954 
2955     if( packed_reduction_method == critical_reduce_block ) {
2956 
2957         __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2958 
2959         // TODO: implicit barrier: should be exposed
2960 #if USE_ITT_NOTIFY
2961         __kmp_threads[global_tid]->th.th_ident = loc;
2962 #endif
2963         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2964 
2965     } else if( packed_reduction_method == empty_reduce_block ) {
2966 
2967         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2968 
2969         // TODO: implicit barrier: should be exposed
2970 #if USE_ITT_NOTIFY
2971         __kmp_threads[global_tid]->th.th_ident = loc;
2972 #endif
2973         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2974 
2975     } else if( packed_reduction_method == atomic_reduce_block ) {
2976 
2977         // TODO: implicit barrier: should be exposed
2978 #if USE_ITT_NOTIFY
2979         __kmp_threads[global_tid]->th.th_ident = loc;
2980 #endif
2981         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2982 
2983     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2984 
2985         // only master executes here (master releases all other workers)
2986         __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2987 
2988     } else {
2989 
2990         // should never reach this block
2991         KMP_ASSERT( 0 ); // "unexpected method"
2992 
2993     }
2994 
2995     if ( __kmp_env_consistency_check )
2996         __kmp_pop_sync( global_tid, ct_reduce, loc );
2997 
2998     KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2999 
3000     return;
3001 }
3002 
3003 #undef __KMP_GET_REDUCTION_METHOD
3004 #undef __KMP_SET_REDUCTION_METHOD
3005 
3006 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
3007 
3008 kmp_uint64
3009 __kmpc_get_taskid() {
3010 
3011     kmp_int32    gtid;
3012     kmp_info_t * thread;
3013 
3014     gtid = __kmp_get_gtid();
3015     if ( gtid < 0 ) {
3016         return 0;
3017     }; // if
3018     thread = __kmp_thread_from_gtid( gtid );
3019     return thread->th.th_current_task->td_task_id;
3020 
3021 } // __kmpc_get_taskid
3022 
3023 
3024 kmp_uint64
3025 __kmpc_get_parent_taskid() {
3026 
3027     kmp_int32        gtid;
3028     kmp_info_t *     thread;
3029     kmp_taskdata_t * parent_task;
3030 
3031     gtid = __kmp_get_gtid();
3032     if ( gtid < 0 ) {
3033         return 0;
3034     }; // if
3035     thread      = __kmp_thread_from_gtid( gtid );
3036     parent_task = thread->th.th_current_task->td_parent;
3037     return ( parent_task == NULL ? 0 : parent_task->td_task_id );
3038 
3039 } // __kmpc_get_parent_taskid
3040 
3041 #if OMP_45_ENABLED
3042 /*!
3043 @ingroup WORK_SHARING
3044 @param loc  source location information.
3045 @param gtid  global thread number.
3046 @param num_dims  number of associated doacross loops.
3047 @param dims  info on loops bounds.
3048 
3049 Initialize doacross loop information.
3050 Expect compiler send us inclusive bounds,
3051 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3052 */
3053 void
3054 __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
3055 {
3056     int j, idx;
3057     kmp_int64 last, trace_count;
3058     kmp_info_t *th = __kmp_threads[gtid];
3059     kmp_team_t *team = th->th.th_team;
3060     kmp_uint32 *flags;
3061     kmp_disp_t *pr_buf = th->th.th_dispatch;
3062     dispatch_shared_info_t *sh_buf;
3063 
3064     KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3065                  gtid, num_dims, !team->t.t_serialized));
3066     KMP_DEBUG_ASSERT(dims != NULL);
3067     KMP_DEBUG_ASSERT(num_dims > 0);
3068 
3069     if( team->t.t_serialized ) {
3070         KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
3071         return; // no dependencies if team is serialized
3072     }
3073     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3074     idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
3075     sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3076 
3077     // Save bounds info into allocated private buffer
3078     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3079     pr_buf->th_doacross_info =
3080         (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
3081     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3082     pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
3083     // Save also address of num_done in order to access it later without knowing the buffer index
3084     pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3085     pr_buf->th_doacross_info[2] = dims[0].lo;
3086     pr_buf->th_doacross_info[3] = dims[0].up;
3087     pr_buf->th_doacross_info[4] = dims[0].st;
3088     last = 5;
3089     for( j = 1; j < num_dims; ++j ) {
3090         kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
3091         if( dims[j].st == 1 ) { // most common case
3092             // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3093             range_length = dims[j].up - dims[j].lo + 1;
3094         } else {
3095             if( dims[j].st > 0 ) {
3096                 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3097                 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3098             } else {            // negative increment
3099                 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3100                 range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3101             }
3102         }
3103         pr_buf->th_doacross_info[last++] = range_length;
3104         pr_buf->th_doacross_info[last++] = dims[j].lo;
3105         pr_buf->th_doacross_info[last++] = dims[j].up;
3106         pr_buf->th_doacross_info[last++] = dims[j].st;
3107     }
3108 
3109     // Compute total trip count.
3110     // Start with range of dims[0] which we don't need to keep in the buffer.
3111     if( dims[0].st == 1 ) { // most common case
3112         trace_count = dims[0].up - dims[0].lo + 1;
3113     } else if( dims[0].st > 0 ) {
3114         KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3115         trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3116     } else {   // negative increment
3117         KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3118         trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3119     }
3120     for( j = 1; j < num_dims; ++j ) {
3121         trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3122     }
3123     KMP_DEBUG_ASSERT(trace_count > 0);
3124 
3125     // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers)
3126     if( idx != sh_buf->doacross_buf_idx ) {
3127         // Shared buffer is occupied, wait for it to be free
3128         __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
3129     }
3130     // Check if we are the first thread. After the CAS the first thread gets 0,
3131     // others get 1 if initialization is in progress, allocated pointer otherwise.
3132     flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
3133         (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
3134     if( flags == NULL ) {
3135         // we are the first thread, allocate the array of flags
3136         kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3137         sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
3138     } else if( (kmp_int64)flags == 1 ) {
3139         // initialization is still in progress, need to wait
3140         while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
3141             KMP_YIELD(TRUE);
3142         }
3143     }
3144     KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
3145     pr_buf->th_doacross_flags = sh_buf->doacross_flags;      // save private copy in order to not
3146                                                              // touch shared buffer on each iteration
3147     KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
3148 }
3149 
3150 void
3151 __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
3152 {
3153     kmp_int32 shft, num_dims, i;
3154     kmp_uint32 flag;
3155     kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3156     kmp_info_t *th = __kmp_threads[gtid];
3157     kmp_team_t *team = th->th.th_team;
3158     kmp_disp_t *pr_buf;
3159     kmp_int64 lo, up, st;
3160 
3161     KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3162     if( team->t.t_serialized ) {
3163         KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
3164         return; // no dependencies if team is serialized
3165     }
3166 
3167     // calculate sequential iteration number and check out-of-bounds condition
3168     pr_buf = th->th.th_dispatch;
3169     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3170     num_dims = pr_buf->th_doacross_info[0];
3171     lo = pr_buf->th_doacross_info[2];
3172     up = pr_buf->th_doacross_info[3];
3173     st = pr_buf->th_doacross_info[4];
3174     if( st == 1 ) { // most common case
3175         if( vec[0] < lo || vec[0] > up ) {
3176             KA_TRACE(20,(
3177                 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3178                 gtid, vec[0], lo, up));
3179             return;
3180         }
3181         iter_number = vec[0] - lo;
3182     } else if( st > 0 ) {
3183         if( vec[0] < lo || vec[0] > up ) {
3184             KA_TRACE(20,(
3185                 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3186                 gtid, vec[0], lo, up));
3187             return;
3188         }
3189         iter_number = (kmp_uint64)(vec[0] - lo) / st;
3190     } else {        // negative increment
3191         if( vec[0] > lo || vec[0] < up ) {
3192             KA_TRACE(20,(
3193                 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3194                 gtid, vec[0], lo, up));
3195             return;
3196         }
3197         iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3198     }
3199     for( i = 1; i < num_dims; ++i ) {
3200         kmp_int64 iter, ln;
3201         kmp_int32 j = i * 4;
3202         ln = pr_buf->th_doacross_info[j + 1];
3203         lo = pr_buf->th_doacross_info[j + 2];
3204         up = pr_buf->th_doacross_info[j + 3];
3205         st = pr_buf->th_doacross_info[j + 4];
3206         if( st == 1 ) {
3207             if( vec[i] < lo || vec[i] > up ) {
3208                 KA_TRACE(20,(
3209                     "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3210                     gtid, vec[i], lo, up));
3211                 return;
3212             }
3213             iter = vec[i] - lo;
3214         } else if( st > 0 ) {
3215             if( vec[i] < lo || vec[i] > up ) {
3216                 KA_TRACE(20,(
3217                     "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3218                     gtid, vec[i], lo, up));
3219                 return;
3220             }
3221             iter = (kmp_uint64)(vec[i] - lo) / st;
3222         } else {   // st < 0
3223             if( vec[i] > lo || vec[i] < up ) {
3224                 KA_TRACE(20,(
3225                     "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3226                     gtid, vec[i], lo, up));
3227                 return;
3228             }
3229             iter = (kmp_uint64)(lo - vec[i]) / (-st);
3230         }
3231         iter_number = iter + ln * iter_number;
3232     }
3233     shft = iter_number % 32; // use 32-bit granularity
3234     iter_number >>= 5;       // divided by 32
3235     flag = 1 << shft;
3236     while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
3237         KMP_YIELD(TRUE);
3238     }
3239     KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3240                  gtid, (iter_number<<5)+shft));
3241 }
3242 
3243 void
3244 __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
3245 {
3246     kmp_int32 shft, num_dims, i;
3247     kmp_uint32 flag;
3248     kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3249     kmp_info_t *th = __kmp_threads[gtid];
3250     kmp_team_t *team = th->th.th_team;
3251     kmp_disp_t *pr_buf;
3252     kmp_int64 lo, st;
3253 
3254     KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3255     if( team->t.t_serialized ) {
3256         KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
3257         return; // no dependencies if team is serialized
3258     }
3259 
3260     // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
3261     pr_buf = th->th.th_dispatch;
3262     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3263     num_dims = pr_buf->th_doacross_info[0];
3264     lo = pr_buf->th_doacross_info[2];
3265     st = pr_buf->th_doacross_info[4];
3266     if( st == 1 ) { // most common case
3267         iter_number = vec[0] - lo;
3268     } else if( st > 0 ) {
3269         iter_number = (kmp_uint64)(vec[0] - lo) / st;
3270     } else {        // negative increment
3271         iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3272     }
3273     for( i = 1; i < num_dims; ++i ) {
3274         kmp_int64 iter, ln;
3275         kmp_int32 j = i * 4;
3276         ln = pr_buf->th_doacross_info[j + 1];
3277         lo = pr_buf->th_doacross_info[j + 2];
3278         st = pr_buf->th_doacross_info[j + 4];
3279         if( st == 1 ) {
3280             iter = vec[i] - lo;
3281         } else if( st > 0 ) {
3282             iter = (kmp_uint64)(vec[i] - lo) / st;
3283         } else {   // st < 0
3284             iter = (kmp_uint64)(lo - vec[i]) / (-st);
3285         }
3286         iter_number = iter + ln * iter_number;
3287     }
3288     shft = iter_number % 32; // use 32-bit granularity
3289     iter_number >>= 5;       // divided by 32
3290     flag = 1 << shft;
3291     if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
3292         KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
3293     KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
3294                  gtid, (iter_number<<5)+shft));
3295 }
3296 
3297 void
3298 __kmpc_doacross_fini(ident_t *loc, int gtid)
3299 {
3300     kmp_int64 num_done;
3301     kmp_info_t *th = __kmp_threads[gtid];
3302     kmp_team_t *team = th->th.th_team;
3303     kmp_disp_t *pr_buf = th->th.th_dispatch;
3304 
3305     KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3306     if( team->t.t_serialized ) {
3307         KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3308         return; // nothing to do
3309     }
3310     num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
3311     if( num_done == th->th.th_team_nproc ) {
3312         // we are the last thread, need to free shared resources
3313         int idx = pr_buf->th_doacross_buf_idx - 1;
3314         dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3315         KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
3316         KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3317         KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3318         __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
3319         sh_buf->doacross_flags = NULL;
3320         sh_buf->doacross_num_done = 0;
3321         sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use
3322     }
3323     // free private resources (need to keep buffer index forever)
3324     __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
3325     pr_buf->th_doacross_info = NULL;
3326     KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
3327 }
3328 #endif
3329 
3330 // end of file //
3331 
3332