1 /*
2  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h"        /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_error.h"
22 #include "kmp_stats.h"
23 
24 #if OMPT_SUPPORT
25 #include "ompt-internal.h"
26 #include "ompt-specific.h"
27 #endif
28 
29 #define MAX_MESSAGE 512
30 
31 /* ------------------------------------------------------------------------ */
32 /* ------------------------------------------------------------------------ */
33 
34 /*  flags will be used in future, e.g., to implement */
35 /*  openmp_strict library restrictions               */
36 
37 /*!
38  * @ingroup STARTUP_SHUTDOWN
39  * @param loc   in   source location information
40  * @param flags in   for future use (currently ignored)
41  *
42  * Initialize the runtime library. This call is optional; if it is not made then
43  * it will be implicitly called by attempts to use other library functions.
44  *
45  */
46 void
47 __kmpc_begin(ident_t *loc, kmp_int32 flags)
48 {
49     // By default __kmp_ignore_mppbeg() returns TRUE.
50     if (__kmp_ignore_mppbeg() == FALSE) {
51         __kmp_internal_begin();
52 
53         KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
54     }
55 }
56 
57 /*!
58  * @ingroup STARTUP_SHUTDOWN
59  * @param loc source location information
60  *
61  * Shutdown the runtime library. This is also optional, and even if called will not
62  * do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to zero.
63   */
64 void
65 __kmpc_end(ident_t *loc)
66 {
67     // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
68     // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
69     // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
70     // will unregister this root (it can cause library shut down).
71     if (__kmp_ignore_mppend() == FALSE) {
72         KC_TRACE( 10, ("__kmpc_end: called\n" ) );
73         KA_TRACE( 30, ("__kmpc_end\n" ));
74 
75         __kmp_internal_end_thread( -1 );
76     }
77 }
78 
79 /*!
80 @ingroup THREAD_STATES
81 @param loc Source location information.
82 @return The global thread index of the active thread.
83 
84 This function can be called in any context.
85 
86 If the runtime has ony been entered at the outermost level from a
87 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is that
88 which would be returned by omp_get_thread_num() in the outermost
89 active parallel construct. (Or zero if there is no active parallel
90 construct, since the master thread is necessarily thread zero).
91 
92 If multiple non-OpenMP threads all enter an OpenMP construct then this
93 will be a unique thread identifier among all the threads created by
94 the OpenMP runtime (but the value cannote be defined in terms of
95 OpenMP thread ids returned by omp_get_thread_num()).
96 
97 */
98 kmp_int32
99 __kmpc_global_thread_num(ident_t *loc)
100 {
101     kmp_int32 gtid = __kmp_entry_gtid();
102 
103     KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
104 
105     return gtid;
106 }
107 
108 /*!
109 @ingroup THREAD_STATES
110 @param loc Source location information.
111 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
112 
113 This function can be called in any context.
114 It returns the total number of threads under the control of the OpenMP runtime. That is
115 not a number that can be determined by any OpenMP standard calls, since the library may be
116 called from more than one non-OpenMP thread, and this reflects the total over all such calls.
117 Similarly the runtime maintains underlying threads even when they are not active (since the cost
118 of creating and destroying OS threads is high), this call counts all such threads even if they are not
119 waiting for work.
120 */
121 kmp_int32
122 __kmpc_global_num_threads(ident_t *loc)
123 {
124     KC_TRACE(10,("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
125 
126     return TCR_4(__kmp_all_nth);
127 }
128 
129 /*!
130 @ingroup THREAD_STATES
131 @param loc Source location information.
132 @return The thread number of the calling thread in the innermost active parallel construct.
133 
134 */
135 kmp_int32
136 __kmpc_bound_thread_num(ident_t *loc)
137 {
138     KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
139     return __kmp_tid_from_gtid( __kmp_entry_gtid() );
140 }
141 
142 /*!
143 @ingroup THREAD_STATES
144 @param loc Source location information.
145 @return The number of threads in the innermost active parallel construct.
146 */
147 kmp_int32
148 __kmpc_bound_num_threads(ident_t *loc)
149 {
150     KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
151 
152     return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
153 }
154 
155 /*!
156  * @ingroup DEPRECATED
157  * @param loc location description
158  *
159  * This function need not be called. It always returns TRUE.
160  */
161 kmp_int32
162 __kmpc_ok_to_fork(ident_t *loc)
163 {
164 #ifndef KMP_DEBUG
165 
166     return TRUE;
167 
168 #else
169 
170     const char *semi2;
171     const char *semi3;
172     int line_no;
173 
174     if (__kmp_par_range == 0) {
175         return TRUE;
176     }
177     semi2 = loc->psource;
178     if (semi2 == NULL) {
179         return TRUE;
180     }
181     semi2 = strchr(semi2, ';');
182     if (semi2 == NULL) {
183         return TRUE;
184     }
185     semi2 = strchr(semi2 + 1, ';');
186     if (semi2 == NULL) {
187         return TRUE;
188     }
189     if (__kmp_par_range_filename[0]) {
190         const char *name = semi2 - 1;
191         while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
192             name--;
193         }
194         if ((*name == '/') || (*name == ';')) {
195             name++;
196         }
197         if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
198             return __kmp_par_range < 0;
199         }
200     }
201     semi3 = strchr(semi2 + 1, ';');
202     if (__kmp_par_range_routine[0]) {
203         if ((semi3 != NULL) && (semi3 > semi2)
204           && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
205             return __kmp_par_range < 0;
206         }
207     }
208     if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
209         if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
210             return __kmp_par_range > 0;
211         }
212         return __kmp_par_range < 0;
213     }
214     return TRUE;
215 
216 #endif /* KMP_DEBUG */
217 
218 }
219 
220 /*!
221 @ingroup THREAD_STATES
222 @param loc Source location information.
223 @return 1 if this thread is executing inside an active parallel region, zero if not.
224 */
225 kmp_int32
226 __kmpc_in_parallel( ident_t *loc )
227 {
228     return __kmp_entry_thread() -> th.th_root -> r.r_active;
229 }
230 
231 /*!
232 @ingroup PARALLEL
233 @param loc source location information
234 @param global_tid global thread number
235 @param num_threads number of threads requested for this parallel construct
236 
237 Set the number of threads to be used by the next fork spawned by this thread.
238 This call is only required if the parallel construct has a `num_threads` clause.
239 */
240 void
241 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
242 {
243     KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
244       global_tid, num_threads ) );
245 
246     __kmp_push_num_threads( loc, global_tid, num_threads );
247 }
248 
249 void
250 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
251 {
252     KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
253 
254     /* the num_threads are automatically popped */
255 }
256 
257 
258 #if OMP_40_ENABLED
259 
260 void
261 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
262 {
263     KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
264       global_tid, proc_bind ) );
265 
266     __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
267 }
268 
269 #endif /* OMP_40_ENABLED */
270 
271 
272 /*!
273 @ingroup PARALLEL
274 @param loc  source location information
275 @param argc  total number of arguments in the ellipsis
276 @param microtask  pointer to callback routine consisting of outlined parallel construct
277 @param ...  pointers to shared variables that aren't global
278 
279 Do the actual fork and call the microtask in the relevant number of threads.
280 */
281 void
282 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
283 {
284   int         gtid = __kmp_entry_gtid();
285 
286 #if (KMP_STATS_ENABLED)
287   int inParallel = __kmpc_in_parallel(loc);
288   if (inParallel)
289   {
290       KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
291   }
292   else
293   {
294       KMP_COUNT_BLOCK(OMP_PARALLEL);
295   }
296 #endif
297 
298   // maybe to save thr_state is enough here
299   {
300     va_list     ap;
301     va_start(   ap, microtask );
302 
303 #if OMPT_SUPPORT
304     ompt_frame_t* ompt_frame;
305     if (ompt_enabled) {
306        kmp_info_t *master_th = __kmp_threads[ gtid ];
307        kmp_team_t *parent_team = master_th->th.th_team;
308        ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
309        if (lwt)
310          ompt_frame = &(lwt->ompt_task_info.frame);
311        else
312        {
313          int tid = __kmp_tid_from_gtid( gtid );
314          ompt_frame = &(parent_team->t.t_implicit_task_taskdata[tid].
315          ompt_task_info.frame);
316        }
317        ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
318     }
319 #endif
320 
321 #if INCLUDE_SSC_MARKS
322     SSC_MARK_FORKING();
323 #endif
324     __kmp_fork_call( loc, gtid, fork_context_intel,
325             argc,
326 #if OMPT_SUPPORT
327             VOLATILE_CAST(void *) microtask,      // "unwrapped" task
328 #endif
329             VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
330             VOLATILE_CAST(launch_t)    __kmp_invoke_task_func,
331 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
332 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
333             &ap
334 #else
335             ap
336 #endif
337             );
338 #if INCLUDE_SSC_MARKS
339     SSC_MARK_JOINING();
340 #endif
341     __kmp_join_call( loc, gtid
342 #if OMPT_SUPPORT
343         , fork_context_intel
344 #endif
345     );
346 
347     va_end( ap );
348 
349   }
350 }
351 
352 #if OMP_40_ENABLED
353 /*!
354 @ingroup PARALLEL
355 @param loc source location information
356 @param global_tid global thread number
357 @param num_teams number of teams requested for the teams construct
358 @param num_threads number of threads per team requested for the teams construct
359 
360 Set the number of teams to be used by the teams construct.
361 This call is only required if the teams construct has a `num_teams` clause
362 or a `thread_limit` clause (or both).
363 */
364 void
365 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
366 {
367     KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
368       global_tid, num_teams, num_threads ) );
369 
370     __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
371 }
372 
373 /*!
374 @ingroup PARALLEL
375 @param loc  source location information
376 @param argc  total number of arguments in the ellipsis
377 @param microtask  pointer to callback routine consisting of outlined teams construct
378 @param ...  pointers to shared variables that aren't global
379 
380 Do the actual fork and call the microtask in the relevant number of threads.
381 */
382 void
383 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
384 {
385     int         gtid = __kmp_entry_gtid();
386     kmp_info_t *this_thr = __kmp_threads[ gtid ];
387     va_list     ap;
388     va_start(   ap, microtask );
389 
390     KMP_COUNT_BLOCK(OMP_TEAMS);
391 
392     // remember teams entry point and nesting level
393     this_thr->th.th_teams_microtask = microtask;
394     this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
395 
396 #if OMPT_SUPPORT
397     kmp_team_t *parent_team = this_thr->th.th_team;
398     int tid = __kmp_tid_from_gtid( gtid );
399     if (ompt_enabled) {
400         parent_team->t.t_implicit_task_taskdata[tid].
401            ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
402     }
403 #endif
404 
405     // check if __kmpc_push_num_teams called, set default number of teams otherwise
406     if ( this_thr->th.th_teams_size.nteams == 0 ) {
407         __kmp_push_num_teams( loc, gtid, 0, 0 );
408     }
409     KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
410     KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
411     KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
412 
413     __kmp_fork_call( loc, gtid, fork_context_intel,
414             argc,
415 #if OMPT_SUPPORT
416             VOLATILE_CAST(void *) microtask,               // "unwrapped" task
417 #endif
418             VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
419             VOLATILE_CAST(launch_t)    __kmp_invoke_teams_master,
420 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
421             &ap
422 #else
423             ap
424 #endif
425             );
426     __kmp_join_call( loc, gtid
427 #if OMPT_SUPPORT
428         , fork_context_intel
429 #endif
430     );
431 
432     this_thr->th.th_teams_microtask = NULL;
433     this_thr->th.th_teams_level = 0;
434     *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
435     va_end( ap );
436 }
437 #endif /* OMP_40_ENABLED */
438 
439 
440 //
441 // I don't think this function should ever have been exported.
442 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
443 // openmp code ever called it, but it's been exported from the RTL for so
444 // long that I'm afraid to remove the definition.
445 //
446 int
447 __kmpc_invoke_task_func( int gtid )
448 {
449     return __kmp_invoke_task_func( gtid );
450 }
451 
452 /*!
453 @ingroup PARALLEL
454 @param loc  source location information
455 @param global_tid  global thread number
456 
457 Enter a serialized parallel construct. This interface is used to handle a
458 conditional parallel region, like this,
459 @code
460 #pragma omp parallel if (condition)
461 @endcode
462 when the condition is false.
463 */
464 void
465 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
466 {
467     // The implementation is now in kmp_runtime.cpp so that it can share static
468     // functions with kmp_fork_call since the tasks to be done are similar in
469     // each case.
470     __kmp_serialized_parallel(loc, global_tid);
471 }
472 
473 /*!
474 @ingroup PARALLEL
475 @param loc  source location information
476 @param global_tid  global thread number
477 
478 Leave a serialized parallel construct.
479 */
480 void
481 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
482 {
483     kmp_internal_control_t *top;
484     kmp_info_t *this_thr;
485     kmp_team_t *serial_team;
486 
487     KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
488 
489     /* skip all this code for autopar serialized loops since it results in
490        unacceptable overhead */
491     if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
492         return;
493 
494     // Not autopar code
495     if( ! TCR_4( __kmp_init_parallel ) )
496         __kmp_parallel_initialize();
497 
498     this_thr    = __kmp_threads[ global_tid ];
499     serial_team = this_thr->th.th_serial_team;
500 
501    #if OMP_45_ENABLED
502    kmp_task_team_t *   task_team = this_thr->th.th_task_team;
503 
504    // we need to wait for the proxy tasks before finishing the thread
505    if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
506         __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here?
507    #endif
508 
509     KMP_MB();
510     KMP_DEBUG_ASSERT( serial_team );
511     KMP_ASSERT(       serial_team -> t.t_serialized );
512     KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
513     KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
514     KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
515     KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
516 
517     /* If necessary, pop the internal control stack values and replace the team values */
518     top = serial_team -> t.t_control_stack_top;
519     if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
520         copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
521         serial_team -> t.t_control_stack_top = top -> next;
522         __kmp_free(top);
523     }
524 
525     //if( serial_team -> t.t_serialized > 1 )
526     serial_team -> t.t_level--;
527 
528     /* pop dispatch buffers stack */
529     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
530     {
531         dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
532         serial_team->t.t_dispatch->th_disp_buffer =
533             serial_team->t.t_dispatch->th_disp_buffer->next;
534         __kmp_free( disp_buffer );
535     }
536 
537     -- serial_team -> t.t_serialized;
538     if ( serial_team -> t.t_serialized == 0 ) {
539 
540         /* return to the parallel section */
541 
542 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
543         if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
544             __kmp_clear_x87_fpu_status_word();
545             __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
546             __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
547         }
548 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
549 
550         this_thr -> th.th_team           = serial_team -> t.t_parent;
551         this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
552 
553         /* restore values cached in the thread */
554         this_thr -> th.th_team_nproc     = serial_team -> t.t_parent -> t.t_nproc;          /*  JPH */
555         this_thr -> th.th_team_master    = serial_team -> t.t_parent -> t.t_threads[0];     /* JPH */
556         this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
557 
558         /* TODO the below shouldn't need to be adjusted for serialized teams */
559         this_thr -> th.th_dispatch       = & this_thr -> th.th_team ->
560             t.t_dispatch[ serial_team -> t.t_master_tid ];
561 
562         __kmp_pop_current_task_from_thread( this_thr );
563 
564         KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
565         this_thr -> th.th_current_task -> td_flags.executing = 1;
566 
567         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
568             // Copy the task team from the new child / old parent team to the thread.
569             this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
570             KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
571                             global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
572         }
573     } else {
574         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
575             KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
576                             global_tid, serial_team, serial_team -> t.t_serialized ) );
577         }
578     }
579 
580     if ( __kmp_env_consistency_check )
581         __kmp_pop_parallel( global_tid, NULL );
582 }
583 
584 /*!
585 @ingroup SYNCHRONIZATION
586 @param loc  source location information.
587 
588 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
589 depending on the memory ordering convention obeyed by the compiler
590 even that may not be necessary).
591 */
592 void
593 __kmpc_flush(ident_t *loc)
594 {
595     KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
596 
597     /* need explicit __mf() here since use volatile instead in library */
598     KMP_MB();       /* Flush all pending memory write invalidates.  */
599 
600     #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
601         #if KMP_MIC
602             // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
603             // We shouldn't need it, though, since the ABI rules require that
604             // * If the compiler generates NGO stores it also generates the fence
605             // * If users hand-code NGO stores they should insert the fence
606             // therefore no incomplete unordered stores should be visible.
607         #else
608             // C74404
609             // This is to address non-temporal store instructions (sfence needed).
610             // The clflush instruction is addressed either (mfence needed).
611             // Probably the non-temporal load monvtdqa instruction should also be addressed.
612             // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
613             if ( ! __kmp_cpuinfo.initialized ) {
614                 __kmp_query_cpuid( & __kmp_cpuinfo );
615             }; // if
616             if ( ! __kmp_cpuinfo.sse2 ) {
617                 // CPU cannot execute SSE2 instructions.
618             } else {
619                 #if KMP_COMPILER_ICC
620                 _mm_mfence();
621                 #elif KMP_COMPILER_MSVC
622                 MemoryBarrier();
623                 #else
624                 __sync_synchronize();
625                 #endif // KMP_COMPILER_ICC
626             }; // if
627         #endif // KMP_MIC
628     #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
629         // Nothing to see here move along
630     #elif KMP_ARCH_PPC64
631         // Nothing needed here (we have a real MB above).
632         #if KMP_OS_CNK
633         // The flushing thread needs to yield here; this prevents a
634        // busy-waiting thread from saturating the pipeline. flush is
635           // often used in loops like this:
636            // while (!flag) {
637            //   #pragma omp flush(flag)
638            // }
639        // and adding the yield here is good for at least a 10x speedup
640           // when running >2 threads per core (on the NAS LU benchmark).
641             __kmp_yield(TRUE);
642         #endif
643     #else
644         #error Unknown or unsupported architecture
645     #endif
646 
647 }
648 
649 /* -------------------------------------------------------------------------- */
650 
651 /* -------------------------------------------------------------------------- */
652 
653 /*!
654 @ingroup SYNCHRONIZATION
655 @param loc source location information
656 @param global_tid thread id.
657 
658 Execute a barrier.
659 */
660 void
661 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
662 {
663     KMP_COUNT_BLOCK(OMP_BARRIER);
664     KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
665 
666     if (! TCR_4(__kmp_init_parallel))
667         __kmp_parallel_initialize();
668 
669     if ( __kmp_env_consistency_check ) {
670         if ( loc == 0 ) {
671             KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
672         }; // if
673 
674         __kmp_check_barrier( global_tid, ct_barrier, loc );
675     }
676 
677 #if OMPT_SUPPORT && OMPT_TRACE
678     ompt_frame_t * ompt_frame;
679     if (ompt_enabled ) {
680         ompt_frame = __ompt_get_task_frame_internal(0);
681         if ( ompt_frame->reenter_runtime_frame == NULL )
682             ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
683     }
684 #endif
685     __kmp_threads[ global_tid ]->th.th_ident = loc;
686     // TODO: explicit barrier_wait_id:
687     //   this function is called when 'barrier' directive is present or
688     //   implicit barrier at the end of a worksharing construct.
689     // 1) better to add a per-thread barrier counter to a thread data structure
690     // 2) set to 0 when a new team is created
691     // 4) no sync is required
692 
693     __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
694 #if OMPT_SUPPORT && OMPT_TRACE
695     if (ompt_enabled ) {
696         ompt_frame->reenter_runtime_frame = NULL;
697     }
698 #endif
699 }
700 
701 /* The BARRIER for a MASTER section is always explicit   */
702 /*!
703 @ingroup WORK_SHARING
704 @param loc  source location information.
705 @param global_tid  global thread number .
706 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
707 */
708 kmp_int32
709 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
710 {
711     int status = 0;
712 
713     KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
714 
715     if( ! TCR_4( __kmp_init_parallel ) )
716         __kmp_parallel_initialize();
717 
718     if( KMP_MASTER_GTID( global_tid )) {
719         KMP_COUNT_BLOCK(OMP_MASTER);
720         KMP_PUSH_PARTITIONED_TIMER(OMP_master);
721         status = 1;
722     }
723 
724 #if OMPT_SUPPORT && OMPT_TRACE
725     if (status) {
726         if (ompt_enabled &&
727             ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
728             kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
729             kmp_team_t  *team            = this_thr -> th.th_team;
730 
731             int  tid = __kmp_tid_from_gtid( global_tid );
732             ompt_callbacks.ompt_callback(ompt_event_master_begin)(
733                 team->t.ompt_team_info.parallel_id,
734                 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
735         }
736     }
737 #endif
738 
739     if ( __kmp_env_consistency_check ) {
740 #if KMP_USE_DYNAMIC_LOCK
741         if (status)
742             __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
743         else
744             __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
745 #else
746         if (status)
747             __kmp_push_sync( global_tid, ct_master, loc, NULL );
748         else
749             __kmp_check_sync( global_tid, ct_master, loc, NULL );
750 #endif
751     }
752 
753     return status;
754 }
755 
756 /*!
757 @ingroup WORK_SHARING
758 @param loc  source location information.
759 @param global_tid  global thread number .
760 
761 Mark the end of a <tt>master</tt> region. This should only be called by the thread
762 that executes the <tt>master</tt> region.
763 */
764 void
765 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
766 {
767     KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
768 
769     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
770     KMP_POP_PARTITIONED_TIMER();
771 
772 #if OMPT_SUPPORT && OMPT_TRACE
773     kmp_info_t  *this_thr        = __kmp_threads[ global_tid ];
774     kmp_team_t  *team            = this_thr -> th.th_team;
775     if (ompt_enabled &&
776         ompt_callbacks.ompt_callback(ompt_event_master_end)) {
777         int  tid = __kmp_tid_from_gtid( global_tid );
778         ompt_callbacks.ompt_callback(ompt_event_master_end)(
779             team->t.ompt_team_info.parallel_id,
780             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
781     }
782 #endif
783 
784     if ( __kmp_env_consistency_check ) {
785         if( global_tid < 0 )
786             KMP_WARNING( ThreadIdentInvalid );
787 
788         if( KMP_MASTER_GTID( global_tid ))
789             __kmp_pop_sync( global_tid, ct_master, loc );
790     }
791 }
792 
793 /*!
794 @ingroup WORK_SHARING
795 @param loc  source location information.
796 @param gtid  global thread number.
797 
798 Start execution of an <tt>ordered</tt> construct.
799 */
800 void
801 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
802 {
803     int cid = 0;
804     kmp_info_t *th;
805     KMP_DEBUG_ASSERT( __kmp_init_serial );
806 
807     KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
808 
809     if (! TCR_4(__kmp_init_parallel))
810         __kmp_parallel_initialize();
811 
812 #if USE_ITT_BUILD
813     __kmp_itt_ordered_prep( gtid );
814     // TODO: ordered_wait_id
815 #endif /* USE_ITT_BUILD */
816 
817     th = __kmp_threads[ gtid ];
818 
819 #if OMPT_SUPPORT && OMPT_TRACE
820     if (ompt_enabled) {
821         /* OMPT state update */
822         th->th.ompt_thread_info.wait_id = (uint64_t) loc;
823         th->th.ompt_thread_info.state = ompt_state_wait_ordered;
824 
825         /* OMPT event callback */
826         if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
827             ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
828                 th->th.ompt_thread_info.wait_id);
829         }
830     }
831 #endif
832 
833     if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
834         (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
835     else
836         __kmp_parallel_deo( & gtid, & cid, loc );
837 
838 #if OMPT_SUPPORT && OMPT_TRACE
839     if (ompt_enabled) {
840         /* OMPT state update */
841         th->th.ompt_thread_info.state = ompt_state_work_parallel;
842         th->th.ompt_thread_info.wait_id = 0;
843 
844         /* OMPT event callback */
845         if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
846             ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
847                 th->th.ompt_thread_info.wait_id);
848         }
849     }
850 #endif
851 
852 #if USE_ITT_BUILD
853     __kmp_itt_ordered_start( gtid );
854 #endif /* USE_ITT_BUILD */
855 }
856 
857 /*!
858 @ingroup WORK_SHARING
859 @param loc  source location information.
860 @param gtid  global thread number.
861 
862 End execution of an <tt>ordered</tt> construct.
863 */
864 void
865 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
866 {
867     int cid = 0;
868     kmp_info_t *th;
869 
870     KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
871 
872 #if USE_ITT_BUILD
873     __kmp_itt_ordered_end( gtid );
874     // TODO: ordered_wait_id
875 #endif /* USE_ITT_BUILD */
876 
877     th = __kmp_threads[ gtid ];
878 
879     if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
880         (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
881     else
882         __kmp_parallel_dxo( & gtid, & cid, loc );
883 
884 #if OMPT_SUPPORT && OMPT_BLAME
885     if (ompt_enabled &&
886         ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
887         ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
888             th->th.ompt_thread_info.wait_id);
889     }
890 #endif
891 }
892 
893 #if KMP_USE_DYNAMIC_LOCK
894 
895 static __forceinline void
896 __kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag)
897 {
898     // Pointer to the allocated indirect lock is written to crit, while indexing is ignored.
899     void *idx;
900     kmp_indirect_lock_t **lck;
901     lck = (kmp_indirect_lock_t **)crit;
902     kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
903     KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
904     KMP_SET_I_LOCK_LOCATION(ilk, loc);
905     KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
906     KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
907 #if USE_ITT_BUILD
908     __kmp_itt_critical_creating(ilk->lock, loc);
909 #endif
910     int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
911     if (status == 0) {
912 #if USE_ITT_BUILD
913         __kmp_itt_critical_destroyed(ilk->lock);
914 #endif
915         // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit.
916         //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
917     }
918     KMP_DEBUG_ASSERT(*lck != NULL);
919 }
920 
921 // Fast-path acquire tas lock
922 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) {                                                                       \
923     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                                                  \
924     if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                                                      \
925             ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) {      \
926         kmp_uint32 spins;                                                                                        \
927         KMP_FSYNC_PREPARE(l);                                                                                    \
928         KMP_INIT_YIELD(spins);                                                                                   \
929         if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {                            \
930             KMP_YIELD(TRUE);                                                                                     \
931         } else {                                                                                                 \
932             KMP_YIELD_SPIN(spins);                                                                               \
933         }                                                                                                        \
934         kmp_backoff_t backoff = __kmp_spin_backoff_params;                                                       \
935         while (l->lk.poll != KMP_LOCK_FREE(tas) ||                                                               \
936                ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) {   \
937             __kmp_spin_backoff(&backoff);                                                                        \
938             if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {                        \
939                 KMP_YIELD(TRUE);                                                                                 \
940             } else {                                                                                             \
941                 KMP_YIELD_SPIN(spins);                                                                           \
942             }                                                                                                    \
943         }                                                                                                        \
944     }                                                                                                            \
945     KMP_FSYNC_ACQUIRED(l);                                                                                       \
946 }
947 
948 // Fast-path test tas lock
949 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) {                                                            \
950     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                                        \
951     rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                                           \
952          KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas));   \
953 }
954 
955 // Fast-path release tas lock
956 #define KMP_RELEASE_TAS_LOCK(lock, gtid) {                          \
957     TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));   \
958     KMP_MB();                                                       \
959 }
960 
961 #if KMP_USE_FUTEX
962 
963 # include <unistd.h>
964 # include <sys/syscall.h>
965 # ifndef FUTEX_WAIT
966 #  define FUTEX_WAIT 0
967 # endif
968 # ifndef FUTEX_WAKE
969 #  define FUTEX_WAKE 1
970 # endif
971 
972 // Fast-path acquire futex lock
973 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) {                                                                        \
974     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                                               \
975     kmp_int32 gtid_code = (gtid+1) << 1;                                                                            \
976     KMP_MB();                                                                                                       \
977     KMP_FSYNC_PREPARE(ftx);                                                                                         \
978     kmp_int32 poll_val;                                                                                             \
979     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),                           \
980                                                    KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {     \
981         kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                                                              \
982         if (!cond) {                                                                                                \
983             if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) {      \
984                 continue;                                                                                           \
985             }                                                                                                       \
986             poll_val |= KMP_LOCK_BUSY(1, futex);                                                                    \
987         }                                                                                                           \
988         kmp_int32 rc;                                                                                               \
989         if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) {                \
990             continue;                                                                                               \
991         }                                                                                                           \
992         gtid_code |= 1;                                                                                             \
993     }                                                                                                               \
994     KMP_FSYNC_ACQUIRED(ftx);                                                                                        \
995 }
996 
997 // Fast-path test futex lock
998 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) {                                                                       \
999     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                                               \
1000     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) {    \
1001         KMP_FSYNC_ACQUIRED(ftx);                                                                                    \
1002         rc = TRUE;                                                                                                  \
1003     } else {                                                                                                        \
1004         rc = FALSE;                                                                                                 \
1005     }                                                                                                               \
1006 }
1007 
1008 // Fast-path release futex lock
1009 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) {                                                        \
1010     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                                               \
1011     KMP_MB();                                                                                       \
1012     KMP_FSYNC_RELEASING(ftx);                                                                       \
1013     kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));                   \
1014     if (KMP_LOCK_STRIP(poll_val) & 1) {                                                             \
1015         syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);   \
1016     }                                                                                               \
1017     KMP_MB();                                                                                       \
1018     KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));              \
1019 }
1020 
1021 #endif // KMP_USE_FUTEX
1022 
1023 #else // KMP_USE_DYNAMIC_LOCK
1024 
1025 static kmp_user_lock_p
1026 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1027 {
1028     kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1029 
1030     //
1031     // Because of the double-check, the following load
1032     // doesn't need to be volatile.
1033     //
1034     kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1035 
1036     if ( lck == NULL ) {
1037         void * idx;
1038 
1039         // Allocate & initialize the lock.
1040         // Remember allocated locks in table in order to free them in __kmp_cleanup()
1041         lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1042         __kmp_init_user_lock_with_checks( lck );
1043         __kmp_set_user_lock_location( lck, loc );
1044 #if USE_ITT_BUILD
1045         __kmp_itt_critical_creating( lck );
1046             // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1047             // lock. It is the only place where we can guarantee it. There are chances the lock will
1048             // destroyed with no usage, but it is not a problem, because this is not real event seen
1049             // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1050 #endif /* USE_ITT_BUILD */
1051 
1052         //
1053         // Use a cmpxchg instruction to slam the start of the critical
1054         // section with the lock pointer.  If another thread beat us
1055         // to it, deallocate the lock, and use the lock that the other
1056         // thread allocated.
1057         //
1058         int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1059 
1060         if ( status == 0 ) {
1061             // Deallocate the lock and reload the value.
1062 #if USE_ITT_BUILD
1063             __kmp_itt_critical_destroyed( lck );
1064                 // Let ITT know the lock is destroyed and the same memory location may be reused for
1065                 // another purpose.
1066 #endif /* USE_ITT_BUILD */
1067             __kmp_destroy_user_lock_with_checks( lck );
1068             __kmp_user_lock_free( &idx, gtid, lck );
1069             lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1070             KMP_DEBUG_ASSERT( lck != NULL );
1071         }
1072     }
1073     return lck;
1074 }
1075 
1076 #endif // KMP_USE_DYNAMIC_LOCK
1077 
1078 /*!
1079 @ingroup WORK_SHARING
1080 @param loc  source location information.
1081 @param global_tid  global thread number .
1082 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
1083 some other suitably unique value.
1084 
1085 Enter code protected by a `critical` construct.
1086 This function blocks until the executing thread can enter the critical section.
1087 */
1088 void
1089 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit )
1090 {
1091 #if KMP_USE_DYNAMIC_LOCK
1092     __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1093 #else
1094     KMP_COUNT_BLOCK(OMP_CRITICAL);
1095     KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait);        /* Time spent waiting to enter the critical section */
1096     kmp_user_lock_p lck;
1097 
1098     KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1099 
1100     //TODO: add THR_OVHD_STATE
1101 
1102     KMP_CHECK_USER_LOCK_INIT();
1103 
1104     if ( ( __kmp_user_lock_kind == lk_tas )
1105       && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1106         lck = (kmp_user_lock_p)crit;
1107     }
1108 #if KMP_USE_FUTEX
1109     else if ( ( __kmp_user_lock_kind == lk_futex )
1110       && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1111         lck = (kmp_user_lock_p)crit;
1112     }
1113 #endif
1114     else { // ticket, queuing or drdpa
1115         lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1116     }
1117 
1118     if ( __kmp_env_consistency_check )
1119         __kmp_push_sync( global_tid, ct_critical, loc, lck );
1120 
1121     /* since the critical directive binds to all threads, not just
1122      * the current team we have to check this even if we are in a
1123      * serialized team */
1124     /* also, even if we are the uber thread, we still have to conduct the lock,
1125      * as we have to contend with sibling threads */
1126 
1127 #if USE_ITT_BUILD
1128     __kmp_itt_critical_acquiring( lck );
1129 #endif /* USE_ITT_BUILD */
1130     // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1131     __kmp_acquire_user_lock_with_checks( lck, global_tid );
1132 
1133 #if USE_ITT_BUILD
1134     __kmp_itt_critical_acquired( lck );
1135 #endif /* USE_ITT_BUILD */
1136 
1137     KMP_START_EXPLICIT_TIMER(OMP_critical);
1138     KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1139 #endif // KMP_USE_DYNAMIC_LOCK
1140 }
1141 
1142 #if KMP_USE_DYNAMIC_LOCK
1143 
1144 // Converts the given hint to an internal lock implementation
1145 static __forceinline kmp_dyna_lockseq_t
1146 __kmp_map_hint_to_lock(uintptr_t hint)
1147 {
1148 #if KMP_USE_TSX
1149 # define KMP_TSX_LOCK(seq) lockseq_##seq
1150 #else
1151 # define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1152 #endif
1153 
1154 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1155 # define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1156 #else
1157 # define KMP_CPUINFO_RTM 0
1158 #endif
1159 
1160     // Hints that do not require further logic
1161     if (hint & kmp_lock_hint_hle)
1162         return KMP_TSX_LOCK(hle);
1163     if (hint & kmp_lock_hint_rtm)
1164         return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq;
1165     if (hint & kmp_lock_hint_adaptive)
1166         return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq;
1167 
1168     // Rule out conflicting hints first by returning the default lock
1169     if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1170         return __kmp_user_lock_seq;
1171     if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative))
1172         return __kmp_user_lock_seq;
1173 
1174     // Do not even consider speculation when it appears to be contended
1175     if (hint & omp_lock_hint_contended)
1176         return lockseq_queuing;
1177 
1178     // Uncontended lock without speculation
1179     if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1180         return lockseq_tas;
1181 
1182     // HLE lock for speculation
1183     if (hint & omp_lock_hint_speculative)
1184         return KMP_TSX_LOCK(hle);
1185 
1186     return __kmp_user_lock_seq;
1187 }
1188 
1189 /*!
1190 @ingroup WORK_SHARING
1191 @param loc  source location information.
1192 @param global_tid  global thread number.
1193 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section,
1194 or some other suitably unique value.
1195 @param hint the lock hint.
1196 
1197 Enter code protected by a `critical` construct with a hint. The hint value is used to suggest a lock implementation.
1198 This function blocks until the executing thread can enter the critical section unless the hint suggests use of
1199 speculative execution and the hardware supports it.
1200 */
1201 void
1202 __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint )
1203 {
1204     KMP_COUNT_BLOCK(OMP_CRITICAL);
1205     kmp_user_lock_p lck;
1206 
1207     KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1208 
1209     kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1210     // Check if it is initialized.
1211     if (*lk == 0) {
1212         kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1213         if (KMP_IS_D_LOCK(lckseq)) {
1214             KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq));
1215         } else {
1216             __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1217         }
1218     }
1219     // Branch for accessing the actual lock object and set operation. This branching is inevitable since
1220     // this lock initialization does not follow the normal dispatch path (lock table is not used).
1221     if (KMP_EXTRACT_D_TAG(lk) != 0) {
1222         lck = (kmp_user_lock_p)lk;
1223         if (__kmp_env_consistency_check) {
1224             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1225         }
1226 # if USE_ITT_BUILD
1227         __kmp_itt_critical_acquiring(lck);
1228 # endif
1229 # if KMP_USE_INLINED_TAS
1230         if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1231             KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1232         } else
1233 # elif KMP_USE_INLINED_FUTEX
1234         if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1235             KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1236         } else
1237 # endif
1238         {
1239             KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1240         }
1241     } else {
1242         kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1243         lck = ilk->lock;
1244         if (__kmp_env_consistency_check) {
1245             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1246         }
1247 # if USE_ITT_BUILD
1248         __kmp_itt_critical_acquiring(lck);
1249 # endif
1250         KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1251     }
1252 
1253 #if USE_ITT_BUILD
1254     __kmp_itt_critical_acquired( lck );
1255 #endif /* USE_ITT_BUILD */
1256 
1257     KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1258     KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1259 } // __kmpc_critical_with_hint
1260 
1261 #endif // KMP_USE_DYNAMIC_LOCK
1262 
1263 /*!
1264 @ingroup WORK_SHARING
1265 @param loc  source location information.
1266 @param global_tid  global thread number .
1267 @param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
1268 some other suitably unique value.
1269 
1270 Leave a critical section, releasing any lock that was held during its execution.
1271 */
1272 void
1273 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1274 {
1275     kmp_user_lock_p lck;
1276 
1277     KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1278 
1279 #if KMP_USE_DYNAMIC_LOCK
1280     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1281         lck = (kmp_user_lock_p)crit;
1282         KMP_ASSERT(lck != NULL);
1283         if (__kmp_env_consistency_check) {
1284             __kmp_pop_sync(global_tid, ct_critical, loc);
1285         }
1286 # if USE_ITT_BUILD
1287         __kmp_itt_critical_releasing( lck );
1288 # endif
1289 # if KMP_USE_INLINED_TAS
1290         if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1291             KMP_RELEASE_TAS_LOCK(lck, global_tid);
1292         } else
1293 # elif KMP_USE_INLINED_FUTEX
1294         if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1295             KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1296         } else
1297 # endif
1298         {
1299             KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1300         }
1301     } else {
1302         kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1303         KMP_ASSERT(ilk != NULL);
1304         lck = ilk->lock;
1305         if (__kmp_env_consistency_check) {
1306             __kmp_pop_sync(global_tid, ct_critical, loc);
1307         }
1308 # if USE_ITT_BUILD
1309         __kmp_itt_critical_releasing( lck );
1310 # endif
1311         KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1312     }
1313 
1314 #else // KMP_USE_DYNAMIC_LOCK
1315 
1316     if ( ( __kmp_user_lock_kind == lk_tas )
1317       && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1318         lck = (kmp_user_lock_p)crit;
1319     }
1320 #if KMP_USE_FUTEX
1321     else if ( ( __kmp_user_lock_kind == lk_futex )
1322       && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1323         lck = (kmp_user_lock_p)crit;
1324     }
1325 #endif
1326     else { // ticket, queuing or drdpa
1327         lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1328     }
1329 
1330     KMP_ASSERT(lck != NULL);
1331 
1332     if ( __kmp_env_consistency_check )
1333         __kmp_pop_sync( global_tid, ct_critical, loc );
1334 
1335 #if USE_ITT_BUILD
1336     __kmp_itt_critical_releasing( lck );
1337 #endif /* USE_ITT_BUILD */
1338     // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1339     __kmp_release_user_lock_with_checks( lck, global_tid );
1340 
1341 #if OMPT_SUPPORT && OMPT_BLAME
1342     if (ompt_enabled &&
1343         ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1344         ompt_callbacks.ompt_callback(ompt_event_release_critical)(
1345             (uint64_t) lck);
1346     }
1347 #endif
1348 
1349 #endif // KMP_USE_DYNAMIC_LOCK
1350     KMP_POP_PARTITIONED_TIMER();
1351     KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1352 }
1353 
1354 /*!
1355 @ingroup SYNCHRONIZATION
1356 @param loc source location information
1357 @param global_tid thread id.
1358 @return one if the thread should execute the master block, zero otherwise
1359 
1360 Start execution of a combined barrier and master. The barrier is executed inside this function.
1361 */
1362 kmp_int32
1363 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1364 {
1365     int status;
1366 
1367     KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1368 
1369     if (! TCR_4(__kmp_init_parallel))
1370         __kmp_parallel_initialize();
1371 
1372     if ( __kmp_env_consistency_check )
1373         __kmp_check_barrier( global_tid, ct_barrier, loc );
1374 
1375 #if USE_ITT_NOTIFY
1376     __kmp_threads[global_tid]->th.th_ident = loc;
1377 #endif
1378     status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1379 
1380     return (status != 0) ? 0 : 1;
1381 }
1382 
1383 /*!
1384 @ingroup SYNCHRONIZATION
1385 @param loc source location information
1386 @param global_tid thread id.
1387 
1388 Complete the execution of a combined barrier and master. This function should
1389 only be called at the completion of the <tt>master</tt> code. Other threads will
1390 still be waiting at the barrier and this call releases them.
1391 */
1392 void
1393 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1394 {
1395     KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1396 
1397     __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1398 }
1399 
1400 /*!
1401 @ingroup SYNCHRONIZATION
1402 @param loc source location information
1403 @param global_tid thread id.
1404 @return one if the thread should execute the master block, zero otherwise
1405 
1406 Start execution of a combined barrier and master(nowait) construct.
1407 The barrier is executed inside this function.
1408 There is no equivalent "end" function, since the
1409 */
1410 kmp_int32
1411 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1412 {
1413     kmp_int32 ret;
1414 
1415     KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1416 
1417     if (! TCR_4(__kmp_init_parallel))
1418         __kmp_parallel_initialize();
1419 
1420     if ( __kmp_env_consistency_check ) {
1421         if ( loc == 0 ) {
1422             KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1423         }
1424         __kmp_check_barrier( global_tid, ct_barrier, loc );
1425     }
1426 
1427 #if USE_ITT_NOTIFY
1428     __kmp_threads[global_tid]->th.th_ident = loc;
1429 #endif
1430     __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1431 
1432     ret = __kmpc_master (loc, global_tid);
1433 
1434     if ( __kmp_env_consistency_check ) {
1435         /*  there's no __kmpc_end_master called; so the (stats) */
1436         /*  actions of __kmpc_end_master are done here          */
1437 
1438         if ( global_tid < 0 ) {
1439             KMP_WARNING( ThreadIdentInvalid );
1440         }
1441         if (ret) {
1442             /* only one thread should do the pop since only */
1443             /* one did the push (see __kmpc_master())       */
1444 
1445             __kmp_pop_sync( global_tid, ct_master, loc );
1446         }
1447     }
1448 
1449     return (ret);
1450 }
1451 
1452 /* The BARRIER for a SINGLE process section is always explicit   */
1453 /*!
1454 @ingroup WORK_SHARING
1455 @param loc  source location information
1456 @param global_tid  global thread number
1457 @return One if this thread should execute the single construct, zero otherwise.
1458 
1459 Test whether to execute a <tt>single</tt> construct.
1460 There are no implicit barriers in the two "single" calls, rather the compiler should
1461 introduce an explicit barrier if it is required.
1462 */
1463 
1464 kmp_int32
1465 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1466 {
1467     kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1468 
1469     if (rc) {
1470         // We are going to execute the single statement, so we should count it.
1471         KMP_COUNT_BLOCK(OMP_SINGLE);
1472         KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1473     }
1474 
1475 #if OMPT_SUPPORT && OMPT_TRACE
1476     kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
1477     kmp_team_t *team            = this_thr -> th.th_team;
1478     int tid = __kmp_tid_from_gtid( global_tid );
1479 
1480     if (ompt_enabled) {
1481         if (rc) {
1482             if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1483                 ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1484                     team->t.ompt_team_info.parallel_id,
1485                     team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1486                     team->t.ompt_team_info.microtask);
1487             }
1488         } else {
1489             if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1490                 ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1491                     team->t.ompt_team_info.parallel_id,
1492                     team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1493             }
1494             this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1495         }
1496     }
1497 #endif
1498 
1499     return rc;
1500 }
1501 
1502 /*!
1503 @ingroup WORK_SHARING
1504 @param loc  source location information
1505 @param global_tid  global thread number
1506 
1507 Mark the end of a <tt>single</tt> construct.  This function should
1508 only be called by the thread that executed the block of code protected
1509 by the `single` construct.
1510 */
1511 void
1512 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1513 {
1514     __kmp_exit_single( global_tid );
1515     KMP_POP_PARTITIONED_TIMER();
1516 
1517 #if OMPT_SUPPORT && OMPT_TRACE
1518     kmp_info_t *this_thr        = __kmp_threads[ global_tid ];
1519     kmp_team_t *team            = this_thr -> th.th_team;
1520     int tid = __kmp_tid_from_gtid( global_tid );
1521 
1522     if (ompt_enabled &&
1523         ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1524         ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1525             team->t.ompt_team_info.parallel_id,
1526             team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1527     }
1528 #endif
1529 }
1530 
1531 /*!
1532 @ingroup WORK_SHARING
1533 @param loc Source location
1534 @param global_tid Global thread id
1535 
1536 Mark the end of a statically scheduled loop.
1537 */
1538 void
1539 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1540 {
1541     KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1542 
1543 #if OMPT_SUPPORT && OMPT_TRACE
1544     if (ompt_enabled &&
1545         ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1546         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1547         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1548         ompt_callbacks.ompt_callback(ompt_event_loop_end)(
1549             team_info->parallel_id, task_info->task_id);
1550     }
1551 #endif
1552 
1553     if ( __kmp_env_consistency_check )
1554      __kmp_pop_workshare( global_tid, ct_pdo, loc );
1555 }
1556 
1557 /*
1558  * User routines which take C-style arguments (call by value)
1559  * different from the Fortran equivalent routines
1560  */
1561 
1562 void
1563 ompc_set_num_threads( int arg )
1564 {
1565 // !!!!! TODO: check the per-task binding
1566     __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1567 }
1568 
1569 void
1570 ompc_set_dynamic( int flag )
1571 {
1572     kmp_info_t *thread;
1573 
1574     /* For the thread-private implementation of the internal controls */
1575     thread = __kmp_entry_thread();
1576 
1577     __kmp_save_internal_controls( thread );
1578 
1579     set__dynamic( thread, flag ? TRUE : FALSE );
1580 }
1581 
1582 void
1583 ompc_set_nested( int flag )
1584 {
1585     kmp_info_t *thread;
1586 
1587     /* For the thread-private internal controls implementation */
1588     thread = __kmp_entry_thread();
1589 
1590     __kmp_save_internal_controls( thread );
1591 
1592     set__nested( thread, flag ? TRUE : FALSE );
1593 }
1594 
1595 void
1596 ompc_set_max_active_levels( int max_active_levels )
1597 {
1598     /* TO DO */
1599     /* we want per-task implementation of this internal control */
1600 
1601     /* For the per-thread internal controls implementation */
1602     __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1603 }
1604 
1605 void
1606 ompc_set_schedule( omp_sched_t kind, int modifier )
1607 {
1608 // !!!!! TODO: check the per-task binding
1609     __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1610 }
1611 
1612 int
1613 ompc_get_ancestor_thread_num( int level )
1614 {
1615     return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1616 }
1617 
1618 int
1619 ompc_get_team_size( int level )
1620 {
1621     return __kmp_get_team_size( __kmp_entry_gtid(), level );
1622 }
1623 
1624 void
1625 kmpc_set_stacksize( int arg )
1626 {
1627     // __kmp_aux_set_stacksize initializes the library if needed
1628     __kmp_aux_set_stacksize( arg );
1629 }
1630 
1631 void
1632 kmpc_set_stacksize_s( size_t arg )
1633 {
1634     // __kmp_aux_set_stacksize initializes the library if needed
1635     __kmp_aux_set_stacksize( arg );
1636 }
1637 
1638 void
1639 kmpc_set_blocktime( int arg )
1640 {
1641     int gtid, tid;
1642     kmp_info_t *thread;
1643 
1644     gtid = __kmp_entry_gtid();
1645     tid = __kmp_tid_from_gtid(gtid);
1646     thread = __kmp_thread_from_gtid(gtid);
1647 
1648     __kmp_aux_set_blocktime( arg, thread, tid );
1649 }
1650 
1651 void
1652 kmpc_set_library( int arg )
1653 {
1654     // __kmp_user_set_library initializes the library if needed
1655     __kmp_user_set_library( (enum library_type)arg );
1656 }
1657 
1658 void
1659 kmpc_set_defaults( char const * str )
1660 {
1661     // __kmp_aux_set_defaults initializes the library if needed
1662     __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
1663 }
1664 
1665 void
1666 kmpc_set_disp_num_buffers( int arg )
1667 {
1668     // ignore after initialization because some teams have already
1669     // allocated dispatch buffers
1670     if( __kmp_init_serial == 0 && arg > 0 )
1671         __kmp_dispatch_num_buffers = arg;
1672 }
1673 
1674 int
1675 kmpc_set_affinity_mask_proc( int proc, void **mask )
1676 {
1677 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1678     return -1;
1679 #else
1680     if ( ! TCR_4(__kmp_init_middle) ) {
1681         __kmp_middle_initialize();
1682     }
1683     return __kmp_aux_set_affinity_mask_proc( proc, mask );
1684 #endif
1685 }
1686 
1687 int
1688 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1689 {
1690 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1691     return -1;
1692 #else
1693     if ( ! TCR_4(__kmp_init_middle) ) {
1694         __kmp_middle_initialize();
1695     }
1696     return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1697 #endif
1698 }
1699 
1700 int
1701 kmpc_get_affinity_mask_proc( int proc, void **mask )
1702 {
1703 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1704     return -1;
1705 #else
1706     if ( ! TCR_4(__kmp_init_middle) ) {
1707         __kmp_middle_initialize();
1708     }
1709     return __kmp_aux_get_affinity_mask_proc( proc, mask );
1710 #endif
1711 }
1712 
1713 
1714 /* -------------------------------------------------------------------------- */
1715 /*!
1716 @ingroup THREADPRIVATE
1717 @param loc       source location information
1718 @param gtid      global thread number
1719 @param cpy_size  size of the cpy_data buffer
1720 @param cpy_data  pointer to data to be copied
1721 @param cpy_func  helper function to call for copying data
1722 @param didit     flag variable: 1=single thread; 0=not single thread
1723 
1724 __kmpc_copyprivate implements the interface for the private data broadcast needed for
1725 the copyprivate clause associated with a single region in an OpenMP<sup>*</sup> program (both C and Fortran).
1726 All threads participating in the parallel region call this routine.
1727 One of the threads (called the single thread) should have the <tt>didit</tt> variable set to 1
1728 and all other threads should have that variable set to 0.
1729 All threads pass a pointer to a data buffer (cpy_data) that they have built.
1730 
1731 The OpenMP specification forbids the use of nowait on the single region when a copyprivate
1732 clause is present. However, @ref __kmpc_copyprivate implements a barrier internally to avoid
1733 race conditions, so the code generation for the single region should avoid generating a barrier
1734 after the call to @ref __kmpc_copyprivate.
1735 
1736 The <tt>gtid</tt> parameter is the global thread id for the current thread.
1737 The <tt>loc</tt> parameter is a pointer to source location information.
1738 
1739 Internal implementation: The single thread will first copy its descriptor address (cpy_data)
1740 to a team-private location, then the other threads will each call the function pointed to by
1741 the parameter cpy_func, which carries out the copy by copying the data using the cpy_data buffer.
1742 
1743 The cpy_func routine used for the copy and the contents of the data area defined by cpy_data
1744 and cpy_size may be built in any fashion that will allow the copy to be done. For instance,
1745 the cpy_data buffer can hold the actual data to be copied or it may hold a list of pointers
1746 to the data. The cpy_func routine must interpret the cpy_data buffer appropriately.
1747 
1748 The interface to cpy_func is as follows:
1749 @code
1750 void cpy_func( void *destination, void *source )
1751 @endcode
1752 where void *destination is the cpy_data pointer for the thread being copied to
1753 and void *source is the cpy_data pointer for the thread being copied from.
1754 */
1755 void
1756 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1757 {
1758     void **data_ptr;
1759 
1760     KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1761 
1762     KMP_MB();
1763 
1764     data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1765 
1766     if ( __kmp_env_consistency_check ) {
1767         if ( loc == 0 ) {
1768             KMP_WARNING( ConstructIdentInvalid );
1769         }
1770     }
1771 
1772     /* ToDo: Optimize the following two barriers into some kind of split barrier */
1773 
1774     if (didit) *data_ptr = cpy_data;
1775 
1776     /* This barrier is not a barrier region boundary */
1777 #if USE_ITT_NOTIFY
1778     __kmp_threads[gtid]->th.th_ident = loc;
1779 #endif
1780     __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1781 
1782     if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1783 
1784     /* Consider next barrier the user-visible barrier for barrier region boundaries */
1785     /* Nesting checks are already handled by the single construct checks */
1786 
1787 #if USE_ITT_NOTIFY
1788     __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1789 #endif
1790     __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1791 }
1792 
1793 /* -------------------------------------------------------------------------- */
1794 
1795 #define INIT_LOCK                 __kmp_init_user_lock_with_checks
1796 #define INIT_NESTED_LOCK          __kmp_init_nested_user_lock_with_checks
1797 #define ACQUIRE_LOCK              __kmp_acquire_user_lock_with_checks
1798 #define ACQUIRE_LOCK_TIMED        __kmp_acquire_user_lock_with_checks_timed
1799 #define ACQUIRE_NESTED_LOCK       __kmp_acquire_nested_user_lock_with_checks
1800 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1801 #define RELEASE_LOCK              __kmp_release_user_lock_with_checks
1802 #define RELEASE_NESTED_LOCK       __kmp_release_nested_user_lock_with_checks
1803 #define TEST_LOCK                 __kmp_test_user_lock_with_checks
1804 #define TEST_NESTED_LOCK          __kmp_test_nested_user_lock_with_checks
1805 #define DESTROY_LOCK              __kmp_destroy_user_lock_with_checks
1806 #define DESTROY_NESTED_LOCK       __kmp_destroy_nested_user_lock_with_checks
1807 
1808 
1809 /*
1810  * TODO: Make check abort messages use location info & pass it
1811  * into with_checks routines
1812  */
1813 
1814 #if KMP_USE_DYNAMIC_LOCK
1815 
1816 // internal lock initializer
1817 static __forceinline void
1818 __kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1819 {
1820     if (KMP_IS_D_LOCK(seq)) {
1821         KMP_INIT_D_LOCK(lock, seq);
1822 #if USE_ITT_BUILD
1823         __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
1824 #endif
1825     } else {
1826         KMP_INIT_I_LOCK(lock, seq);
1827 #if USE_ITT_BUILD
1828         kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1829         __kmp_itt_lock_creating(ilk->lock, loc);
1830 #endif
1831     }
1832 }
1833 
1834 // internal nest lock initializer
1835 static __forceinline void
1836 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1837 {
1838 #if KMP_USE_TSX
1839     // Don't have nested lock implementation for speculative locks
1840     if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
1841         seq = __kmp_user_lock_seq;
1842 #endif
1843     switch (seq) {
1844         case lockseq_tas:
1845             seq = lockseq_nested_tas;
1846             break;
1847 #if KMP_USE_FUTEX
1848         case lockseq_futex:
1849             seq = lockseq_nested_futex;
1850             break;
1851 #endif
1852         case lockseq_ticket:
1853             seq = lockseq_nested_ticket;
1854             break;
1855         case lockseq_queuing:
1856             seq = lockseq_nested_queuing;
1857             break;
1858         case lockseq_drdpa:
1859             seq = lockseq_nested_drdpa;
1860             break;
1861         default:
1862             seq = lockseq_nested_queuing;
1863     }
1864     KMP_INIT_I_LOCK(lock, seq);
1865 #if USE_ITT_BUILD
1866     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1867     __kmp_itt_lock_creating(ilk->lock, loc);
1868 #endif
1869 }
1870 
1871 /* initialize the lock with a hint */
1872 void
1873 __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1874 {
1875     KMP_DEBUG_ASSERT(__kmp_init_serial);
1876     if (__kmp_env_consistency_check && user_lock == NULL) {
1877         KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
1878     }
1879 
1880     __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1881 }
1882 
1883 /* initialize the lock with a hint */
1884 void
1885 __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1886 {
1887     KMP_DEBUG_ASSERT(__kmp_init_serial);
1888     if (__kmp_env_consistency_check && user_lock == NULL) {
1889         KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
1890     }
1891 
1892     __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1893 }
1894 
1895 #endif // KMP_USE_DYNAMIC_LOCK
1896 
1897 /* initialize the lock */
1898 void
1899 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid,  void ** user_lock ) {
1900 #if KMP_USE_DYNAMIC_LOCK
1901     KMP_DEBUG_ASSERT(__kmp_init_serial);
1902     if (__kmp_env_consistency_check && user_lock == NULL) {
1903         KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1904     }
1905     __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1906 
1907 #else // KMP_USE_DYNAMIC_LOCK
1908 
1909     static char const * const func = "omp_init_lock";
1910     kmp_user_lock_p lck;
1911     KMP_DEBUG_ASSERT( __kmp_init_serial );
1912 
1913     if ( __kmp_env_consistency_check ) {
1914         if ( user_lock == NULL ) {
1915             KMP_FATAL( LockIsUninitialized, func );
1916         }
1917     }
1918 
1919     KMP_CHECK_USER_LOCK_INIT();
1920 
1921     if ( ( __kmp_user_lock_kind == lk_tas )
1922       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1923         lck = (kmp_user_lock_p)user_lock;
1924     }
1925 #if KMP_USE_FUTEX
1926     else if ( ( __kmp_user_lock_kind == lk_futex )
1927       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1928         lck = (kmp_user_lock_p)user_lock;
1929     }
1930 #endif
1931     else {
1932         lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1933     }
1934     INIT_LOCK( lck );
1935     __kmp_set_user_lock_location( lck, loc );
1936 
1937 #if OMPT_SUPPORT && OMPT_TRACE
1938     if (ompt_enabled &&
1939         ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
1940         ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck);
1941     }
1942 #endif
1943 
1944 #if USE_ITT_BUILD
1945     __kmp_itt_lock_creating( lck );
1946 #endif /* USE_ITT_BUILD */
1947 
1948 #endif // KMP_USE_DYNAMIC_LOCK
1949 } // __kmpc_init_lock
1950 
1951 /* initialize the lock */
1952 void
1953 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1954 #if KMP_USE_DYNAMIC_LOCK
1955 
1956     KMP_DEBUG_ASSERT(__kmp_init_serial);
1957     if (__kmp_env_consistency_check && user_lock == NULL) {
1958         KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1959     }
1960     __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1961 
1962 #else // KMP_USE_DYNAMIC_LOCK
1963 
1964     static char const * const func = "omp_init_nest_lock";
1965     kmp_user_lock_p lck;
1966     KMP_DEBUG_ASSERT( __kmp_init_serial );
1967 
1968     if ( __kmp_env_consistency_check ) {
1969         if ( user_lock == NULL ) {
1970             KMP_FATAL( LockIsUninitialized, func );
1971         }
1972     }
1973 
1974     KMP_CHECK_USER_LOCK_INIT();
1975 
1976     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1977       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1978         lck = (kmp_user_lock_p)user_lock;
1979     }
1980 #if KMP_USE_FUTEX
1981     else if ( ( __kmp_user_lock_kind == lk_futex )
1982      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1983      <= OMP_NEST_LOCK_T_SIZE ) ) {
1984         lck = (kmp_user_lock_p)user_lock;
1985     }
1986 #endif
1987     else {
1988         lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1989     }
1990 
1991     INIT_NESTED_LOCK( lck );
1992     __kmp_set_user_lock_location( lck, loc );
1993 
1994 #if OMPT_SUPPORT && OMPT_TRACE
1995     if (ompt_enabled &&
1996         ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
1997         ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck);
1998     }
1999 #endif
2000 
2001 #if USE_ITT_BUILD
2002     __kmp_itt_lock_creating( lck );
2003 #endif /* USE_ITT_BUILD */
2004 
2005 #endif // KMP_USE_DYNAMIC_LOCK
2006 } // __kmpc_init_nest_lock
2007 
2008 void
2009 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2010 #if KMP_USE_DYNAMIC_LOCK
2011 
2012 # if USE_ITT_BUILD
2013     kmp_user_lock_p lck;
2014     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2015         lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2016     } else {
2017         lck = (kmp_user_lock_p)user_lock;
2018     }
2019     __kmp_itt_lock_destroyed(lck);
2020 # endif
2021     KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2022 #else
2023     kmp_user_lock_p lck;
2024 
2025     if ( ( __kmp_user_lock_kind == lk_tas )
2026       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2027         lck = (kmp_user_lock_p)user_lock;
2028     }
2029 #if KMP_USE_FUTEX
2030     else if ( ( __kmp_user_lock_kind == lk_futex )
2031       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2032         lck = (kmp_user_lock_p)user_lock;
2033     }
2034 #endif
2035     else {
2036         lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
2037     }
2038 
2039 #if OMPT_SUPPORT && OMPT_TRACE
2040     if (ompt_enabled &&
2041         ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
2042         ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck);
2043     }
2044 #endif
2045 
2046 #if USE_ITT_BUILD
2047     __kmp_itt_lock_destroyed( lck );
2048 #endif /* USE_ITT_BUILD */
2049     DESTROY_LOCK( lck );
2050 
2051     if ( ( __kmp_user_lock_kind == lk_tas )
2052       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2053         ;
2054     }
2055 #if KMP_USE_FUTEX
2056     else if ( ( __kmp_user_lock_kind == lk_futex )
2057       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2058         ;
2059     }
2060 #endif
2061     else {
2062         __kmp_user_lock_free( user_lock, gtid, lck );
2063     }
2064 #endif // KMP_USE_DYNAMIC_LOCK
2065 } // __kmpc_destroy_lock
2066 
2067 /* destroy the lock */
2068 void
2069 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2070 #if KMP_USE_DYNAMIC_LOCK
2071 
2072 # if USE_ITT_BUILD
2073     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2074     __kmp_itt_lock_destroyed(ilk->lock);
2075 # endif
2076     KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2077 
2078 #else // KMP_USE_DYNAMIC_LOCK
2079 
2080     kmp_user_lock_p lck;
2081 
2082     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2083       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2084         lck = (kmp_user_lock_p)user_lock;
2085     }
2086 #if KMP_USE_FUTEX
2087     else if ( ( __kmp_user_lock_kind == lk_futex )
2088      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2089      <= OMP_NEST_LOCK_T_SIZE ) ) {
2090         lck = (kmp_user_lock_p)user_lock;
2091     }
2092 #endif
2093     else {
2094         lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
2095     }
2096 
2097 #if OMPT_SUPPORT && OMPT_TRACE
2098     if (ompt_enabled &&
2099         ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
2100         ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck);
2101     }
2102 #endif
2103 
2104 #if USE_ITT_BUILD
2105     __kmp_itt_lock_destroyed( lck );
2106 #endif /* USE_ITT_BUILD */
2107 
2108     DESTROY_NESTED_LOCK( lck );
2109 
2110     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2111      + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2112         ;
2113     }
2114 #if KMP_USE_FUTEX
2115     else if ( ( __kmp_user_lock_kind == lk_futex )
2116      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2117      <= OMP_NEST_LOCK_T_SIZE ) ) {
2118         ;
2119     }
2120 #endif
2121     else {
2122         __kmp_user_lock_free( user_lock, gtid, lck );
2123     }
2124 #endif // KMP_USE_DYNAMIC_LOCK
2125 } // __kmpc_destroy_nest_lock
2126 
2127 void
2128 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2129     KMP_COUNT_BLOCK(OMP_set_lock);
2130 #if KMP_USE_DYNAMIC_LOCK
2131     int tag = KMP_EXTRACT_D_TAG(user_lock);
2132 # if USE_ITT_BUILD
2133    __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
2134 # endif
2135 # if KMP_USE_INLINED_TAS
2136     if (tag == locktag_tas && !__kmp_env_consistency_check) {
2137         KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2138     } else
2139 # elif KMP_USE_INLINED_FUTEX
2140     if (tag == locktag_futex && !__kmp_env_consistency_check) {
2141         KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2142     } else
2143 # endif
2144     {
2145         __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2146     }
2147 # if USE_ITT_BUILD
2148     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2149 # endif
2150 
2151 #else // KMP_USE_DYNAMIC_LOCK
2152 
2153     kmp_user_lock_p lck;
2154 
2155     if ( ( __kmp_user_lock_kind == lk_tas )
2156       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2157         lck = (kmp_user_lock_p)user_lock;
2158     }
2159 #if KMP_USE_FUTEX
2160     else if ( ( __kmp_user_lock_kind == lk_futex )
2161       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2162         lck = (kmp_user_lock_p)user_lock;
2163     }
2164 #endif
2165     else {
2166         lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
2167     }
2168 
2169 #if USE_ITT_BUILD
2170     __kmp_itt_lock_acquiring( lck );
2171 #endif /* USE_ITT_BUILD */
2172 
2173     ACQUIRE_LOCK( lck, gtid );
2174 
2175 #if USE_ITT_BUILD
2176     __kmp_itt_lock_acquired( lck );
2177 #endif /* USE_ITT_BUILD */
2178 
2179 #if OMPT_SUPPORT && OMPT_TRACE
2180     if (ompt_enabled &&
2181         ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
2182         ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck);
2183     }
2184 #endif
2185 
2186 #endif // KMP_USE_DYNAMIC_LOCK
2187 }
2188 
2189 void
2190 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2191 #if KMP_USE_DYNAMIC_LOCK
2192 
2193 # if USE_ITT_BUILD
2194     __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2195 # endif
2196     KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2197 # if USE_ITT_BUILD
2198     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2199 #endif
2200 
2201 #if OMPT_SUPPORT && OMPT_TRACE
2202     if (ompt_enabled) {
2203         // missing support here: need to know whether acquired first or not
2204     }
2205 #endif
2206 
2207 #else // KMP_USE_DYNAMIC_LOCK
2208     int acquire_status;
2209     kmp_user_lock_p lck;
2210 
2211     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2212       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2213         lck = (kmp_user_lock_p)user_lock;
2214     }
2215 #if KMP_USE_FUTEX
2216     else if ( ( __kmp_user_lock_kind == lk_futex )
2217      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2218      <= OMP_NEST_LOCK_T_SIZE ) ) {
2219         lck = (kmp_user_lock_p)user_lock;
2220     }
2221 #endif
2222     else {
2223         lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
2224     }
2225 
2226 #if USE_ITT_BUILD
2227     __kmp_itt_lock_acquiring( lck );
2228 #endif /* USE_ITT_BUILD */
2229 
2230     ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status );
2231 
2232 #if USE_ITT_BUILD
2233     __kmp_itt_lock_acquired( lck );
2234 #endif /* USE_ITT_BUILD */
2235 
2236 #if OMPT_SUPPORT && OMPT_TRACE
2237     if (ompt_enabled) {
2238         if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2239            if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
2240               ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck);
2241         } else {
2242            if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
2243               ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck);
2244         }
2245     }
2246 #endif
2247 
2248 #endif // KMP_USE_DYNAMIC_LOCK
2249 }
2250 
2251 void
2252 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2253 {
2254 #if KMP_USE_DYNAMIC_LOCK
2255 
2256     int tag = KMP_EXTRACT_D_TAG(user_lock);
2257 # if USE_ITT_BUILD
2258     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2259 # endif
2260 # if KMP_USE_INLINED_TAS
2261     if (tag == locktag_tas && !__kmp_env_consistency_check) {
2262         KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2263     } else
2264 # elif KMP_USE_INLINED_FUTEX
2265     if (tag == locktag_futex && !__kmp_env_consistency_check) {
2266         KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2267     } else
2268 # endif
2269     {
2270         __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2271     }
2272 
2273 #else // KMP_USE_DYNAMIC_LOCK
2274 
2275     kmp_user_lock_p lck;
2276 
2277     /* Can't use serial interval since not block structured */
2278     /* release the lock */
2279 
2280     if ( ( __kmp_user_lock_kind == lk_tas )
2281       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2282 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2283         // "fast" path implemented to fix customer performance issue
2284 #if USE_ITT_BUILD
2285         __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2286 #endif /* USE_ITT_BUILD */
2287         TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2288         KMP_MB();
2289         return;
2290 #else
2291         lck = (kmp_user_lock_p)user_lock;
2292 #endif
2293     }
2294 #if KMP_USE_FUTEX
2295     else if ( ( __kmp_user_lock_kind == lk_futex )
2296       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2297         lck = (kmp_user_lock_p)user_lock;
2298     }
2299 #endif
2300     else {
2301         lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
2302     }
2303 
2304 #if USE_ITT_BUILD
2305     __kmp_itt_lock_releasing( lck );
2306 #endif /* USE_ITT_BUILD */
2307 
2308     RELEASE_LOCK( lck, gtid );
2309 
2310 #if OMPT_SUPPORT && OMPT_BLAME
2311     if (ompt_enabled &&
2312         ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2313         ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
2314     }
2315 #endif
2316 
2317 #endif // KMP_USE_DYNAMIC_LOCK
2318 }
2319 
2320 /* release the lock */
2321 void
2322 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2323 {
2324 #if KMP_USE_DYNAMIC_LOCK
2325 
2326 # if USE_ITT_BUILD
2327     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2328 # endif
2329     KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2330 
2331 #else // KMP_USE_DYNAMIC_LOCK
2332 
2333     kmp_user_lock_p lck;
2334 
2335     /* Can't use serial interval since not block structured */
2336 
2337     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2338       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2339 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2340         // "fast" path implemented to fix customer performance issue
2341         kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
2342 #if USE_ITT_BUILD
2343         __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2344 #endif /* USE_ITT_BUILD */
2345         if ( --(tl->lk.depth_locked) == 0 ) {
2346             TCW_4(tl->lk.poll, 0);
2347         }
2348         KMP_MB();
2349         return;
2350 #else
2351         lck = (kmp_user_lock_p)user_lock;
2352 #endif
2353     }
2354 #if KMP_USE_FUTEX
2355     else if ( ( __kmp_user_lock_kind == lk_futex )
2356      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2357      <= OMP_NEST_LOCK_T_SIZE ) ) {
2358         lck = (kmp_user_lock_p)user_lock;
2359     }
2360 #endif
2361     else {
2362         lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
2363     }
2364 
2365 #if USE_ITT_BUILD
2366     __kmp_itt_lock_releasing( lck );
2367 #endif /* USE_ITT_BUILD */
2368 
2369     int release_status;
2370     release_status = RELEASE_NESTED_LOCK( lck, gtid );
2371 #if OMPT_SUPPORT && OMPT_BLAME
2372     if (ompt_enabled) {
2373         if (release_status == KMP_LOCK_RELEASED) {
2374             if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2375                 ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2376                     (uint64_t) lck);
2377             }
2378         } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
2379             ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2380                 (uint64_t) lck);
2381         }
2382     }
2383 #endif
2384 
2385 #endif // KMP_USE_DYNAMIC_LOCK
2386 }
2387 
2388 /* try to acquire the lock */
2389 int
2390 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2391 {
2392     KMP_COUNT_BLOCK(OMP_test_lock);
2393 
2394 #if KMP_USE_DYNAMIC_LOCK
2395     int rc;
2396     int tag = KMP_EXTRACT_D_TAG(user_lock);
2397 # if USE_ITT_BUILD
2398     __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2399 # endif
2400 # if KMP_USE_INLINED_TAS
2401     if (tag == locktag_tas && !__kmp_env_consistency_check) {
2402         KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2403     } else
2404 # elif KMP_USE_INLINED_FUTEX
2405     if (tag == locktag_futex && !__kmp_env_consistency_check) {
2406         KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2407     } else
2408 # endif
2409     {
2410         rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2411     }
2412     if (rc) {
2413 # if USE_ITT_BUILD
2414         __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2415 # endif
2416         return FTN_TRUE;
2417     } else {
2418 # if USE_ITT_BUILD
2419         __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2420 # endif
2421         return FTN_FALSE;
2422     }
2423 
2424 #else // KMP_USE_DYNAMIC_LOCK
2425 
2426     kmp_user_lock_p lck;
2427     int          rc;
2428 
2429     if ( ( __kmp_user_lock_kind == lk_tas )
2430       && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2431         lck = (kmp_user_lock_p)user_lock;
2432     }
2433 #if KMP_USE_FUTEX
2434     else if ( ( __kmp_user_lock_kind == lk_futex )
2435       && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2436         lck = (kmp_user_lock_p)user_lock;
2437     }
2438 #endif
2439     else {
2440         lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
2441     }
2442 
2443 #if USE_ITT_BUILD
2444     __kmp_itt_lock_acquiring( lck );
2445 #endif /* USE_ITT_BUILD */
2446 
2447     rc = TEST_LOCK( lck, gtid );
2448 #if USE_ITT_BUILD
2449     if ( rc ) {
2450         __kmp_itt_lock_acquired( lck );
2451     } else {
2452         __kmp_itt_lock_cancelled( lck );
2453     }
2454 #endif /* USE_ITT_BUILD */
2455     return ( rc ? FTN_TRUE : FTN_FALSE );
2456 
2457     /* Can't use serial interval since not block structured */
2458 
2459 #endif // KMP_USE_DYNAMIC_LOCK
2460 }
2461 
2462 /* try to acquire the lock */
2463 int
2464 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2465 {
2466 #if KMP_USE_DYNAMIC_LOCK
2467     int rc;
2468 # if USE_ITT_BUILD
2469     __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2470 # endif
2471     rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2472 # if USE_ITT_BUILD
2473     if (rc) {
2474         __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2475     } else {
2476         __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2477     }
2478 # endif
2479     return rc;
2480 
2481 #else // KMP_USE_DYNAMIC_LOCK
2482 
2483     kmp_user_lock_p lck;
2484     int          rc;
2485 
2486     if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2487       + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2488         lck = (kmp_user_lock_p)user_lock;
2489     }
2490 #if KMP_USE_FUTEX
2491     else if ( ( __kmp_user_lock_kind == lk_futex )
2492      && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2493      <= OMP_NEST_LOCK_T_SIZE ) ) {
2494         lck = (kmp_user_lock_p)user_lock;
2495     }
2496 #endif
2497     else {
2498         lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
2499     }
2500 
2501 #if USE_ITT_BUILD
2502     __kmp_itt_lock_acquiring( lck );
2503 #endif /* USE_ITT_BUILD */
2504 
2505     rc = TEST_NESTED_LOCK( lck, gtid );
2506 #if USE_ITT_BUILD
2507     if ( rc ) {
2508         __kmp_itt_lock_acquired( lck );
2509     } else {
2510         __kmp_itt_lock_cancelled( lck );
2511     }
2512 #endif /* USE_ITT_BUILD */
2513     return rc;
2514 
2515     /* Can't use serial interval since not block structured */
2516 
2517 #endif // KMP_USE_DYNAMIC_LOCK
2518 }
2519 
2520 
2521 /*--------------------------------------------------------------------------------------------------------------------*/
2522 
2523 /*
2524  * Interface to fast scalable reduce methods routines
2525  */
2526 
2527 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
2528 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
2529 // AT: which solution is better?
2530 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
2531                    ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
2532 
2533 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2534                    ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
2535 
2536 // description of the packed_reduction_method variable: look at the macros in kmp.h
2537 
2538 
2539 // used in a critical section reduce block
2540 static __forceinline void
2541 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2542 
2543     // this lock was visible to a customer and to the threading profile tool as a serial overhead span
2544     //            (although it's used for an internal purpose only)
2545     //            why was it visible in previous implementation?
2546     //            should we keep it visible in new reduce block?
2547     kmp_user_lock_p lck;
2548 
2549 #if KMP_USE_DYNAMIC_LOCK
2550 
2551     kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
2552     // Check if it is initialized.
2553     if (*lk == 0) {
2554         if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2555             KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq));
2556         } else {
2557             __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq));
2558         }
2559     }
2560     // Branch for accessing the actual lock object and set operation. This branching is inevitable since
2561     // this lock initialization does not follow the normal dispatch path (lock table is not used).
2562     if (KMP_EXTRACT_D_TAG(lk) != 0) {
2563         lck = (kmp_user_lock_p)lk;
2564         KMP_DEBUG_ASSERT(lck != NULL);
2565         if (__kmp_env_consistency_check) {
2566             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2567         }
2568         KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
2569     } else {
2570         kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
2571         lck = ilk->lock;
2572         KMP_DEBUG_ASSERT(lck != NULL);
2573         if (__kmp_env_consistency_check) {
2574             __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2575         }
2576         KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
2577     }
2578 
2579 #else // KMP_USE_DYNAMIC_LOCK
2580 
2581     // We know that the fast reduction code is only emitted by Intel compilers
2582     // with 32 byte critical sections. If there isn't enough space, then we
2583     // have to use a pointer.
2584     if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2585         lck = (kmp_user_lock_p)crit;
2586     }
2587     else {
2588         lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2589     }
2590     KMP_DEBUG_ASSERT( lck != NULL );
2591 
2592     if ( __kmp_env_consistency_check )
2593         __kmp_push_sync( global_tid, ct_critical, loc, lck );
2594 
2595     __kmp_acquire_user_lock_with_checks( lck, global_tid );
2596 
2597 #endif // KMP_USE_DYNAMIC_LOCK
2598 }
2599 
2600 // used in a critical section reduce block
2601 static __forceinline void
2602 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2603 
2604     kmp_user_lock_p lck;
2605 
2606 #if KMP_USE_DYNAMIC_LOCK
2607 
2608     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2609         lck = (kmp_user_lock_p)crit;
2610         if (__kmp_env_consistency_check)
2611             __kmp_pop_sync(global_tid, ct_critical, loc);
2612         KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2613     } else {
2614         kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2615         if (__kmp_env_consistency_check)
2616             __kmp_pop_sync(global_tid, ct_critical, loc);
2617         KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2618     }
2619 
2620 #else // KMP_USE_DYNAMIC_LOCK
2621 
2622     // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2623     // sections. If there isn't enough space, then we have to use a pointer.
2624     if ( __kmp_base_user_lock_size > 32 ) {
2625         lck = *( (kmp_user_lock_p *) crit );
2626         KMP_ASSERT( lck != NULL );
2627     } else {
2628         lck = (kmp_user_lock_p) crit;
2629     }
2630 
2631     if ( __kmp_env_consistency_check )
2632         __kmp_pop_sync( global_tid, ct_critical, loc );
2633 
2634     __kmp_release_user_lock_with_checks( lck, global_tid );
2635 
2636 #endif // KMP_USE_DYNAMIC_LOCK
2637 } // __kmp_end_critical_section_reduce_block
2638 
2639 
2640 /* 2.a.i. Reduce Block without a terminating barrier */
2641 /*!
2642 @ingroup SYNCHRONIZATION
2643 @param loc source location information
2644 @param global_tid global thread number
2645 @param num_vars number of items (variables) to be reduced
2646 @param reduce_size size of data in bytes to be reduced
2647 @param reduce_data pointer to data to be reduced
2648 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
2649 @param lck pointer to the unique lock data structure
2650 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
2651 
2652 The nowait version is used for a reduce clause with the nowait argument.
2653 */
2654 kmp_int32
2655 __kmpc_reduce_nowait(
2656     ident_t *loc, kmp_int32 global_tid,
2657     kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2658     kmp_critical_name *lck ) {
2659 
2660     KMP_COUNT_BLOCK(REDUCE_nowait);
2661     int retval = 0;
2662     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2663 #if OMP_40_ENABLED
2664     kmp_team_t *team;
2665     kmp_info_t *th;
2666     int teams_swapped = 0, task_state;
2667 #endif
2668     KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2669 
2670     // why do we need this initialization here at all?
2671     // Reduction clause can not be used as a stand-alone directive.
2672 
2673     // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2674     // possible detection of false-positive race by the threadchecker ???
2675     if( ! TCR_4( __kmp_init_parallel ) )
2676         __kmp_parallel_initialize();
2677 
2678     // check correctness of reduce block nesting
2679 #if KMP_USE_DYNAMIC_LOCK
2680     if ( __kmp_env_consistency_check )
2681         __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2682 #else
2683     if ( __kmp_env_consistency_check )
2684         __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2685 #endif
2686 
2687 #if OMP_40_ENABLED
2688     th = __kmp_thread_from_gtid(global_tid);
2689     if( th->th.th_teams_microtask ) {   // AC: check if we are inside the teams construct?
2690         team = th->th.th_team;
2691         if( team->t.t_level == th->th.th_teams_level ) {
2692             // this is reduction at teams construct
2693             KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid);  // AC: check that tid == 0
2694             // Let's swap teams temporarily for the reduction barrier
2695             teams_swapped = 1;
2696             th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2697             th->th.th_team = team->t.t_parent;
2698             th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2699             th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2700             task_state = th->th.th_task_state;
2701             th->th.th_task_state = 0;
2702         }
2703     }
2704 #endif // OMP_40_ENABLED
2705 
2706     // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2707     // the variable should be either a construct-specific or thread-specific property, not a team specific property
2708     //     (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2709     // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2710     //     (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2711     // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2712     // a thread-specific "th_local.reduction_method" variable is used currently
2713     // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2714 
2715     packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2716     __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2717 
2718     if( packed_reduction_method == critical_reduce_block ) {
2719 
2720         __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2721         retval = 1;
2722 
2723     } else if( packed_reduction_method == empty_reduce_block ) {
2724 
2725         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2726         retval = 1;
2727 
2728     } else if( packed_reduction_method == atomic_reduce_block ) {
2729 
2730         retval = 2;
2731 
2732         // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2733         //     (it's not quite good, because the checking block has been closed by this 'pop',
2734         //      but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2735         if ( __kmp_env_consistency_check )
2736             __kmp_pop_sync( global_tid, ct_reduce, loc );
2737 
2738     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2739 
2740         //AT: performance issue: a real barrier here
2741         //AT:     (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2742         //AT:     (it's not what a customer might expect specifying NOWAIT clause)
2743         //AT:     (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2744         //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2745         //        and be more in line with sense of NOWAIT
2746         //AT: TO DO: do epcc test and compare times
2747 
2748         // this barrier should be invisible to a customer and to the threading profile tool
2749         //              (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2750 #if USE_ITT_NOTIFY
2751         __kmp_threads[global_tid]->th.th_ident = loc;
2752 #endif
2753         retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2754         retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2755 
2756         // all other workers except master should do this pop here
2757         //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
2758         if ( __kmp_env_consistency_check ) {
2759             if( retval == 0 ) {
2760                 __kmp_pop_sync( global_tid, ct_reduce, loc );
2761             }
2762         }
2763 
2764     } else {
2765 
2766         // should never reach this block
2767         KMP_ASSERT( 0 ); // "unexpected method"
2768 
2769     }
2770 #if OMP_40_ENABLED
2771     if( teams_swapped ) {
2772         // Restore thread structure
2773         th->th.th_info.ds.ds_tid = 0;
2774         th->th.th_team = team;
2775         th->th.th_team_nproc = team->t.t_nproc;
2776         th->th.th_task_team = team->t.t_task_team[task_state];
2777         th->th.th_task_state = task_state;
2778     }
2779 #endif
2780     KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2781 
2782     return retval;
2783 }
2784 
2785 /*!
2786 @ingroup SYNCHRONIZATION
2787 @param loc source location information
2788 @param global_tid global thread id.
2789 @param lck pointer to the unique lock data structure
2790 
2791 Finish the execution of a reduce nowait.
2792 */
2793 void
2794 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2795 
2796     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2797 
2798     KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2799 
2800     packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2801 
2802     if( packed_reduction_method == critical_reduce_block ) {
2803 
2804         __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2805 
2806     } else if( packed_reduction_method == empty_reduce_block ) {
2807 
2808         // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2809 
2810     } else if( packed_reduction_method == atomic_reduce_block ) {
2811 
2812         // neither master nor other workers should get here
2813         //     (code gen does not generate this call in case 2: atomic reduce block)
2814         // actually it's better to remove this elseif at all;
2815         // after removal this value will checked by the 'else' and will assert
2816 
2817     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2818 
2819         // only master gets here
2820 
2821     } else {
2822 
2823         // should never reach this block
2824         KMP_ASSERT( 0 ); // "unexpected method"
2825 
2826     }
2827 
2828     if ( __kmp_env_consistency_check )
2829         __kmp_pop_sync( global_tid, ct_reduce, loc );
2830 
2831     KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2832 
2833     return;
2834 }
2835 
2836 /* 2.a.ii. Reduce Block with a terminating barrier */
2837 
2838 /*!
2839 @ingroup SYNCHRONIZATION
2840 @param loc source location information
2841 @param global_tid global thread number
2842 @param num_vars number of items (variables) to be reduced
2843 @param reduce_size size of data in bytes to be reduced
2844 @param reduce_data pointer to data to be reduced
2845 @param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
2846 @param lck pointer to the unique lock data structure
2847 @result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
2848 
2849 A blocking reduce that includes an implicit barrier.
2850 */
2851 kmp_int32
2852 __kmpc_reduce(
2853     ident_t *loc, kmp_int32 global_tid,
2854     kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2855     void (*reduce_func)(void *lhs_data, void *rhs_data),
2856     kmp_critical_name *lck )
2857 {
2858     KMP_COUNT_BLOCK(REDUCE_wait);
2859     int retval = 0;
2860     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2861 
2862     KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2863 
2864     // why do we need this initialization here at all?
2865     // Reduction clause can not be a stand-alone directive.
2866 
2867     // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2868     // possible detection of false-positive race by the threadchecker ???
2869     if( ! TCR_4( __kmp_init_parallel ) )
2870         __kmp_parallel_initialize();
2871 
2872     // check correctness of reduce block nesting
2873 #if KMP_USE_DYNAMIC_LOCK
2874     if ( __kmp_env_consistency_check )
2875         __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2876 #else
2877     if ( __kmp_env_consistency_check )
2878         __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2879 #endif
2880 
2881     packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2882     __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2883 
2884     if( packed_reduction_method == critical_reduce_block ) {
2885 
2886         __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2887         retval = 1;
2888 
2889     } else if( packed_reduction_method == empty_reduce_block ) {
2890 
2891         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2892         retval = 1;
2893 
2894     } else if( packed_reduction_method == atomic_reduce_block ) {
2895 
2896         retval = 2;
2897 
2898     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2899 
2900         //case tree_reduce_block:
2901         // this barrier should be visible to a customer and to the threading profile tool
2902         //              (it's a terminating barrier on constructs if NOWAIT not specified)
2903 #if USE_ITT_NOTIFY
2904         __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2905 #endif
2906         retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2907         retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2908 
2909         // all other workers except master should do this pop here
2910         //     ( none of other workers except master will enter __kmpc_end_reduce() )
2911         if ( __kmp_env_consistency_check ) {
2912             if( retval == 0 ) { // 0: all other workers; 1: master
2913                 __kmp_pop_sync( global_tid, ct_reduce, loc );
2914             }
2915         }
2916 
2917     } else {
2918 
2919         // should never reach this block
2920         KMP_ASSERT( 0 ); // "unexpected method"
2921 
2922     }
2923 
2924     KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2925 
2926     return retval;
2927 }
2928 
2929 /*!
2930 @ingroup SYNCHRONIZATION
2931 @param loc source location information
2932 @param global_tid global thread id.
2933 @param lck pointer to the unique lock data structure
2934 
2935 Finish the execution of a blocking reduce.
2936 The <tt>lck</tt> pointer must be the same as that used in the corresponding start function.
2937 */
2938 void
2939 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2940 
2941     PACKED_REDUCTION_METHOD_T packed_reduction_method;
2942 
2943     KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2944 
2945     packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2946 
2947     // this barrier should be visible to a customer and to the threading profile tool
2948     //              (it's a terminating barrier on constructs if NOWAIT not specified)
2949 
2950     if( packed_reduction_method == critical_reduce_block ) {
2951 
2952         __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2953 
2954         // TODO: implicit barrier: should be exposed
2955 #if USE_ITT_NOTIFY
2956         __kmp_threads[global_tid]->th.th_ident = loc;
2957 #endif
2958         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2959 
2960     } else if( packed_reduction_method == empty_reduce_block ) {
2961 
2962         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2963 
2964         // TODO: implicit barrier: should be exposed
2965 #if USE_ITT_NOTIFY
2966         __kmp_threads[global_tid]->th.th_ident = loc;
2967 #endif
2968         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2969 
2970     } else if( packed_reduction_method == atomic_reduce_block ) {
2971 
2972         // TODO: implicit barrier: should be exposed
2973 #if USE_ITT_NOTIFY
2974         __kmp_threads[global_tid]->th.th_ident = loc;
2975 #endif
2976         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2977 
2978     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2979 
2980         // only master executes here (master releases all other workers)
2981         __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2982 
2983     } else {
2984 
2985         // should never reach this block
2986         KMP_ASSERT( 0 ); // "unexpected method"
2987 
2988     }
2989 
2990     if ( __kmp_env_consistency_check )
2991         __kmp_pop_sync( global_tid, ct_reduce, loc );
2992 
2993     KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2994 
2995     return;
2996 }
2997 
2998 #undef __KMP_GET_REDUCTION_METHOD
2999 #undef __KMP_SET_REDUCTION_METHOD
3000 
3001 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
3002 
3003 kmp_uint64
3004 __kmpc_get_taskid() {
3005 
3006     kmp_int32    gtid;
3007     kmp_info_t * thread;
3008 
3009     gtid = __kmp_get_gtid();
3010     if ( gtid < 0 ) {
3011         return 0;
3012     }; // if
3013     thread = __kmp_thread_from_gtid( gtid );
3014     return thread->th.th_current_task->td_task_id;
3015 
3016 } // __kmpc_get_taskid
3017 
3018 
3019 kmp_uint64
3020 __kmpc_get_parent_taskid() {
3021 
3022     kmp_int32        gtid;
3023     kmp_info_t *     thread;
3024     kmp_taskdata_t * parent_task;
3025 
3026     gtid = __kmp_get_gtid();
3027     if ( gtid < 0 ) {
3028         return 0;
3029     }; // if
3030     thread      = __kmp_thread_from_gtid( gtid );
3031     parent_task = thread->th.th_current_task->td_parent;
3032     return ( parent_task == NULL ? 0 : parent_task->td_task_id );
3033 
3034 } // __kmpc_get_parent_taskid
3035 
3036 void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
3037 {
3038     if ( ! __kmp_init_serial ) {
3039         __kmp_serial_initialize();
3040     }
3041     __kmp_place_num_sockets = nS;
3042     __kmp_place_socket_offset = sO;
3043     __kmp_place_num_cores = nC;
3044     __kmp_place_core_offset = cO;
3045     __kmp_place_num_threads_per_core = nT;
3046 }
3047 
3048 #if OMP_45_ENABLED
3049 /*!
3050 @ingroup WORK_SHARING
3051 @param loc  source location information.
3052 @param gtid  global thread number.
3053 @param num_dims  number of associated doacross loops.
3054 @param dims  info on loops bounds.
3055 
3056 Initialize doacross loop information.
3057 Expect compiler send us inclusive bounds,
3058 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3059 */
3060 void
3061 __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
3062 {
3063     int j, idx;
3064     kmp_int64 last, trace_count;
3065     kmp_info_t *th = __kmp_threads[gtid];
3066     kmp_team_t *team = th->th.th_team;
3067     kmp_uint32 *flags;
3068     kmp_disp_t *pr_buf = th->th.th_dispatch;
3069     dispatch_shared_info_t *sh_buf;
3070 
3071     KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3072                  gtid, num_dims, !team->t.t_serialized));
3073     KMP_DEBUG_ASSERT(dims != NULL);
3074     KMP_DEBUG_ASSERT(num_dims > 0);
3075 
3076     if( team->t.t_serialized ) {
3077         KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
3078         return; // no dependencies if team is serialized
3079     }
3080     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3081     idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
3082     sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3083 
3084     // Save bounds info into allocated private buffer
3085     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3086     pr_buf->th_doacross_info =
3087         (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
3088     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3089     pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
3090     // Save also address of num_done in order to access it later without knowing the buffer index
3091     pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3092     pr_buf->th_doacross_info[2] = dims[0].lo;
3093     pr_buf->th_doacross_info[3] = dims[0].up;
3094     pr_buf->th_doacross_info[4] = dims[0].st;
3095     last = 5;
3096     for( j = 1; j < num_dims; ++j ) {
3097         kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
3098         if( dims[j].st == 1 ) { // most common case
3099             // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3100             range_length = dims[j].up - dims[j].lo + 1;
3101         } else {
3102             if( dims[j].st > 0 ) {
3103                 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3104                 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3105             } else {            // negative increment
3106                 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3107                 range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3108             }
3109         }
3110         pr_buf->th_doacross_info[last++] = range_length;
3111         pr_buf->th_doacross_info[last++] = dims[j].lo;
3112         pr_buf->th_doacross_info[last++] = dims[j].up;
3113         pr_buf->th_doacross_info[last++] = dims[j].st;
3114     }
3115 
3116     // Compute total trip count.
3117     // Start with range of dims[0] which we don't need to keep in the buffer.
3118     if( dims[0].st == 1 ) { // most common case
3119         trace_count = dims[0].up - dims[0].lo + 1;
3120     } else if( dims[0].st > 0 ) {
3121         KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3122         trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3123     } else {   // negative increment
3124         KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3125         trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3126     }
3127     for( j = 1; j < num_dims; ++j ) {
3128         trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3129     }
3130     KMP_DEBUG_ASSERT(trace_count > 0);
3131 
3132     // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers)
3133     if( idx != sh_buf->doacross_buf_idx ) {
3134         // Shared buffer is occupied, wait for it to be free
3135         __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
3136     }
3137     // Check if we are the first thread. After the CAS the first thread gets 0,
3138     // others get 1 if initialization is in progress, allocated pointer otherwise.
3139     flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
3140         (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
3141     if( flags == NULL ) {
3142         // we are the first thread, allocate the array of flags
3143         kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3144         sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
3145     } else if( (kmp_int64)flags == 1 ) {
3146         // initialization is still in progress, need to wait
3147         while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
3148             KMP_YIELD(TRUE);
3149         }
3150     }
3151     KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
3152     pr_buf->th_doacross_flags = sh_buf->doacross_flags;      // save private copy in order to not
3153                                                              // touch shared buffer on each iteration
3154     KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
3155 }
3156 
3157 void
3158 __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
3159 {
3160     kmp_int32 shft, num_dims, i;
3161     kmp_uint32 flag;
3162     kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3163     kmp_info_t *th = __kmp_threads[gtid];
3164     kmp_team_t *team = th->th.th_team;
3165     kmp_disp_t *pr_buf;
3166     kmp_int64 lo, up, st;
3167 
3168     KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3169     if( team->t.t_serialized ) {
3170         KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
3171         return; // no dependencies if team is serialized
3172     }
3173 
3174     // calculate sequential iteration number and check out-of-bounds condition
3175     pr_buf = th->th.th_dispatch;
3176     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3177     num_dims = pr_buf->th_doacross_info[0];
3178     lo = pr_buf->th_doacross_info[2];
3179     up = pr_buf->th_doacross_info[3];
3180     st = pr_buf->th_doacross_info[4];
3181     if( st == 1 ) { // most common case
3182         if( vec[0] < lo || vec[0] > up ) {
3183             KA_TRACE(20,(
3184                 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3185                 gtid, vec[0], lo, up));
3186             return;
3187         }
3188         iter_number = vec[0] - lo;
3189     } else if( st > 0 ) {
3190         if( vec[0] < lo || vec[0] > up ) {
3191             KA_TRACE(20,(
3192                 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3193                 gtid, vec[0], lo, up));
3194             return;
3195         }
3196         iter_number = (kmp_uint64)(vec[0] - lo) / st;
3197     } else {        // negative increment
3198         if( vec[0] > lo || vec[0] < up ) {
3199             KA_TRACE(20,(
3200                 "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3201                 gtid, vec[0], lo, up));
3202             return;
3203         }
3204         iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3205     }
3206     for( i = 1; i < num_dims; ++i ) {
3207         kmp_int64 iter, ln;
3208         kmp_int32 j = i * 4;
3209         ln = pr_buf->th_doacross_info[j + 1];
3210         lo = pr_buf->th_doacross_info[j + 2];
3211         up = pr_buf->th_doacross_info[j + 3];
3212         st = pr_buf->th_doacross_info[j + 4];
3213         if( st == 1 ) {
3214             if( vec[i] < lo || vec[i] > up ) {
3215                 KA_TRACE(20,(
3216                     "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3217                     gtid, vec[i], lo, up));
3218                 return;
3219             }
3220             iter = vec[i] - lo;
3221         } else if( st > 0 ) {
3222             if( vec[i] < lo || vec[i] > up ) {
3223                 KA_TRACE(20,(
3224                     "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3225                     gtid, vec[i], lo, up));
3226                 return;
3227             }
3228             iter = (kmp_uint64)(vec[i] - lo) / st;
3229         } else {   // st < 0
3230             if( vec[i] > lo || vec[i] < up ) {
3231                 KA_TRACE(20,(
3232                     "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3233                     gtid, vec[i], lo, up));
3234                 return;
3235             }
3236             iter = (kmp_uint64)(lo - vec[i]) / (-st);
3237         }
3238         iter_number = iter + ln * iter_number;
3239     }
3240     shft = iter_number % 32; // use 32-bit granularity
3241     iter_number >>= 5;       // divided by 32
3242     flag = 1 << shft;
3243     while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
3244         KMP_YIELD(TRUE);
3245     }
3246     KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3247                  gtid, (iter_number<<5)+shft));
3248 }
3249 
3250 void
3251 __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
3252 {
3253     kmp_int32 shft, num_dims, i;
3254     kmp_uint32 flag;
3255     kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3256     kmp_info_t *th = __kmp_threads[gtid];
3257     kmp_team_t *team = th->th.th_team;
3258     kmp_disp_t *pr_buf;
3259     kmp_int64 lo, st;
3260 
3261     KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3262     if( team->t.t_serialized ) {
3263         KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
3264         return; // no dependencies if team is serialized
3265     }
3266 
3267     // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
3268     pr_buf = th->th.th_dispatch;
3269     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3270     num_dims = pr_buf->th_doacross_info[0];
3271     lo = pr_buf->th_doacross_info[2];
3272     st = pr_buf->th_doacross_info[4];
3273     if( st == 1 ) { // most common case
3274         iter_number = vec[0] - lo;
3275     } else if( st > 0 ) {
3276         iter_number = (kmp_uint64)(vec[0] - lo) / st;
3277     } else {        // negative increment
3278         iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3279     }
3280     for( i = 1; i < num_dims; ++i ) {
3281         kmp_int64 iter, ln;
3282         kmp_int32 j = i * 4;
3283         ln = pr_buf->th_doacross_info[j + 1];
3284         lo = pr_buf->th_doacross_info[j + 2];
3285         st = pr_buf->th_doacross_info[j + 4];
3286         if( st == 1 ) {
3287             iter = vec[i] - lo;
3288         } else if( st > 0 ) {
3289             iter = (kmp_uint64)(vec[i] - lo) / st;
3290         } else {   // st < 0
3291             iter = (kmp_uint64)(lo - vec[i]) / (-st);
3292         }
3293         iter_number = iter + ln * iter_number;
3294     }
3295     shft = iter_number % 32; // use 32-bit granularity
3296     iter_number >>= 5;       // divided by 32
3297     flag = 1 << shft;
3298     if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
3299         KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
3300     KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
3301                  gtid, (iter_number<<5)+shft));
3302 }
3303 
3304 void
3305 __kmpc_doacross_fini(ident_t *loc, int gtid)
3306 {
3307     kmp_int64 num_done;
3308     kmp_info_t *th = __kmp_threads[gtid];
3309     kmp_team_t *team = th->th.th_team;
3310     kmp_disp_t *pr_buf = th->th.th_dispatch;
3311 
3312     KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3313     if( team->t.t_serialized ) {
3314         KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3315         return; // nothing to do
3316     }
3317     num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
3318     if( num_done == th->th.th_team_nproc ) {
3319         // we are the last thread, need to free shared resources
3320         int idx = pr_buf->th_doacross_buf_idx - 1;
3321         dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3322         KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
3323         KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3324         KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3325         __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
3326         sh_buf->doacross_flags = NULL;
3327         sh_buf->doacross_num_done = 0;
3328         sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use
3329     }
3330     // free private resources (need to keep buffer index forever)
3331     __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
3332     pr_buf->th_doacross_info = NULL;
3333     KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
3334 }
3335 #endif
3336 
3337 // end of file //
3338 
3339