1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 42674 $
4  * $Date: 2013-09-18 11:12:49 -0500 (Wed, 18 Sep 2013) $
5  */
6 
7 
8 //===----------------------------------------------------------------------===//
9 //
10 //                     The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 
18 /*
19  * Dynamic scheduling initialization and dispatch.
20  *
21  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
22  *       it may change values between parallel regions.  __kmp_max_nth
23  *       is the largest value __kmp_nth may take, 1 is the smallest.
24  *
25  */
26 
27 /* ------------------------------------------------------------------------ */
28 /* ------------------------------------------------------------------------ */
29 
30 #include "kmp.h"
31 #include "kmp_i18n.h"
32 #include "kmp_itt.h"
33 #include "kmp_str.h"
34 #include "kmp_error.h"
35 #if KMP_OS_WINDOWS && KMP_ARCH_X86
36     #include <float.h>
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 #ifdef KMP_STATIC_STEAL_ENABLED
43 
44     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
45     template< typename T >
46     struct dispatch_private_infoXX_template {
47         typedef typename traits_t< T >::unsigned_t  UT;
48         typedef typename traits_t< T >::signed_t    ST;
49         UT count;                // unsigned
50         T  ub;
51         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
52         T  lb;
53         ST st;                   // signed
54         UT tc;                   // unsigned
55         T  static_steal_counter; // for static_steal only; maybe better to put after ub
56 
57         /* parm[1-4] are used in different ways by different scheduling algorithms */
58 
59         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
60         //    a) parm3 is properly aligned and
61         //    b) all parm1-4 are in the same cache line.
62         // Because of parm1-4 are used together, performance seems to be better
63         // if they are in the same line (not measured though).
64 
65         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
66             T  parm1;
67             T  parm2;
68             T  parm3;
69             T  parm4;
70         };
71 
72         UT ordered_lower; // unsigned
73         UT ordered_upper; // unsigned
74         #if KMP_OS_WINDOWS
75         T  last_upper;
76         #endif /* KMP_OS_WINDOWS */
77     };
78 
79 #else /* KMP_STATIC_STEAL_ENABLED */
80 
81     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
82     template< typename T >
83     struct dispatch_private_infoXX_template {
84         typedef typename traits_t< T >::unsigned_t  UT;
85         typedef typename traits_t< T >::signed_t    ST;
86         T  lb;
87         T  ub;
88         ST st;            // signed
89         UT tc;            // unsigned
90 
91         T  parm1;
92         T  parm2;
93         T  parm3;
94         T  parm4;
95 
96         UT count;         // unsigned
97 
98         UT ordered_lower; // unsigned
99         UT ordered_upper; // unsigned
100         #if KMP_OS_WINDOWS
101 	T  last_upper;
102         #endif /* KMP_OS_WINDOWS */
103     };
104 
105 #endif /* KMP_STATIC_STEAL_ENABLED */
106 
107 // replaces dispatch_private_info structure and dispatch_private_info_t type
108 template< typename T >
109 struct KMP_ALIGN_CACHE dispatch_private_info_template {
110     // duplicate alignment here, otherwise size of structure is not correct in our compiler
111     union KMP_ALIGN_CACHE private_info_tmpl {
112         dispatch_private_infoXX_template< T > p;
113         dispatch_private_info64_t             p64;
114     } u;
115     enum sched_type schedule;  /* scheduling algorithm */
116     kmp_uint32      ordered;   /* ordered clause specified */
117     kmp_uint32      ordered_bumped;
118     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
119     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
120     kmp_uint32      nomerge;   /* don't merge iters if serialized */
121     kmp_uint32      type_size;
122     enum cons_type  pushed_ws;
123 };
124 
125 
126 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
127 template< typename UT >
128 struct dispatch_shared_infoXX_template {
129     /* chunk index under dynamic, number of idle threads under static-steal;
130        iteration index otherwise */
131     volatile UT     iteration;
132     volatile UT     num_done;
133     volatile UT     ordered_iteration;
134     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
135 };
136 
137 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
138 template< typename UT >
139 struct dispatch_shared_info_template {
140     // we need union here to keep the structure size
141     union shared_info_tmpl {
142         dispatch_shared_infoXX_template< UT >  s;
143         dispatch_shared_info64_t               s64;
144     } u;
145     volatile kmp_uint32     buffer_index;
146 };
147 
148 /* ------------------------------------------------------------------------ */
149 /* ------------------------------------------------------------------------ */
150 
151 static void
152 __kmp_static_delay( int arg )
153 {
154     /* Work around weird code-gen bug that causes assert to trip */
155     #if KMP_ARCH_X86_64 && KMP_OS_LINUX
156     #else
157         KMP_ASSERT( arg >= 0 );
158     #endif
159 }
160 
161 static void
162 __kmp_static_yield( int arg )
163 {
164     __kmp_yield( arg );
165 }
166 
167 #undef USE_TEST_LOCKS
168 
169 // test_then_add template (general template should NOT be used)
170 template< typename T >
171 static __forceinline T
172 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
173 
174 template<>
175 __forceinline kmp_int32
176 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
177 {
178     kmp_int32 r;
179     r = KMP_TEST_THEN_ADD32( p, d );
180     return r;
181 }
182 
183 template<>
184 __forceinline kmp_int64
185 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
186 {
187     kmp_int64 r;
188     r = KMP_TEST_THEN_ADD64( p, d );
189     return r;
190 }
191 
192 // test_then_inc_acq template (general template should NOT be used)
193 template< typename T >
194 static __forceinline T
195 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
196 
197 template<>
198 __forceinline kmp_int32
199 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
200 {
201     kmp_int32 r;
202     r = KMP_TEST_THEN_INC_ACQ32( p );
203     return r;
204 }
205 
206 template<>
207 __forceinline kmp_int64
208 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
209 {
210     kmp_int64 r;
211     r = KMP_TEST_THEN_INC_ACQ64( p );
212     return r;
213 }
214 
215 // test_then_inc template (general template should NOT be used)
216 template< typename T >
217 static __forceinline T
218 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
219 
220 template<>
221 __forceinline kmp_int32
222 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
223 {
224     kmp_int32 r;
225     r = KMP_TEST_THEN_INC32( p );
226     return r;
227 }
228 
229 template<>
230 __forceinline kmp_int64
231 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
232 {
233     kmp_int64 r;
234     r = KMP_TEST_THEN_INC64( p );
235     return r;
236 }
237 
238 // compare_and_swap template (general template should NOT be used)
239 template< typename T >
240 static __forceinline kmp_int32
241 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
242 
243 template<>
244 __forceinline kmp_int32
245 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
246 {
247     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
248 }
249 
250 template<>
251 __forceinline kmp_int32
252 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
253 {
254     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
255 }
256 
257 /*
258     Spin wait loop that first does pause, then yield.
259     Waits until function returns non-zero when called with *spinner and check.
260     Does NOT put threads to sleep.
261 #if USE_ITT_BUILD
262     Arguments:
263         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
264             locks consistently. For example, if lock is acquired immediately, its address is
265             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
266             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
267             address, not an address of low-level spinner.
268 #endif // USE_ITT_BUILD
269 */
270 template< typename UT >
271 // ToDo: make inline function (move to header file for icl)
272 static UT  // unsigned 4- or 8-byte type
273 __kmp_wait_yield( volatile UT * spinner,
274                   UT            checker,
275                   kmp_uint32 (* pred)( UT, UT )
276                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
277                   )
278 {
279     // note: we may not belong to a team at this point
280     register volatile UT         * spin          = spinner;
281     register          UT           check         = checker;
282     register          kmp_uint32   spins;
283     register          kmp_uint32 (*f) ( UT, UT ) = pred;
284     register          UT           r;
285 
286     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
287     KMP_INIT_YIELD( spins );
288     // main wait spin loop
289     while(!f(r = *spin, check))
290     {
291         KMP_FSYNC_SPIN_PREPARE( obj );
292         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
293            It causes problems with infinite recursion because of exit lock */
294         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
295             __kmp_abort_thread(); */
296 
297         __kmp_static_delay(TRUE);
298 
299         // if we are oversubscribed,
300         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
301         // pause is in the following code
302         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
303         KMP_YIELD_SPIN( spins );
304     }
305     KMP_FSYNC_SPIN_ACQUIRED( obj );
306     return r;
307 }
308 
309 template< typename UT >
310 static kmp_uint32 __kmp_eq( UT value, UT checker) {
311     return value == checker;
312 }
313 
314 template< typename UT >
315 static kmp_uint32 __kmp_neq( UT value, UT checker) {
316     return value != checker;
317 }
318 
319 template< typename UT >
320 static kmp_uint32 __kmp_lt( UT value, UT checker) {
321     return value < checker;
322 }
323 
324 template< typename UT >
325 static kmp_uint32 __kmp_ge( UT value, UT checker) {
326     return value >= checker;
327 }
328 
329 template< typename UT >
330 static kmp_uint32 __kmp_le( UT value, UT checker) {
331     return value <= checker;
332 }
333 
334 
335 /* ------------------------------------------------------------------------ */
336 /* ------------------------------------------------------------------------ */
337 
338 static void
339 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
340 {
341     kmp_info_t *th;
342 
343     KMP_DEBUG_ASSERT( gtid_ref );
344 
345     if ( __kmp_env_consistency_check ) {
346         th = __kmp_threads[*gtid_ref];
347         if ( th -> th.th_root -> r.r_active
348           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
349             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
350         }
351     }
352 }
353 
354 template< typename UT >
355 static void
356 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
357 {
358     typedef typename traits_t< UT >::signed_t    ST;
359     dispatch_private_info_template< UT > * pr;
360 
361     int gtid = *gtid_ref;
362 //    int  cid = *cid_ref;
363     kmp_info_t *th = __kmp_threads[ gtid ];
364     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
365 
366     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
367     if ( __kmp_env_consistency_check ) {
368         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
369             ( th -> th.th_dispatch -> th_dispatch_pr_current );
370         if ( pr -> pushed_ws != ct_none ) {
371             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
372         }
373     }
374 
375     if ( ! th -> th.th_team -> t.t_serialized ) {
376         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
377             ( th -> th.th_dispatch -> th_dispatch_sh_current );
378         UT  lower;
379 
380         if ( ! __kmp_env_consistency_check ) {
381                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
382                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
383         }
384         lower = pr->u.p.ordered_lower;
385 
386         #if ! defined( KMP_GOMP_COMPAT )
387             if ( __kmp_env_consistency_check ) {
388                 if ( pr->ordered_bumped ) {
389                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
390                     __kmp_error_construct2(
391                         kmp_i18n_msg_CnsMultipleNesting,
392                         ct_ordered_in_pdo, loc_ref,
393                         & p->stack_data[ p->w_top ]
394                     );
395                 }
396             }
397         #endif /* !defined(KMP_GOMP_COMPAT) */
398 
399         KMP_MB();
400         #ifdef KMP_DEBUG
401         {
402             const char * buff;
403             // create format specifiers before the debug output
404             buff = __kmp_str_format(
405                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
406                 traits_t< UT >::spec, traits_t< UT >::spec );
407             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
408             __kmp_str_free( &buff );
409         }
410         #endif
411 
412         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
413                                 USE_ITT_BUILD_ARG( NULL )
414                                 );
415         KMP_MB();  /* is this necessary? */
416         #ifdef KMP_DEBUG
417         {
418             const char * buff;
419             // create format specifiers before the debug output
420             buff = __kmp_str_format(
421                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
422                 traits_t< UT >::spec, traits_t< UT >::spec );
423             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
424             __kmp_str_free( &buff );
425         }
426         #endif
427     }
428     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
429 }
430 
431 static void
432 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
433 {
434     kmp_info_t *th;
435 
436     if ( __kmp_env_consistency_check ) {
437         th = __kmp_threads[*gtid_ref];
438         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
439             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
440         }
441     }
442 }
443 
444 template< typename UT >
445 static void
446 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
447 {
448     typedef typename traits_t< UT >::signed_t    ST;
449     dispatch_private_info_template< UT > * pr;
450 
451     int gtid = *gtid_ref;
452 //    int  cid = *cid_ref;
453     kmp_info_t *th = __kmp_threads[ gtid ];
454     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
455 
456     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
457     if ( __kmp_env_consistency_check ) {
458         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
459             ( th -> th.th_dispatch -> th_dispatch_pr_current );
460         if ( pr -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 
465     if ( ! th -> th.th_team -> t.t_serialized ) {
466         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
467             ( th -> th.th_dispatch -> th_dispatch_sh_current );
468 
469         if ( ! __kmp_env_consistency_check ) {
470             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
471                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
472         }
473 
474         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
475         #if ! defined( KMP_GOMP_COMPAT )
476             if ( __kmp_env_consistency_check ) {
477                 if ( pr->ordered_bumped != 0 ) {
478                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
479                     /* How to test it? - OM */
480                     __kmp_error_construct2(
481                         kmp_i18n_msg_CnsMultipleNesting,
482                         ct_ordered_in_pdo, loc_ref,
483                         & p->stack_data[ p->w_top ]
484                     );
485                 }
486             }
487         #endif /* !defined(KMP_GOMP_COMPAT) */
488 
489         KMP_MB();       /* Flush all pending memory write invalidates.  */
490 
491         pr->ordered_bumped += 1;
492 
493         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
494                         gtid, pr->ordered_bumped ) );
495 
496         KMP_MB();       /* Flush all pending memory write invalidates.  */
497 
498         /* TODO use general release procedure? */
499         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
500 
501         KMP_MB();       /* Flush all pending memory write invalidates.  */
502     }
503     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
504 }
505 
506 /* Computes and returns x to the power of y, where y must a non-negative integer */
507 template< typename UT >
508 static __forceinline long double
509 __kmp_pow(long double x, UT y) {
510     long double s=1.0L;
511 
512     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
513     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
514     while(y) {
515         if ( y & 1 )
516             s *= x;
517         x *= x;
518         y >>= 1;
519     }
520     return s;
521 }
522 
523 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
524    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
525    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
526    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
527 */
528 template< typename T >
529 static __inline typename traits_t< T >::unsigned_t
530 __kmp_dispatch_guided_remaining(
531     T                                  tc,
532     typename traits_t< T >::floating_t base,
533     typename traits_t< T >::unsigned_t idx
534 ) {
535     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
536        least for ICL 8.1, long double arithmetic may not really have
537        long double precision, even with /Qlong_double.  Currently, we
538        workaround that in the caller code, by manipulating the FPCW for
539        Windows* OS on IA-32 architecture.  The lack of precision is not
540        expected to be a correctness issue, though.
541     */
542     typedef typename traits_t< T >::unsigned_t  UT;
543 
544     long double x = tc * __kmp_pow< UT >(base, idx);
545     UT r = (UT) x;
546     if ( x == r )
547         return r;
548     return r + 1;
549 }
550 
551 // Parameters of the guided-iterative algorithm:
552 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
553 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
554 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
555 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
556 static int guided_int_param = 2;
557 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
558 
559 // UT - unsigned flavor of T, ST - signed flavor of T,
560 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
561 template< typename T >
562 static void
563 __kmp_dispatch_init(
564     ident_t                        * loc,
565     int                              gtid,
566     enum sched_type                  schedule,
567     T                                lb,
568     T                                ub,
569     typename traits_t< T >::signed_t st,
570     typename traits_t< T >::signed_t chunk,
571     int                              push_ws
572 ) {
573     typedef typename traits_t< T >::unsigned_t  UT;
574     typedef typename traits_t< T >::signed_t    ST;
575     typedef typename traits_t< T >::floating_t  DBL;
576     static const int ___kmp_size_type = sizeof( UT );
577 
578     int                                            active;
579     T                                              tc;
580     kmp_info_t *                                   th;
581     kmp_team_t *                                   team;
582     kmp_uint32                                     my_buffer_index;
583     dispatch_private_info_template< T >          * pr;
584     dispatch_shared_info_template< UT > volatile * sh;
585 
586     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
587     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
588 
589     if ( ! TCR_4( __kmp_init_parallel ) )
590         __kmp_parallel_initialize();
591 
592     #ifdef KMP_DEBUG
593     {
594         const char * buff;
595         // create format specifiers before the debug output
596         buff = __kmp_str_format(
597             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
598             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
599         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
600         __kmp_str_free( &buff );
601     }
602     #endif
603     /* setup data */
604     th     = __kmp_threads[ gtid ];
605     team   = th -> th.th_team;
606     active = ! team -> t.t_serialized;
607     th->th.th_ident = loc;
608 
609     if ( ! active ) {
610         pr = reinterpret_cast< dispatch_private_info_template< T >* >
611             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
612     } else {
613         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
614                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
615 
616         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
617 
618         /* What happens when number of threads changes, need to resize buffer? */
619         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
620             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
621         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
622             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
623     }
624 
625     /* Pick up the nomerge/ordered bits from the scheduling type */
626     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
627         pr->nomerge = TRUE;
628         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
629     } else {
630         pr->nomerge = FALSE;
631     }
632     pr->type_size = ___kmp_size_type; // remember the size of variables
633     if ( kmp_ord_lower & schedule ) {
634         pr->ordered = TRUE;
635         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
636     } else {
637         pr->ordered = FALSE;
638     }
639     if ( schedule == kmp_sch_static ) {
640         schedule = __kmp_static;
641     } else {
642         if ( schedule == kmp_sch_runtime ) {
643             #if OMP_30_ENABLED
644                 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
645                 schedule = team -> t.t_sched.r_sched_type;
646                 // Detail the schedule if needed (global controls are differentiated appropriately)
647                 if ( schedule == kmp_sch_guided_chunked ) {
648                     schedule = __kmp_guided;
649                 } else if ( schedule == kmp_sch_static ) {
650                     schedule = __kmp_static;
651                 }
652                 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
653                 chunk = team -> t.t_sched.chunk;
654             #else
655                 kmp_r_sched_t r_sched = __kmp_get_schedule_global();
656                 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
657                 schedule = r_sched.r_sched_type;
658                 chunk    = r_sched.chunk;
659             #endif
660 
661             #ifdef KMP_DEBUG
662             {
663                 const char * buff;
664                 // create format specifiers before the debug output
665                 buff = __kmp_str_format(
666                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
667                     traits_t< ST >::spec );
668                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
669                 __kmp_str_free( &buff );
670             }
671             #endif
672         } else {
673             if ( schedule == kmp_sch_guided_chunked ) {
674                 schedule = __kmp_guided;
675             }
676             if ( chunk <= 0 ) {
677                 chunk = KMP_DEFAULT_CHUNK;
678             }
679         }
680 
681         #if OMP_30_ENABLED
682         if ( schedule == kmp_sch_auto ) {
683             // mapping and differentiation: in the __kmp_do_serial_initialize()
684             schedule = __kmp_auto;
685             #ifdef KMP_DEBUG
686             {
687                 const char * buff;
688                 // create format specifiers before the debug output
689                 buff = __kmp_str_format(
690                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
691                     traits_t< ST >::spec );
692                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
693                 __kmp_str_free( &buff );
694             }
695             #endif
696         }
697         #endif // OMP_30_ENABLED
698 
699         /* guided analytical not safe for too many threads */
700         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
701             schedule = kmp_sch_guided_iterative_chunked;
702             KMP_WARNING( DispatchManyThreads );
703         }
704         pr->u.p.parm1 = chunk;
705     }
706     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
707                 "unknown scheduling type" );
708 
709     pr->u.p.count = 0;
710 
711     if ( __kmp_env_consistency_check ) {
712         if ( st == 0 ) {
713             __kmp_error_construct(
714                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
715                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
716             );
717         }
718     }
719 
720     tc = ( ub - lb + st );
721     if ( st != 1 ) {
722         if ( st < 0 ) {
723             if ( lb < ub ) {
724                 tc = 0;            // zero-trip
725             } else {   // lb >= ub
726                 tc = (ST)tc / st;  // convert to signed division
727             }
728         } else {       // st > 0
729             if ( ub < lb ) {
730                 tc = 0;            // zero-trip
731             } else {   // lb >= ub
732                 tc /= st;
733             }
734         }
735     } else if ( ub < lb ) {        // st == 1
736         tc = 0;                    // zero-trip
737     }
738 
739     pr->u.p.lb = lb;
740     pr->u.p.ub = ub;
741     pr->u.p.st = st;
742     pr->u.p.tc = tc;
743 
744     #if KMP_OS_WINDOWS
745     pr->u.p.last_upper = ub + st;
746     #endif /* KMP_OS_WINDOWS */
747 
748     /* NOTE: only the active parallel region(s) has active ordered sections */
749 
750     if ( active ) {
751         if ( pr->ordered == 0 ) {
752             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
753             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
754         } else {
755             pr->ordered_bumped = 0;
756 
757             pr->u.p.ordered_lower = 1;
758             pr->u.p.ordered_upper = 0;
759 
760             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
761             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
762         }
763     }
764 
765     if ( __kmp_env_consistency_check ) {
766         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
767         if ( push_ws ) {
768             __kmp_push_workshare( gtid, ws, loc );
769             pr->pushed_ws = ws;
770         } else {
771             __kmp_check_workshare( gtid, ws, loc );
772             pr->pushed_ws = ct_none;
773         }
774     }
775 
776     switch ( schedule ) {
777     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
778     case kmp_sch_static_steal:
779         {
780             T nproc = team->t.t_nproc;
781             T ntc, init;
782 
783             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
784 
785             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
786             if ( nproc > 1 && ntc >= nproc ) {
787                 T id = __kmp_tid_from_gtid(gtid);
788                 T small_chunk, extras;
789 
790                 small_chunk = ntc / nproc;
791                 extras = ntc % nproc;
792 
793                 init = id * small_chunk + ( id < extras ? id : extras );
794                 pr->u.p.count = init;
795                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
796 
797                 pr->u.p.parm2 = lb;
798                 //pr->pfields.parm3 = 0; // it's not used in static_steal
799                 pr->u.p.parm4 = id;
800                 pr->u.p.st = st;
801                 break;
802             } else {
803                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
804                                gtid ) );
805                 schedule = kmp_sch_static_balanced;
806                 /* too few iterations: fall-through to kmp_sch_static_balanced */
807             } // if
808             /* FALL-THROUGH to static balanced */
809         } // case
810     #endif
811     case kmp_sch_static_balanced:
812         {
813             T nproc = team->t.t_nproc;
814             T init, limit;
815 
816             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
817                             gtid ) );
818 
819             if ( nproc > 1 ) {
820                 T id = __kmp_tid_from_gtid(gtid);
821 
822                 if ( tc < nproc ) {
823                     if ( id < tc ) {
824                         init = id;
825                         limit = id;
826                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
827                     } else {
828                         pr->u.p.count = 1;  /* means no more chunks to execute */
829                         pr->u.p.parm1 = FALSE;
830                         break;
831                     }
832                 } else {
833                     T small_chunk = tc / nproc;
834                     T extras = tc % nproc;
835                     init = id * small_chunk + (id < extras ? id : extras);
836                     limit = init + small_chunk - (id < extras ? 0 : 1);
837                     pr->u.p.parm1 = (id == nproc - 1);
838                 }
839             } else {
840                 if ( tc > 0 ) {
841                     init = 0;
842                     limit = tc - 1;
843                     pr->u.p.parm1 = TRUE;
844                 } else {
845                     // zero trip count
846                     pr->u.p.count = 1;  /* means no more chunks to execute */
847                     pr->u.p.parm1 = FALSE;
848                     break;
849                 }
850             }
851             if ( st == 1 ) {
852                 pr->u.p.lb = lb + init;
853                 pr->u.p.ub = lb + limit;
854             } else {
855                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
856                 pr->u.p.lb = lb + init * st;
857                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
858                 if ( st > 0 ) {
859                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
860                 } else {
861                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
862                 }
863             }
864             if ( pr->ordered ) {
865                 pr->u.p.ordered_lower = init;
866                 pr->u.p.ordered_upper = limit;
867             }
868             break;
869         } // case
870     case kmp_sch_guided_iterative_chunked :
871         {
872             T nproc = team->t.t_nproc;
873             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
874 
875             if ( nproc > 1 ) {
876                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
877                     /* chunk size too large, switch to dynamic */
878                     schedule = kmp_sch_dynamic_chunked;
879                 } else {
880                     // when remaining iters become less than parm2 - switch to dynamic
881                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
882                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
883                 }
884             } else {
885                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
886                 schedule = kmp_sch_static_greedy;
887                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
888                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
889                 pr->u.p.parm1 = tc;
890             } // if
891         } // case
892         break;
893     case kmp_sch_guided_analytical_chunked:
894         {
895             T nproc = team->t.t_nproc;
896             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
897 
898             if ( nproc > 1 ) {
899                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
900                     /* chunk size too large, switch to dynamic */
901                     schedule = kmp_sch_dynamic_chunked;
902                 } else {
903                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
904                     DBL x;
905 
906                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
907                     /* Linux* OS already has 64-bit computation by default for
908 		       long double, and on Windows* OS on Intel(R) 64,
909 		       /Qlong_double doesn't work.  On Windows* OS
910 		       on IA-32 architecture, we need to set precision to
911 		       64-bit instead of the default 53-bit. Even though long
912 		       double doesn't work on Windows* OS on Intel(R) 64, the
913 		       resulting lack of precision is not expected to impact
914 		       the correctness of the algorithm, but this has not been
915 		       mathematically proven.
916                     */
917                     // save original FPCW and set precision to 64-bit, as
918                     // Windows* OS on IA-32 architecture defaults to 53-bit
919                     unsigned int oldFpcw = _control87(0,0);
920                     _control87(_PC_64,_MCW_PC); // 0,0x30000
921                     #endif
922                     /* value used for comparison in solver for cross-over point */
923                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
924 
925                     /* crossover point--chunk indexes equal to or greater than
926 		       this point switch to dynamic-style scheduling */
927                     UT   cross;
928 
929                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
930                     x = (long double)1.0 - (long double)0.5 / nproc;
931 
932                     #ifdef KMP_DEBUG
933                     { // test natural alignment
934                         struct _test_a {
935                             char a;
936                             union {
937                                 char b;
938                                 DBL  d;
939                             };
940                         } t;
941                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
942                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
943                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
944                     }
945                     #endif // KMP_DEBUG
946 
947                     /* save the term in thread private dispatch structure */
948                     *(DBL*)&pr->u.p.parm3 = x;
949 
950                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
951                     {
952                         UT          left, right, mid;
953                         long double p;
954 
955                         /* estimate initial upper and lower bound */
956 
957                         /* doesn't matter what value right is as long as it is positive, but
958                            it affects performance of the solver
959                         */
960                         right = 229;
961                         p = __kmp_pow< UT >(x,right);
962                         if ( p > target ) {
963                             do{
964                                 p *= p;
965                                 right <<= 1;
966                             } while(p>target && right < (1<<27));
967                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
968                         } else {
969                             left = 0;
970                         }
971 
972                         /* bisection root-finding method */
973                         while ( left + 1 < right ) {
974                             mid = (left + right) / 2;
975                             if ( __kmp_pow< UT >(x,mid) > target ) {
976                                 left = mid;
977                             } else {
978                                 right = mid;
979                             }
980                         } // while
981                         cross = right;
982                     }
983                     /* assert sanity of computed crossover point */
984                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
985 
986                     /* save the crossover point in thread private dispatch structure */
987                     pr->u.p.parm2 = cross;
988 
989                     // C75803
990                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
991                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
992                     #else
993                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
994                     #endif
995                     /* dynamic-style scheduling offset */
996                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
997                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
998                         // restore FPCW
999                         _control87(oldFpcw,_MCW_PC);
1000                     #endif
1001                 } // if
1002             } else {
1003                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1004                                gtid ) );
1005                 schedule = kmp_sch_static_greedy;
1006                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1007                 pr->u.p.parm1 = tc;
1008             } // if
1009         } // case
1010         break;
1011     case kmp_sch_static_greedy:
1012         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1013             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1014                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1015                 tc;
1016         break;
1017     case kmp_sch_static_chunked :
1018     case kmp_sch_dynamic_chunked :
1019         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1020         break;
1021     case kmp_sch_trapezoidal :
1022         {
1023             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1024 
1025             T parm1, parm2, parm3, parm4;
1026             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1027 
1028             parm1 = chunk;
1029 
1030             /* F : size of the first cycle */
1031             parm2 = ( tc / (2 * team->t.t_nproc) );
1032 
1033             if ( parm2 < 1 ) {
1034                 parm2 = 1;
1035             }
1036 
1037             /* L : size of the last cycle.  Make sure the last cycle
1038              *     is not larger than the first cycle.
1039              */
1040             if ( parm1 < 1 ) {
1041                 parm1 = 1;
1042             } else if ( parm1 > parm2 ) {
1043                 parm1 = parm2;
1044             }
1045 
1046             /* N : number of cycles */
1047             parm3 = ( parm2 + parm1 );
1048             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1049 
1050             if ( parm3 < 2 ) {
1051                 parm3 = 2;
1052             }
1053 
1054             /* sigma : decreasing incr of the trapezoid */
1055             parm4 = ( parm3 - 1 );
1056             parm4 = ( parm2 - parm1 ) / parm4;
1057 
1058             // pointless check, because parm4 >= 0 always
1059             //if ( parm4 < 0 ) {
1060             //    parm4 = 0;
1061             //}
1062 
1063             pr->u.p.parm1 = parm1;
1064             pr->u.p.parm2 = parm2;
1065             pr->u.p.parm3 = parm3;
1066             pr->u.p.parm4 = parm4;
1067         } // case
1068         break;
1069 
1070     default:
1071         {
1072             __kmp_msg(
1073                 kmp_ms_fatal,                        // Severity
1074                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1075                 KMP_HNT( GetNewerLibrary ),          // Hint
1076                 __kmp_msg_null                       // Variadic argument list terminator
1077             );
1078         }
1079         break;
1080     } // switch
1081     pr->schedule = schedule;
1082     if ( active ) {
1083         /* The name of this buffer should be my_buffer_index when it's free to use it */
1084 
1085         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1086                         gtid, my_buffer_index, sh->buffer_index) );
1087         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1088                                         USE_ITT_BUILD_ARG( NULL )
1089                                         );
1090             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1091             // *always* 32-bit integers.
1092         KMP_MB();  /* is this necessary? */
1093         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1094                         gtid, my_buffer_index, sh->buffer_index) );
1095 
1096         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1097         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1098 #if USE_ITT_BUILD
1099         if ( pr->ordered ) {
1100             __kmp_itt_ordered_init( gtid );
1101         }; // if
1102 #endif /* USE_ITT_BUILD */
1103     }; // if
1104     #ifdef KMP_DEBUG
1105     {
1106         const char * buff;
1107         // create format specifiers before the debug output
1108         buff = __kmp_str_format(
1109             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1110             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1111             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1112             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1113             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1114             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1115             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1116         KD_TRACE(10, ( buff,
1117             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1118             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1119             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1120             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1121         __kmp_str_free( &buff );
1122     }
1123     #endif
1124     #if ( KMP_STATIC_STEAL_ENABLED )
1125     if ( ___kmp_size_type < 8 ) {
1126       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1127       // all the parm3 variables will contain the same value.
1128       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1129       // rather than program life-time increment.
1130       // So the dedicated variable is required. The 'static_steal_counter' is used.
1131       if( schedule == kmp_sch_static_steal ) {
1132         // Other threads will inspect this variable when searching for a victim.
1133         // This is a flag showing that other threads may steal from this thread since then.
1134         volatile T * p = &pr->u.p.static_steal_counter;
1135         *p = *p + 1;
1136       }
1137     }
1138     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1139 }
1140 
1141 /*
1142  * For ordered loops, either __kmp_dispatch_finish() should be called after
1143  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1144  * every chunk of iterations.  If the ordered section(s) were not executed
1145  * for this iteration (or every iteration in this chunk), we need to set the
1146  * ordered iteration counters so that the next thread can proceed.
1147  */
1148 template< typename UT >
1149 static void
1150 __kmp_dispatch_finish( int gtid, ident_t *loc )
1151 {
1152     typedef typename traits_t< UT >::signed_t ST;
1153     kmp_info_t *th = __kmp_threads[ gtid ];
1154 
1155     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1156     if ( ! th -> th.th_team -> t.t_serialized ) {
1157 
1158         dispatch_private_info_template< UT > * pr =
1159             reinterpret_cast< dispatch_private_info_template< UT >* >
1160             ( th->th.th_dispatch->th_dispatch_pr_current );
1161         dispatch_shared_info_template< UT > volatile * sh =
1162             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1163             ( th->th.th_dispatch->th_dispatch_sh_current );
1164         KMP_DEBUG_ASSERT( pr );
1165         KMP_DEBUG_ASSERT( sh );
1166         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1167                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1168 
1169         if ( pr->ordered_bumped ) {
1170             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1171                             gtid ) );
1172             pr->ordered_bumped = 0;
1173         } else {
1174             UT lower = pr->u.p.ordered_lower;
1175 
1176             #ifdef KMP_DEBUG
1177             {
1178                 const char * buff;
1179                 // create format specifiers before the debug output
1180                 buff = __kmp_str_format(
1181                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1182                     traits_t< UT >::spec, traits_t< UT >::spec );
1183                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1184                 __kmp_str_free( &buff );
1185             }
1186             #endif
1187 
1188             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1189                                    USE_ITT_BUILD_ARG(NULL)
1190                                    );
1191             KMP_MB();  /* is this necessary? */
1192             #ifdef KMP_DEBUG
1193             {
1194                 const char * buff;
1195                 // create format specifiers before the debug output
1196                 buff = __kmp_str_format(
1197                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1198                     traits_t< UT >::spec, traits_t< UT >::spec );
1199                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1200                 __kmp_str_free( &buff );
1201             }
1202             #endif
1203 
1204             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1205         } // if
1206     } // if
1207     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1208 }
1209 
1210 #ifdef KMP_GOMP_COMPAT
1211 
1212 template< typename UT >
1213 static void
1214 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1215 {
1216     typedef typename traits_t< UT >::signed_t ST;
1217     kmp_info_t *th = __kmp_threads[ gtid ];
1218 
1219     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1220     if ( ! th -> th.th_team -> t.t_serialized ) {
1221 //        int cid;
1222         dispatch_private_info_template< UT > * pr =
1223             reinterpret_cast< dispatch_private_info_template< UT >* >
1224             ( th->th.th_dispatch->th_dispatch_pr_current );
1225         dispatch_shared_info_template< UT > volatile * sh =
1226             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1227             ( th->th.th_dispatch->th_dispatch_sh_current );
1228         KMP_DEBUG_ASSERT( pr );
1229         KMP_DEBUG_ASSERT( sh );
1230         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1231                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1232 
1233 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1234             UT lower = pr->u.p.ordered_lower;
1235             UT upper = pr->u.p.ordered_upper;
1236             UT inc = upper - lower + 1;
1237 
1238             if ( pr->ordered_bumped == inc ) {
1239                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1240                   gtid ) );
1241                 pr->ordered_bumped = 0;
1242             } else {
1243                 inc -= pr->ordered_bumped;
1244 
1245                 #ifdef KMP_DEBUG
1246                 {
1247                     const char * buff;
1248                     // create format specifiers before the debug output
1249                     buff = __kmp_str_format(
1250                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1251                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1252                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1253                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1254                     __kmp_str_free( &buff );
1255                 }
1256                 #endif
1257 
1258                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1259                                        USE_ITT_BUILD_ARG(NULL)
1260                                        );
1261 
1262                 KMP_MB();  /* is this necessary? */
1263                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1264                   gtid ) );
1265                 pr->ordered_bumped = 0;
1266 //!!!!! TODO check if the inc should be unsigned, or signed???
1267                 #ifdef KMP_DEBUG
1268                 {
1269                     const char * buff;
1270                     // create format specifiers before the debug output
1271                     buff = __kmp_str_format(
1272                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1273                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1274                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1275                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1276                     __kmp_str_free( &buff );
1277                 }
1278                 #endif
1279 
1280                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1281             }
1282 //        }
1283     }
1284     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1285 }
1286 
1287 #endif /* KMP_GOMP_COMPAT */
1288 
1289 template< typename T >
1290 static int
1291 __kmp_dispatch_next(
1292     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1293 ) {
1294 
1295     typedef typename traits_t< T >::unsigned_t  UT;
1296     typedef typename traits_t< T >::signed_t    ST;
1297     typedef typename traits_t< T >::floating_t  DBL;
1298     static const int ___kmp_size_type = sizeof( UT );
1299 
1300     int                                   status;
1301     dispatch_private_info_template< T > * pr;
1302     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1303     kmp_team_t                          * team = th -> th.th_team;
1304 
1305     #ifdef KMP_DEBUG
1306     {
1307         const char * buff;
1308         // create format specifiers before the debug output
1309         buff = __kmp_str_format(
1310             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1311             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1312         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1313         __kmp_str_free( &buff );
1314     }
1315     #endif
1316 
1317     if ( team -> t.t_serialized ) {
1318         /* NOTE: serialize this dispatch becase we are not at the active level */
1319         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1320             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1321         KMP_DEBUG_ASSERT( pr );
1322 
1323         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1324             *p_lb = 0;
1325             *p_ub = 0;
1326             if ( p_st != 0 ) {
1327                 *p_st = 0;
1328             }
1329             if ( __kmp_env_consistency_check ) {
1330                 if ( pr->pushed_ws != ct_none ) {
1331                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1332                 }
1333             }
1334         } else if ( pr->nomerge ) {
1335             kmp_int32 last;
1336             T         start;
1337             UT        limit, trip, init;
1338             ST        incr;
1339             T         chunk = pr->u.p.parm1;
1340 
1341             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1342 
1343             init = chunk * pr->u.p.count++;
1344             trip = pr->u.p.tc - 1;
1345 
1346             if ( (status = (init <= trip)) == 0 ) {
1347                 *p_lb = 0;
1348                 *p_ub = 0;
1349                 if ( p_st != 0 ) *p_st = 0;
1350                 if ( __kmp_env_consistency_check ) {
1351                     if ( pr->pushed_ws != ct_none ) {
1352                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1353                     }
1354                 }
1355             } else {
1356                 start = pr->u.p.lb;
1357                 limit = chunk + init - 1;
1358                 incr  = pr->u.p.st;
1359 
1360                 if ( (last = (limit >= trip)) != 0 ) {
1361                     limit = trip;
1362                     #if KMP_OS_WINDOWS
1363                     pr->u.p.last_upper = pr->u.p.ub;
1364                     #endif /* KMP_OS_WINDOWS */
1365                 }
1366                 if ( p_last ) {
1367                     *p_last = last;
1368                 }
1369                 if ( p_st != 0 ) {
1370                     *p_st = incr;
1371                 }
1372                 if ( incr == 1 ) {
1373                     *p_lb = start + init;
1374                     *p_ub = start + limit;
1375                 } else {
1376                     *p_lb = start + init * incr;
1377                     *p_ub = start + limit * incr;
1378                 }
1379 
1380                 if ( pr->ordered ) {
1381                     pr->u.p.ordered_lower = init;
1382                     pr->u.p.ordered_upper = limit;
1383                     #ifdef KMP_DEBUG
1384                     {
1385                         const char * buff;
1386                         // create format specifiers before the debug output
1387                         buff = __kmp_str_format(
1388                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1389                             traits_t< UT >::spec, traits_t< UT >::spec );
1390                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1391                         __kmp_str_free( &buff );
1392                     }
1393                     #endif
1394                 } // if
1395             } // if
1396         } else {
1397             pr->u.p.tc = 0;
1398 
1399             *p_lb = pr->u.p.lb;
1400             *p_ub = pr->u.p.ub;
1401             #if KMP_OS_WINDOWS
1402             pr->u.p.last_upper = *p_ub;
1403             #endif /* KMP_OS_WINDOWS */
1404 
1405             if ( p_st != 0 ) {
1406                 *p_st = pr->u.p.st;
1407             }
1408             if ( p_last ) {
1409                 *p_last = TRUE;
1410             }
1411         } // if
1412         #ifdef KMP_DEBUG
1413         {
1414             const char * buff;
1415             // create format specifiers before the debug output
1416             buff = __kmp_str_format(
1417                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1418                 "p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
1419                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1420             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1421             __kmp_str_free( &buff );
1422         }
1423         #endif
1424         return status;
1425     } else {
1426         kmp_int32 last = 0;
1427         dispatch_shared_info_template< UT > *sh;
1428         T         start;
1429         ST        incr;
1430         UT        limit, trip, init;
1431 
1432         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1433                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1434 
1435         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1436             ( th->th.th_dispatch->th_dispatch_pr_current );
1437         KMP_DEBUG_ASSERT( pr );
1438         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1439             ( th->th.th_dispatch->th_dispatch_sh_current );
1440         KMP_DEBUG_ASSERT( sh );
1441 
1442         if ( pr->u.p.tc == 0 ) {
1443             // zero trip count
1444             status = 0;
1445         } else {
1446             switch (pr->schedule) {
1447             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1448             case kmp_sch_static_steal:
1449                 {
1450                     T chunk = pr->u.p.parm1;
1451 
1452                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1453 
1454                     trip = pr->u.p.tc - 1;
1455 
1456                     if ( ___kmp_size_type > 4 ) {
1457                         // Other threads do not look into the data of this thread,
1458                         //  so it's not necessary to make volatile casting.
1459                         init   = ( pr->u.p.count )++;
1460                         status = ( init < (UT)pr->u.p.ub );
1461                     } else {
1462                         typedef union {
1463                             struct {
1464                                 UT count;
1465                                 T  ub;
1466                             } p;
1467                             kmp_int64 b;
1468                         } union_i4;
1469                         // All operations on 'count' or 'ub' must be combined atomically together.
1470                         // stealing implemented only for 4-byte indexes
1471                         {
1472                             union_i4 vold, vnew;
1473                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1474                             vnew = vold;
1475                             vnew.p.count++;
1476                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1477                                         ( volatile kmp_int64* )&pr->u.p.count,
1478                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1479                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1480                                 KMP_CPU_PAUSE();
1481                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1482                                 vnew = vold;
1483                                 vnew.p.count++;
1484                             }
1485                             vnew = vold;
1486                             init   = vnew.p.count;
1487                             status = ( init < (UT)vnew.p.ub ) ;
1488                         }
1489 
1490                         if( !status ) {
1491                             kmp_info_t   **other_threads = team->t.t_threads;
1492                             int          while_limit = 10;
1493                             int          while_index = 0;
1494 
1495                             // TODO: algorithm of searching for a victim
1496                             // should be cleaned up and measured
1497                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1498                                 union_i4  vold, vnew;
1499                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1500                                 T         victimIdx    = pr->u.p.parm4;
1501                                 T         oldVictimIdx = victimIdx;
1502                                 dispatch_private_info_template< T > * victim;
1503 
1504                                 do {
1505                                     if( !victimIdx ) {
1506                                         victimIdx = team->t.t_nproc - 1;
1507                                     } else {
1508                                         --victimIdx;
1509                                     }
1510                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1511                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1512                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1513                                 // TODO: think about a proper place of this test
1514                                 if ( ( !victim ) ||
1515                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1516                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1517                                     // TODO: delay would be nice
1518                                     continue;
1519                                     // the victim is not ready yet to participate in stealing
1520                                     // because the victim is still in kmp_init_dispatch
1521                                 }
1522                                 if ( oldVictimIdx == victimIdx ) {
1523                                     break;
1524                                 }
1525                                 pr->u.p.parm4 = victimIdx;
1526 
1527                                 while( 1 ) {
1528                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1529                                     vnew = vold;
1530 
1531                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1532                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1533                                         break;
1534                                     }
1535                                     vnew.p.ub -= (remaining >> 2);
1536                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1537                                     #pragma warning( push )
1538                                     // disable warning on pointless comparison of unsigned with 0
1539                                     #pragma warning( disable: 186 )
1540                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1541                                     #pragma warning( pop )
1542                                     // TODO: Should this be acquire or release?
1543                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1544                                             ( volatile kmp_int64 * )&victim->u.p.count,
1545                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1546                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1547                                         status = 1;
1548                                         while_index = 0;
1549                                         // now update own count and ub
1550                                         #if KMP_ARCH_X86
1551                                         // stealing executed on non-KMP_ARCH_X86 only
1552                                             // Atomic 64-bit write on ia32 is
1553                                             // unavailable, so we do this in steps.
1554                                             //     This code is not tested.
1555                                             init = vold.p.count;
1556                                             pr->u.p.ub = 0;
1557                                             pr->u.p.count = init + 1;
1558                                             pr->u.p.ub = vnew.p.count;
1559                                         #else
1560                                             init = vnew.p.ub;
1561                                             vold.p.count = init + 1;
1562                                             // TODO: is it safe and enough?
1563                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1564                                         #endif // KMP_ARCH_X86
1565                                         break;
1566                                     } // if
1567                                 KMP_CPU_PAUSE();
1568                                 } // while (1)
1569                             } // while
1570                         } // if
1571                     } // if
1572                     if ( !status ) {
1573                         *p_lb = 0;
1574                         *p_ub = 0;
1575                         if ( p_st != 0 ) *p_st = 0;
1576                     } else {
1577                         start = pr->u.p.parm2;
1578                         init *= chunk;
1579                         limit = chunk + init - 1;
1580                         incr  = pr->u.p.st;
1581 
1582                         KMP_DEBUG_ASSERT(init <= trip);
1583                         if ( (last = (limit >= trip)) != 0 )
1584                             limit = trip;
1585                         if ( p_last ) {
1586                             *p_last = last;
1587                         }
1588                         if ( p_st != 0 ) *p_st = incr;
1589 
1590                         if ( incr == 1 ) {
1591                             *p_lb = start + init;
1592                             *p_ub = start + limit;
1593                         } else {
1594                             *p_lb = start + init * incr;
1595                             *p_ub = start + limit * incr;
1596                         }
1597 
1598                         if ( pr->ordered ) {
1599                             pr->u.p.ordered_lower = init;
1600                             pr->u.p.ordered_upper = limit;
1601                             #ifdef KMP_DEBUG
1602                             {
1603                                 const char * buff;
1604                                 // create format specifiers before the debug output
1605                                 buff = __kmp_str_format(
1606                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1607                                     traits_t< UT >::spec, traits_t< UT >::spec );
1608                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1609                                 __kmp_str_free( &buff );
1610                             }
1611                             #endif
1612                         } // if
1613                     } // if
1614                     break;
1615                 } // case
1616             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1617             case kmp_sch_static_balanced:
1618                 {
1619                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1620                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1621                         pr->u.p.count = 1;
1622                         *p_lb = pr->u.p.lb;
1623                         *p_ub = pr->u.p.ub;
1624                         last = pr->u.p.parm1;
1625                         if ( p_last ) {
1626                             *p_last = last;
1627                         }
1628                         if ( p_st )
1629                             *p_st = pr->u.p.st;
1630                     } else {  /* no iterations to do */
1631                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1632                     }
1633                     if ( pr->ordered ) {
1634                         #ifdef KMP_DEBUG
1635                         {
1636                             const char * buff;
1637                             // create format specifiers before the debug output
1638                             buff = __kmp_str_format(
1639                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1640                                 traits_t< UT >::spec, traits_t< UT >::spec );
1641                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1642                             __kmp_str_free( &buff );
1643                         }
1644                         #endif
1645                     } // if
1646                 } // case
1647                 break;
1648             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1649             case kmp_sch_static_chunked:
1650                 {
1651                     T parm1;
1652 
1653                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1654                                    gtid ) );
1655                     parm1 = pr->u.p.parm1;
1656 
1657                     trip  = pr->u.p.tc - 1;
1658                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1659 
1660                     if ( (status = (init <= trip)) != 0 ) {
1661                         start = pr->u.p.lb;
1662                         incr  = pr->u.p.st;
1663                         limit = parm1 + init - 1;
1664 
1665                         if ( (last = (limit >= trip)) != 0 )
1666                             limit = trip;
1667 
1668                         if ( p_last ) {
1669                             *p_last = last;
1670                         }
1671                         if ( p_st != 0 ) *p_st = incr;
1672 
1673                         pr->u.p.count += team->t.t_nproc;
1674 
1675                         if ( incr == 1 ) {
1676                             *p_lb = start + init;
1677                             *p_ub = start + limit;
1678                         }
1679                         else {
1680                             *p_lb = start + init * incr;
1681                             *p_ub = start + limit * incr;
1682                         }
1683 
1684                         if ( pr->ordered ) {
1685                             pr->u.p.ordered_lower = init;
1686                             pr->u.p.ordered_upper = limit;
1687                             #ifdef KMP_DEBUG
1688                             {
1689                                 const char * buff;
1690                                 // create format specifiers before the debug output
1691                                 buff = __kmp_str_format(
1692                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1693                                     traits_t< UT >::spec, traits_t< UT >::spec );
1694                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1695                                 __kmp_str_free( &buff );
1696                             }
1697                             #endif
1698                         } // if
1699                     } // if
1700                 } // case
1701                 break;
1702 
1703             case kmp_sch_dynamic_chunked:
1704                 {
1705                     T chunk = pr->u.p.parm1;
1706 
1707                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1708                                    gtid ) );
1709 
1710                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1711                     trip = pr->u.p.tc - 1;
1712 
1713                     if ( (status = (init <= trip)) == 0 ) {
1714                         *p_lb = 0;
1715                         *p_ub = 0;
1716                         if ( p_st != 0 ) *p_st = 0;
1717                     } else {
1718                         start = pr->u.p.lb;
1719                         limit = chunk + init - 1;
1720                         incr  = pr->u.p.st;
1721 
1722                         if ( (last = (limit >= trip)) != 0 )
1723                             limit = trip;
1724                         if ( p_last ) {
1725                             *p_last = last;
1726                         }
1727                         if ( p_st != 0 ) *p_st = incr;
1728 
1729                         if ( incr == 1 ) {
1730                             *p_lb = start + init;
1731                             *p_ub = start + limit;
1732                         } else {
1733                             *p_lb = start + init * incr;
1734                             *p_ub = start + limit * incr;
1735                         }
1736 
1737                         if ( pr->ordered ) {
1738                             pr->u.p.ordered_lower = init;
1739                             pr->u.p.ordered_upper = limit;
1740                             #ifdef KMP_DEBUG
1741                             {
1742                                 const char * buff;
1743                                 // create format specifiers before the debug output
1744                                 buff = __kmp_str_format(
1745                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1746                                     traits_t< UT >::spec, traits_t< UT >::spec );
1747                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1748                                 __kmp_str_free( &buff );
1749                             }
1750                             #endif
1751                         } // if
1752                     } // if
1753                 } // case
1754                 break;
1755 
1756             case kmp_sch_guided_iterative_chunked:
1757                 {
1758                     T  chunkspec = pr->u.p.parm1;
1759                     KD_TRACE(100,
1760                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1761                     trip  = pr->u.p.tc;
1762                     // Start atomic part of calculations
1763                     while(1) {
1764                         ST  remaining;             // signed, because can be < 0
1765                         init = sh->u.s.iteration;  // shared value
1766                         remaining = trip - init;
1767                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1768                             // nothing to do, don't try atomic op
1769                             status = 0;
1770                             break;
1771                         }
1772                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1773                             // use dynamic-style shcedule
1774                             // atomically inrement iterations, get old value
1775                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1776                             remaining = trip - init;
1777                             if (remaining <= 0) {
1778                                 status = 0;    // all iterations got by other threads
1779                             } else {
1780                                 // got some iterations to work on
1781                                 status = 1;
1782                                 if ( (T)remaining > chunkspec ) {
1783                                     limit = init + chunkspec - 1;
1784                                 } else {
1785                                     last = 1;   // the last chunk
1786                                     limit = init + remaining - 1;
1787                                 } // if
1788                             } // if
1789                             break;
1790                         } // if
1791                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1792                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1793                             // CAS was successful, chunk obtained
1794                             status = 1;
1795                             --limit;
1796                             break;
1797                         } // if
1798                     } // while
1799                     if ( status != 0 ) {
1800                         start = pr->u.p.lb;
1801                         incr = pr->u.p.st;
1802                         if ( p_st != NULL )
1803                             *p_st = incr;
1804                         if ( p_last != NULL )
1805                             *p_last = last;
1806                         *p_lb = start + init * incr;
1807                         *p_ub = start + limit * incr;
1808                         if ( pr->ordered ) {
1809                             pr->u.p.ordered_lower = init;
1810                             pr->u.p.ordered_upper = limit;
1811                             #ifdef KMP_DEBUG
1812                             {
1813                                 const char * buff;
1814                                 // create format specifiers before the debug output
1815                                 buff = __kmp_str_format(
1816                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1817                                     traits_t< UT >::spec, traits_t< UT >::spec );
1818                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1819                                 __kmp_str_free( &buff );
1820                             }
1821                             #endif
1822                         } // if
1823                     } else {
1824                         *p_lb = 0;
1825                         *p_ub = 0;
1826                         if ( p_st != NULL )
1827                             *p_st = 0;
1828                     } // if
1829                 } // case
1830                 break;
1831 
1832             case kmp_sch_guided_analytical_chunked:
1833                 {
1834                     T   chunkspec = pr->u.p.parm1;
1835                     UT chunkIdx;
1836     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1837                     /* for storing original FPCW value for Windows* OS on
1838 		       IA-32 architecture 8-byte version */
1839                     unsigned int oldFpcw;
1840                     unsigned int fpcwSet = 0;
1841     #endif
1842                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1843                                    gtid ) );
1844 
1845                     trip  = pr->u.p.tc;
1846 
1847                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1848                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1849 
1850                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1851                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1852                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1853                             --trip;
1854                             /* use dynamic-style scheduling */
1855                             init = chunkIdx * chunkspec + pr->u.p.count;
1856                             /* need to verify init > 0 in case of overflow in the above calculation */
1857                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1858                                 limit = init + chunkspec -1;
1859 
1860                                 if ( (last = (limit >= trip)) != 0 )
1861                                     limit = trip;
1862                             }
1863                             break;
1864                         } else {
1865                             /* use exponential-style scheduling */
1866                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1867                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1868                              */
1869     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1870                             /* If we haven't already done so, save original
1871 			       FPCW and set precision to 64-bit, as Windows* OS
1872 			       on IA-32 architecture defaults to 53-bit */
1873                             if ( !fpcwSet ) {
1874                                 oldFpcw = _control87(0,0);
1875                                 _control87(_PC_64,_MCW_PC);
1876                                 fpcwSet = 0x30000;
1877                             }
1878     #endif
1879                             if ( chunkIdx ) {
1880                                 init = __kmp_dispatch_guided_remaining< T >(
1881                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1882                                 KMP_DEBUG_ASSERT(init);
1883                                 init = trip - init;
1884                             } else
1885                                 init = 0;
1886                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1887                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1888                             KMP_ASSERT(init <= limit);
1889                             if ( init < limit ) {
1890                                 KMP_DEBUG_ASSERT(limit <= trip);
1891                                 --limit;
1892                                 status = 1;
1893                                 break;
1894                             } // if
1895                         } // if
1896                     } // while (1)
1897     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1898                     /* restore FPCW if necessary
1899                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1900                     */
1901                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1902                         _control87(oldFpcw,_MCW_PC);
1903     #endif
1904                     if ( status != 0 ) {
1905                         start = pr->u.p.lb;
1906                         incr = pr->u.p.st;
1907                         if ( p_st != NULL )
1908                             *p_st = incr;
1909                         if ( p_last != NULL )
1910                             *p_last = last;
1911                         *p_lb = start + init * incr;
1912                         *p_ub = start + limit * incr;
1913                         if ( pr->ordered ) {
1914                             pr->u.p.ordered_lower = init;
1915                             pr->u.p.ordered_upper = limit;
1916                             #ifdef KMP_DEBUG
1917                             {
1918                                 const char * buff;
1919                                 // create format specifiers before the debug output
1920                                 buff = __kmp_str_format(
1921                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1922                                     traits_t< UT >::spec, traits_t< UT >::spec );
1923                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1924                                 __kmp_str_free( &buff );
1925                             }
1926                             #endif
1927                         }
1928                     } else {
1929                         *p_lb = 0;
1930                         *p_ub = 0;
1931                         if ( p_st != NULL )
1932                             *p_st = 0;
1933                     }
1934                 } // case
1935                 break;
1936 
1937             case kmp_sch_trapezoidal:
1938                 {
1939                     UT   index;
1940                     T    parm2 = pr->u.p.parm2;
1941                     T    parm3 = pr->u.p.parm3;
1942                     T    parm4 = pr->u.p.parm4;
1943                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1944                                    gtid ) );
1945 
1946                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1947 
1948                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1949                     trip = pr->u.p.tc - 1;
1950 
1951                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1952                         *p_lb = 0;
1953                         *p_ub = 0;
1954                         if ( p_st != 0 ) *p_st = 0;
1955                     } else {
1956                         start = pr->u.p.lb;
1957                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1958                         incr  = pr->u.p.st;
1959 
1960                         if ( (last = (limit >= trip)) != 0 )
1961                             limit = trip;
1962 
1963                         if ( p_last != 0 ) {
1964                             *p_last = last;
1965                         }
1966                         if ( p_st != 0 ) *p_st = incr;
1967 
1968                         if ( incr == 1 ) {
1969                             *p_lb = start + init;
1970                             *p_ub = start + limit;
1971                         } else {
1972                             *p_lb = start + init * incr;
1973                             *p_ub = start + limit * incr;
1974                         }
1975 
1976                         if ( pr->ordered ) {
1977                             pr->u.p.ordered_lower = init;
1978                             pr->u.p.ordered_upper = limit;
1979                             #ifdef KMP_DEBUG
1980                             {
1981                                 const char * buff;
1982                                 // create format specifiers before the debug output
1983                                 buff = __kmp_str_format(
1984                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1985                                     traits_t< UT >::spec, traits_t< UT >::spec );
1986                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1987                                 __kmp_str_free( &buff );
1988                             }
1989                             #endif
1990                         } // if
1991                     } // if
1992                 } // case
1993                 break;
1994             } // switch
1995         } // if tc == 0;
1996 
1997         if ( status == 0 ) {
1998             UT   num_done;
1999 
2000             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2001             #ifdef KMP_DEBUG
2002             {
2003                 const char * buff;
2004                 // create format specifiers before the debug output
2005                 buff = __kmp_str_format(
2006                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2007                     traits_t< UT >::spec );
2008                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2009                 __kmp_str_free( &buff );
2010             }
2011             #endif
2012 
2013             if ( num_done == team->t.t_nproc-1 ) {
2014                 /* NOTE: release this buffer to be reused */
2015 
2016                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2017 
2018                 sh->u.s.num_done = 0;
2019                 sh->u.s.iteration = 0;
2020 
2021                 /* TODO replace with general release procedure? */
2022                 if ( pr->ordered ) {
2023                     sh->u.s.ordered_iteration = 0;
2024                 }
2025 
2026                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2027 
2028                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2029                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2030                                 gtid, sh->buffer_index) );
2031 
2032                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2033 
2034             } // if
2035             if ( __kmp_env_consistency_check ) {
2036                 if ( pr->pushed_ws != ct_none ) {
2037                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2038                 }
2039             }
2040 
2041             th -> th.th_dispatch -> th_deo_fcn = NULL;
2042             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2043             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2044             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2045         } // if (status == 0)
2046 #if KMP_OS_WINDOWS
2047         else if ( last ) {
2048             pr->u.p.last_upper = pr->u.p.ub;
2049         }
2050 #endif /* KMP_OS_WINDOWS */
2051     } // if
2052 
2053     #ifdef KMP_DEBUG
2054     {
2055         const char * buff;
2056         // create format specifiers before the debug output
2057         buff = __kmp_str_format(
2058             "__kmp_dispatch_next: T#%%d normal case: " \
2059             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2060             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2061         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2062         __kmp_str_free( &buff );
2063     }
2064     #endif
2065     return status;
2066 }
2067 
2068 //-----------------------------------------------------------------------------------------
2069 // Dispatch routines
2070 //    Transfer call to template< type T >
2071 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2072 //                         T lb, T ub, ST st, ST chunk )
2073 extern "C" {
2074 
2075 /*!
2076 @ingroup WORK_SHARING
2077 @{
2078 @param loc Source location
2079 @param gtid Global thread id
2080 @param schedule Schedule type
2081 @param lb  Lower bound
2082 @param ub  Upper bound
2083 @param st  Step (or increment if you prefer)
2084 @param chunk The chunk size to block with
2085 
2086 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2087 These functions are all identical apart from the types of the arguments.
2088 */
2089 
2090 void
2091 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2092                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2093 {
2094     KMP_DEBUG_ASSERT( __kmp_init_serial );
2095     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2096 }
2097 /*!
2098 See @ref __kmpc_dispatch_init_4
2099 */
2100 void
2101 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2102                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2103 {
2104     KMP_DEBUG_ASSERT( __kmp_init_serial );
2105     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2106 }
2107 
2108 /*!
2109 See @ref __kmpc_dispatch_init_4
2110 */
2111 void
2112 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2113                         kmp_int64 lb, kmp_int64 ub,
2114                         kmp_int64 st, kmp_int64 chunk )
2115 {
2116     KMP_DEBUG_ASSERT( __kmp_init_serial );
2117     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2118 }
2119 
2120 /*!
2121 See @ref __kmpc_dispatch_init_4
2122 */
2123 void
2124 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2125                          kmp_uint64 lb, kmp_uint64 ub,
2126                          kmp_int64 st, kmp_int64 chunk )
2127 {
2128     KMP_DEBUG_ASSERT( __kmp_init_serial );
2129     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2130 }
2131 
2132 /*!
2133 @param loc Source code location
2134 @param gtid Global thread id
2135 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2136 @param p_lb   Pointer to the lower bound for the next chunk of work
2137 @param p_ub   Pointer to the upper bound for the next chunk of work
2138 @param p_st   Pointer to the stride for the next chunk of work
2139 @return one if there is work to be done, zero otherwise
2140 
2141 Get the next dynamically allocated chunk of work for this thread.
2142 If there is no more work, then the lb,ub and stride need not be modified.
2143 */
2144 int
2145 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2146                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2147 {
2148     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2149 }
2150 
2151 /*!
2152 See @ref __kmpc_dispatch_next_4
2153 */
2154 int
2155 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2156                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2157 {
2158     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2159 }
2160 
2161 /*!
2162 See @ref __kmpc_dispatch_next_4
2163 */
2164 int
2165 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2166                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2167 {
2168     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2169 }
2170 
2171 /*!
2172 See @ref __kmpc_dispatch_next_4
2173 */
2174 int
2175 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2176                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2177 {
2178     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2179 }
2180 
2181 /*!
2182 @param loc Source code location
2183 @param gtid Global thread id
2184 
2185 Mark the end of a dynamic loop.
2186 */
2187 void
2188 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2189 {
2190     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2191 }
2192 
2193 /*!
2194 See @ref __kmpc_dispatch_fini_4
2195 */
2196 void
2197 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2198 {
2199     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2200 }
2201 
2202 /*!
2203 See @ref __kmpc_dispatch_fini_4
2204 */
2205 void
2206 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2207 {
2208     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2209 }
2210 
2211 /*!
2212 See @ref __kmpc_dispatch_fini_4
2213 */
2214 void
2215 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2216 {
2217     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2218 }
2219 /*! @} */
2220 
2221 //-----------------------------------------------------------------------------------------
2222 //Non-template routines from kmp_dispatch.c used in other sources
2223 
2224 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2225     return value == checker;
2226 }
2227 
2228 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2229     return value != checker;
2230 }
2231 
2232 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2233     return value < checker;
2234 }
2235 
2236 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2237     return value >= checker;
2238 }
2239 
2240 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2241     return value <= checker;
2242 }
2243 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2244     return value == checker;
2245 }
2246 
2247 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2248     return value != checker;
2249 }
2250 
2251 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2252     return value < checker;
2253 }
2254 
2255 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2256     return value >= checker;
2257 }
2258 
2259 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2260     return value <= checker;
2261 }
2262 
2263 kmp_uint32
2264 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2265                    kmp_uint32            checker,
2266                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2267                    , void        * obj    // Higher-level synchronization object, or NULL.
2268                    )
2269 {
2270     // note: we may not belong to a team at this point
2271     register volatile kmp_uint32         * spin          = spinner;
2272     register          kmp_uint32           check         = checker;
2273     register          kmp_uint32   spins;
2274     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2275     register          kmp_uint32           r;
2276 
2277     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2278     KMP_INIT_YIELD( spins );
2279     // main wait spin loop
2280     while(!f(r = TCR_4(*spin), check)) {
2281         KMP_FSYNC_SPIN_PREPARE( obj );
2282         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2283            It causes problems with infinite recursion because of exit lock */
2284         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2285             __kmp_abort_thread(); */
2286 
2287         __kmp_static_delay(TRUE);
2288 
2289         /* if we have waited a bit, or are oversubscribed, yield */
2290         /* pause is in the following code */
2291         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2292         KMP_YIELD_SPIN( spins );
2293     }
2294     KMP_FSYNC_SPIN_ACQUIRED( obj );
2295     return r;
2296 }
2297 
2298 kmp_uint64
2299 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2300                     kmp_uint64            checker,
2301                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2302                     , void        * obj    // Higher-level synchronization object, or NULL.
2303                     )
2304 {
2305     // note: we may not belong to a team at this point
2306     register volatile kmp_uint64         * spin          = spinner;
2307     register          kmp_uint64           check         = checker;
2308     register          kmp_uint32   spins;
2309     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2310     register          kmp_uint64           r;
2311 
2312     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2313     KMP_INIT_YIELD( spins );
2314     // main wait spin loop
2315     while(!f(r = *spin, check))
2316     {
2317         KMP_FSYNC_SPIN_PREPARE( obj );
2318         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2319            It causes problems with infinite recursion because of exit lock */
2320         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2321             __kmp_abort_thread(); */
2322 
2323         __kmp_static_delay(TRUE);
2324 
2325         // if we are oversubscribed,
2326         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2327         // pause is in the following code
2328         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2329         KMP_YIELD_SPIN( spins );
2330     }
2331     KMP_FSYNC_SPIN_ACQUIRED( obj );
2332     return r;
2333 }
2334 
2335 } // extern "C"
2336 
2337 #ifdef KMP_GOMP_COMPAT
2338 
2339 void
2340 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2341                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2342                            kmp_int32 chunk, int push_ws )
2343 {
2344     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2345                                       push_ws );
2346 }
2347 
2348 void
2349 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2350                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2351                             kmp_int32 chunk, int push_ws )
2352 {
2353     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2354                                        push_ws );
2355 }
2356 
2357 void
2358 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2359                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2360                            kmp_int64 chunk, int push_ws )
2361 {
2362     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2363                                       push_ws );
2364 }
2365 
2366 void
2367 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2368                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2369                             kmp_int64 chunk, int push_ws )
2370 {
2371     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2372                                        push_ws );
2373 }
2374 
2375 void
2376 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2377 {
2378     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2379 }
2380 
2381 void
2382 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2383 {
2384     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2385 }
2386 
2387 void
2388 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2389 {
2390     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2391 }
2392 
2393 void
2394 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2395 {
2396     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2397 }
2398 
2399 #endif /* KMP_GOMP_COMPAT */
2400 
2401 /* ------------------------------------------------------------------------ */
2402 /* ------------------------------------------------------------------------ */
2403 
2404