1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 42624 $
4  * $Date: 2013-08-27 10:53:11 -0500 (Tue, 27 Aug 2013) $
5  */
6 
7 
8 //===----------------------------------------------------------------------===//
9 //
10 //                     The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 
18 /*
19  * Dynamic scheduling initialization and dispatch.
20  *
21  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
22  *       it may change values between parallel regions.  __kmp_max_nth
23  *       is the largest value __kmp_nth may take, 1 is the smallest.
24  *
25  */
26 
27 /* ------------------------------------------------------------------------ */
28 /* ------------------------------------------------------------------------ */
29 
30 #include "kmp.h"
31 #include "kmp_i18n.h"
32 #include "kmp_itt.h"
33 #include "kmp_str.h"
34 #include "kmp_error.h"
35 #if KMP_OS_WINDOWS && KMP_ARCH_X86
36     #include <float.h>
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 #ifdef KMP_STATIC_STEAL_ENABLED
43 
44     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
45     template< typename T >
46     struct dispatch_private_infoXX_template {
47         typedef typename traits_t< T >::unsigned_t  UT;
48         typedef typename traits_t< T >::signed_t    ST;
49         UT count;                // unsigned
50         T  ub;
51         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
52         T  lb;
53         ST st;                   // signed
54         UT tc;                   // unsigned
55         T  static_steal_counter; // for static_steal only; maybe better to put after ub
56 
57         /* parm[1-4] are used in different ways by different scheduling algorithms */
58 
59         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
60         //    a) parm3 is properly aligned and
61         //    b) all parm1-4 are in the same cache line.
62         // Because of parm1-4 are used together, performance seems to be better
63         // if they are in the same line (not measured though).
64 
65         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
66             T  parm1;
67             T  parm2;
68             T  parm3;
69             T  parm4;
70         };
71 
72         UT ordered_lower; // unsigned
73         UT ordered_upper; // unsigned
74         #if KMP_OS_WINDOWS
75         T  last_upper;
76         #endif /* KMP_OS_WINDOWS */
77     };
78 
79 #else /* KMP_STATIC_STEAL_ENABLED */
80 
81     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
82     template< typename T >
83     struct dispatch_private_infoXX_template {
84         typedef typename traits_t< T >::unsigned_t  UT;
85         typedef typename traits_t< T >::signed_t    ST;
86         T  lb;
87         T  ub;
88         ST st;            // signed
89         UT tc;            // unsigned
90 
91         T  parm1;
92         T  parm2;
93         T  parm3;
94         T  parm4;
95 
96         UT count;         // unsigned
97 
98         UT ordered_lower; // unsigned
99         UT ordered_upper; // unsigned
100         #if KMP_OS_WINDOWS
101 	T  last_upper;
102         #endif /* KMP_OS_WINDOWS */
103     };
104 
105 #endif /* KMP_STATIC_STEAL_ENABLED */
106 
107 // replaces dispatch_private_info structure and dispatch_private_info_t type
108 template< typename T >
109 struct KMP_ALIGN_CACHE dispatch_private_info_template {
110     // duplicate alignment here, otherwise size of structure is not correct in our compiler
111     union KMP_ALIGN_CACHE private_info_tmpl {
112         dispatch_private_infoXX_template< T > p;
113         dispatch_private_info64_t             p64;
114     } u;
115     enum sched_type schedule;  /* scheduling algorithm */
116     kmp_uint32      ordered;   /* ordered clause specified */
117     kmp_uint32      ordered_bumped;
118     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
119     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
120     kmp_uint32      nomerge;   /* don't merge iters if serialized */
121     kmp_uint32      type_size;
122     enum cons_type  pushed_ws;
123 };
124 
125 
126 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
127 template< typename UT >
128 struct dispatch_shared_infoXX_template {
129     /* chunk index under dynamic, number of idle threads under static-steal;
130        iteration index otherwise */
131     volatile UT     iteration;
132     volatile UT     num_done;
133     volatile UT     ordered_iteration;
134     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
135 };
136 
137 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
138 template< typename UT >
139 struct dispatch_shared_info_template {
140     // we need union here to keep the structure size
141     union shared_info_tmpl {
142         dispatch_shared_infoXX_template< UT >  s;
143         dispatch_shared_info64_t               s64;
144     } u;
145     volatile kmp_uint32     buffer_index;
146 };
147 
148 /* ------------------------------------------------------------------------ */
149 /* ------------------------------------------------------------------------ */
150 
151 static void
152 __kmp_static_delay( int arg )
153 {
154     /* Work around weird code-gen bug that causes assert to trip */
155     #if KMP_ARCH_X86_64 && KMP_OS_LINUX
156     #else
157         KMP_ASSERT( arg >= 0 );
158     #endif
159 }
160 
161 static void
162 __kmp_static_yield( int arg )
163 {
164     __kmp_yield( arg );
165 }
166 
167 #undef USE_TEST_LOCKS
168 
169 // test_then_add template (general template should NOT be used)
170 template< typename T >
171 static __forceinline T
172 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
173 
174 template<>
175 __forceinline kmp_int32
176 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
177 {
178     kmp_int32 r;
179     r = KMP_TEST_THEN_ADD32( p, d );
180     return r;
181 }
182 
183 template<>
184 __forceinline kmp_int64
185 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
186 {
187     kmp_int64 r;
188     r = KMP_TEST_THEN_ADD64( p, d );
189     return r;
190 }
191 
192 // test_then_inc_acq template (general template should NOT be used)
193 template< typename T >
194 static __forceinline T
195 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
196 
197 template<>
198 __forceinline kmp_int32
199 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
200 {
201     kmp_int32 r;
202     r = KMP_TEST_THEN_INC_ACQ32( p );
203     return r;
204 }
205 
206 template<>
207 __forceinline kmp_int64
208 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
209 {
210     kmp_int64 r;
211     r = KMP_TEST_THEN_INC_ACQ64( p );
212     return r;
213 }
214 
215 // test_then_inc template (general template should NOT be used)
216 template< typename T >
217 static __forceinline T
218 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
219 
220 template<>
221 __forceinline kmp_int32
222 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
223 {
224     kmp_int32 r;
225     r = KMP_TEST_THEN_INC32( p );
226     return r;
227 }
228 
229 template<>
230 __forceinline kmp_int64
231 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
232 {
233     kmp_int64 r;
234     r = KMP_TEST_THEN_INC64( p );
235     return r;
236 }
237 
238 // compare_and_swap template (general template should NOT be used)
239 template< typename T >
240 static __forceinline kmp_int32
241 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
242 
243 template<>
244 __forceinline kmp_int32
245 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
246 {
247     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
248 }
249 
250 template<>
251 __forceinline kmp_int32
252 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
253 {
254     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
255 }
256 
257 /*
258     Spin wait loop that first does pause, then yield.
259     Waits until function returns non-zero when called with *spinner and check.
260     Does NOT put threads to sleep.
261 #if USE_ITT_BUILD
262     Arguments:
263         obj -- is higher-level syncronization object to report to ittnotify. It is used to report
264             locks consistently. For example, if lock is acquired immediately, its address is
265             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
266             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
267             address, not an address of low-level spinner.
268 #endif // USE_ITT_BUILD
269 */
270 template< typename UT >
271 // ToDo: make inline function (move to header file for icl)
272 static UT  // unsigned 4- or 8-byte type
273 __kmp_wait_yield( volatile UT * spinner,
274                   UT            checker,
275                   kmp_uint32 (* pred)( UT, UT )
276                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
277                   )
278 {
279     // note: we may not belong to a team at this point
280     register volatile UT         * spin          = spinner;
281     register          UT           check         = checker;
282     register          kmp_uint32   spins;
283     register          kmp_uint32 (*f) ( UT, UT ) = pred;
284     register          UT           r;
285 
286     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
287     KMP_INIT_YIELD( spins );
288     // main wait spin loop
289     while(!f(r = *spin, check))
290     {
291         KMP_FSYNC_SPIN_PREPARE( obj );
292         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
293            It causes problems with infinite recursion because of exit lock */
294         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
295             __kmp_abort_thread(); */
296 
297         __kmp_static_delay(TRUE);
298 
299         // if we are oversubscribed,
300         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
301         // pause is in the following code
302         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
303         KMP_YIELD_SPIN( spins );
304     }
305     KMP_FSYNC_SPIN_ACQUIRED( obj );
306     return r;
307 }
308 
309 template< typename UT >
310 static kmp_uint32 __kmp_eq( UT value, UT checker) {
311     return value == checker;
312 }
313 
314 template< typename UT >
315 static kmp_uint32 __kmp_neq( UT value, UT checker) {
316     return value != checker;
317 }
318 
319 template< typename UT >
320 static kmp_uint32 __kmp_lt( UT value, UT checker) {
321     return value < checker;
322 }
323 
324 template< typename UT >
325 static kmp_uint32 __kmp_ge( UT value, UT checker) {
326     return value >= checker;
327 }
328 
329 template< typename UT >
330 static kmp_uint32 __kmp_le( UT value, UT checker) {
331     return value <= checker;
332 }
333 
334 
335 /* ------------------------------------------------------------------------ */
336 /* ------------------------------------------------------------------------ */
337 
338 static void
339 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
340 {
341     kmp_info_t *th;
342 
343     KMP_DEBUG_ASSERT( gtid_ref );
344 
345     if ( __kmp_env_consistency_check ) {
346         th = __kmp_threads[*gtid_ref];
347         if ( th -> th.th_root -> r.r_active
348           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
349             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
350         }
351     }
352 }
353 
354 template< typename UT >
355 static void
356 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
357 {
358     typedef typename traits_t< UT >::signed_t    ST;
359     dispatch_private_info_template< UT > * pr;
360 
361     int gtid = *gtid_ref;
362 //    int  cid = *cid_ref;
363     kmp_info_t *th = __kmp_threads[ gtid ];
364     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
365 
366     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
367     if ( __kmp_env_consistency_check ) {
368         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
369             ( th -> th.th_dispatch -> th_dispatch_pr_current );
370         if ( pr -> pushed_ws != ct_none ) {
371             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
372         }
373     }
374 
375     if ( ! th -> th.th_team -> t.t_serialized ) {
376         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
377             ( th -> th.th_dispatch -> th_dispatch_sh_current );
378         UT  lower;
379 
380         if ( ! __kmp_env_consistency_check ) {
381                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
382                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
383         }
384         lower = pr->u.p.ordered_lower;
385 
386         #if ! defined( KMP_GOMP_COMPAT )
387             if ( __kmp_env_consistency_check ) {
388                 if ( pr->ordered_bumped ) {
389                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
390                     __kmp_error_construct2(
391                         kmp_i18n_msg_CnsMultipleNesting,
392                         ct_ordered_in_pdo, loc_ref,
393                         & p->stack_data[ p->w_top ]
394                     );
395                 }
396             }
397         #endif /* !defined(KMP_GOMP_COMPAT) */
398 
399         KMP_MB();
400         #ifdef KMP_DEBUG
401         {
402             const char * buff;
403             // create format specifiers before the debug output
404             buff = __kmp_str_format(
405                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
406                 traits_t< UT >::spec, traits_t< UT >::spec );
407             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
408             __kmp_str_free( &buff );
409         }
410         #endif
411 
412         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
413                                 USE_ITT_BUILD_ARG( NULL )
414                                 );
415         KMP_MB();  /* is this necessary? */
416         #ifdef KMP_DEBUG
417         {
418             const char * buff;
419             // create format specifiers before the debug output
420             buff = __kmp_str_format(
421                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
422                 traits_t< UT >::spec, traits_t< UT >::spec );
423             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
424             __kmp_str_free( &buff );
425         }
426         #endif
427     }
428     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
429 }
430 
431 static void
432 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
433 {
434     kmp_info_t *th;
435 
436     if ( __kmp_env_consistency_check ) {
437         th = __kmp_threads[*gtid_ref];
438         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
439             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
440         }
441     }
442 }
443 
444 template< typename UT >
445 static void
446 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
447 {
448     typedef typename traits_t< UT >::signed_t    ST;
449     dispatch_private_info_template< UT > * pr;
450 
451     int gtid = *gtid_ref;
452 //    int  cid = *cid_ref;
453     kmp_info_t *th = __kmp_threads[ gtid ];
454     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
455 
456     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
457     if ( __kmp_env_consistency_check ) {
458         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
459             ( th -> th.th_dispatch -> th_dispatch_pr_current );
460         if ( pr -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 
465     if ( ! th -> th.th_team -> t.t_serialized ) {
466         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
467             ( th -> th.th_dispatch -> th_dispatch_sh_current );
468 
469         if ( ! __kmp_env_consistency_check ) {
470             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
471                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
472         }
473 
474         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
475         #if ! defined( KMP_GOMP_COMPAT )
476             if ( __kmp_env_consistency_check ) {
477                 if ( pr->ordered_bumped != 0 ) {
478                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
479                     /* How to test it? - OM */
480                     __kmp_error_construct2(
481                         kmp_i18n_msg_CnsMultipleNesting,
482                         ct_ordered_in_pdo, loc_ref,
483                         & p->stack_data[ p->w_top ]
484                     );
485                 }
486             }
487         #endif /* !defined(KMP_GOMP_COMPAT) */
488 
489         KMP_MB();       /* Flush all pending memory write invalidates.  */
490 
491         pr->ordered_bumped += 1;
492 
493         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
494                         gtid, pr->ordered_bumped ) );
495 
496         KMP_MB();       /* Flush all pending memory write invalidates.  */
497 
498         /* TODO use general release procedure? */
499         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
500 
501         KMP_MB();       /* Flush all pending memory write invalidates.  */
502     }
503     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
504 }
505 
506 /* Computes and returns x to the power of y, where y must a non-negative integer */
507 template< typename UT >
508 static __forceinline long double
509 __kmp_pow(long double x, UT y) {
510     long double s=1.0L;
511 
512     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
513     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
514     while(y) {
515         if ( y & 1 )
516             s *= x;
517         x *= x;
518         y >>= 1;
519     }
520     return s;
521 }
522 
523 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
524    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
525    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
526    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
527 */
528 template< typename T >
529 static __inline typename traits_t< T >::unsigned_t
530 __kmp_dispatch_guided_remaining(
531     T                                  tc,
532     typename traits_t< T >::floating_t base,
533     typename traits_t< T >::unsigned_t idx
534 ) {
535     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
536        least for ICL 8.1, long double arithmetic may not really have
537        long double precision, even with /Qlong_double.  Currently, we
538        workaround that in the caller code, by manipulating the FPCW for
539        Windows* OS on IA-32 architecture.  The lack of precision is not
540        expected to be a correctness issue, though.
541     */
542     typedef typename traits_t< T >::unsigned_t  UT;
543 
544     long double x = tc * __kmp_pow< UT >(base, idx);
545     UT r = (UT) x;
546     if ( x == r )
547         return r;
548     return r + 1;
549 }
550 
551 // Parameters of the guided-iterative algorithm:
552 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
553 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
554 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
555 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
556 static int guided_int_param = 2;
557 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
558 
559 // UT - unsigned flavor of T, ST - signed flavor of T,
560 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
561 template< typename T >
562 static void
563 __kmp_dispatch_init(
564     ident_t                        * loc,
565     int                              gtid,
566     enum sched_type                  schedule,
567     T                                lb,
568     T                                ub,
569     typename traits_t< T >::signed_t st,
570     typename traits_t< T >::signed_t chunk,
571     int                              push_ws
572 ) {
573     typedef typename traits_t< T >::unsigned_t  UT;
574     typedef typename traits_t< T >::signed_t    ST;
575     typedef typename traits_t< T >::floating_t  DBL;
576     static const int ___kmp_size_type = sizeof( UT );
577 
578     int                                            active;
579     T                                              tc;
580     kmp_info_t *                                   th;
581     kmp_team_t *                                   team;
582     kmp_uint32                                     my_buffer_index;
583     dispatch_private_info_template< T >          * pr;
584     dispatch_shared_info_template< UT > volatile * sh;
585 
586     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
587     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
588 
589     if ( ! TCR_4( __kmp_init_parallel ) )
590         __kmp_parallel_initialize();
591 
592     #ifdef KMP_DEBUG
593     {
594         const char * buff;
595         // create format specifiers before the debug output
596         buff = __kmp_str_format(
597             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
598             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
599         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
600         __kmp_str_free( &buff );
601     }
602     #endif
603     /* setup data */
604     th     = __kmp_threads[ gtid ];
605     team   = th -> th.th_team;
606     active = ! team -> t.t_serialized;
607     th->th.th_ident = loc;
608 
609     if ( ! active ) {
610         pr = reinterpret_cast< dispatch_private_info_template< T >* >
611             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
612     } else {
613         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
614                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
615 
616         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
617 
618         /* What happens when number of threads changes, need to resize buffer? */
619         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
620             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
621         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
622             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
623     }
624 
625     /* Pick up the nomerge/ordered bits from the scheduling type */
626     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
627         pr->nomerge = TRUE;
628         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
629     } else {
630         pr->nomerge = FALSE;
631     }
632     pr->type_size = ___kmp_size_type; // remember the size of variables
633     if ( kmp_ord_lower & schedule ) {
634         pr->ordered = TRUE;
635         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
636     } else {
637         pr->ordered = FALSE;
638     }
639     if ( schedule == kmp_sch_static ) {
640         schedule = __kmp_static;
641     } else {
642         if ( schedule == kmp_sch_runtime ) {
643             #if OMP_30_ENABLED
644                 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
645                 schedule = team -> t.t_sched.r_sched_type;
646                 // Detail the schedule if needed (global controls are differentiated appropriately)
647                 if ( schedule == kmp_sch_guided_chunked ) {
648                     schedule = __kmp_guided;
649                 } else if ( schedule == kmp_sch_static ) {
650                     schedule = __kmp_static;
651                 }
652                 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
653                 chunk = team -> t.t_sched.chunk;
654             #else
655                 kmp_r_sched_t r_sched = __kmp_get_schedule_global();
656                 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
657                 schedule = r_sched.r_sched_type;
658                 chunk    = r_sched.chunk;
659             #endif
660 
661             #ifdef KMP_DEBUG
662             {
663                 const char * buff;
664                 // create format specifiers before the debug output
665                 buff = __kmp_str_format(
666                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
667                     traits_t< ST >::spec );
668                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
669                 __kmp_str_free( &buff );
670             }
671             #endif
672         } else {
673             if ( schedule == kmp_sch_guided_chunked ) {
674                 schedule = __kmp_guided;
675             }
676             if ( chunk <= 0 ) {
677                 chunk = KMP_DEFAULT_CHUNK;
678             }
679         }
680 
681         #if OMP_30_ENABLED
682         if ( schedule == kmp_sch_auto ) {
683             // mapping and differentiation: in the __kmp_do_serial_initialize()
684             schedule = __kmp_auto;
685             #ifdef KMP_DEBUG
686             {
687                 const char * buff;
688                 // create format specifiers before the debug output
689                 buff = __kmp_str_format(
690                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
691                     traits_t< ST >::spec );
692                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
693                 __kmp_str_free( &buff );
694             }
695             #endif
696         }
697         #endif // OMP_30_ENABLED
698 
699         /* guided analytical not safe for too many threads */
700         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
701             schedule = kmp_sch_guided_iterative_chunked;
702             KMP_WARNING( DispatchManyThreads );
703         }
704         pr->u.p.parm1 = chunk;
705     }
706     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
707                 "unknown scheduling type" );
708 
709     pr->u.p.count = 0;
710 
711     if ( __kmp_env_consistency_check ) {
712         if ( st == 0 ) {
713             __kmp_error_construct(
714                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
715                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
716             );
717         }
718     }
719 
720     tc = ( ub - lb + st );
721     if ( st != 1 ) {
722         if ( st < 0 ) {
723             if ( lb < ub ) {
724                 tc = 0;            // zero-trip
725             } else {   // lb >= ub
726                 tc = (ST)tc / st;  // convert to signed division
727             }
728         } else {       // st > 0
729             if ( ub < lb ) {
730                 tc = 0;            // zero-trip
731             } else {   // lb >= ub
732                 tc /= st;
733             }
734         }
735     } else if ( ub < lb ) {        // st == 1
736         tc = 0;                    // zero-trip
737     }
738 
739     pr->u.p.lb = lb;
740     pr->u.p.ub = ub;
741     pr->u.p.st = st;
742     pr->u.p.tc = tc;
743 
744     #if KMP_OS_WINDOWS
745     pr->u.p.last_upper = ub + st;
746     #endif /* KMP_OS_WINDOWS */
747 
748     /* NOTE: only the active parallel region(s) has active ordered sections */
749 
750     if ( active ) {
751         if ( pr->ordered == 0 ) {
752             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
753             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
754         } else {
755             pr->ordered_bumped = 0;
756 
757             pr->u.p.ordered_lower = 1;
758             pr->u.p.ordered_upper = 0;
759 
760             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
761             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
762         }
763     }
764 
765     if ( __kmp_env_consistency_check ) {
766         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
767         if ( push_ws ) {
768             __kmp_push_workshare( gtid, ws, loc );
769             pr->pushed_ws = ws;
770         } else {
771             __kmp_check_workshare( gtid, ws, loc );
772             pr->pushed_ws = ct_none;
773         }
774     }
775 
776     switch ( schedule ) {
777     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
778     case kmp_sch_static_steal:
779         {
780             T nproc = team->t.t_nproc;
781             T ntc, init;
782 
783             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
784 
785             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
786             if ( nproc > 1 && ntc >= nproc ) {
787                 T id = __kmp_tid_from_gtid(gtid);
788                 T small_chunk, extras;
789 
790                 small_chunk = ntc / nproc;
791                 extras = ntc % nproc;
792 
793                 init = id * small_chunk + ( id < extras ? id : extras );
794                 pr->u.p.count = init;
795                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
796 
797                 pr->u.p.parm2 = lb;
798                 //pr->pfields.parm3 = 0; // it's not used in static_steal
799                 pr->u.p.parm4 = id;
800                 pr->u.p.st = st;
801                 break;
802             } else {
803                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
804                                gtid ) );
805                 schedule = kmp_sch_static_balanced;
806                 /* too few iterations: fall-through to kmp_sch_static_balanced */
807             } // if
808             /* FALL-THROUGH to static balanced */
809         } // case
810     #endif
811     case kmp_sch_static_balanced:
812         {
813             T nproc = team->t.t_nproc;
814             T init, limit;
815 
816             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
817                             gtid ) );
818 
819             if ( nproc > 1 ) {
820                 T id = __kmp_tid_from_gtid(gtid);
821 
822                 if ( tc < nproc ) {
823                     if ( id < tc ) {
824                         init = id;
825                         limit = id;
826                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
827                     } else {
828                         pr->u.p.count = 1;  /* means no more chunks to execute */
829                         pr->u.p.parm1 = FALSE;
830                         break;
831                     }
832                 } else {
833                     T small_chunk = tc / nproc;
834                     T extras = tc % nproc;
835                     init = id * small_chunk + (id < extras ? id : extras);
836                     limit = init + small_chunk - (id < extras ? 0 : 1);
837                     pr->u.p.parm1 = (id == nproc - 1);
838                 }
839             } else {
840                 if ( tc > 0 ) {
841                     init = 0;
842                     limit = tc - 1;
843                     pr->u.p.parm1 = TRUE;
844                 } else {
845                     // zero trip count
846                     pr->u.p.count = 1;  /* means no more chunks to execute */
847                     pr->u.p.parm1 = FALSE;
848                     break;
849                 }
850             }
851             if ( st == 1 ) {
852                 pr->u.p.lb = lb + init;
853                 pr->u.p.ub = lb + limit;
854             } else {
855                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
856                 pr->u.p.lb = lb + init * st;
857                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
858                 if ( st > 0 ) {
859                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
860                 } else {
861                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
862                 }
863             }
864             if ( pr->ordered ) {
865                 pr->u.p.ordered_lower = init;
866                 pr->u.p.ordered_upper = limit;
867             }
868             break;
869         } // case
870     case kmp_sch_guided_iterative_chunked :
871         {
872             T nproc = team->t.t_nproc;
873             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
874 
875             if ( nproc > 1 ) {
876                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
877                     /* chunk size too large, switch to dynamic */
878                     schedule = kmp_sch_dynamic_chunked;
879                 } else {
880                     // when remaining iters become less than parm2 - switch to dynamic
881                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
882                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
883                 }
884             } else {
885                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
886                 schedule = kmp_sch_static_greedy;
887                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
888                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
889                 pr->u.p.parm1 = tc;
890             } // if
891         } // case
892         break;
893     case kmp_sch_guided_analytical_chunked:
894         {
895             T nproc = team->t.t_nproc;
896             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
897 
898             if ( nproc > 1 ) {
899                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
900                     /* chunk size too large, switch to dynamic */
901                     schedule = kmp_sch_dynamic_chunked;
902                 } else {
903                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
904                     DBL x;
905 
906                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
907                     /* Linux* OS already has 64-bit computation by default for
908 		       long double, and on Windows* OS on Intel(R) 64,
909 		       /Qlong_double doesn't work.  On Windows* OS
910 		       on IA-32 architecture, we need to set precision to
911 		       64-bit instead of the default 53-bit. Even though long
912 		       double doesn't work on Windows* OS on Intel(R) 64, the
913 		       resulting lack of precision is not expected to impact
914 		       the correctness of the algorithm, but this has not been
915 		       mathematically proven.
916                     */
917                     // save original FPCW and set precision to 64-bit, as
918                     // Windows* OS on IA-32 architecture defaults to 53-bit
919                     unsigned int oldFpcw = _control87(0,0x30000);
920                     #endif
921                     /* value used for comparison in solver for cross-over point */
922                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
923 
924                     /* crossover point--chunk indexes equal to or greater than
925 		       this point switch to dynamic-style scheduling */
926                     UT   cross;
927 
928                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
929                     x = (long double)1.0 - (long double)0.5 / nproc;
930 
931                     #ifdef KMP_DEBUG
932                     { // test natural alignment
933                         struct _test_a {
934                             char a;
935                             union {
936                                 char b;
937                                 DBL  d;
938                             };
939                         } t;
940                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
941                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
942                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
943                     }
944                     #endif // KMP_DEBUG
945 
946                     /* save the term in thread private dispatch structure */
947                     *(DBL*)&pr->u.p.parm3 = x;
948 
949                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
950                     {
951                         UT          left, right, mid;
952                         long double p;
953 
954                         /* estimate initial upper and lower bound */
955 
956                         /* doesn't matter what value right is as long as it is positive, but
957                            it affects performance of the solver
958                         */
959                         right = 229;
960                         p = __kmp_pow< UT >(x,right);
961                         if ( p > target ) {
962                             do{
963                                 p *= p;
964                                 right <<= 1;
965                             } while(p>target && right < (1<<27));
966                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
967                         } else {
968                             left = 0;
969                         }
970 
971                         /* bisection root-finding method */
972                         while ( left + 1 < right ) {
973                             mid = (left + right) / 2;
974                             if ( __kmp_pow< UT >(x,mid) > target ) {
975                                 left = mid;
976                             } else {
977                                 right = mid;
978                             }
979                         } // while
980                         cross = right;
981                     }
982                     /* assert sanity of computed crossover point */
983                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
984 
985                     /* save the crossover point in thread private dispatch structure */
986                     pr->u.p.parm2 = cross;
987 
988                     // C75803
989                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
990                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
991                     #else
992                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
993                     #endif
994                     /* dynamic-style scheduling offset */
995                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
996                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
997                         // restore FPCW
998                         _control87(oldFpcw,0x30000);
999                     #endif
1000                 } // if
1001             } else {
1002                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1003                                gtid ) );
1004                 schedule = kmp_sch_static_greedy;
1005                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1006                 pr->u.p.parm1 = tc;
1007             } // if
1008         } // case
1009         break;
1010     case kmp_sch_static_greedy:
1011         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1012             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1013                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1014                 tc;
1015         break;
1016     case kmp_sch_static_chunked :
1017     case kmp_sch_dynamic_chunked :
1018         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1019         break;
1020     case kmp_sch_trapezoidal :
1021         {
1022             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1023 
1024             T parm1, parm2, parm3, parm4;
1025             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1026 
1027             parm1 = chunk;
1028 
1029             /* F : size of the first cycle */
1030             parm2 = ( tc / (2 * team->t.t_nproc) );
1031 
1032             if ( parm2 < 1 ) {
1033                 parm2 = 1;
1034             }
1035 
1036             /* L : size of the last cycle.  Make sure the last cycle
1037              *     is not larger than the first cycle.
1038              */
1039             if ( parm1 < 1 ) {
1040                 parm1 = 1;
1041             } else if ( parm1 > parm2 ) {
1042                 parm1 = parm2;
1043             }
1044 
1045             /* N : number of cycles */
1046             parm3 = ( parm2 + parm1 );
1047             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1048 
1049             if ( parm3 < 2 ) {
1050                 parm3 = 2;
1051             }
1052 
1053             /* sigma : decreasing incr of the trapezoid */
1054             parm4 = ( parm3 - 1 );
1055             parm4 = ( parm2 - parm1 ) / parm4;
1056 
1057             // pointless check, because parm4 >= 0 always
1058             //if ( parm4 < 0 ) {
1059             //    parm4 = 0;
1060             //}
1061 
1062             pr->u.p.parm1 = parm1;
1063             pr->u.p.parm2 = parm2;
1064             pr->u.p.parm3 = parm3;
1065             pr->u.p.parm4 = parm4;
1066         } // case
1067         break;
1068 
1069     default:
1070         {
1071             __kmp_msg(
1072                 kmp_ms_fatal,                        // Severity
1073                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1074                 KMP_HNT( GetNewerLibrary ),          // Hint
1075                 __kmp_msg_null                       // Variadic argument list terminator
1076             );
1077         }
1078         break;
1079     } // switch
1080     pr->schedule = schedule;
1081     if ( active ) {
1082         /* The name of this buffer should be my_buffer_index when it's free to use it */
1083 
1084         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1085                         gtid, my_buffer_index, sh->buffer_index) );
1086         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1087                                         USE_ITT_BUILD_ARG( NULL )
1088                                         );
1089             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1090             // *always* 32-bit integers.
1091         KMP_MB();  /* is this necessary? */
1092         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1093                         gtid, my_buffer_index, sh->buffer_index) );
1094 
1095         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1096         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1097 #if USE_ITT_BUILD
1098         if ( pr->ordered ) {
1099             __kmp_itt_ordered_init( gtid );
1100         }; // if
1101 #endif /* USE_ITT_BUILD */
1102     }; // if
1103     #ifdef KMP_DEBUG
1104     {
1105         const char * buff;
1106         // create format specifiers before the debug output
1107         buff = __kmp_str_format(
1108             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1109             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1110             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1111             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1112             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1113             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1114             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1115         KD_TRACE(10, ( buff,
1116             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1117             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1118             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1119             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1120         __kmp_str_free( &buff );
1121     }
1122     #endif
1123     #if ( KMP_STATIC_STEAL_ENABLED )
1124     if ( ___kmp_size_type < 8 ) {
1125       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1126       // all the parm3 variables will contain the same value.
1127       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1128       // rather than program life-time increment.
1129       // So the dedicated variable is required. The 'static_steal_counter' is used.
1130       if( schedule == kmp_sch_static_steal ) {
1131         // Other threads will inspect this variable when searching for a victim.
1132         // This is a flag showing that other threads may steal from this thread since then.
1133         volatile T * p = &pr->u.p.static_steal_counter;
1134         *p = *p + 1;
1135       }
1136     }
1137     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1138 }
1139 
1140 /*
1141  * For ordered loops, either __kmp_dispatch_finish() should be called after
1142  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1143  * every chunk of iterations.  If the ordered section(s) were not executed
1144  * for this iteration (or every iteration in this chunk), we need to set the
1145  * ordered iteration counters so that the next thread can proceed.
1146  */
1147 template< typename UT >
1148 static void
1149 __kmp_dispatch_finish( int gtid, ident_t *loc )
1150 {
1151     typedef typename traits_t< UT >::signed_t ST;
1152     kmp_info_t *th = __kmp_threads[ gtid ];
1153 
1154     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1155     if ( ! th -> th.th_team -> t.t_serialized ) {
1156 
1157         dispatch_private_info_template< UT > * pr =
1158             reinterpret_cast< dispatch_private_info_template< UT >* >
1159             ( th->th.th_dispatch->th_dispatch_pr_current );
1160         dispatch_shared_info_template< UT > volatile * sh =
1161             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1162             ( th->th.th_dispatch->th_dispatch_sh_current );
1163         KMP_DEBUG_ASSERT( pr );
1164         KMP_DEBUG_ASSERT( sh );
1165         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1166                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1167 
1168         if ( pr->ordered_bumped ) {
1169             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1170                             gtid ) );
1171             pr->ordered_bumped = 0;
1172         } else {
1173             UT lower = pr->u.p.ordered_lower;
1174 
1175             #ifdef KMP_DEBUG
1176             {
1177                 const char * buff;
1178                 // create format specifiers before the debug output
1179                 buff = __kmp_str_format(
1180                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1181                     traits_t< UT >::spec, traits_t< UT >::spec );
1182                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1183                 __kmp_str_free( &buff );
1184             }
1185             #endif
1186 
1187             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1188                                    USE_ITT_BUILD_ARG(NULL)
1189                                    );
1190             KMP_MB();  /* is this necessary? */
1191             #ifdef KMP_DEBUG
1192             {
1193                 const char * buff;
1194                 // create format specifiers before the debug output
1195                 buff = __kmp_str_format(
1196                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1197                     traits_t< UT >::spec, traits_t< UT >::spec );
1198                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1199                 __kmp_str_free( &buff );
1200             }
1201             #endif
1202 
1203             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1204         } // if
1205     } // if
1206     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1207 }
1208 
1209 #ifdef KMP_GOMP_COMPAT
1210 
1211 template< typename UT >
1212 static void
1213 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1214 {
1215     typedef typename traits_t< UT >::signed_t ST;
1216     kmp_info_t *th = __kmp_threads[ gtid ];
1217 
1218     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1219     if ( ! th -> th.th_team -> t.t_serialized ) {
1220 //        int cid;
1221         dispatch_private_info_template< UT > * pr =
1222             reinterpret_cast< dispatch_private_info_template< UT >* >
1223             ( th->th.th_dispatch->th_dispatch_pr_current );
1224         dispatch_shared_info_template< UT > volatile * sh =
1225             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1226             ( th->th.th_dispatch->th_dispatch_sh_current );
1227         KMP_DEBUG_ASSERT( pr );
1228         KMP_DEBUG_ASSERT( sh );
1229         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1230                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1231 
1232 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1233             UT lower = pr->u.p.ordered_lower;
1234             UT upper = pr->u.p.ordered_upper;
1235             UT inc = upper - lower + 1;
1236 
1237             if ( pr->ordered_bumped == inc ) {
1238                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1239                   gtid ) );
1240                 pr->ordered_bumped = 0;
1241             } else {
1242                 inc -= pr->ordered_bumped;
1243 
1244                 #ifdef KMP_DEBUG
1245                 {
1246                     const char * buff;
1247                     // create format specifiers before the debug output
1248                     buff = __kmp_str_format(
1249                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1250                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1251                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1252                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1253                     __kmp_str_free( &buff );
1254                 }
1255                 #endif
1256 
1257                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1258                                        USE_ITT_BUILD_ARG(NULL)
1259                                        );
1260 
1261                 KMP_MB();  /* is this necessary? */
1262                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1263                   gtid ) );
1264                 pr->ordered_bumped = 0;
1265 //!!!!! TODO check if the inc should be unsigned, or signed???
1266                 #ifdef KMP_DEBUG
1267                 {
1268                     const char * buff;
1269                     // create format specifiers before the debug output
1270                     buff = __kmp_str_format(
1271                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1272                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1273                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1274                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1275                     __kmp_str_free( &buff );
1276                 }
1277                 #endif
1278 
1279                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1280             }
1281 //        }
1282     }
1283     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1284 }
1285 
1286 #endif /* KMP_GOMP_COMPAT */
1287 
1288 template< typename T >
1289 static int
1290 __kmp_dispatch_next(
1291     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1292 ) {
1293 
1294     typedef typename traits_t< T >::unsigned_t  UT;
1295     typedef typename traits_t< T >::signed_t    ST;
1296     typedef typename traits_t< T >::floating_t  DBL;
1297     static const int ___kmp_size_type = sizeof( UT );
1298 
1299     int                                   status;
1300     dispatch_private_info_template< T > * pr;
1301     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1302     kmp_team_t                          * team = th -> th.th_team;
1303 
1304     #ifdef KMP_DEBUG
1305     {
1306         const char * buff;
1307         // create format specifiers before the debug output
1308         buff = __kmp_str_format(
1309             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1310             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1311         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1312         __kmp_str_free( &buff );
1313     }
1314     #endif
1315 
1316     if ( team -> t.t_serialized ) {
1317         /* NOTE: serialize this dispatch becase we are not at the active level */
1318         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1319             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1320         KMP_DEBUG_ASSERT( pr );
1321 
1322         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1323             *p_lb = 0;
1324             *p_ub = 0;
1325             if ( p_st != 0 ) {
1326                 *p_st = 0;
1327             }
1328             if ( __kmp_env_consistency_check ) {
1329                 if ( pr->pushed_ws != ct_none ) {
1330                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1331                 }
1332             }
1333         } else if ( pr->nomerge ) {
1334             kmp_int32 last;
1335             T         start;
1336             UT        limit, trip, init;
1337             ST        incr;
1338             T         chunk = pr->u.p.parm1;
1339 
1340             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1341 
1342             init = chunk * pr->u.p.count++;
1343             trip = pr->u.p.tc - 1;
1344 
1345             if ( (status = (init <= trip)) == 0 ) {
1346                 *p_lb = 0;
1347                 *p_ub = 0;
1348                 if ( p_st != 0 ) *p_st = 0;
1349                 if ( __kmp_env_consistency_check ) {
1350                     if ( pr->pushed_ws != ct_none ) {
1351                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1352                     }
1353                 }
1354             } else {
1355                 start = pr->u.p.lb;
1356                 limit = chunk + init - 1;
1357                 incr  = pr->u.p.st;
1358 
1359                 if ( (last = (limit >= trip)) != 0 ) {
1360                     limit = trip;
1361                     #if KMP_OS_WINDOWS
1362                     pr->u.p.last_upper = pr->u.p.ub;
1363                     #endif /* KMP_OS_WINDOWS */
1364                 }
1365                 if ( p_last ) {
1366                     *p_last = last;
1367                 }
1368                 if ( p_st != 0 ) {
1369                     *p_st = incr;
1370                 }
1371                 if ( incr == 1 ) {
1372                     *p_lb = start + init;
1373                     *p_ub = start + limit;
1374                 } else {
1375                     *p_lb = start + init * incr;
1376                     *p_ub = start + limit * incr;
1377                 }
1378 
1379                 if ( pr->ordered ) {
1380                     pr->u.p.ordered_lower = init;
1381                     pr->u.p.ordered_upper = limit;
1382                     #ifdef KMP_DEBUG
1383                     {
1384                         const char * buff;
1385                         // create format specifiers before the debug output
1386                         buff = __kmp_str_format(
1387                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1388                             traits_t< UT >::spec, traits_t< UT >::spec );
1389                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1390                         __kmp_str_free( &buff );
1391                     }
1392                     #endif
1393                 } // if
1394             } // if
1395         } else {
1396             pr->u.p.tc = 0;
1397 
1398             *p_lb = pr->u.p.lb;
1399             *p_ub = pr->u.p.ub;
1400             #if KMP_OS_WINDOWS
1401             pr->u.p.last_upper = *p_ub;
1402             #endif /* KMP_OS_WINDOWS */
1403 
1404             if ( p_st != 0 ) {
1405                 *p_st = pr->u.p.st;
1406             }
1407             if ( p_last ) {
1408                 *p_last = TRUE;
1409             }
1410         } // if
1411         #ifdef KMP_DEBUG
1412         {
1413             const char * buff;
1414             // create format specifiers before the debug output
1415             buff = __kmp_str_format(
1416                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1417                 "p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
1418                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1419             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1420             __kmp_str_free( &buff );
1421         }
1422         #endif
1423         return status;
1424     } else {
1425         kmp_int32 last = 0;
1426         dispatch_shared_info_template< UT > *sh;
1427         T         start;
1428         ST        incr;
1429         UT        limit, trip, init;
1430 
1431         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1432                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1433 
1434         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1435             ( th->th.th_dispatch->th_dispatch_pr_current );
1436         KMP_DEBUG_ASSERT( pr );
1437         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1438             ( th->th.th_dispatch->th_dispatch_sh_current );
1439         KMP_DEBUG_ASSERT( sh );
1440 
1441         if ( pr->u.p.tc == 0 ) {
1442             // zero trip count
1443             status = 0;
1444         } else {
1445             switch (pr->schedule) {
1446             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1447             case kmp_sch_static_steal:
1448                 {
1449                     T chunk = pr->u.p.parm1;
1450 
1451                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1452 
1453                     trip = pr->u.p.tc - 1;
1454 
1455                     if ( ___kmp_size_type > 4 ) {
1456                         // Other threads do not look into the data of this thread,
1457                         //  so it's not necessary to make volatile casting.
1458                         init   = ( pr->u.p.count )++;
1459                         status = ( init < (UT)pr->u.p.ub );
1460                     } else {
1461                         typedef union {
1462                             struct {
1463                                 UT count;
1464                                 T  ub;
1465                             } p;
1466                             kmp_int64 b;
1467                         } union_i4;
1468                         // All operations on 'count' or 'ub' must be combined atomically together.
1469                         // stealing implemented only for 4-byte indexes
1470                         {
1471                             union_i4 vold, vnew;
1472                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1473                             vnew = vold;
1474                             vnew.p.count++;
1475                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1476                                         ( volatile kmp_int64* )&pr->u.p.count,
1477                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1478                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1479                                 KMP_CPU_PAUSE();
1480                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1481                                 vnew = vold;
1482                                 vnew.p.count++;
1483                             }
1484                             vnew = vold;
1485                             init   = vnew.p.count;
1486                             status = ( init < (UT)vnew.p.ub ) ;
1487                         }
1488 
1489                         if( !status ) {
1490                             kmp_info_t   **other_threads = team->t.t_threads;
1491                             int          while_limit = 10;
1492                             int          while_index = 0;
1493 
1494                             // TODO: algorithm of searching for a victim
1495                             // should be cleaned up and measured
1496                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1497                                 union_i4  vold, vnew;
1498                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1499                                 T         victimIdx    = pr->u.p.parm4;
1500                                 T         oldVictimIdx = victimIdx;
1501                                 dispatch_private_info_template< T > * victim;
1502 
1503                                 do {
1504                                     if( !victimIdx ) {
1505                                         victimIdx = team->t.t_nproc - 1;
1506                                     } else {
1507                                         --victimIdx;
1508                                     }
1509                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1510                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1511                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1512                                 // TODO: think about a proper place of this test
1513                                 if ( ( !victim ) ||
1514                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1515                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1516                                     // TODO: delay would be nice
1517                                     continue;
1518                                     // the victim is not ready yet to participate in stealing
1519                                     // because the victim is still in kmp_init_dispatch
1520                                 }
1521                                 if ( oldVictimIdx == victimIdx ) {
1522                                     break;
1523                                 }
1524                                 pr->u.p.parm4 = victimIdx;
1525 
1526                                 while( 1 ) {
1527                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1528                                     vnew = vold;
1529 
1530                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1531                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1532                                         break;
1533                                     }
1534                                     vnew.p.ub -= (remaining >> 2);
1535                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1536                                     #pragma warning( push )
1537                                     // disable warning on pointless comparison of unsigned with 0
1538                                     #pragma warning( disable: 186 )
1539                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1540                                     #pragma warning( pop )
1541                                     // TODO: Should this be acquire or release?
1542                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1543                                             ( volatile kmp_int64 * )&victim->u.p.count,
1544                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1545                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1546                                         status = 1;
1547                                         while_index = 0;
1548                                         // now update own count and ub
1549                                         #if KMP_ARCH_X86
1550                                         // stealing executed on non-KMP_ARCH_X86 only
1551                                             // Atomic 64-bit write on ia32 is
1552                                             // unavailable, so we do this in steps.
1553                                             //     This code is not tested.
1554                                             init = vold.p.count;
1555                                             pr->u.p.ub = 0;
1556                                             pr->u.p.count = init + 1;
1557                                             pr->u.p.ub = vnew.p.count;
1558                                         #else
1559                                             init = vnew.p.ub;
1560                                             vold.p.count = init + 1;
1561                                             // TODO: is it safe and enough?
1562                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1563                                         #endif // KMP_ARCH_X86
1564                                         break;
1565                                     } // if
1566                                 KMP_CPU_PAUSE();
1567                                 } // while (1)
1568                             } // while
1569                         } // if
1570                     } // if
1571                     if ( !status ) {
1572                         *p_lb = 0;
1573                         *p_ub = 0;
1574                         if ( p_st != 0 ) *p_st = 0;
1575                     } else {
1576                         start = pr->u.p.parm2;
1577                         init *= chunk;
1578                         limit = chunk + init - 1;
1579                         incr  = pr->u.p.st;
1580 
1581                         KMP_DEBUG_ASSERT(init <= trip);
1582                         if ( (last = (limit >= trip)) != 0 )
1583                             limit = trip;
1584                         if ( p_last ) {
1585                             *p_last = last;
1586                         }
1587                         if ( p_st != 0 ) *p_st = incr;
1588 
1589                         if ( incr == 1 ) {
1590                             *p_lb = start + init;
1591                             *p_ub = start + limit;
1592                         } else {
1593                             *p_lb = start + init * incr;
1594                             *p_ub = start + limit * incr;
1595                         }
1596 
1597                         if ( pr->ordered ) {
1598                             pr->u.p.ordered_lower = init;
1599                             pr->u.p.ordered_upper = limit;
1600                             #ifdef KMP_DEBUG
1601                             {
1602                                 const char * buff;
1603                                 // create format specifiers before the debug output
1604                                 buff = __kmp_str_format(
1605                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1606                                     traits_t< UT >::spec, traits_t< UT >::spec );
1607                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1608                                 __kmp_str_free( &buff );
1609                             }
1610                             #endif
1611                         } // if
1612                     } // if
1613                     break;
1614                 } // case
1615             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1616             case kmp_sch_static_balanced:
1617                 {
1618                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1619                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1620                         pr->u.p.count = 1;
1621                         *p_lb = pr->u.p.lb;
1622                         *p_ub = pr->u.p.ub;
1623                         last = pr->u.p.parm1;
1624                         if ( p_last ) {
1625                             *p_last = last;
1626                         }
1627                         if ( p_st )
1628                             *p_st = pr->u.p.st;
1629                     } else {  /* no iterations to do */
1630                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1631                     }
1632                     if ( pr->ordered ) {
1633                         #ifdef KMP_DEBUG
1634                         {
1635                             const char * buff;
1636                             // create format specifiers before the debug output
1637                             buff = __kmp_str_format(
1638                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1639                                 traits_t< UT >::spec, traits_t< UT >::spec );
1640                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1641                             __kmp_str_free( &buff );
1642                         }
1643                         #endif
1644                     } // if
1645                 } // case
1646                 break;
1647             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1648             case kmp_sch_static_chunked:
1649                 {
1650                     T parm1;
1651 
1652                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1653                                    gtid ) );
1654                     parm1 = pr->u.p.parm1;
1655 
1656                     trip  = pr->u.p.tc - 1;
1657                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1658 
1659                     if ( (status = (init <= trip)) != 0 ) {
1660                         start = pr->u.p.lb;
1661                         incr  = pr->u.p.st;
1662                         limit = parm1 + init - 1;
1663 
1664                         if ( (last = (limit >= trip)) != 0 )
1665                             limit = trip;
1666 
1667                         if ( p_last ) {
1668                             *p_last = last;
1669                         }
1670                         if ( p_st != 0 ) *p_st = incr;
1671 
1672                         pr->u.p.count += team->t.t_nproc;
1673 
1674                         if ( incr == 1 ) {
1675                             *p_lb = start + init;
1676                             *p_ub = start + limit;
1677                         }
1678                         else {
1679                             *p_lb = start + init * incr;
1680                             *p_ub = start + limit * incr;
1681                         }
1682 
1683                         if ( pr->ordered ) {
1684                             pr->u.p.ordered_lower = init;
1685                             pr->u.p.ordered_upper = limit;
1686                             #ifdef KMP_DEBUG
1687                             {
1688                                 const char * buff;
1689                                 // create format specifiers before the debug output
1690                                 buff = __kmp_str_format(
1691                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1692                                     traits_t< UT >::spec, traits_t< UT >::spec );
1693                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1694                                 __kmp_str_free( &buff );
1695                             }
1696                             #endif
1697                         } // if
1698                     } // if
1699                 } // case
1700                 break;
1701 
1702             case kmp_sch_dynamic_chunked:
1703                 {
1704                     T chunk = pr->u.p.parm1;
1705 
1706                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1707                                    gtid ) );
1708 
1709                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1710                     trip = pr->u.p.tc - 1;
1711 
1712                     if ( (status = (init <= trip)) == 0 ) {
1713                         *p_lb = 0;
1714                         *p_ub = 0;
1715                         if ( p_st != 0 ) *p_st = 0;
1716                     } else {
1717                         start = pr->u.p.lb;
1718                         limit = chunk + init - 1;
1719                         incr  = pr->u.p.st;
1720 
1721                         if ( (last = (limit >= trip)) != 0 )
1722                             limit = trip;
1723                         if ( p_last ) {
1724                             *p_last = last;
1725                         }
1726                         if ( p_st != 0 ) *p_st = incr;
1727 
1728                         if ( incr == 1 ) {
1729                             *p_lb = start + init;
1730                             *p_ub = start + limit;
1731                         } else {
1732                             *p_lb = start + init * incr;
1733                             *p_ub = start + limit * incr;
1734                         }
1735 
1736                         if ( pr->ordered ) {
1737                             pr->u.p.ordered_lower = init;
1738                             pr->u.p.ordered_upper = limit;
1739                             #ifdef KMP_DEBUG
1740                             {
1741                                 const char * buff;
1742                                 // create format specifiers before the debug output
1743                                 buff = __kmp_str_format(
1744                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1745                                     traits_t< UT >::spec, traits_t< UT >::spec );
1746                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1747                                 __kmp_str_free( &buff );
1748                             }
1749                             #endif
1750                         } // if
1751                     } // if
1752                 } // case
1753                 break;
1754 
1755             case kmp_sch_guided_iterative_chunked:
1756                 {
1757                     T  chunkspec = pr->u.p.parm1;
1758                     KD_TRACE(100,
1759                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1760                     trip  = pr->u.p.tc;
1761                     // Start atomic part of calculations
1762                     while(1) {
1763                         ST  remaining;             // signed, because can be < 0
1764                         init = sh->u.s.iteration;  // shared value
1765                         remaining = trip - init;
1766                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1767                             // nothing to do, don't try atomic op
1768                             status = 0;
1769                             break;
1770                         }
1771                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1772                             // use dynamic-style shcedule
1773                             // atomically inrement iterations, get old value
1774                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1775                             remaining = trip - init;
1776                             if (remaining <= 0) {
1777                                 status = 0;    // all iterations got by other threads
1778                             } else {
1779                                 // got some iterations to work on
1780                                 status = 1;
1781                                 if ( (T)remaining > chunkspec ) {
1782                                     limit = init + chunkspec - 1;
1783                                 } else {
1784                                     last = 1;   // the last chunk
1785                                     limit = init + remaining - 1;
1786                                 } // if
1787                             } // if
1788                             break;
1789                         } // if
1790                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1791                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1792                             // CAS was successful, chunk obtained
1793                             status = 1;
1794                             --limit;
1795                             break;
1796                         } // if
1797                     } // while
1798                     if ( status != 0 ) {
1799                         start = pr->u.p.lb;
1800                         incr = pr->u.p.st;
1801                         if ( p_st != NULL )
1802                             *p_st = incr;
1803                         if ( p_last != NULL )
1804                             *p_last = last;
1805                         *p_lb = start + init * incr;
1806                         *p_ub = start + limit * incr;
1807                         if ( pr->ordered ) {
1808                             pr->u.p.ordered_lower = init;
1809                             pr->u.p.ordered_upper = limit;
1810                             #ifdef KMP_DEBUG
1811                             {
1812                                 const char * buff;
1813                                 // create format specifiers before the debug output
1814                                 buff = __kmp_str_format(
1815                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1816                                     traits_t< UT >::spec, traits_t< UT >::spec );
1817                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1818                                 __kmp_str_free( &buff );
1819                             }
1820                             #endif
1821                         } // if
1822                     } else {
1823                         *p_lb = 0;
1824                         *p_ub = 0;
1825                         if ( p_st != NULL )
1826                             *p_st = 0;
1827                     } // if
1828                 } // case
1829                 break;
1830 
1831             case kmp_sch_guided_analytical_chunked:
1832                 {
1833                     T   chunkspec = pr->u.p.parm1;
1834                     UT chunkIdx;
1835     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1836                     /* for storing original FPCW value for Windows* OS on
1837 		       IA-32 architecture 8-byte version */
1838                     unsigned int oldFpcw;
1839                     int fpcwSet = 0;
1840     #endif
1841                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1842                                    gtid ) );
1843 
1844                     trip  = pr->u.p.tc;
1845 
1846                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1847                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1848 
1849                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1850                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1851                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1852                             --trip;
1853                             /* use dynamic-style scheduling */
1854                             init = chunkIdx * chunkspec + pr->u.p.count;
1855                             /* need to verify init > 0 in case of overflow in the above calculation */
1856                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1857                                 limit = init + chunkspec -1;
1858 
1859                                 if ( (last = (limit >= trip)) != 0 )
1860                                     limit = trip;
1861                             }
1862                             break;
1863                         } else {
1864                             /* use exponential-style scheduling */
1865                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1866                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1867                              */
1868     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1869                             /* If we haven't already done so, save original
1870 			       FPCW and set precision to 64-bit, as Windows* OS
1871 			       on IA-32 architecture defaults to 53-bit */
1872                             if ( !fpcwSet ) {
1873                                 oldFpcw = _control87(0,0x30000);
1874                                 fpcwSet = 0x30000;
1875                             }
1876     #endif
1877                             if ( chunkIdx ) {
1878                                 init = __kmp_dispatch_guided_remaining< T >(
1879                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1880                                 KMP_DEBUG_ASSERT(init);
1881                                 init = trip - init;
1882                             } else
1883                                 init = 0;
1884                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1885                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1886                             KMP_ASSERT(init <= limit);
1887                             if ( init < limit ) {
1888                                 KMP_DEBUG_ASSERT(limit <= trip);
1889                                 --limit;
1890                                 status = 1;
1891                                 break;
1892                             } // if
1893                         } // if
1894                     } // while (1)
1895     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1896                     /* restore FPCW if necessary */
1897                     if ( oldFpcw & fpcwSet != 0 )
1898                         _control87(oldFpcw,0x30000);
1899     #endif
1900                     if ( status != 0 ) {
1901                         start = pr->u.p.lb;
1902                         incr = pr->u.p.st;
1903                         if ( p_st != NULL )
1904                             *p_st = incr;
1905                         if ( p_last != NULL )
1906                             *p_last = last;
1907                         *p_lb = start + init * incr;
1908                         *p_ub = start + limit * incr;
1909                         if ( pr->ordered ) {
1910                             pr->u.p.ordered_lower = init;
1911                             pr->u.p.ordered_upper = limit;
1912                             #ifdef KMP_DEBUG
1913                             {
1914                                 const char * buff;
1915                                 // create format specifiers before the debug output
1916                                 buff = __kmp_str_format(
1917                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1918                                     traits_t< UT >::spec, traits_t< UT >::spec );
1919                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1920                                 __kmp_str_free( &buff );
1921                             }
1922                             #endif
1923                         }
1924                     } else {
1925                         *p_lb = 0;
1926                         *p_ub = 0;
1927                         if ( p_st != NULL )
1928                             *p_st = 0;
1929                     }
1930                 } // case
1931                 break;
1932 
1933             case kmp_sch_trapezoidal:
1934                 {
1935                     UT   index;
1936                     T    parm2 = pr->u.p.parm2;
1937                     T    parm3 = pr->u.p.parm3;
1938                     T    parm4 = pr->u.p.parm4;
1939                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1940                                    gtid ) );
1941 
1942                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1943 
1944                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1945                     trip = pr->u.p.tc - 1;
1946 
1947                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1948                         *p_lb = 0;
1949                         *p_ub = 0;
1950                         if ( p_st != 0 ) *p_st = 0;
1951                     } else {
1952                         start = pr->u.p.lb;
1953                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1954                         incr  = pr->u.p.st;
1955 
1956                         if ( (last = (limit >= trip)) != 0 )
1957                             limit = trip;
1958 
1959                         if ( p_last != 0 ) {
1960                             *p_last = last;
1961                         }
1962                         if ( p_st != 0 ) *p_st = incr;
1963 
1964                         if ( incr == 1 ) {
1965                             *p_lb = start + init;
1966                             *p_ub = start + limit;
1967                         } else {
1968                             *p_lb = start + init * incr;
1969                             *p_ub = start + limit * incr;
1970                         }
1971 
1972                         if ( pr->ordered ) {
1973                             pr->u.p.ordered_lower = init;
1974                             pr->u.p.ordered_upper = limit;
1975                             #ifdef KMP_DEBUG
1976                             {
1977                                 const char * buff;
1978                                 // create format specifiers before the debug output
1979                                 buff = __kmp_str_format(
1980                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1981                                     traits_t< UT >::spec, traits_t< UT >::spec );
1982                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1983                                 __kmp_str_free( &buff );
1984                             }
1985                             #endif
1986                         } // if
1987                     } // if
1988                 } // case
1989                 break;
1990             } // switch
1991         } // if tc == 0;
1992 
1993         if ( status == 0 ) {
1994             UT   num_done;
1995 
1996             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
1997             #ifdef KMP_DEBUG
1998             {
1999                 const char * buff;
2000                 // create format specifiers before the debug output
2001                 buff = __kmp_str_format(
2002                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2003                     traits_t< UT >::spec );
2004                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2005                 __kmp_str_free( &buff );
2006             }
2007             #endif
2008 
2009             if ( num_done == team->t.t_nproc-1 ) {
2010                 /* NOTE: release this buffer to be reused */
2011 
2012                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2013 
2014                 sh->u.s.num_done = 0;
2015                 sh->u.s.iteration = 0;
2016 
2017                 /* TODO replace with general release procedure? */
2018                 if ( pr->ordered ) {
2019                     sh->u.s.ordered_iteration = 0;
2020                 }
2021 
2022                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2023 
2024                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2025                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2026                                 gtid, sh->buffer_index) );
2027 
2028                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2029 
2030             } // if
2031             if ( __kmp_env_consistency_check ) {
2032                 if ( pr->pushed_ws != ct_none ) {
2033                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2034                 }
2035             }
2036 
2037             th -> th.th_dispatch -> th_deo_fcn = NULL;
2038             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2039             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2040             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2041         } // if (status == 0)
2042 #if KMP_OS_WINDOWS
2043         else if ( last ) {
2044             pr->u.p.last_upper = pr->u.p.ub;
2045         }
2046 #endif /* KMP_OS_WINDOWS */
2047     } // if
2048 
2049     #ifdef KMP_DEBUG
2050     {
2051         const char * buff;
2052         // create format specifiers before the debug output
2053         buff = __kmp_str_format(
2054             "__kmp_dispatch_next: T#%%d normal case: " \
2055             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2056             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2057         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2058         __kmp_str_free( &buff );
2059     }
2060     #endif
2061     return status;
2062 }
2063 
2064 //-----------------------------------------------------------------------------------------
2065 // Dispatch routines
2066 //    Transfer call to template< type T >
2067 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2068 //                         T lb, T ub, ST st, ST chunk )
2069 extern "C" {
2070 
2071 /*!
2072 @ingroup WORK_SHARING
2073 @{
2074 @param loc Source location
2075 @param gtid Global thread id
2076 @param schedule Schedule type
2077 @param lb  Lower bound
2078 @param ub  Upper bound
2079 @param st  Step (or increment if you prefer)
2080 @param chunk The chunk size to block with
2081 
2082 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2083 These functions are all identical apart from the types of the arguments.
2084 */
2085 
2086 void
2087 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2088                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2089 {
2090     KMP_DEBUG_ASSERT( __kmp_init_serial );
2091     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2092 }
2093 /*!
2094 See @ref __kmpc_dispatch_init_4
2095 */
2096 void
2097 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2098                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2099 {
2100     KMP_DEBUG_ASSERT( __kmp_init_serial );
2101     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2102 }
2103 
2104 /*!
2105 See @ref __kmpc_dispatch_init_4
2106 */
2107 void
2108 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2109                         kmp_int64 lb, kmp_int64 ub,
2110                         kmp_int64 st, kmp_int64 chunk )
2111 {
2112     KMP_DEBUG_ASSERT( __kmp_init_serial );
2113     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2114 }
2115 
2116 /*!
2117 See @ref __kmpc_dispatch_init_4
2118 */
2119 void
2120 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2121                          kmp_uint64 lb, kmp_uint64 ub,
2122                          kmp_int64 st, kmp_int64 chunk )
2123 {
2124     KMP_DEBUG_ASSERT( __kmp_init_serial );
2125     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2126 }
2127 
2128 /*!
2129 @param loc Source code location
2130 @param gtid Global thread id
2131 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2132 @param p_lb   Pointer to the lower bound for the next chunk of work
2133 @param p_ub   Pointer to the upper bound for the next chunk of work
2134 @param p_st   Pointer to the stride for the next chunk of work
2135 @return one if there is work to be done, zero otherwise
2136 
2137 Get the next dynamically allocated chunk of work for this thread.
2138 If there is no more work, then the lb,ub and stride need not be modified.
2139 */
2140 int
2141 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2142                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2143 {
2144     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2145 }
2146 
2147 /*!
2148 See @ref __kmpc_dispatch_next_4
2149 */
2150 int
2151 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2152                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2153 {
2154     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2155 }
2156 
2157 /*!
2158 See @ref __kmpc_dispatch_next_4
2159 */
2160 int
2161 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2162                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2163 {
2164     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2165 }
2166 
2167 /*!
2168 See @ref __kmpc_dispatch_next_4
2169 */
2170 int
2171 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2172                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2173 {
2174     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2175 }
2176 
2177 /*!
2178 @param loc Source code location
2179 @param gtid Global thread id
2180 
2181 Mark the end of a dynamic loop.
2182 */
2183 void
2184 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2185 {
2186     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2187 }
2188 
2189 /*!
2190 See @ref __kmpc_dispatch_fini_4
2191 */
2192 void
2193 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2194 {
2195     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2196 }
2197 
2198 /*!
2199 See @ref __kmpc_dispatch_fini_4
2200 */
2201 void
2202 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2203 {
2204     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2205 }
2206 
2207 /*!
2208 See @ref __kmpc_dispatch_fini_4
2209 */
2210 void
2211 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2212 {
2213     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2214 }
2215 /*! @} */
2216 
2217 //-----------------------------------------------------------------------------------------
2218 //Non-template routines from kmp_dispatch.c used in other sources
2219 
2220 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2221     return value == checker;
2222 }
2223 
2224 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2225     return value != checker;
2226 }
2227 
2228 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2229     return value < checker;
2230 }
2231 
2232 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2233     return value >= checker;
2234 }
2235 
2236 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2237     return value <= checker;
2238 }
2239 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2240     return value == checker;
2241 }
2242 
2243 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2244     return value != checker;
2245 }
2246 
2247 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2248     return value < checker;
2249 }
2250 
2251 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2252     return value >= checker;
2253 }
2254 
2255 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2256     return value <= checker;
2257 }
2258 
2259 kmp_uint32
2260 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2261                    kmp_uint32            checker,
2262                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2263                    , void        * obj    // Higher-level synchronization object, or NULL.
2264                    )
2265 {
2266     // note: we may not belong to a team at this point
2267     register volatile kmp_uint32         * spin          = spinner;
2268     register          kmp_uint32           check         = checker;
2269     register          kmp_uint32   spins;
2270     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2271     register          kmp_uint32           r;
2272 
2273     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2274     KMP_INIT_YIELD( spins );
2275     // main wait spin loop
2276     while(!f(r = TCR_4(*spin), check)) {
2277         KMP_FSYNC_SPIN_PREPARE( obj );
2278         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2279            It causes problems with infinite recursion because of exit lock */
2280         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2281             __kmp_abort_thread(); */
2282 
2283         __kmp_static_delay(TRUE);
2284 
2285         /* if we have waited a bit, or are oversubscribed, yield */
2286         /* pause is in the following code */
2287         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2288         KMP_YIELD_SPIN( spins );
2289     }
2290     KMP_FSYNC_SPIN_ACQUIRED( obj );
2291     return r;
2292 }
2293 
2294 kmp_uint64
2295 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2296                     kmp_uint64            checker,
2297                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2298                     , void        * obj    // Higher-level synchronization object, or NULL.
2299                     )
2300 {
2301     // note: we may not belong to a team at this point
2302     register volatile kmp_uint64         * spin          = spinner;
2303     register          kmp_uint64           check         = checker;
2304     register          kmp_uint32   spins;
2305     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2306     register          kmp_uint64           r;
2307 
2308     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2309     KMP_INIT_YIELD( spins );
2310     // main wait spin loop
2311     while(!f(r = *spin, check))
2312     {
2313         KMP_FSYNC_SPIN_PREPARE( obj );
2314         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2315            It causes problems with infinite recursion because of exit lock */
2316         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2317             __kmp_abort_thread(); */
2318 
2319         __kmp_static_delay(TRUE);
2320 
2321         // if we are oversubscribed,
2322         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2323         // pause is in the following code
2324         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2325         KMP_YIELD_SPIN( spins );
2326     }
2327     KMP_FSYNC_SPIN_ACQUIRED( obj );
2328     return r;
2329 }
2330 
2331 } // extern "C"
2332 
2333 #ifdef KMP_GOMP_COMPAT
2334 
2335 void
2336 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2337                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2338                            kmp_int32 chunk, int push_ws )
2339 {
2340     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2341                                       push_ws );
2342 }
2343 
2344 void
2345 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2346                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2347                             kmp_int32 chunk, int push_ws )
2348 {
2349     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2350                                        push_ws );
2351 }
2352 
2353 void
2354 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2355                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2356                            kmp_int64 chunk, int push_ws )
2357 {
2358     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2359                                       push_ws );
2360 }
2361 
2362 void
2363 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2364                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2365                             kmp_int64 chunk, int push_ws )
2366 {
2367     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2368                                        push_ws );
2369 }
2370 
2371 void
2372 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2373 {
2374     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2375 }
2376 
2377 void
2378 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2379 {
2380     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2381 }
2382 
2383 void
2384 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2385 {
2386     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2387 }
2388 
2389 void
2390 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2391 {
2392     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2393 }
2394 
2395 #endif /* KMP_GOMP_COMPAT */
2396 
2397 /* ------------------------------------------------------------------------ */
2398 /* ------------------------------------------------------------------------ */
2399 
2400