1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 #if OMP_41_ENABLED
179     volatile kmp_int32      doacross_buf_idx;  // teamwise index
180     kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
181     kmp_int32               doacross_num_done; // count finished threads
182 #endif
183 };
184 
185 /* ------------------------------------------------------------------------ */
186 /* ------------------------------------------------------------------------ */
187 
188 #undef USE_TEST_LOCKS
189 
190 // test_then_add template (general template should NOT be used)
191 template< typename T >
192 static __forceinline T
193 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
194 
195 template<>
196 __forceinline kmp_int32
197 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
198 {
199     kmp_int32 r;
200     r = KMP_TEST_THEN_ADD32( p, d );
201     return r;
202 }
203 
204 template<>
205 __forceinline kmp_int64
206 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
207 {
208     kmp_int64 r;
209     r = KMP_TEST_THEN_ADD64( p, d );
210     return r;
211 }
212 
213 // test_then_inc_acq template (general template should NOT be used)
214 template< typename T >
215 static __forceinline T
216 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
217 
218 template<>
219 __forceinline kmp_int32
220 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
221 {
222     kmp_int32 r;
223     r = KMP_TEST_THEN_INC_ACQ32( p );
224     return r;
225 }
226 
227 template<>
228 __forceinline kmp_int64
229 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
230 {
231     kmp_int64 r;
232     r = KMP_TEST_THEN_INC_ACQ64( p );
233     return r;
234 }
235 
236 // test_then_inc template (general template should NOT be used)
237 template< typename T >
238 static __forceinline T
239 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
240 
241 template<>
242 __forceinline kmp_int32
243 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
244 {
245     kmp_int32 r;
246     r = KMP_TEST_THEN_INC32( p );
247     return r;
248 }
249 
250 template<>
251 __forceinline kmp_int64
252 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
253 {
254     kmp_int64 r;
255     r = KMP_TEST_THEN_INC64( p );
256     return r;
257 }
258 
259 // compare_and_swap template (general template should NOT be used)
260 template< typename T >
261 static __forceinline kmp_int32
262 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
263 
264 template<>
265 __forceinline kmp_int32
266 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
267 {
268     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
269 }
270 
271 template<>
272 __forceinline kmp_int32
273 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
274 {
275     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
276 }
277 
278 /*
279     Spin wait loop that first does pause, then yield.
280     Waits until function returns non-zero when called with *spinner and check.
281     Does NOT put threads to sleep.
282 #if USE_ITT_BUILD
283     Arguments:
284         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
285             locks consistently. For example, if lock is acquired immediately, its address is
286             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
287             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
288             address, not an address of low-level spinner.
289 #endif // USE_ITT_BUILD
290 */
291 template< typename UT >
292 // ToDo: make inline function (move to header file for icl)
293 static UT  // unsigned 4- or 8-byte type
294 __kmp_wait_yield( volatile UT * spinner,
295                   UT            checker,
296                   kmp_uint32 (* pred)( UT, UT )
297                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
298                   )
299 {
300     // note: we may not belong to a team at this point
301     register volatile UT         * spin          = spinner;
302     register          UT           check         = checker;
303     register          kmp_uint32   spins;
304     register          kmp_uint32 (*f) ( UT, UT ) = pred;
305     register          UT           r;
306 
307     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
308     KMP_INIT_YIELD( spins );
309     // main wait spin loop
310     while(!f(r = *spin, check))
311     {
312         KMP_FSYNC_SPIN_PREPARE( obj );
313         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
314            It causes problems with infinite recursion because of exit lock */
315         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
316             __kmp_abort_thread(); */
317 
318         // if we are oversubscribed,
319         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
320         // pause is in the following code
321         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
322         KMP_YIELD_SPIN( spins );
323     }
324     KMP_FSYNC_SPIN_ACQUIRED( obj );
325     return r;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_eq( UT value, UT checker) {
330     return value == checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_neq( UT value, UT checker) {
335     return value != checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_lt( UT value, UT checker) {
340     return value < checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_ge( UT value, UT checker) {
345     return value >= checker;
346 }
347 
348 template< typename UT >
349 static kmp_uint32 __kmp_le( UT value, UT checker) {
350     return value <= checker;
351 }
352 
353 
354 /* ------------------------------------------------------------------------ */
355 /* ------------------------------------------------------------------------ */
356 
357 static void
358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
359 {
360     kmp_info_t *th;
361 
362     KMP_DEBUG_ASSERT( gtid_ref );
363 
364     if ( __kmp_env_consistency_check ) {
365         th = __kmp_threads[*gtid_ref];
366         if ( th -> th.th_root -> r.r_active
367           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
368 #if KMP_USE_DYNAMIC_LOCK
369             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
370 #else
371             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
372 #endif
373         }
374     }
375 }
376 
377 template< typename UT >
378 static void
379 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
380 {
381     typedef typename traits_t< UT >::signed_t    ST;
382     dispatch_private_info_template< UT > * pr;
383 
384     int gtid = *gtid_ref;
385 //    int  cid = *cid_ref;
386     kmp_info_t *th = __kmp_threads[ gtid ];
387     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
388 
389     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
390     if ( __kmp_env_consistency_check ) {
391         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
392             ( th -> th.th_dispatch -> th_dispatch_pr_current );
393         if ( pr -> pushed_ws != ct_none ) {
394 #if KMP_USE_DYNAMIC_LOCK
395             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
396 #else
397             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
398 #endif
399         }
400     }
401 
402     if ( ! th -> th.th_team -> t.t_serialized ) {
403         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
404             ( th -> th.th_dispatch -> th_dispatch_sh_current );
405         UT  lower;
406 
407         if ( ! __kmp_env_consistency_check ) {
408                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
409                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
410         }
411         lower = pr->u.p.ordered_lower;
412 
413         #if ! defined( KMP_GOMP_COMPAT )
414             if ( __kmp_env_consistency_check ) {
415                 if ( pr->ordered_bumped ) {
416                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
417                     __kmp_error_construct2(
418                         kmp_i18n_msg_CnsMultipleNesting,
419                         ct_ordered_in_pdo, loc_ref,
420                         & p->stack_data[ p->w_top ]
421                     );
422                 }
423             }
424         #endif /* !defined(KMP_GOMP_COMPAT) */
425 
426         KMP_MB();
427         #ifdef KMP_DEBUG
428         {
429             const char * buff;
430             // create format specifiers before the debug output
431             buff = __kmp_str_format(
432                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
433                 traits_t< UT >::spec, traits_t< UT >::spec );
434             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435             __kmp_str_free( &buff );
436         }
437         #endif
438 
439         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
440                                 USE_ITT_BUILD_ARG( NULL )
441                                 );
442         KMP_MB();  /* is this necessary? */
443         #ifdef KMP_DEBUG
444         {
445             const char * buff;
446             // create format specifiers before the debug output
447             buff = __kmp_str_format(
448                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
449                 traits_t< UT >::spec, traits_t< UT >::spec );
450             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
451             __kmp_str_free( &buff );
452         }
453         #endif
454     }
455     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
456 }
457 
458 static void
459 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
460 {
461     kmp_info_t *th;
462 
463     if ( __kmp_env_consistency_check ) {
464         th = __kmp_threads[*gtid_ref];
465         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
466             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
467         }
468     }
469 }
470 
471 template< typename UT >
472 static void
473 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
474 {
475     typedef typename traits_t< UT >::signed_t    ST;
476     dispatch_private_info_template< UT > * pr;
477 
478     int gtid = *gtid_ref;
479 //    int  cid = *cid_ref;
480     kmp_info_t *th = __kmp_threads[ gtid ];
481     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
482 
483     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
484     if ( __kmp_env_consistency_check ) {
485         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
486             ( th -> th.th_dispatch -> th_dispatch_pr_current );
487         if ( pr -> pushed_ws != ct_none ) {
488             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
489         }
490     }
491 
492     if ( ! th -> th.th_team -> t.t_serialized ) {
493         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
494             ( th -> th.th_dispatch -> th_dispatch_sh_current );
495 
496         if ( ! __kmp_env_consistency_check ) {
497             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
498                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
499         }
500 
501         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
502         #if ! defined( KMP_GOMP_COMPAT )
503             if ( __kmp_env_consistency_check ) {
504                 if ( pr->ordered_bumped != 0 ) {
505                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
506                     /* How to test it? - OM */
507                     __kmp_error_construct2(
508                         kmp_i18n_msg_CnsMultipleNesting,
509                         ct_ordered_in_pdo, loc_ref,
510                         & p->stack_data[ p->w_top ]
511                     );
512                 }
513             }
514         #endif /* !defined(KMP_GOMP_COMPAT) */
515 
516         KMP_MB();       /* Flush all pending memory write invalidates.  */
517 
518         pr->ordered_bumped += 1;
519 
520         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
521                         gtid, pr->ordered_bumped ) );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524 
525         /* TODO use general release procedure? */
526         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
527 
528         KMP_MB();       /* Flush all pending memory write invalidates.  */
529     }
530     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
531 }
532 
533 /* Computes and returns x to the power of y, where y must a non-negative integer */
534 template< typename UT >
535 static __forceinline long double
536 __kmp_pow(long double x, UT y) {
537     long double s=1.0L;
538 
539     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
540     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
541     while(y) {
542         if ( y & 1 )
543             s *= x;
544         x *= x;
545         y >>= 1;
546     }
547     return s;
548 }
549 
550 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
551    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
552    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
553    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
554 */
555 template< typename T >
556 static __inline typename traits_t< T >::unsigned_t
557 __kmp_dispatch_guided_remaining(
558     T                                  tc,
559     typename traits_t< T >::floating_t base,
560     typename traits_t< T >::unsigned_t idx
561 ) {
562     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
563        least for ICL 8.1, long double arithmetic may not really have
564        long double precision, even with /Qlong_double.  Currently, we
565        workaround that in the caller code, by manipulating the FPCW for
566        Windows* OS on IA-32 architecture.  The lack of precision is not
567        expected to be a correctness issue, though.
568     */
569     typedef typename traits_t< T >::unsigned_t  UT;
570 
571     long double x = tc * __kmp_pow< UT >(base, idx);
572     UT r = (UT) x;
573     if ( x == r )
574         return r;
575     return r + 1;
576 }
577 
578 // Parameters of the guided-iterative algorithm:
579 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
580 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
581 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
582 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
583 static int guided_int_param = 2;
584 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
585 
586 // UT - unsigned flavor of T, ST - signed flavor of T,
587 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
588 template< typename T >
589 static void
590 __kmp_dispatch_init(
591     ident_t                        * loc,
592     int                              gtid,
593     enum sched_type                  schedule,
594     T                                lb,
595     T                                ub,
596     typename traits_t< T >::signed_t st,
597     typename traits_t< T >::signed_t chunk,
598     int                              push_ws
599 ) {
600     typedef typename traits_t< T >::unsigned_t  UT;
601     typedef typename traits_t< T >::signed_t    ST;
602     typedef typename traits_t< T >::floating_t  DBL;
603     static const int ___kmp_size_type = sizeof( UT );
604 
605     int                                            active;
606     T                                              tc;
607     kmp_info_t *                                   th;
608     kmp_team_t *                                   team;
609     kmp_uint32                                     my_buffer_index;
610     dispatch_private_info_template< T >          * pr;
611     dispatch_shared_info_template< UT > volatile * sh;
612 
613     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
614     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
615 
616     if ( ! TCR_4( __kmp_init_parallel ) )
617         __kmp_parallel_initialize();
618 
619 #if INCLUDE_SSC_MARKS
620     SSC_MARK_DISPATCH_INIT();
621 #endif
622     #ifdef KMP_DEBUG
623     {
624         const char * buff;
625         // create format specifiers before the debug output
626         buff = __kmp_str_format(
627             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
628             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
629         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
630         __kmp_str_free( &buff );
631     }
632     #endif
633     /* setup data */
634     th     = __kmp_threads[ gtid ];
635     team   = th -> th.th_team;
636     active = ! team -> t.t_serialized;
637     th->th.th_ident = loc;
638 
639 #if USE_ITT_BUILD
640     kmp_uint64 cur_chunk = chunk;
641     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
642         KMP_MASTER_GTID(gtid) &&
643 #if OMP_40_ENABLED
644         th->th.th_teams_microtask == NULL &&
645 #endif
646         team->t.t_active_level == 1;
647 #endif
648     if ( ! active ) {
649         pr = reinterpret_cast< dispatch_private_info_template< T >* >
650             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
651     } else {
652         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
653                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
654 
655         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
656 
657         /* What happens when number of threads changes, need to resize buffer? */
658         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
659             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
660         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
661             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
662     }
663 
664     /* Currently just ignore the monotonic and non-monotonic modifiers (the compiler isn't producing them
665      * yet anyway).
666      * When it is we'll want to look at them somewhere here and use that information to add to our
667      * schedule choice. We shouldn't need to pass them on, they merely affect which schedule we can
668      * legally choose for various dynamic cases. (In paritcular, whether or not a stealing scheme is legal).
669      */
670     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
671 
672     /* Pick up the nomerge/ordered bits from the scheduling type */
673     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
674         pr->nomerge = TRUE;
675         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
676     } else {
677         pr->nomerge = FALSE;
678     }
679     pr->type_size = ___kmp_size_type; // remember the size of variables
680     if ( kmp_ord_lower & schedule ) {
681         pr->ordered = TRUE;
682         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
683     } else {
684         pr->ordered = FALSE;
685     }
686 
687     if ( schedule == kmp_sch_static ) {
688         schedule = __kmp_static;
689     } else {
690         if ( schedule == kmp_sch_runtime ) {
691             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
692             schedule = team -> t.t_sched.r_sched_type;
693             // Detail the schedule if needed (global controls are differentiated appropriately)
694             if ( schedule == kmp_sch_guided_chunked ) {
695                 schedule = __kmp_guided;
696             } else if ( schedule == kmp_sch_static ) {
697                 schedule = __kmp_static;
698             }
699             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
700             chunk = team -> t.t_sched.chunk;
701 #if USE_ITT_BUILD
702             cur_chunk = chunk;
703 #endif
704             #ifdef KMP_DEBUG
705             {
706                 const char * buff;
707                 // create format specifiers before the debug output
708                 buff = __kmp_str_format(
709                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
710                     traits_t< ST >::spec );
711                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
712                 __kmp_str_free( &buff );
713             }
714             #endif
715         } else {
716             if ( schedule == kmp_sch_guided_chunked ) {
717                 schedule = __kmp_guided;
718             }
719             if ( chunk <= 0 ) {
720                 chunk = KMP_DEFAULT_CHUNK;
721             }
722         }
723 
724         if ( schedule == kmp_sch_auto ) {
725             // mapping and differentiation: in the __kmp_do_serial_initialize()
726             schedule = __kmp_auto;
727             #ifdef KMP_DEBUG
728             {
729                 const char * buff;
730                 // create format specifiers before the debug output
731                 buff = __kmp_str_format(
732                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
733                     traits_t< ST >::spec );
734                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
735                 __kmp_str_free( &buff );
736             }
737             #endif
738         }
739 
740         /* guided analytical not safe for too many threads */
741         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
742             schedule = kmp_sch_guided_iterative_chunked;
743             KMP_WARNING( DispatchManyThreads );
744         }
745         pr->u.p.parm1 = chunk;
746     }
747     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
748                 "unknown scheduling type" );
749 
750     pr->u.p.count = 0;
751 
752     if ( __kmp_env_consistency_check ) {
753         if ( st == 0 ) {
754             __kmp_error_construct(
755                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
756                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
757             );
758         }
759     }
760 
761     tc = ( ub - lb + st );
762     if ( st != 1 ) {
763         if ( st < 0 ) {
764             if ( lb < ub ) {
765                 tc = 0;            // zero-trip
766             } else {   // lb >= ub
767                 tc = (ST)tc / st;  // convert to signed division
768             }
769         } else {       // st > 0
770             if ( ub < lb ) {
771                 tc = 0;            // zero-trip
772             } else {   // lb >= ub
773                 tc /= st;
774             }
775         }
776     } else if ( ub < lb ) {        // st == 1
777         tc = 0;                    // zero-trip
778     }
779 
780     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
781     // when statistics are disabled.
782     if (schedule == __kmp_static)
783     {
784         KMP_COUNT_BLOCK(OMP_FOR_static);
785         KMP_COUNT_VALUE(FOR_static_iterations, tc);
786     }
787     else
788     {
789         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
790         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
791     }
792 
793     pr->u.p.lb = lb;
794     pr->u.p.ub = ub;
795     pr->u.p.st = st;
796     pr->u.p.tc = tc;
797 
798     #if KMP_OS_WINDOWS
799     pr->u.p.last_upper = ub + st;
800     #endif /* KMP_OS_WINDOWS */
801 
802     /* NOTE: only the active parallel region(s) has active ordered sections */
803 
804     if ( active ) {
805         if ( pr->ordered == 0 ) {
806             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
807             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
808         } else {
809             pr->ordered_bumped = 0;
810 
811             pr->u.p.ordered_lower = 1;
812             pr->u.p.ordered_upper = 0;
813 
814             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
815             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
816         }
817     }
818 
819     if ( __kmp_env_consistency_check ) {
820         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
821         if ( push_ws ) {
822             __kmp_push_workshare( gtid, ws, loc );
823             pr->pushed_ws = ws;
824         } else {
825             __kmp_check_workshare( gtid, ws, loc );
826             pr->pushed_ws = ct_none;
827         }
828     }
829 
830     switch ( schedule ) {
831     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
832     case kmp_sch_static_steal:
833         {
834             T nproc = team->t.t_nproc;
835             T ntc, init;
836 
837             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
838 
839             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
840             if ( nproc > 1 && ntc >= nproc ) {
841                 T id = __kmp_tid_from_gtid(gtid);
842                 T small_chunk, extras;
843 
844                 small_chunk = ntc / nproc;
845                 extras = ntc % nproc;
846 
847                 init = id * small_chunk + ( id < extras ? id : extras );
848                 pr->u.p.count = init;
849                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
850 
851                 pr->u.p.parm2 = lb;
852                 //pr->pfields.parm3 = 0; // it's not used in static_steal
853                 pr->u.p.parm4 = id;
854                 pr->u.p.st = st;
855                 break;
856             } else {
857                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
858                                gtid ) );
859                 schedule = kmp_sch_static_balanced;
860                 /* too few iterations: fall-through to kmp_sch_static_balanced */
861             } // if
862             /* FALL-THROUGH to static balanced */
863         } // case
864     #endif
865     case kmp_sch_static_balanced:
866         {
867             T nproc = team->t.t_nproc;
868             T init, limit;
869 
870             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
871                             gtid ) );
872 
873             if ( nproc > 1 ) {
874                 T id = __kmp_tid_from_gtid(gtid);
875 
876                 if ( tc < nproc ) {
877                     if ( id < tc ) {
878                         init = id;
879                         limit = id;
880                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
881                     } else {
882                         pr->u.p.count = 1;  /* means no more chunks to execute */
883                         pr->u.p.parm1 = FALSE;
884                         break;
885                     }
886                 } else {
887                     T small_chunk = tc / nproc;
888                     T extras = tc % nproc;
889                     init = id * small_chunk + (id < extras ? id : extras);
890                     limit = init + small_chunk - (id < extras ? 0 : 1);
891                     pr->u.p.parm1 = (id == nproc - 1);
892                 }
893             } else {
894                 if ( tc > 0 ) {
895                     init = 0;
896                     limit = tc - 1;
897                     pr->u.p.parm1 = TRUE;
898                 } else {
899                     // zero trip count
900                     pr->u.p.count = 1;  /* means no more chunks to execute */
901                     pr->u.p.parm1 = FALSE;
902                     break;
903                 }
904             }
905 #if USE_ITT_BUILD
906             // Calculate chunk for metadata report
907             if ( itt_need_metadata_reporting )
908                 cur_chunk = limit - init + 1;
909 #endif
910             if ( st == 1 ) {
911                 pr->u.p.lb = lb + init;
912                 pr->u.p.ub = lb + limit;
913             } else {
914                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
915                 pr->u.p.lb = lb + init * st;
916                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
917                 if ( st > 0 ) {
918                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
919                 } else {
920                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
921                 }
922             }
923             if ( pr->ordered ) {
924                 pr->u.p.ordered_lower = init;
925                 pr->u.p.ordered_upper = limit;
926             }
927             break;
928         } // case
929     case kmp_sch_guided_iterative_chunked :
930         {
931             T nproc = team->t.t_nproc;
932             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
933 
934             if ( nproc > 1 ) {
935                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
936                     /* chunk size too large, switch to dynamic */
937                     schedule = kmp_sch_dynamic_chunked;
938                 } else {
939                     // when remaining iters become less than parm2 - switch to dynamic
940                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
941                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
942                 }
943             } else {
944                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
945                 schedule = kmp_sch_static_greedy;
946                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
947                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
948                 pr->u.p.parm1 = tc;
949             } // if
950         } // case
951         break;
952     case kmp_sch_guided_analytical_chunked:
953         {
954             T nproc = team->t.t_nproc;
955             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
956 
957             if ( nproc > 1 ) {
958                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
959                     /* chunk size too large, switch to dynamic */
960                     schedule = kmp_sch_dynamic_chunked;
961                 } else {
962                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
963                     DBL x;
964 
965                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
966                     /* Linux* OS already has 64-bit computation by default for
967 		       long double, and on Windows* OS on Intel(R) 64,
968 		       /Qlong_double doesn't work.  On Windows* OS
969 		       on IA-32 architecture, we need to set precision to
970 		       64-bit instead of the default 53-bit. Even though long
971 		       double doesn't work on Windows* OS on Intel(R) 64, the
972 		       resulting lack of precision is not expected to impact
973 		       the correctness of the algorithm, but this has not been
974 		       mathematically proven.
975                     */
976                     // save original FPCW and set precision to 64-bit, as
977                     // Windows* OS on IA-32 architecture defaults to 53-bit
978                     unsigned int oldFpcw = _control87(0,0);
979                     _control87(_PC_64,_MCW_PC); // 0,0x30000
980                     #endif
981                     /* value used for comparison in solver for cross-over point */
982                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
983 
984                     /* crossover point--chunk indexes equal to or greater than
985 		       this point switch to dynamic-style scheduling */
986                     UT   cross;
987 
988                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
989                     x = (long double)1.0 - (long double)0.5 / nproc;
990 
991                     #ifdef KMP_DEBUG
992                     { // test natural alignment
993                         struct _test_a {
994                             char a;
995                             union {
996                                 char b;
997                                 DBL  d;
998                             };
999                         } t;
1000                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1001                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1002                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1003                     }
1004                     #endif // KMP_DEBUG
1005 
1006                     /* save the term in thread private dispatch structure */
1007                     *(DBL*)&pr->u.p.parm3 = x;
1008 
1009                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1010                     {
1011                         UT          left, right, mid;
1012                         long double p;
1013 
1014                         /* estimate initial upper and lower bound */
1015 
1016                         /* doesn't matter what value right is as long as it is positive, but
1017                            it affects performance of the solver
1018                         */
1019                         right = 229;
1020                         p = __kmp_pow< UT >(x,right);
1021                         if ( p > target ) {
1022                             do{
1023                                 p *= p;
1024                                 right <<= 1;
1025                             } while(p>target && right < (1<<27));
1026                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1027                         } else {
1028                             left = 0;
1029                         }
1030 
1031                         /* bisection root-finding method */
1032                         while ( left + 1 < right ) {
1033                             mid = (left + right) / 2;
1034                             if ( __kmp_pow< UT >(x,mid) > target ) {
1035                                 left = mid;
1036                             } else {
1037                                 right = mid;
1038                             }
1039                         } // while
1040                         cross = right;
1041                     }
1042                     /* assert sanity of computed crossover point */
1043                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1044 
1045                     /* save the crossover point in thread private dispatch structure */
1046                     pr->u.p.parm2 = cross;
1047 
1048                     // C75803
1049                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1050                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1051                     #else
1052                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1053                     #endif
1054                     /* dynamic-style scheduling offset */
1055                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1056                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1057                         // restore FPCW
1058                         _control87(oldFpcw,_MCW_PC);
1059                     #endif
1060                 } // if
1061             } else {
1062                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1063                                gtid ) );
1064                 schedule = kmp_sch_static_greedy;
1065                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1066                 pr->u.p.parm1 = tc;
1067             } // if
1068         } // case
1069         break;
1070     case kmp_sch_static_greedy:
1071         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1072             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1073                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1074                 tc;
1075         break;
1076     case kmp_sch_static_chunked :
1077     case kmp_sch_dynamic_chunked :
1078         if ( pr->u.p.parm1 <= 0 ) {
1079             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1080         }
1081         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1082         break;
1083     case kmp_sch_trapezoidal :
1084         {
1085             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1086 
1087             T parm1, parm2, parm3, parm4;
1088             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1089 
1090             parm1 = chunk;
1091 
1092             /* F : size of the first cycle */
1093             parm2 = ( tc / (2 * team->t.t_nproc) );
1094 
1095             if ( parm2 < 1 ) {
1096                 parm2 = 1;
1097             }
1098 
1099             /* L : size of the last cycle.  Make sure the last cycle
1100              *     is not larger than the first cycle.
1101              */
1102             if ( parm1 < 1 ) {
1103                 parm1 = 1;
1104             } else if ( parm1 > parm2 ) {
1105                 parm1 = parm2;
1106             }
1107 
1108             /* N : number of cycles */
1109             parm3 = ( parm2 + parm1 );
1110             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1111 
1112             if ( parm3 < 2 ) {
1113                 parm3 = 2;
1114             }
1115 
1116             /* sigma : decreasing incr of the trapezoid */
1117             parm4 = ( parm3 - 1 );
1118             parm4 = ( parm2 - parm1 ) / parm4;
1119 
1120             // pointless check, because parm4 >= 0 always
1121             //if ( parm4 < 0 ) {
1122             //    parm4 = 0;
1123             //}
1124 
1125             pr->u.p.parm1 = parm1;
1126             pr->u.p.parm2 = parm2;
1127             pr->u.p.parm3 = parm3;
1128             pr->u.p.parm4 = parm4;
1129         } // case
1130         break;
1131 
1132     default:
1133         {
1134             __kmp_msg(
1135                 kmp_ms_fatal,                        // Severity
1136                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1137                 KMP_HNT( GetNewerLibrary ),          // Hint
1138                 __kmp_msg_null                       // Variadic argument list terminator
1139             );
1140         }
1141         break;
1142     } // switch
1143     pr->schedule = schedule;
1144     if ( active ) {
1145         /* The name of this buffer should be my_buffer_index when it's free to use it */
1146 
1147         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1148                         gtid, my_buffer_index, sh->buffer_index) );
1149         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1150                                         USE_ITT_BUILD_ARG( NULL )
1151                                         );
1152             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1153             // *always* 32-bit integers.
1154         KMP_MB();  /* is this necessary? */
1155         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1156                         gtid, my_buffer_index, sh->buffer_index) );
1157 
1158         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1159         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1160 #if USE_ITT_BUILD
1161         if ( pr->ordered ) {
1162             __kmp_itt_ordered_init( gtid );
1163         }; // if
1164         // Report loop metadata
1165         if ( itt_need_metadata_reporting ) {
1166             // Only report metadata by master of active team at level 1
1167             kmp_uint64 schedtype = 0;
1168             switch ( schedule ) {
1169             case kmp_sch_static_chunked:
1170             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1171                 break;
1172             case kmp_sch_static_greedy:
1173                 cur_chunk = pr->u.p.parm1;
1174                 break;
1175             case kmp_sch_dynamic_chunked:
1176                 schedtype = 1;
1177                 break;
1178             case kmp_sch_guided_iterative_chunked:
1179             case kmp_sch_guided_analytical_chunked:
1180                 schedtype = 2;
1181                 break;
1182             default:
1183 //            Should we put this case under "static"?
1184 //            case kmp_sch_static_steal:
1185                 schedtype = 3;
1186                 break;
1187             }
1188             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1189         }
1190 #endif /* USE_ITT_BUILD */
1191     }; // if
1192 
1193     #ifdef KMP_DEBUG
1194     {
1195         const char * buff;
1196         // create format specifiers before the debug output
1197         buff = __kmp_str_format(
1198             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1199             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1200             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1201             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1202             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1203             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1204             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1205         KD_TRACE(10, ( buff,
1206             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1207             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1208             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1209             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1210         __kmp_str_free( &buff );
1211     }
1212     #endif
1213     #if ( KMP_STATIC_STEAL_ENABLED )
1214     if ( ___kmp_size_type < 8 ) {
1215       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1216       // all the parm3 variables will contain the same value.
1217       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1218       // rather than program life-time increment.
1219       // So the dedicated variable is required. The 'static_steal_counter' is used.
1220       if( schedule == kmp_sch_static_steal ) {
1221         // Other threads will inspect this variable when searching for a victim.
1222         // This is a flag showing that other threads may steal from this thread since then.
1223         volatile T * p = &pr->u.p.static_steal_counter;
1224         *p = *p + 1;
1225       }
1226     }
1227     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1228 
1229 #if OMPT_SUPPORT && OMPT_TRACE
1230     if (ompt_enabled &&
1231         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1232         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1233         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1234         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1235             team_info->parallel_id, task_info->task_id, team_info->microtask);
1236     }
1237 #endif
1238 }
1239 
1240 /*
1241  * For ordered loops, either __kmp_dispatch_finish() should be called after
1242  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1243  * every chunk of iterations.  If the ordered section(s) were not executed
1244  * for this iteration (or every iteration in this chunk), we need to set the
1245  * ordered iteration counters so that the next thread can proceed.
1246  */
1247 template< typename UT >
1248 static void
1249 __kmp_dispatch_finish( int gtid, ident_t *loc )
1250 {
1251     typedef typename traits_t< UT >::signed_t ST;
1252     kmp_info_t *th = __kmp_threads[ gtid ];
1253 
1254     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1255     if ( ! th -> th.th_team -> t.t_serialized ) {
1256 
1257         dispatch_private_info_template< UT > * pr =
1258             reinterpret_cast< dispatch_private_info_template< UT >* >
1259             ( th->th.th_dispatch->th_dispatch_pr_current );
1260         dispatch_shared_info_template< UT > volatile * sh =
1261             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1262             ( th->th.th_dispatch->th_dispatch_sh_current );
1263         KMP_DEBUG_ASSERT( pr );
1264         KMP_DEBUG_ASSERT( sh );
1265         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1266                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1267 
1268         if ( pr->ordered_bumped ) {
1269             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1270                             gtid ) );
1271             pr->ordered_bumped = 0;
1272         } else {
1273             UT lower = pr->u.p.ordered_lower;
1274 
1275             #ifdef KMP_DEBUG
1276             {
1277                 const char * buff;
1278                 // create format specifiers before the debug output
1279                 buff = __kmp_str_format(
1280                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1281                     traits_t< UT >::spec, traits_t< UT >::spec );
1282                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1283                 __kmp_str_free( &buff );
1284             }
1285             #endif
1286 
1287             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1288                                    USE_ITT_BUILD_ARG(NULL)
1289                                    );
1290             KMP_MB();  /* is this necessary? */
1291             #ifdef KMP_DEBUG
1292             {
1293                 const char * buff;
1294                 // create format specifiers before the debug output
1295                 buff = __kmp_str_format(
1296                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1297                     traits_t< UT >::spec, traits_t< UT >::spec );
1298                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1299                 __kmp_str_free( &buff );
1300             }
1301             #endif
1302 
1303             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1304         } // if
1305     } // if
1306     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1307 }
1308 
1309 #ifdef KMP_GOMP_COMPAT
1310 
1311 template< typename UT >
1312 static void
1313 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1314 {
1315     typedef typename traits_t< UT >::signed_t ST;
1316     kmp_info_t *th = __kmp_threads[ gtid ];
1317 
1318     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1319     if ( ! th -> th.th_team -> t.t_serialized ) {
1320 //        int cid;
1321         dispatch_private_info_template< UT > * pr =
1322             reinterpret_cast< dispatch_private_info_template< UT >* >
1323             ( th->th.th_dispatch->th_dispatch_pr_current );
1324         dispatch_shared_info_template< UT > volatile * sh =
1325             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1326             ( th->th.th_dispatch->th_dispatch_sh_current );
1327         KMP_DEBUG_ASSERT( pr );
1328         KMP_DEBUG_ASSERT( sh );
1329         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1330                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1331 
1332 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333             UT lower = pr->u.p.ordered_lower;
1334             UT upper = pr->u.p.ordered_upper;
1335             UT inc = upper - lower + 1;
1336 
1337             if ( pr->ordered_bumped == inc ) {
1338                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1339                   gtid ) );
1340                 pr->ordered_bumped = 0;
1341             } else {
1342                 inc -= pr->ordered_bumped;
1343 
1344                 #ifdef KMP_DEBUG
1345                 {
1346                     const char * buff;
1347                     // create format specifiers before the debug output
1348                     buff = __kmp_str_format(
1349                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1350                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1351                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1352                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1353                     __kmp_str_free( &buff );
1354                 }
1355                 #endif
1356 
1357                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1358                                        USE_ITT_BUILD_ARG(NULL)
1359                                        );
1360 
1361                 KMP_MB();  /* is this necessary? */
1362                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1363                   gtid ) );
1364                 pr->ordered_bumped = 0;
1365 //!!!!! TODO check if the inc should be unsigned, or signed???
1366                 #ifdef KMP_DEBUG
1367                 {
1368                     const char * buff;
1369                     // create format specifiers before the debug output
1370                     buff = __kmp_str_format(
1371                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1372                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1373                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1374                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1375                     __kmp_str_free( &buff );
1376                 }
1377                 #endif
1378 
1379                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1380             }
1381 //        }
1382     }
1383     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1384 }
1385 
1386 #endif /* KMP_GOMP_COMPAT */
1387 
1388 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1389  * (no more work), then tell OMPT the loop is over. In some cases
1390  * kmp_dispatch_fini() is not called. */
1391 #if OMPT_SUPPORT && OMPT_TRACE
1392 #define OMPT_LOOP_END                                                          \
1393     if (status == 0) {                                                         \
1394         if (ompt_enabled &&                     \
1395             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1396             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1397             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1398             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1399                 team_info->parallel_id, task_info->task_id);                   \
1400         }                                                                      \
1401     }
1402 #else
1403 #define OMPT_LOOP_END // no-op
1404 #endif
1405 
1406 template< typename T >
1407 static int
1408 __kmp_dispatch_next(
1409     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1410 ) {
1411 
1412     typedef typename traits_t< T >::unsigned_t  UT;
1413     typedef typename traits_t< T >::signed_t    ST;
1414     typedef typename traits_t< T >::floating_t  DBL;
1415 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1416     static const int ___kmp_size_type = sizeof( UT );
1417 #endif
1418 
1419     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1420     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1421     // more than a compile time choice to use static scheduling would.)
1422     KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1423 
1424     int                                   status;
1425     dispatch_private_info_template< T > * pr;
1426     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1427     kmp_team_t                          * team = th -> th.th_team;
1428 
1429     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1430     #ifdef KMP_DEBUG
1431     {
1432         const char * buff;
1433         // create format specifiers before the debug output
1434         buff = __kmp_str_format(
1435             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1436             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1437         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1438         __kmp_str_free( &buff );
1439     }
1440     #endif
1441 
1442     if ( team -> t.t_serialized ) {
1443         /* NOTE: serialize this dispatch becase we are not at the active level */
1444         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1445             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1446         KMP_DEBUG_ASSERT( pr );
1447 
1448         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1449             *p_lb = 0;
1450             *p_ub = 0;
1451 //            if ( p_last != NULL )
1452 //                *p_last = 0;
1453             if ( p_st != NULL )
1454                 *p_st = 0;
1455             if ( __kmp_env_consistency_check ) {
1456                 if ( pr->pushed_ws != ct_none ) {
1457                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1458                 }
1459             }
1460         } else if ( pr->nomerge ) {
1461             kmp_int32 last;
1462             T         start;
1463             UT        limit, trip, init;
1464             ST        incr;
1465             T         chunk = pr->u.p.parm1;
1466 
1467             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1468 
1469             init = chunk * pr->u.p.count++;
1470             trip = pr->u.p.tc - 1;
1471 
1472             if ( (status = (init <= trip)) == 0 ) {
1473                 *p_lb = 0;
1474                 *p_ub = 0;
1475 //                if ( p_last != NULL )
1476 //                    *p_last = 0;
1477                 if ( p_st != NULL )
1478                     *p_st = 0;
1479                 if ( __kmp_env_consistency_check ) {
1480                     if ( pr->pushed_ws != ct_none ) {
1481                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1482                     }
1483                 }
1484             } else {
1485                 start = pr->u.p.lb;
1486                 limit = chunk + init - 1;
1487                 incr  = pr->u.p.st;
1488 
1489                 if ( (last = (limit >= trip)) != 0 ) {
1490                     limit = trip;
1491                     #if KMP_OS_WINDOWS
1492                     pr->u.p.last_upper = pr->u.p.ub;
1493                     #endif /* KMP_OS_WINDOWS */
1494                 }
1495                 if ( p_last != NULL )
1496                     *p_last = last;
1497                 if ( p_st != NULL )
1498                     *p_st = incr;
1499                 if ( incr == 1 ) {
1500                     *p_lb = start + init;
1501                     *p_ub = start + limit;
1502                 } else {
1503                     *p_lb = start + init * incr;
1504                     *p_ub = start + limit * incr;
1505                 }
1506 
1507                 if ( pr->ordered ) {
1508                     pr->u.p.ordered_lower = init;
1509                     pr->u.p.ordered_upper = limit;
1510                     #ifdef KMP_DEBUG
1511                     {
1512                         const char * buff;
1513                         // create format specifiers before the debug output
1514                         buff = __kmp_str_format(
1515                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1516                             traits_t< UT >::spec, traits_t< UT >::spec );
1517                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1518                         __kmp_str_free( &buff );
1519                     }
1520                     #endif
1521                 } // if
1522             } // if
1523         } else {
1524             pr->u.p.tc = 0;
1525             *p_lb = pr->u.p.lb;
1526             *p_ub = pr->u.p.ub;
1527             #if KMP_OS_WINDOWS
1528             pr->u.p.last_upper = *p_ub;
1529             #endif /* KMP_OS_WINDOWS */
1530             if ( p_last != NULL )
1531                 *p_last = TRUE;
1532             if ( p_st != NULL )
1533                 *p_st = pr->u.p.st;
1534         } // if
1535         #ifdef KMP_DEBUG
1536         {
1537             const char * buff;
1538             // create format specifiers before the debug output
1539             buff = __kmp_str_format(
1540                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1541                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1542                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1543             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1544             __kmp_str_free( &buff );
1545         }
1546         #endif
1547 #if INCLUDE_SSC_MARKS
1548         SSC_MARK_DISPATCH_NEXT();
1549 #endif
1550         OMPT_LOOP_END;
1551         return status;
1552     } else {
1553         kmp_int32 last = 0;
1554         dispatch_shared_info_template< UT > *sh;
1555         T         start;
1556         ST        incr;
1557         UT        limit, trip, init;
1558 
1559         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1560                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1561 
1562         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1563             ( th->th.th_dispatch->th_dispatch_pr_current );
1564         KMP_DEBUG_ASSERT( pr );
1565         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1566             ( th->th.th_dispatch->th_dispatch_sh_current );
1567         KMP_DEBUG_ASSERT( sh );
1568 
1569         if ( pr->u.p.tc == 0 ) {
1570             // zero trip count
1571             status = 0;
1572         } else {
1573             switch (pr->schedule) {
1574             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1575             case kmp_sch_static_steal:
1576                 {
1577                     T chunk = pr->u.p.parm1;
1578 
1579                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1580 
1581                     trip = pr->u.p.tc - 1;
1582 
1583                     if ( ___kmp_size_type > 4 ) {
1584                         // Other threads do not look into the data of this thread,
1585                         //  so it's not necessary to make volatile casting.
1586                         init   = ( pr->u.p.count )++;
1587                         status = ( init < (UT)pr->u.p.ub );
1588                     } else {
1589                         typedef union {
1590                             struct {
1591                                 UT count;
1592                                 T  ub;
1593                             } p;
1594                             kmp_int64 b;
1595                         } union_i4;
1596                         // All operations on 'count' or 'ub' must be combined atomically together.
1597                         // stealing implemented only for 4-byte indexes
1598                         {
1599                             union_i4 vold, vnew;
1600                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1601                             vnew = vold;
1602                             vnew.p.count++;
1603                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1604                                         ( volatile kmp_int64* )&pr->u.p.count,
1605                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1606                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1607                                 KMP_CPU_PAUSE();
1608                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1609                                 vnew = vold;
1610                                 vnew.p.count++;
1611                             }
1612                             vnew = vold;
1613                             init   = vnew.p.count;
1614                             status = ( init < (UT)vnew.p.ub ) ;
1615                         }
1616 
1617                         if( !status ) {
1618                             kmp_info_t   **other_threads = team->t.t_threads;
1619                             int          while_limit = 10;
1620                             int          while_index = 0;
1621 
1622                             // TODO: algorithm of searching for a victim
1623                             // should be cleaned up and measured
1624                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1625                                 union_i4  vold, vnew;
1626                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1627                                 T         victimIdx    = pr->u.p.parm4;
1628                                 T         oldVictimIdx = victimIdx;
1629                                 dispatch_private_info_template< T > * victim;
1630 
1631                                 do {
1632                                     if( !victimIdx ) {
1633                                         victimIdx = team->t.t_nproc - 1;
1634                                     } else {
1635                                         --victimIdx;
1636                                     }
1637                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1638                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1639                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1640                                 // TODO: think about a proper place of this test
1641                                 if ( ( !victim ) ||
1642                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1643                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1644                                     // TODO: delay would be nice
1645                                     continue;
1646                                     // the victim is not ready yet to participate in stealing
1647                                     // because the victim is still in kmp_init_dispatch
1648                                 }
1649                                 if ( oldVictimIdx == victimIdx ) {
1650                                     break;
1651                                 }
1652                                 pr->u.p.parm4 = victimIdx;
1653 
1654                                 while( 1 ) {
1655                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1656                                     vnew = vold;
1657 
1658                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1659                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1660                                         break;
1661                                     }
1662                                     vnew.p.ub -= (remaining >> 2);
1663                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1664                                     #pragma warning( push )
1665                                     // disable warning on pointless comparison of unsigned with 0
1666                                     #pragma warning( disable: 186 )
1667                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1668                                     #pragma warning( pop )
1669                                     // TODO: Should this be acquire or release?
1670                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1671                                             ( volatile kmp_int64 * )&victim->u.p.count,
1672                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1673                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1674                                         status = 1;
1675                                         while_index = 0;
1676                                         // now update own count and ub
1677                                         #if KMP_ARCH_X86
1678                                         // stealing executed on non-KMP_ARCH_X86 only
1679                                             // Atomic 64-bit write on ia32 is
1680                                             // unavailable, so we do this in steps.
1681                                             //     This code is not tested.
1682                                             init = vold.p.count;
1683                                             pr->u.p.ub = 0;
1684                                             pr->u.p.count = init + 1;
1685                                             pr->u.p.ub = vnew.p.count;
1686                                         #else
1687                                             init = vnew.p.ub;
1688                                             vold.p.count = init + 1;
1689                                             // TODO: is it safe and enough?
1690                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1691                                         #endif // KMP_ARCH_X86
1692                                         break;
1693                                     } // if
1694                                 KMP_CPU_PAUSE();
1695                                 } // while (1)
1696                             } // while
1697                         } // if
1698                     } // if
1699                     if ( !status ) {
1700                         *p_lb = 0;
1701                         *p_ub = 0;
1702                         if ( p_st != NULL ) *p_st = 0;
1703                     } else {
1704                         start = pr->u.p.parm2;
1705                         init *= chunk;
1706                         limit = chunk + init - 1;
1707                         incr  = pr->u.p.st;
1708 
1709                         KMP_DEBUG_ASSERT(init <= trip);
1710                         if ( (last = (limit >= trip)) != 0 )
1711                             limit = trip;
1712                         if ( p_st != NULL ) *p_st = incr;
1713 
1714                         if ( incr == 1 ) {
1715                             *p_lb = start + init;
1716                             *p_ub = start + limit;
1717                         } else {
1718                             *p_lb = start + init * incr;
1719                             *p_ub = start + limit * incr;
1720                         }
1721 
1722                         if ( pr->ordered ) {
1723                             pr->u.p.ordered_lower = init;
1724                             pr->u.p.ordered_upper = limit;
1725                             #ifdef KMP_DEBUG
1726                             {
1727                                 const char * buff;
1728                                 // create format specifiers before the debug output
1729                                 buff = __kmp_str_format(
1730                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1731                                     traits_t< UT >::spec, traits_t< UT >::spec );
1732                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1733                                 __kmp_str_free( &buff );
1734                             }
1735                             #endif
1736                         } // if
1737                     } // if
1738                     break;
1739                 } // case
1740             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1741             case kmp_sch_static_balanced:
1742                 {
1743                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1744                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1745                         pr->u.p.count = 1;
1746                         *p_lb = pr->u.p.lb;
1747                         *p_ub = pr->u.p.ub;
1748                         last = pr->u.p.parm1;
1749                         if ( p_st != NULL )
1750                             *p_st = pr->u.p.st;
1751                     } else {  /* no iterations to do */
1752                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1753                     }
1754                     if ( pr->ordered ) {
1755                         #ifdef KMP_DEBUG
1756                         {
1757                             const char * buff;
1758                             // create format specifiers before the debug output
1759                             buff = __kmp_str_format(
1760                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1761                                 traits_t< UT >::spec, traits_t< UT >::spec );
1762                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1763                             __kmp_str_free( &buff );
1764                         }
1765                         #endif
1766                     } // if
1767                 } // case
1768                 break;
1769             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1770             case kmp_sch_static_chunked:
1771                 {
1772                     T parm1;
1773 
1774                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1775                                    gtid ) );
1776                     parm1 = pr->u.p.parm1;
1777 
1778                     trip  = pr->u.p.tc - 1;
1779                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1780 
1781                     if ( (status = (init <= trip)) != 0 ) {
1782                         start = pr->u.p.lb;
1783                         incr  = pr->u.p.st;
1784                         limit = parm1 + init - 1;
1785 
1786                         if ( (last = (limit >= trip)) != 0 )
1787                             limit = trip;
1788 
1789                         if ( p_st != NULL ) *p_st = incr;
1790 
1791                         pr->u.p.count += team->t.t_nproc;
1792 
1793                         if ( incr == 1 ) {
1794                             *p_lb = start + init;
1795                             *p_ub = start + limit;
1796                         }
1797                         else {
1798                             *p_lb = start + init * incr;
1799                             *p_ub = start + limit * incr;
1800                         }
1801 
1802                         if ( pr->ordered ) {
1803                             pr->u.p.ordered_lower = init;
1804                             pr->u.p.ordered_upper = limit;
1805                             #ifdef KMP_DEBUG
1806                             {
1807                                 const char * buff;
1808                                 // create format specifiers before the debug output
1809                                 buff = __kmp_str_format(
1810                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1811                                     traits_t< UT >::spec, traits_t< UT >::spec );
1812                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1813                                 __kmp_str_free( &buff );
1814                             }
1815                             #endif
1816                         } // if
1817                     } // if
1818                 } // case
1819                 break;
1820 
1821             case kmp_sch_dynamic_chunked:
1822                 {
1823                     T chunk = pr->u.p.parm1;
1824 
1825                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1826                                    gtid ) );
1827 
1828                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1829                     trip = pr->u.p.tc - 1;
1830 
1831                     if ( (status = (init <= trip)) == 0 ) {
1832                         *p_lb = 0;
1833                         *p_ub = 0;
1834                         if ( p_st != NULL ) *p_st = 0;
1835                     } else {
1836                         start = pr->u.p.lb;
1837                         limit = chunk + init - 1;
1838                         incr  = pr->u.p.st;
1839 
1840                         if ( (last = (limit >= trip)) != 0 )
1841                             limit = trip;
1842 
1843                         if ( p_st != NULL ) *p_st = incr;
1844 
1845                         if ( incr == 1 ) {
1846                             *p_lb = start + init;
1847                             *p_ub = start + limit;
1848                         } else {
1849                             *p_lb = start + init * incr;
1850                             *p_ub = start + limit * incr;
1851                         }
1852 
1853                         if ( pr->ordered ) {
1854                             pr->u.p.ordered_lower = init;
1855                             pr->u.p.ordered_upper = limit;
1856                             #ifdef KMP_DEBUG
1857                             {
1858                                 const char * buff;
1859                                 // create format specifiers before the debug output
1860                                 buff = __kmp_str_format(
1861                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1862                                     traits_t< UT >::spec, traits_t< UT >::spec );
1863                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1864                                 __kmp_str_free( &buff );
1865                             }
1866                             #endif
1867                         } // if
1868                     } // if
1869                 } // case
1870                 break;
1871 
1872             case kmp_sch_guided_iterative_chunked:
1873                 {
1874                     T  chunkspec = pr->u.p.parm1;
1875                     KD_TRACE(100,
1876                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1877                     trip  = pr->u.p.tc;
1878                     // Start atomic part of calculations
1879                     while(1) {
1880                         ST  remaining;             // signed, because can be < 0
1881                         init = sh->u.s.iteration;  // shared value
1882                         remaining = trip - init;
1883                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1884                             // nothing to do, don't try atomic op
1885                             status = 0;
1886                             break;
1887                         }
1888                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1889                             // use dynamic-style shcedule
1890                             // atomically inrement iterations, get old value
1891                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1892                             remaining = trip - init;
1893                             if (remaining <= 0) {
1894                                 status = 0;    // all iterations got by other threads
1895                             } else {
1896                                 // got some iterations to work on
1897                                 status = 1;
1898                                 if ( (T)remaining > chunkspec ) {
1899                                     limit = init + chunkspec - 1;
1900                                 } else {
1901                                     last = 1;   // the last chunk
1902                                     limit = init + remaining - 1;
1903                                 } // if
1904                             } // if
1905                             break;
1906                         } // if
1907                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1908                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1909                             // CAS was successful, chunk obtained
1910                             status = 1;
1911                             --limit;
1912                             break;
1913                         } // if
1914                     } // while
1915                     if ( status != 0 ) {
1916                         start = pr->u.p.lb;
1917                         incr = pr->u.p.st;
1918                         if ( p_st != NULL )
1919                             *p_st = incr;
1920                         *p_lb = start + init * incr;
1921                         *p_ub = start + limit * incr;
1922                         if ( pr->ordered ) {
1923                             pr->u.p.ordered_lower = init;
1924                             pr->u.p.ordered_upper = limit;
1925                             #ifdef KMP_DEBUG
1926                             {
1927                                 const char * buff;
1928                                 // create format specifiers before the debug output
1929                                 buff = __kmp_str_format(
1930                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1931                                     traits_t< UT >::spec, traits_t< UT >::spec );
1932                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1933                                 __kmp_str_free( &buff );
1934                             }
1935                             #endif
1936                         } // if
1937                     } else {
1938                         *p_lb = 0;
1939                         *p_ub = 0;
1940                         if ( p_st != NULL )
1941                             *p_st = 0;
1942                     } // if
1943                 } // case
1944                 break;
1945 
1946             case kmp_sch_guided_analytical_chunked:
1947                 {
1948                     T   chunkspec = pr->u.p.parm1;
1949                     UT chunkIdx;
1950     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1951                     /* for storing original FPCW value for Windows* OS on
1952 		       IA-32 architecture 8-byte version */
1953                     unsigned int oldFpcw;
1954                     unsigned int fpcwSet = 0;
1955     #endif
1956                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1957                                    gtid ) );
1958 
1959                     trip  = pr->u.p.tc;
1960 
1961                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1962                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1963 
1964                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1965                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1966                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1967                             --trip;
1968                             /* use dynamic-style scheduling */
1969                             init = chunkIdx * chunkspec + pr->u.p.count;
1970                             /* need to verify init > 0 in case of overflow in the above calculation */
1971                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1972                                 limit = init + chunkspec -1;
1973 
1974                                 if ( (last = (limit >= trip)) != 0 )
1975                                     limit = trip;
1976                             }
1977                             break;
1978                         } else {
1979                             /* use exponential-style scheduling */
1980                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1981                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1982                              */
1983     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1984                             /* If we haven't already done so, save original
1985 			       FPCW and set precision to 64-bit, as Windows* OS
1986 			       on IA-32 architecture defaults to 53-bit */
1987                             if ( !fpcwSet ) {
1988                                 oldFpcw = _control87(0,0);
1989                                 _control87(_PC_64,_MCW_PC);
1990                                 fpcwSet = 0x30000;
1991                             }
1992     #endif
1993                             if ( chunkIdx ) {
1994                                 init = __kmp_dispatch_guided_remaining< T >(
1995                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1996                                 KMP_DEBUG_ASSERT(init);
1997                                 init = trip - init;
1998                             } else
1999                                 init = 0;
2000                             limit = trip - __kmp_dispatch_guided_remaining< T >(
2001                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2002                             KMP_ASSERT(init <= limit);
2003                             if ( init < limit ) {
2004                                 KMP_DEBUG_ASSERT(limit <= trip);
2005                                 --limit;
2006                                 status = 1;
2007                                 break;
2008                             } // if
2009                         } // if
2010                     } // while (1)
2011     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2012                     /* restore FPCW if necessary
2013                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2014                     */
2015                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2016                         _control87(oldFpcw,_MCW_PC);
2017     #endif
2018                     if ( status != 0 ) {
2019                         start = pr->u.p.lb;
2020                         incr = pr->u.p.st;
2021                         if ( p_st != NULL )
2022                             *p_st = incr;
2023                         *p_lb = start + init * incr;
2024                         *p_ub = start + limit * incr;
2025                         if ( pr->ordered ) {
2026                             pr->u.p.ordered_lower = init;
2027                             pr->u.p.ordered_upper = limit;
2028                             #ifdef KMP_DEBUG
2029                             {
2030                                 const char * buff;
2031                                 // create format specifiers before the debug output
2032                                 buff = __kmp_str_format(
2033                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2034                                     traits_t< UT >::spec, traits_t< UT >::spec );
2035                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2036                                 __kmp_str_free( &buff );
2037                             }
2038                             #endif
2039                         }
2040                     } else {
2041                         *p_lb = 0;
2042                         *p_ub = 0;
2043                         if ( p_st != NULL )
2044                             *p_st = 0;
2045                     }
2046                 } // case
2047                 break;
2048 
2049             case kmp_sch_trapezoidal:
2050                 {
2051                     UT   index;
2052                     T    parm2 = pr->u.p.parm2;
2053                     T    parm3 = pr->u.p.parm3;
2054                     T    parm4 = pr->u.p.parm4;
2055                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2056                                    gtid ) );
2057 
2058                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2059 
2060                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2061                     trip = pr->u.p.tc - 1;
2062 
2063                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2064                         *p_lb = 0;
2065                         *p_ub = 0;
2066                         if ( p_st != NULL ) *p_st = 0;
2067                     } else {
2068                         start = pr->u.p.lb;
2069                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2070                         incr  = pr->u.p.st;
2071 
2072                         if ( (last = (limit >= trip)) != 0 )
2073                             limit = trip;
2074 
2075                         if ( p_st != NULL ) *p_st = incr;
2076 
2077                         if ( incr == 1 ) {
2078                             *p_lb = start + init;
2079                             *p_ub = start + limit;
2080                         } else {
2081                             *p_lb = start + init * incr;
2082                             *p_ub = start + limit * incr;
2083                         }
2084 
2085                         if ( pr->ordered ) {
2086                             pr->u.p.ordered_lower = init;
2087                             pr->u.p.ordered_upper = limit;
2088                             #ifdef KMP_DEBUG
2089                             {
2090                                 const char * buff;
2091                                 // create format specifiers before the debug output
2092                                 buff = __kmp_str_format(
2093                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2094                                     traits_t< UT >::spec, traits_t< UT >::spec );
2095                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2096                                 __kmp_str_free( &buff );
2097                             }
2098                             #endif
2099                         } // if
2100                     } // if
2101                 } // case
2102                 break;
2103             default:
2104                 {
2105                     status = 0; // to avoid complaints on uninitialized variable use
2106                     __kmp_msg(
2107                         kmp_ms_fatal,                        // Severity
2108                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2109                         KMP_HNT( GetNewerLibrary ),          // Hint
2110                         __kmp_msg_null                       // Variadic argument list terminator
2111                     );
2112                 }
2113                 break;
2114             } // switch
2115         } // if tc == 0;
2116 
2117         if ( status == 0 ) {
2118             UT   num_done;
2119 
2120             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2121             #ifdef KMP_DEBUG
2122             {
2123                 const char * buff;
2124                 // create format specifiers before the debug output
2125                 buff = __kmp_str_format(
2126                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2127                     traits_t< UT >::spec );
2128                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2129                 __kmp_str_free( &buff );
2130             }
2131             #endif
2132 
2133             if ( (ST)num_done == team->t.t_nproc-1 ) {
2134                 /* NOTE: release this buffer to be reused */
2135 
2136                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2137 
2138                 sh->u.s.num_done = 0;
2139                 sh->u.s.iteration = 0;
2140 
2141                 /* TODO replace with general release procedure? */
2142                 if ( pr->ordered ) {
2143                     sh->u.s.ordered_iteration = 0;
2144                 }
2145 
2146                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2147 
2148                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2149                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2150                                 gtid, sh->buffer_index) );
2151 
2152                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2153 
2154             } // if
2155             if ( __kmp_env_consistency_check ) {
2156                 if ( pr->pushed_ws != ct_none ) {
2157                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2158                 }
2159             }
2160 
2161             th -> th.th_dispatch -> th_deo_fcn = NULL;
2162             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2163             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2164             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2165         } // if (status == 0)
2166 #if KMP_OS_WINDOWS
2167         else if ( last ) {
2168             pr->u.p.last_upper = pr->u.p.ub;
2169         }
2170 #endif /* KMP_OS_WINDOWS */
2171         if ( p_last != NULL && status != 0 )
2172             *p_last = last;
2173     } // if
2174 
2175     #ifdef KMP_DEBUG
2176     {
2177         const char * buff;
2178         // create format specifiers before the debug output
2179         buff = __kmp_str_format(
2180             "__kmp_dispatch_next: T#%%d normal case: " \
2181             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2182             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2183         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2184         __kmp_str_free( &buff );
2185     }
2186     #endif
2187 #if INCLUDE_SSC_MARKS
2188     SSC_MARK_DISPATCH_NEXT();
2189 #endif
2190     OMPT_LOOP_END;
2191     return status;
2192 }
2193 
2194 template< typename T >
2195 static void
2196 __kmp_dist_get_bounds(
2197     ident_t                          *loc,
2198     kmp_int32                         gtid,
2199     kmp_int32                        *plastiter,
2200     T                                *plower,
2201     T                                *pupper,
2202     typename traits_t< T >::signed_t  incr
2203 ) {
2204     typedef typename traits_t< T >::unsigned_t  UT;
2205     typedef typename traits_t< T >::signed_t    ST;
2206     register kmp_uint32  team_id;
2207     register kmp_uint32  nteams;
2208     register UT          trip_count;
2209     register kmp_team_t *team;
2210     kmp_info_t * th;
2211 
2212     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2213     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2214     #ifdef KMP_DEBUG
2215     {
2216         const char * buff;
2217         // create format specifiers before the debug output
2218         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2219             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2220             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2221             traits_t< T >::spec );
2222         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2223         __kmp_str_free( &buff );
2224     }
2225     #endif
2226 
2227     if( __kmp_env_consistency_check ) {
2228         if( incr == 0 ) {
2229             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2230         }
2231         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2232             // The loop is illegal.
2233             // Some zero-trip loops maintained by compiler, e.g.:
2234             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2235             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2236             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2237             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2238             // Compiler does not check the following illegal loops:
2239             //   for(i=0;i<10;i+=incr) // where incr<0
2240             //   for(i=10;i>0;i-=incr) // where incr<0
2241             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2242         }
2243     }
2244     th = __kmp_threads[gtid];
2245     team = th->th.th_team;
2246     #if OMP_40_ENABLED
2247     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2248     nteams = th->th.th_teams_size.nteams;
2249     #endif
2250     team_id = team->t.t_master_tid;
2251     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2252 
2253     // compute global trip count
2254     if( incr == 1 ) {
2255         trip_count = *pupper - *plower + 1;
2256     } else if(incr == -1) {
2257         trip_count = *plower - *pupper + 1;
2258     } else {
2259         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2260     }
2261 
2262     if( trip_count <= nteams ) {
2263         KMP_DEBUG_ASSERT(
2264             __kmp_static == kmp_sch_static_greedy || \
2265             __kmp_static == kmp_sch_static_balanced
2266         ); // Unknown static scheduling type.
2267         // only some teams get single iteration, others get nothing
2268         if( team_id < trip_count ) {
2269             *pupper = *plower = *plower + team_id * incr;
2270         } else {
2271             *plower = *pupper + incr; // zero-trip loop
2272         }
2273         if( plastiter != NULL )
2274             *plastiter = ( team_id == trip_count - 1 );
2275     } else {
2276         if( __kmp_static == kmp_sch_static_balanced ) {
2277             register UT chunk = trip_count / nteams;
2278             register UT extras = trip_count % nteams;
2279             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2280             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2281             if( plastiter != NULL )
2282                 *plastiter = ( team_id == nteams - 1 );
2283         } else {
2284             register T chunk_inc_count =
2285                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2286             register T upper = *pupper;
2287             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2288                 // Unknown static scheduling type.
2289             *plower += team_id * chunk_inc_count;
2290             *pupper = *plower + chunk_inc_count - incr;
2291             // Check/correct bounds if needed
2292             if( incr > 0 ) {
2293                 if( *pupper < *plower )
2294                     *pupper = i_maxmin< T >::mx;
2295                 if( plastiter != NULL )
2296                     *plastiter = *plower <= upper && *pupper > upper - incr;
2297                 if( *pupper > upper )
2298                     *pupper = upper; // tracker C73258
2299             } else {
2300                 if( *pupper > *plower )
2301                     *pupper = i_maxmin< T >::mn;
2302                 if( plastiter != NULL )
2303                     *plastiter = *plower >= upper && *pupper < upper - incr;
2304                 if( *pupper < upper )
2305                     *pupper = upper; // tracker C73258
2306             }
2307         }
2308     }
2309 }
2310 
2311 //-----------------------------------------------------------------------------------------
2312 // Dispatch routines
2313 //    Transfer call to template< type T >
2314 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2315 //                         T lb, T ub, ST st, ST chunk )
2316 extern "C" {
2317 
2318 /*!
2319 @ingroup WORK_SHARING
2320 @{
2321 @param loc Source location
2322 @param gtid Global thread id
2323 @param schedule Schedule type
2324 @param lb  Lower bound
2325 @param ub  Upper bound
2326 @param st  Step (or increment if you prefer)
2327 @param chunk The chunk size to block with
2328 
2329 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2330 These functions are all identical apart from the types of the arguments.
2331 */
2332 
2333 void
2334 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2335                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2336 {
2337     KMP_DEBUG_ASSERT( __kmp_init_serial );
2338     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2339 }
2340 /*!
2341 See @ref __kmpc_dispatch_init_4
2342 */
2343 void
2344 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2345                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2346 {
2347     KMP_DEBUG_ASSERT( __kmp_init_serial );
2348     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2349 }
2350 
2351 /*!
2352 See @ref __kmpc_dispatch_init_4
2353 */
2354 void
2355 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2356                         kmp_int64 lb, kmp_int64 ub,
2357                         kmp_int64 st, kmp_int64 chunk )
2358 {
2359     KMP_DEBUG_ASSERT( __kmp_init_serial );
2360     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2361 }
2362 
2363 /*!
2364 See @ref __kmpc_dispatch_init_4
2365 */
2366 void
2367 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2368                          kmp_uint64 lb, kmp_uint64 ub,
2369                          kmp_int64 st, kmp_int64 chunk )
2370 {
2371     KMP_DEBUG_ASSERT( __kmp_init_serial );
2372     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2373 }
2374 
2375 /*!
2376 See @ref __kmpc_dispatch_init_4
2377 
2378 Difference from __kmpc_dispatch_init set of functions is these functions
2379 are called for composite distribute parallel for construct. Thus before
2380 regular iterations dispatching we need to calc per-team iteration space.
2381 
2382 These functions are all identical apart from the types of the arguments.
2383 */
2384 void
2385 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2386     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2387 {
2388     KMP_DEBUG_ASSERT( __kmp_init_serial );
2389     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2390     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2391 }
2392 
2393 void
2394 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2395     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2396 {
2397     KMP_DEBUG_ASSERT( __kmp_init_serial );
2398     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2399     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2400 }
2401 
2402 void
2403 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2404     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2405 {
2406     KMP_DEBUG_ASSERT( __kmp_init_serial );
2407     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2408     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2409 }
2410 
2411 void
2412 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2413     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2414 {
2415     KMP_DEBUG_ASSERT( __kmp_init_serial );
2416     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2417     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2418 }
2419 
2420 /*!
2421 @param loc Source code location
2422 @param gtid Global thread id
2423 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2424 @param p_lb   Pointer to the lower bound for the next chunk of work
2425 @param p_ub   Pointer to the upper bound for the next chunk of work
2426 @param p_st   Pointer to the stride for the next chunk of work
2427 @return one if there is work to be done, zero otherwise
2428 
2429 Get the next dynamically allocated chunk of work for this thread.
2430 If there is no more work, then the lb,ub and stride need not be modified.
2431 */
2432 int
2433 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2434                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2435 {
2436     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2437 }
2438 
2439 /*!
2440 See @ref __kmpc_dispatch_next_4
2441 */
2442 int
2443 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2444                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2445 {
2446     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2447 }
2448 
2449 /*!
2450 See @ref __kmpc_dispatch_next_4
2451 */
2452 int
2453 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2454                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2455 {
2456     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2457 }
2458 
2459 /*!
2460 See @ref __kmpc_dispatch_next_4
2461 */
2462 int
2463 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2464                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2465 {
2466     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2467 }
2468 
2469 /*!
2470 @param loc Source code location
2471 @param gtid Global thread id
2472 
2473 Mark the end of a dynamic loop.
2474 */
2475 void
2476 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2477 {
2478     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2479 }
2480 
2481 /*!
2482 See @ref __kmpc_dispatch_fini_4
2483 */
2484 void
2485 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2486 {
2487     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2488 }
2489 
2490 /*!
2491 See @ref __kmpc_dispatch_fini_4
2492 */
2493 void
2494 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2495 {
2496     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2497 }
2498 
2499 /*!
2500 See @ref __kmpc_dispatch_fini_4
2501 */
2502 void
2503 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2504 {
2505     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2506 }
2507 /*! @} */
2508 
2509 //-----------------------------------------------------------------------------------------
2510 //Non-template routines from kmp_dispatch.c used in other sources
2511 
2512 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2513     return value == checker;
2514 }
2515 
2516 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2517     return value != checker;
2518 }
2519 
2520 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2521     return value < checker;
2522 }
2523 
2524 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2525     return value >= checker;
2526 }
2527 
2528 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2529     return value <= checker;
2530 }
2531 
2532 kmp_uint32
2533 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2534                    kmp_uint32            checker,
2535                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2536                    , void        * obj    // Higher-level synchronization object, or NULL.
2537                    )
2538 {
2539     // note: we may not belong to a team at this point
2540     register volatile kmp_uint32         * spin          = spinner;
2541     register          kmp_uint32           check         = checker;
2542     register          kmp_uint32   spins;
2543     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2544     register          kmp_uint32           r;
2545 
2546     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2547     KMP_INIT_YIELD( spins );
2548     // main wait spin loop
2549     while(!f(r = TCR_4(*spin), check)) {
2550         KMP_FSYNC_SPIN_PREPARE( obj );
2551         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2552            It causes problems with infinite recursion because of exit lock */
2553         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2554             __kmp_abort_thread(); */
2555 
2556         /* if we have waited a bit, or are oversubscribed, yield */
2557         /* pause is in the following code */
2558         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2559         KMP_YIELD_SPIN( spins );
2560     }
2561     KMP_FSYNC_SPIN_ACQUIRED( obj );
2562     return r;
2563 }
2564 
2565 } // extern "C"
2566 
2567 #ifdef KMP_GOMP_COMPAT
2568 
2569 void
2570 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2571                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2572                            kmp_int32 chunk, int push_ws )
2573 {
2574     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2575                                       push_ws );
2576 }
2577 
2578 void
2579 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2580                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2581                             kmp_int32 chunk, int push_ws )
2582 {
2583     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2584                                        push_ws );
2585 }
2586 
2587 void
2588 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2589                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2590                            kmp_int64 chunk, int push_ws )
2591 {
2592     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2593                                       push_ws );
2594 }
2595 
2596 void
2597 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2598                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2599                             kmp_int64 chunk, int push_ws )
2600 {
2601     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2602                                        push_ws );
2603 }
2604 
2605 void
2606 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2607 {
2608     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2609 }
2610 
2611 void
2612 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2613 {
2614     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2615 }
2616 
2617 void
2618 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2619 {
2620     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2621 }
2622 
2623 void
2624 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2625 {
2626     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2627 }
2628 
2629 #endif /* KMP_GOMP_COMPAT */
2630 
2631 /* ------------------------------------------------------------------------ */
2632 /* ------------------------------------------------------------------------ */
2633 
2634