1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 #if OMP_41_ENABLED
179     volatile kmp_int32      doacross_buf_idx;  // teamwise index
180     kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
181     kmp_int32               doacross_num_done; // count finished threads
182 #endif
183 };
184 
185 /* ------------------------------------------------------------------------ */
186 /* ------------------------------------------------------------------------ */
187 
188 #undef USE_TEST_LOCKS
189 
190 // test_then_add template (general template should NOT be used)
191 template< typename T >
192 static __forceinline T
193 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
194 
195 template<>
196 __forceinline kmp_int32
197 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
198 {
199     kmp_int32 r;
200     r = KMP_TEST_THEN_ADD32( p, d );
201     return r;
202 }
203 
204 template<>
205 __forceinline kmp_int64
206 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
207 {
208     kmp_int64 r;
209     r = KMP_TEST_THEN_ADD64( p, d );
210     return r;
211 }
212 
213 // test_then_inc_acq template (general template should NOT be used)
214 template< typename T >
215 static __forceinline T
216 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
217 
218 template<>
219 __forceinline kmp_int32
220 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
221 {
222     kmp_int32 r;
223     r = KMP_TEST_THEN_INC_ACQ32( p );
224     return r;
225 }
226 
227 template<>
228 __forceinline kmp_int64
229 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
230 {
231     kmp_int64 r;
232     r = KMP_TEST_THEN_INC_ACQ64( p );
233     return r;
234 }
235 
236 // test_then_inc template (general template should NOT be used)
237 template< typename T >
238 static __forceinline T
239 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
240 
241 template<>
242 __forceinline kmp_int32
243 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
244 {
245     kmp_int32 r;
246     r = KMP_TEST_THEN_INC32( p );
247     return r;
248 }
249 
250 template<>
251 __forceinline kmp_int64
252 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
253 {
254     kmp_int64 r;
255     r = KMP_TEST_THEN_INC64( p );
256     return r;
257 }
258 
259 // compare_and_swap template (general template should NOT be used)
260 template< typename T >
261 static __forceinline kmp_int32
262 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
263 
264 template<>
265 __forceinline kmp_int32
266 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
267 {
268     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
269 }
270 
271 template<>
272 __forceinline kmp_int32
273 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
274 {
275     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
276 }
277 
278 /*
279     Spin wait loop that first does pause, then yield.
280     Waits until function returns non-zero when called with *spinner and check.
281     Does NOT put threads to sleep.
282 #if USE_ITT_BUILD
283     Arguments:
284         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
285             locks consistently. For example, if lock is acquired immediately, its address is
286             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
287             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
288             address, not an address of low-level spinner.
289 #endif // USE_ITT_BUILD
290 */
291 template< typename UT >
292 // ToDo: make inline function (move to header file for icl)
293 static UT  // unsigned 4- or 8-byte type
294 __kmp_wait_yield( volatile UT * spinner,
295                   UT            checker,
296                   kmp_uint32 (* pred)( UT, UT )
297                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
298                   )
299 {
300     // note: we may not belong to a team at this point
301     register volatile UT         * spin          = spinner;
302     register          UT           check         = checker;
303     register          kmp_uint32   spins;
304     register          kmp_uint32 (*f) ( UT, UT ) = pred;
305     register          UT           r;
306 
307     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
308     KMP_INIT_YIELD( spins );
309     // main wait spin loop
310     while(!f(r = *spin, check))
311     {
312         KMP_FSYNC_SPIN_PREPARE( obj );
313         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
314            It causes problems with infinite recursion because of exit lock */
315         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
316             __kmp_abort_thread(); */
317 
318         // if we are oversubscribed,
319         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
320         // pause is in the following code
321         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
322         KMP_YIELD_SPIN( spins );
323     }
324     KMP_FSYNC_SPIN_ACQUIRED( obj );
325     return r;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_eq( UT value, UT checker) {
330     return value == checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_neq( UT value, UT checker) {
335     return value != checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_lt( UT value, UT checker) {
340     return value < checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_ge( UT value, UT checker) {
345     return value >= checker;
346 }
347 
348 template< typename UT >
349 static kmp_uint32 __kmp_le( UT value, UT checker) {
350     return value <= checker;
351 }
352 
353 
354 /* ------------------------------------------------------------------------ */
355 /* ------------------------------------------------------------------------ */
356 
357 static void
358 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
359 {
360     kmp_info_t *th;
361 
362     KMP_DEBUG_ASSERT( gtid_ref );
363 
364     if ( __kmp_env_consistency_check ) {
365         th = __kmp_threads[*gtid_ref];
366         if ( th -> th.th_root -> r.r_active
367           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
368 #if KMP_USE_DYNAMIC_LOCK
369             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
370 #else
371             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
372 #endif
373         }
374     }
375 }
376 
377 template< typename UT >
378 static void
379 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
380 {
381     typedef typename traits_t< UT >::signed_t    ST;
382     dispatch_private_info_template< UT > * pr;
383 
384     int gtid = *gtid_ref;
385 //    int  cid = *cid_ref;
386     kmp_info_t *th = __kmp_threads[ gtid ];
387     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
388 
389     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
390     if ( __kmp_env_consistency_check ) {
391         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
392             ( th -> th.th_dispatch -> th_dispatch_pr_current );
393         if ( pr -> pushed_ws != ct_none ) {
394 #if KMP_USE_DYNAMIC_LOCK
395             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
396 #else
397             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
398 #endif
399         }
400     }
401 
402     if ( ! th -> th.th_team -> t.t_serialized ) {
403         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
404             ( th -> th.th_dispatch -> th_dispatch_sh_current );
405         UT  lower;
406 
407         if ( ! __kmp_env_consistency_check ) {
408                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
409                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
410         }
411         lower = pr->u.p.ordered_lower;
412 
413         #if ! defined( KMP_GOMP_COMPAT )
414             if ( __kmp_env_consistency_check ) {
415                 if ( pr->ordered_bumped ) {
416                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
417                     __kmp_error_construct2(
418                         kmp_i18n_msg_CnsMultipleNesting,
419                         ct_ordered_in_pdo, loc_ref,
420                         & p->stack_data[ p->w_top ]
421                     );
422                 }
423             }
424         #endif /* !defined(KMP_GOMP_COMPAT) */
425 
426         KMP_MB();
427         #ifdef KMP_DEBUG
428         {
429             const char * buff;
430             // create format specifiers before the debug output
431             buff = __kmp_str_format(
432                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
433                 traits_t< UT >::spec, traits_t< UT >::spec );
434             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435             __kmp_str_free( &buff );
436         }
437         #endif
438 
439         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
440                                 USE_ITT_BUILD_ARG( NULL )
441                                 );
442         KMP_MB();  /* is this necessary? */
443         #ifdef KMP_DEBUG
444         {
445             const char * buff;
446             // create format specifiers before the debug output
447             buff = __kmp_str_format(
448                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
449                 traits_t< UT >::spec, traits_t< UT >::spec );
450             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
451             __kmp_str_free( &buff );
452         }
453         #endif
454     }
455     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
456 }
457 
458 static void
459 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
460 {
461     kmp_info_t *th;
462 
463     if ( __kmp_env_consistency_check ) {
464         th = __kmp_threads[*gtid_ref];
465         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
466             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
467         }
468     }
469 }
470 
471 template< typename UT >
472 static void
473 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
474 {
475     typedef typename traits_t< UT >::signed_t    ST;
476     dispatch_private_info_template< UT > * pr;
477 
478     int gtid = *gtid_ref;
479 //    int  cid = *cid_ref;
480     kmp_info_t *th = __kmp_threads[ gtid ];
481     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
482 
483     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
484     if ( __kmp_env_consistency_check ) {
485         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
486             ( th -> th.th_dispatch -> th_dispatch_pr_current );
487         if ( pr -> pushed_ws != ct_none ) {
488             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
489         }
490     }
491 
492     if ( ! th -> th.th_team -> t.t_serialized ) {
493         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
494             ( th -> th.th_dispatch -> th_dispatch_sh_current );
495 
496         if ( ! __kmp_env_consistency_check ) {
497             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
498                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
499         }
500 
501         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
502         #if ! defined( KMP_GOMP_COMPAT )
503             if ( __kmp_env_consistency_check ) {
504                 if ( pr->ordered_bumped != 0 ) {
505                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
506                     /* How to test it? - OM */
507                     __kmp_error_construct2(
508                         kmp_i18n_msg_CnsMultipleNesting,
509                         ct_ordered_in_pdo, loc_ref,
510                         & p->stack_data[ p->w_top ]
511                     );
512                 }
513             }
514         #endif /* !defined(KMP_GOMP_COMPAT) */
515 
516         KMP_MB();       /* Flush all pending memory write invalidates.  */
517 
518         pr->ordered_bumped += 1;
519 
520         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
521                         gtid, pr->ordered_bumped ) );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524 
525         /* TODO use general release procedure? */
526         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
527 
528         KMP_MB();       /* Flush all pending memory write invalidates.  */
529     }
530     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
531 }
532 
533 /* Computes and returns x to the power of y, where y must a non-negative integer */
534 template< typename UT >
535 static __forceinline long double
536 __kmp_pow(long double x, UT y) {
537     long double s=1.0L;
538 
539     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
540     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
541     while(y) {
542         if ( y & 1 )
543             s *= x;
544         x *= x;
545         y >>= 1;
546     }
547     return s;
548 }
549 
550 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
551    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
552    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
553    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
554 */
555 template< typename T >
556 static __inline typename traits_t< T >::unsigned_t
557 __kmp_dispatch_guided_remaining(
558     T                                  tc,
559     typename traits_t< T >::floating_t base,
560     typename traits_t< T >::unsigned_t idx
561 ) {
562     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
563        least for ICL 8.1, long double arithmetic may not really have
564        long double precision, even with /Qlong_double.  Currently, we
565        workaround that in the caller code, by manipulating the FPCW for
566        Windows* OS on IA-32 architecture.  The lack of precision is not
567        expected to be a correctness issue, though.
568     */
569     typedef typename traits_t< T >::unsigned_t  UT;
570 
571     long double x = tc * __kmp_pow< UT >(base, idx);
572     UT r = (UT) x;
573     if ( x == r )
574         return r;
575     return r + 1;
576 }
577 
578 // Parameters of the guided-iterative algorithm:
579 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
580 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
581 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
582 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
583 static int guided_int_param = 2;
584 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
585 
586 // UT - unsigned flavor of T, ST - signed flavor of T,
587 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
588 template< typename T >
589 static void
590 __kmp_dispatch_init(
591     ident_t                        * loc,
592     int                              gtid,
593     enum sched_type                  schedule,
594     T                                lb,
595     T                                ub,
596     typename traits_t< T >::signed_t st,
597     typename traits_t< T >::signed_t chunk,
598     int                              push_ws
599 ) {
600     typedef typename traits_t< T >::unsigned_t  UT;
601     typedef typename traits_t< T >::signed_t    ST;
602     typedef typename traits_t< T >::floating_t  DBL;
603     static const int ___kmp_size_type = sizeof( UT );
604 
605     int                                            active;
606     T                                              tc;
607     kmp_info_t *                                   th;
608     kmp_team_t *                                   team;
609     kmp_uint32                                     my_buffer_index;
610     dispatch_private_info_template< T >          * pr;
611     dispatch_shared_info_template< UT > volatile * sh;
612 
613     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
614     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
615 
616     if ( ! TCR_4( __kmp_init_parallel ) )
617         __kmp_parallel_initialize();
618 
619 #if INCLUDE_SSC_MARKS
620     SSC_MARK_DISPATCH_INIT();
621 #endif
622     #ifdef KMP_DEBUG
623     {
624         const char * buff;
625         // create format specifiers before the debug output
626         buff = __kmp_str_format(
627             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
628             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
629         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
630         __kmp_str_free( &buff );
631     }
632     #endif
633     /* setup data */
634     th     = __kmp_threads[ gtid ];
635     team   = th -> th.th_team;
636     active = ! team -> t.t_serialized;
637     th->th.th_ident = loc;
638 
639 #if USE_ITT_BUILD
640     kmp_uint64 cur_chunk = chunk;
641     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
642         KMP_MASTER_GTID(gtid) &&
643 #if OMP_40_ENABLED
644         th->th.th_teams_microtask == NULL &&
645 #endif
646         team->t.t_active_level == 1;
647 #endif
648     if ( ! active ) {
649         pr = reinterpret_cast< dispatch_private_info_template< T >* >
650             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
651     } else {
652         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
653                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
654 
655         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
656 
657         /* What happens when number of threads changes, need to resize buffer? */
658         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
659             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
660         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
661             ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
662     }
663 
664     /* Currently just ignore the monotonic and non-monotonic modifiers (the compiler isn't producing them
665      * yet anyway).
666      * When it is we'll want to look at them somewhere here and use that information to add to our
667      * schedule choice. We shouldn't need to pass them on, they merely affect which schedule we can
668      * legally choose for various dynamic cases. (In paritcular, whether or not a stealing scheme is legal).
669      */
670     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
671 
672     /* Pick up the nomerge/ordered bits from the scheduling type */
673     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
674         pr->nomerge = TRUE;
675         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
676     } else {
677         pr->nomerge = FALSE;
678     }
679     pr->type_size = ___kmp_size_type; // remember the size of variables
680     if ( kmp_ord_lower & schedule ) {
681         pr->ordered = TRUE;
682         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
683     } else {
684         pr->ordered = FALSE;
685     }
686 
687     if ( schedule == kmp_sch_static ) {
688         schedule = __kmp_static;
689     } else {
690         if ( schedule == kmp_sch_runtime ) {
691             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
692             schedule = team -> t.t_sched.r_sched_type;
693             // Detail the schedule if needed (global controls are differentiated appropriately)
694             if ( schedule == kmp_sch_guided_chunked ) {
695                 schedule = __kmp_guided;
696             } else if ( schedule == kmp_sch_static ) {
697                 schedule = __kmp_static;
698             }
699             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
700             chunk = team -> t.t_sched.chunk;
701 #if USE_ITT_BUILD
702             cur_chunk = chunk;
703 #endif
704             #ifdef KMP_DEBUG
705             {
706                 const char * buff;
707                 // create format specifiers before the debug output
708                 buff = __kmp_str_format(
709                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
710                     traits_t< ST >::spec );
711                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
712                 __kmp_str_free( &buff );
713             }
714             #endif
715         } else {
716             if ( schedule == kmp_sch_guided_chunked ) {
717                 schedule = __kmp_guided;
718             }
719             if ( chunk <= 0 ) {
720                 chunk = KMP_DEFAULT_CHUNK;
721             }
722         }
723 
724         if ( schedule == kmp_sch_auto ) {
725             // mapping and differentiation: in the __kmp_do_serial_initialize()
726             schedule = __kmp_auto;
727             #ifdef KMP_DEBUG
728             {
729                 const char * buff;
730                 // create format specifiers before the debug output
731                 buff = __kmp_str_format(
732                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
733                     traits_t< ST >::spec );
734                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
735                 __kmp_str_free( &buff );
736             }
737             #endif
738         }
739 
740         /* guided analytical not safe for too many threads */
741         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
742             schedule = kmp_sch_guided_iterative_chunked;
743             KMP_WARNING( DispatchManyThreads );
744         }
745         pr->u.p.parm1 = chunk;
746     }
747     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
748                 "unknown scheduling type" );
749 
750     pr->u.p.count = 0;
751 
752     if ( __kmp_env_consistency_check ) {
753         if ( st == 0 ) {
754             __kmp_error_construct(
755                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
756                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
757             );
758         }
759     }
760     // compute trip count
761     if ( st == 1 ) {   // most common case
762         if ( ub >= lb ) {
763             tc = ub - lb + 1;
764         } else {   // ub < lb
765             tc = 0;            // zero-trip
766         }
767     } else if ( st < 0 ) {
768         if ( lb >= ub ) {
769             // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
770             //     where the division needs to be unsigned regardless of the result type
771             tc = (UT)(lb - ub) / (-st) + 1;
772         } else {   // lb < ub
773             tc = 0;            // zero-trip
774         }
775     } else {       // st > 0
776         if ( ub >= lb ) {
777             // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
778             //     where the division needs to be unsigned regardless of the result type
779             tc = (UT)(ub - lb) / st + 1;
780         } else {   // ub < lb
781             tc = 0;            // zero-trip
782         }
783     }
784 
785     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
786     // when statistics are disabled.
787     if (schedule == __kmp_static)
788     {
789         KMP_COUNT_BLOCK(OMP_FOR_static);
790         KMP_COUNT_VALUE(FOR_static_iterations, tc);
791     }
792     else
793     {
794         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
795         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
796     }
797 
798     pr->u.p.lb = lb;
799     pr->u.p.ub = ub;
800     pr->u.p.st = st;
801     pr->u.p.tc = tc;
802 
803     #if KMP_OS_WINDOWS
804     pr->u.p.last_upper = ub + st;
805     #endif /* KMP_OS_WINDOWS */
806 
807     /* NOTE: only the active parallel region(s) has active ordered sections */
808 
809     if ( active ) {
810         if ( pr->ordered == 0 ) {
811             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
812             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
813         } else {
814             pr->ordered_bumped = 0;
815 
816             pr->u.p.ordered_lower = 1;
817             pr->u.p.ordered_upper = 0;
818 
819             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
820             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
821         }
822     }
823 
824     if ( __kmp_env_consistency_check ) {
825         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
826         if ( push_ws ) {
827             __kmp_push_workshare( gtid, ws, loc );
828             pr->pushed_ws = ws;
829         } else {
830             __kmp_check_workshare( gtid, ws, loc );
831             pr->pushed_ws = ct_none;
832         }
833     }
834 
835     switch ( schedule ) {
836     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
837     case kmp_sch_static_steal:
838         {
839             T nproc = team->t.t_nproc;
840             T ntc, init;
841 
842             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
843 
844             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
845             if ( nproc > 1 && ntc >= nproc ) {
846                 T id = __kmp_tid_from_gtid(gtid);
847                 T small_chunk, extras;
848 
849                 small_chunk = ntc / nproc;
850                 extras = ntc % nproc;
851 
852                 init = id * small_chunk + ( id < extras ? id : extras );
853                 pr->u.p.count = init;
854                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
855 
856                 pr->u.p.parm2 = lb;
857                 //pr->pfields.parm3 = 0; // it's not used in static_steal
858                 pr->u.p.parm4 = id;
859                 pr->u.p.st = st;
860                 break;
861             } else {
862                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
863                                gtid ) );
864                 schedule = kmp_sch_static_balanced;
865                 /* too few iterations: fall-through to kmp_sch_static_balanced */
866             } // if
867             /* FALL-THROUGH to static balanced */
868         } // case
869     #endif
870     case kmp_sch_static_balanced:
871         {
872             T nproc = team->t.t_nproc;
873             T init, limit;
874 
875             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
876                             gtid ) );
877 
878             if ( nproc > 1 ) {
879                 T id = __kmp_tid_from_gtid(gtid);
880 
881                 if ( tc < nproc ) {
882                     if ( id < tc ) {
883                         init = id;
884                         limit = id;
885                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
886                     } else {
887                         pr->u.p.count = 1;  /* means no more chunks to execute */
888                         pr->u.p.parm1 = FALSE;
889                         break;
890                     }
891                 } else {
892                     T small_chunk = tc / nproc;
893                     T extras = tc % nproc;
894                     init = id * small_chunk + (id < extras ? id : extras);
895                     limit = init + small_chunk - (id < extras ? 0 : 1);
896                     pr->u.p.parm1 = (id == nproc - 1);
897                 }
898             } else {
899                 if ( tc > 0 ) {
900                     init = 0;
901                     limit = tc - 1;
902                     pr->u.p.parm1 = TRUE;
903                 } else {
904                     // zero trip count
905                     pr->u.p.count = 1;  /* means no more chunks to execute */
906                     pr->u.p.parm1 = FALSE;
907                     break;
908                 }
909             }
910 #if USE_ITT_BUILD
911             // Calculate chunk for metadata report
912             if ( itt_need_metadata_reporting )
913                 cur_chunk = limit - init + 1;
914 #endif
915             if ( st == 1 ) {
916                 pr->u.p.lb = lb + init;
917                 pr->u.p.ub = lb + limit;
918             } else {
919                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
920                 pr->u.p.lb = lb + init * st;
921                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
922                 if ( st > 0 ) {
923                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
924                 } else {
925                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
926                 }
927             }
928             if ( pr->ordered ) {
929                 pr->u.p.ordered_lower = init;
930                 pr->u.p.ordered_upper = limit;
931             }
932             break;
933         } // case
934     case kmp_sch_guided_iterative_chunked :
935         {
936             T nproc = team->t.t_nproc;
937             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
938 
939             if ( nproc > 1 ) {
940                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
941                     /* chunk size too large, switch to dynamic */
942                     schedule = kmp_sch_dynamic_chunked;
943                 } else {
944                     // when remaining iters become less than parm2 - switch to dynamic
945                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
946                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
947                 }
948             } else {
949                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
950                 schedule = kmp_sch_static_greedy;
951                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
952                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
953                 pr->u.p.parm1 = tc;
954             } // if
955         } // case
956         break;
957     case kmp_sch_guided_analytical_chunked:
958         {
959             T nproc = team->t.t_nproc;
960             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
961 
962             if ( nproc > 1 ) {
963                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
964                     /* chunk size too large, switch to dynamic */
965                     schedule = kmp_sch_dynamic_chunked;
966                 } else {
967                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
968                     DBL x;
969 
970                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
971                     /* Linux* OS already has 64-bit computation by default for
972 		       long double, and on Windows* OS on Intel(R) 64,
973 		       /Qlong_double doesn't work.  On Windows* OS
974 		       on IA-32 architecture, we need to set precision to
975 		       64-bit instead of the default 53-bit. Even though long
976 		       double doesn't work on Windows* OS on Intel(R) 64, the
977 		       resulting lack of precision is not expected to impact
978 		       the correctness of the algorithm, but this has not been
979 		       mathematically proven.
980                     */
981                     // save original FPCW and set precision to 64-bit, as
982                     // Windows* OS on IA-32 architecture defaults to 53-bit
983                     unsigned int oldFpcw = _control87(0,0);
984                     _control87(_PC_64,_MCW_PC); // 0,0x30000
985                     #endif
986                     /* value used for comparison in solver for cross-over point */
987                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
988 
989                     /* crossover point--chunk indexes equal to or greater than
990 		       this point switch to dynamic-style scheduling */
991                     UT   cross;
992 
993                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
994                     x = (long double)1.0 - (long double)0.5 / nproc;
995 
996                     #ifdef KMP_DEBUG
997                     { // test natural alignment
998                         struct _test_a {
999                             char a;
1000                             union {
1001                                 char b;
1002                                 DBL  d;
1003                             };
1004                         } t;
1005                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1006                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1007                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1008                     }
1009                     #endif // KMP_DEBUG
1010 
1011                     /* save the term in thread private dispatch structure */
1012                     *(DBL*)&pr->u.p.parm3 = x;
1013 
1014                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1015                     {
1016                         UT          left, right, mid;
1017                         long double p;
1018 
1019                         /* estimate initial upper and lower bound */
1020 
1021                         /* doesn't matter what value right is as long as it is positive, but
1022                            it affects performance of the solver
1023                         */
1024                         right = 229;
1025                         p = __kmp_pow< UT >(x,right);
1026                         if ( p > target ) {
1027                             do{
1028                                 p *= p;
1029                                 right <<= 1;
1030                             } while(p>target && right < (1<<27));
1031                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1032                         } else {
1033                             left = 0;
1034                         }
1035 
1036                         /* bisection root-finding method */
1037                         while ( left + 1 < right ) {
1038                             mid = (left + right) / 2;
1039                             if ( __kmp_pow< UT >(x,mid) > target ) {
1040                                 left = mid;
1041                             } else {
1042                                 right = mid;
1043                             }
1044                         } // while
1045                         cross = right;
1046                     }
1047                     /* assert sanity of computed crossover point */
1048                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1049 
1050                     /* save the crossover point in thread private dispatch structure */
1051                     pr->u.p.parm2 = cross;
1052 
1053                     // C75803
1054                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1055                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1056                     #else
1057                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1058                     #endif
1059                     /* dynamic-style scheduling offset */
1060                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1061                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1062                         // restore FPCW
1063                         _control87(oldFpcw,_MCW_PC);
1064                     #endif
1065                 } // if
1066             } else {
1067                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1068                                gtid ) );
1069                 schedule = kmp_sch_static_greedy;
1070                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071                 pr->u.p.parm1 = tc;
1072             } // if
1073         } // case
1074         break;
1075     case kmp_sch_static_greedy:
1076         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1077             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1078                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1079                 tc;
1080         break;
1081     case kmp_sch_static_chunked :
1082     case kmp_sch_dynamic_chunked :
1083         if ( pr->u.p.parm1 <= 0 ) {
1084             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1085         }
1086         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1087         break;
1088     case kmp_sch_trapezoidal :
1089         {
1090             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1091 
1092             T parm1, parm2, parm3, parm4;
1093             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1094 
1095             parm1 = chunk;
1096 
1097             /* F : size of the first cycle */
1098             parm2 = ( tc / (2 * team->t.t_nproc) );
1099 
1100             if ( parm2 < 1 ) {
1101                 parm2 = 1;
1102             }
1103 
1104             /* L : size of the last cycle.  Make sure the last cycle
1105              *     is not larger than the first cycle.
1106              */
1107             if ( parm1 < 1 ) {
1108                 parm1 = 1;
1109             } else if ( parm1 > parm2 ) {
1110                 parm1 = parm2;
1111             }
1112 
1113             /* N : number of cycles */
1114             parm3 = ( parm2 + parm1 );
1115             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1116 
1117             if ( parm3 < 2 ) {
1118                 parm3 = 2;
1119             }
1120 
1121             /* sigma : decreasing incr of the trapezoid */
1122             parm4 = ( parm3 - 1 );
1123             parm4 = ( parm2 - parm1 ) / parm4;
1124 
1125             // pointless check, because parm4 >= 0 always
1126             //if ( parm4 < 0 ) {
1127             //    parm4 = 0;
1128             //}
1129 
1130             pr->u.p.parm1 = parm1;
1131             pr->u.p.parm2 = parm2;
1132             pr->u.p.parm3 = parm3;
1133             pr->u.p.parm4 = parm4;
1134         } // case
1135         break;
1136 
1137     default:
1138         {
1139             __kmp_msg(
1140                 kmp_ms_fatal,                        // Severity
1141                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1142                 KMP_HNT( GetNewerLibrary ),          // Hint
1143                 __kmp_msg_null                       // Variadic argument list terminator
1144             );
1145         }
1146         break;
1147     } // switch
1148     pr->schedule = schedule;
1149     if ( active ) {
1150         /* The name of this buffer should be my_buffer_index when it's free to use it */
1151 
1152         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1153                         gtid, my_buffer_index, sh->buffer_index) );
1154         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1155                                         USE_ITT_BUILD_ARG( NULL )
1156                                         );
1157             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1158             // *always* 32-bit integers.
1159         KMP_MB();  /* is this necessary? */
1160         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1161                         gtid, my_buffer_index, sh->buffer_index) );
1162 
1163         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1164         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1165 #if USE_ITT_BUILD
1166         if ( pr->ordered ) {
1167             __kmp_itt_ordered_init( gtid );
1168         }; // if
1169         // Report loop metadata
1170         if ( itt_need_metadata_reporting ) {
1171             // Only report metadata by master of active team at level 1
1172             kmp_uint64 schedtype = 0;
1173             switch ( schedule ) {
1174             case kmp_sch_static_chunked:
1175             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1176                 break;
1177             case kmp_sch_static_greedy:
1178                 cur_chunk = pr->u.p.parm1;
1179                 break;
1180             case kmp_sch_dynamic_chunked:
1181                 schedtype = 1;
1182                 break;
1183             case kmp_sch_guided_iterative_chunked:
1184             case kmp_sch_guided_analytical_chunked:
1185                 schedtype = 2;
1186                 break;
1187             default:
1188 //            Should we put this case under "static"?
1189 //            case kmp_sch_static_steal:
1190                 schedtype = 3;
1191                 break;
1192             }
1193             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1194         }
1195 #endif /* USE_ITT_BUILD */
1196     }; // if
1197 
1198     #ifdef KMP_DEBUG
1199     {
1200         const char * buff;
1201         // create format specifiers before the debug output
1202         buff = __kmp_str_format(
1203             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1204             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1205             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1206             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1207             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1208             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1209             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1210         KD_TRACE(10, ( buff,
1211             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1212             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1213             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1214             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1215         __kmp_str_free( &buff );
1216     }
1217     #endif
1218     #if ( KMP_STATIC_STEAL_ENABLED )
1219     if ( ___kmp_size_type < 8 ) {
1220       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1221       // all the parm3 variables will contain the same value.
1222       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1223       // rather than program life-time increment.
1224       // So the dedicated variable is required. The 'static_steal_counter' is used.
1225       if( schedule == kmp_sch_static_steal ) {
1226         // Other threads will inspect this variable when searching for a victim.
1227         // This is a flag showing that other threads may steal from this thread since then.
1228         volatile T * p = &pr->u.p.static_steal_counter;
1229         *p = *p + 1;
1230       }
1231     }
1232     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1233 
1234 #if OMPT_SUPPORT && OMPT_TRACE
1235     if (ompt_enabled &&
1236         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1237         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1238         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1239         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1240             team_info->parallel_id, task_info->task_id, team_info->microtask);
1241     }
1242 #endif
1243 }
1244 
1245 /*
1246  * For ordered loops, either __kmp_dispatch_finish() should be called after
1247  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1248  * every chunk of iterations.  If the ordered section(s) were not executed
1249  * for this iteration (or every iteration in this chunk), we need to set the
1250  * ordered iteration counters so that the next thread can proceed.
1251  */
1252 template< typename UT >
1253 static void
1254 __kmp_dispatch_finish( int gtid, ident_t *loc )
1255 {
1256     typedef typename traits_t< UT >::signed_t ST;
1257     kmp_info_t *th = __kmp_threads[ gtid ];
1258 
1259     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1260     if ( ! th -> th.th_team -> t.t_serialized ) {
1261 
1262         dispatch_private_info_template< UT > * pr =
1263             reinterpret_cast< dispatch_private_info_template< UT >* >
1264             ( th->th.th_dispatch->th_dispatch_pr_current );
1265         dispatch_shared_info_template< UT > volatile * sh =
1266             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1267             ( th->th.th_dispatch->th_dispatch_sh_current );
1268         KMP_DEBUG_ASSERT( pr );
1269         KMP_DEBUG_ASSERT( sh );
1270         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1271                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1272 
1273         if ( pr->ordered_bumped ) {
1274             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1275                             gtid ) );
1276             pr->ordered_bumped = 0;
1277         } else {
1278             UT lower = pr->u.p.ordered_lower;
1279 
1280             #ifdef KMP_DEBUG
1281             {
1282                 const char * buff;
1283                 // create format specifiers before the debug output
1284                 buff = __kmp_str_format(
1285                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1286                     traits_t< UT >::spec, traits_t< UT >::spec );
1287                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1288                 __kmp_str_free( &buff );
1289             }
1290             #endif
1291 
1292             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1293                                    USE_ITT_BUILD_ARG(NULL)
1294                                    );
1295             KMP_MB();  /* is this necessary? */
1296             #ifdef KMP_DEBUG
1297             {
1298                 const char * buff;
1299                 // create format specifiers before the debug output
1300                 buff = __kmp_str_format(
1301                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1302                     traits_t< UT >::spec, traits_t< UT >::spec );
1303                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1304                 __kmp_str_free( &buff );
1305             }
1306             #endif
1307 
1308             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1309         } // if
1310     } // if
1311     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1312 }
1313 
1314 #ifdef KMP_GOMP_COMPAT
1315 
1316 template< typename UT >
1317 static void
1318 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1319 {
1320     typedef typename traits_t< UT >::signed_t ST;
1321     kmp_info_t *th = __kmp_threads[ gtid ];
1322 
1323     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1324     if ( ! th -> th.th_team -> t.t_serialized ) {
1325 //        int cid;
1326         dispatch_private_info_template< UT > * pr =
1327             reinterpret_cast< dispatch_private_info_template< UT >* >
1328             ( th->th.th_dispatch->th_dispatch_pr_current );
1329         dispatch_shared_info_template< UT > volatile * sh =
1330             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1331             ( th->th.th_dispatch->th_dispatch_sh_current );
1332         KMP_DEBUG_ASSERT( pr );
1333         KMP_DEBUG_ASSERT( sh );
1334         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1335                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1336 
1337 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1338             UT lower = pr->u.p.ordered_lower;
1339             UT upper = pr->u.p.ordered_upper;
1340             UT inc = upper - lower + 1;
1341 
1342             if ( pr->ordered_bumped == inc ) {
1343                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1344                   gtid ) );
1345                 pr->ordered_bumped = 0;
1346             } else {
1347                 inc -= pr->ordered_bumped;
1348 
1349                 #ifdef KMP_DEBUG
1350                 {
1351                     const char * buff;
1352                     // create format specifiers before the debug output
1353                     buff = __kmp_str_format(
1354                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1355                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1356                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1357                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1358                     __kmp_str_free( &buff );
1359                 }
1360                 #endif
1361 
1362                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1363                                        USE_ITT_BUILD_ARG(NULL)
1364                                        );
1365 
1366                 KMP_MB();  /* is this necessary? */
1367                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1368                   gtid ) );
1369                 pr->ordered_bumped = 0;
1370 //!!!!! TODO check if the inc should be unsigned, or signed???
1371                 #ifdef KMP_DEBUG
1372                 {
1373                     const char * buff;
1374                     // create format specifiers before the debug output
1375                     buff = __kmp_str_format(
1376                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1377                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1378                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1379                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1380                     __kmp_str_free( &buff );
1381                 }
1382                 #endif
1383 
1384                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1385             }
1386 //        }
1387     }
1388     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1389 }
1390 
1391 #endif /* KMP_GOMP_COMPAT */
1392 
1393 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1394  * (no more work), then tell OMPT the loop is over. In some cases
1395  * kmp_dispatch_fini() is not called. */
1396 #if OMPT_SUPPORT && OMPT_TRACE
1397 #define OMPT_LOOP_END                                                          \
1398     if (status == 0) {                                                         \
1399         if (ompt_enabled &&                     \
1400             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1401             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1402             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1403             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1404                 team_info->parallel_id, task_info->task_id);                   \
1405         }                                                                      \
1406     }
1407 #else
1408 #define OMPT_LOOP_END // no-op
1409 #endif
1410 
1411 template< typename T >
1412 static int
1413 __kmp_dispatch_next(
1414     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1415 ) {
1416 
1417     typedef typename traits_t< T >::unsigned_t  UT;
1418     typedef typename traits_t< T >::signed_t    ST;
1419     typedef typename traits_t< T >::floating_t  DBL;
1420 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1421     static const int ___kmp_size_type = sizeof( UT );
1422 #endif
1423 
1424     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1425     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1426     // more than a compile time choice to use static scheduling would.)
1427     KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1428 
1429     int                                   status;
1430     dispatch_private_info_template< T > * pr;
1431     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1432     kmp_team_t                          * team = th -> th.th_team;
1433 
1434     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1435     #ifdef KMP_DEBUG
1436     {
1437         const char * buff;
1438         // create format specifiers before the debug output
1439         buff = __kmp_str_format(
1440             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1441             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1442         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1443         __kmp_str_free( &buff );
1444     }
1445     #endif
1446 
1447     if ( team -> t.t_serialized ) {
1448         /* NOTE: serialize this dispatch becase we are not at the active level */
1449         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1450             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1451         KMP_DEBUG_ASSERT( pr );
1452 
1453         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1454             *p_lb = 0;
1455             *p_ub = 0;
1456 //            if ( p_last != NULL )
1457 //                *p_last = 0;
1458             if ( p_st != NULL )
1459                 *p_st = 0;
1460             if ( __kmp_env_consistency_check ) {
1461                 if ( pr->pushed_ws != ct_none ) {
1462                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1463                 }
1464             }
1465         } else if ( pr->nomerge ) {
1466             kmp_int32 last;
1467             T         start;
1468             UT        limit, trip, init;
1469             ST        incr;
1470             T         chunk = pr->u.p.parm1;
1471 
1472             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1473 
1474             init = chunk * pr->u.p.count++;
1475             trip = pr->u.p.tc - 1;
1476 
1477             if ( (status = (init <= trip)) == 0 ) {
1478                 *p_lb = 0;
1479                 *p_ub = 0;
1480 //                if ( p_last != NULL )
1481 //                    *p_last = 0;
1482                 if ( p_st != NULL )
1483                     *p_st = 0;
1484                 if ( __kmp_env_consistency_check ) {
1485                     if ( pr->pushed_ws != ct_none ) {
1486                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1487                     }
1488                 }
1489             } else {
1490                 start = pr->u.p.lb;
1491                 limit = chunk + init - 1;
1492                 incr  = pr->u.p.st;
1493 
1494                 if ( (last = (limit >= trip)) != 0 ) {
1495                     limit = trip;
1496                     #if KMP_OS_WINDOWS
1497                     pr->u.p.last_upper = pr->u.p.ub;
1498                     #endif /* KMP_OS_WINDOWS */
1499                 }
1500                 if ( p_last != NULL )
1501                     *p_last = last;
1502                 if ( p_st != NULL )
1503                     *p_st = incr;
1504                 if ( incr == 1 ) {
1505                     *p_lb = start + init;
1506                     *p_ub = start + limit;
1507                 } else {
1508                     *p_lb = start + init * incr;
1509                     *p_ub = start + limit * incr;
1510                 }
1511 
1512                 if ( pr->ordered ) {
1513                     pr->u.p.ordered_lower = init;
1514                     pr->u.p.ordered_upper = limit;
1515                     #ifdef KMP_DEBUG
1516                     {
1517                         const char * buff;
1518                         // create format specifiers before the debug output
1519                         buff = __kmp_str_format(
1520                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1521                             traits_t< UT >::spec, traits_t< UT >::spec );
1522                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1523                         __kmp_str_free( &buff );
1524                     }
1525                     #endif
1526                 } // if
1527             } // if
1528         } else {
1529             pr->u.p.tc = 0;
1530             *p_lb = pr->u.p.lb;
1531             *p_ub = pr->u.p.ub;
1532             #if KMP_OS_WINDOWS
1533             pr->u.p.last_upper = *p_ub;
1534             #endif /* KMP_OS_WINDOWS */
1535             if ( p_last != NULL )
1536                 *p_last = TRUE;
1537             if ( p_st != NULL )
1538                 *p_st = pr->u.p.st;
1539         } // if
1540         #ifdef KMP_DEBUG
1541         {
1542             const char * buff;
1543             // create format specifiers before the debug output
1544             buff = __kmp_str_format(
1545                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1546                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1547                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1548             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1549             __kmp_str_free( &buff );
1550         }
1551         #endif
1552 #if INCLUDE_SSC_MARKS
1553         SSC_MARK_DISPATCH_NEXT();
1554 #endif
1555         OMPT_LOOP_END;
1556         return status;
1557     } else {
1558         kmp_int32 last = 0;
1559         dispatch_shared_info_template< UT > *sh;
1560         T         start;
1561         ST        incr;
1562         UT        limit, trip, init;
1563 
1564         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1565                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1566 
1567         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1568             ( th->th.th_dispatch->th_dispatch_pr_current );
1569         KMP_DEBUG_ASSERT( pr );
1570         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1571             ( th->th.th_dispatch->th_dispatch_sh_current );
1572         KMP_DEBUG_ASSERT( sh );
1573 
1574         if ( pr->u.p.tc == 0 ) {
1575             // zero trip count
1576             status = 0;
1577         } else {
1578             switch (pr->schedule) {
1579             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1580             case kmp_sch_static_steal:
1581                 {
1582                     T chunk = pr->u.p.parm1;
1583 
1584                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1585 
1586                     trip = pr->u.p.tc - 1;
1587 
1588                     if ( ___kmp_size_type > 4 ) {
1589                         // Other threads do not look into the data of this thread,
1590                         //  so it's not necessary to make volatile casting.
1591                         init   = ( pr->u.p.count )++;
1592                         status = ( init < (UT)pr->u.p.ub );
1593                     } else {
1594                         typedef union {
1595                             struct {
1596                                 UT count;
1597                                 T  ub;
1598                             } p;
1599                             kmp_int64 b;
1600                         } union_i4;
1601                         // All operations on 'count' or 'ub' must be combined atomically together.
1602                         // stealing implemented only for 4-byte indexes
1603                         {
1604                             union_i4 vold, vnew;
1605                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1606                             vnew = vold;
1607                             vnew.p.count++;
1608                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1609                                         ( volatile kmp_int64* )&pr->u.p.count,
1610                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1611                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1612                                 KMP_CPU_PAUSE();
1613                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1614                                 vnew = vold;
1615                                 vnew.p.count++;
1616                             }
1617                             vnew = vold;
1618                             init   = vnew.p.count;
1619                             status = ( init < (UT)vnew.p.ub ) ;
1620                         }
1621 
1622                         if( !status ) {
1623                             kmp_info_t   **other_threads = team->t.t_threads;
1624                             int          while_limit = 10;
1625                             int          while_index = 0;
1626 
1627                             // TODO: algorithm of searching for a victim
1628                             // should be cleaned up and measured
1629                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1630                                 union_i4  vold, vnew;
1631                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1632                                 T         victimIdx    = pr->u.p.parm4;
1633                                 T         oldVictimIdx = victimIdx;
1634                                 dispatch_private_info_template< T > * victim;
1635 
1636                                 do {
1637                                     if( !victimIdx ) {
1638                                         victimIdx = team->t.t_nproc - 1;
1639                                     } else {
1640                                         --victimIdx;
1641                                     }
1642                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1643                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1644                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1645                                 // TODO: think about a proper place of this test
1646                                 if ( ( !victim ) ||
1647                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1648                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1649                                     // TODO: delay would be nice
1650                                     continue;
1651                                     // the victim is not ready yet to participate in stealing
1652                                     // because the victim is still in kmp_init_dispatch
1653                                 }
1654                                 if ( oldVictimIdx == victimIdx ) {
1655                                     break;
1656                                 }
1657                                 pr->u.p.parm4 = victimIdx;
1658 
1659                                 while( 1 ) {
1660                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1661                                     vnew = vold;
1662 
1663                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1664                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1665                                         break;
1666                                     }
1667                                     vnew.p.ub -= (remaining >> 2);
1668                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1669                                     #pragma warning( push )
1670                                     // disable warning on pointless comparison of unsigned with 0
1671                                     #pragma warning( disable: 186 )
1672                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1673                                     #pragma warning( pop )
1674                                     // TODO: Should this be acquire or release?
1675                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1676                                             ( volatile kmp_int64 * )&victim->u.p.count,
1677                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1678                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1679                                         status = 1;
1680                                         while_index = 0;
1681                                         // now update own count and ub
1682                                         #if KMP_ARCH_X86
1683                                         // stealing executed on non-KMP_ARCH_X86 only
1684                                             // Atomic 64-bit write on ia32 is
1685                                             // unavailable, so we do this in steps.
1686                                             //     This code is not tested.
1687                                             init = vold.p.count;
1688                                             pr->u.p.ub = 0;
1689                                             pr->u.p.count = init + 1;
1690                                             pr->u.p.ub = vnew.p.count;
1691                                         #else
1692                                             init = vnew.p.ub;
1693                                             vold.p.count = init + 1;
1694                                             // TODO: is it safe and enough?
1695                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1696                                         #endif // KMP_ARCH_X86
1697                                         break;
1698                                     } // if
1699                                 KMP_CPU_PAUSE();
1700                                 } // while (1)
1701                             } // while
1702                         } // if
1703                     } // if
1704                     if ( !status ) {
1705                         *p_lb = 0;
1706                         *p_ub = 0;
1707                         if ( p_st != NULL ) *p_st = 0;
1708                     } else {
1709                         start = pr->u.p.parm2;
1710                         init *= chunk;
1711                         limit = chunk + init - 1;
1712                         incr  = pr->u.p.st;
1713 
1714                         KMP_DEBUG_ASSERT(init <= trip);
1715                         if ( (last = (limit >= trip)) != 0 )
1716                             limit = trip;
1717                         if ( p_st != NULL ) *p_st = incr;
1718 
1719                         if ( incr == 1 ) {
1720                             *p_lb = start + init;
1721                             *p_ub = start + limit;
1722                         } else {
1723                             *p_lb = start + init * incr;
1724                             *p_ub = start + limit * incr;
1725                         }
1726 
1727                         if ( pr->ordered ) {
1728                             pr->u.p.ordered_lower = init;
1729                             pr->u.p.ordered_upper = limit;
1730                             #ifdef KMP_DEBUG
1731                             {
1732                                 const char * buff;
1733                                 // create format specifiers before the debug output
1734                                 buff = __kmp_str_format(
1735                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1736                                     traits_t< UT >::spec, traits_t< UT >::spec );
1737                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1738                                 __kmp_str_free( &buff );
1739                             }
1740                             #endif
1741                         } // if
1742                     } // if
1743                     break;
1744                 } // case
1745             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1746             case kmp_sch_static_balanced:
1747                 {
1748                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1749                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1750                         pr->u.p.count = 1;
1751                         *p_lb = pr->u.p.lb;
1752                         *p_ub = pr->u.p.ub;
1753                         last = pr->u.p.parm1;
1754                         if ( p_st != NULL )
1755                             *p_st = pr->u.p.st;
1756                     } else {  /* no iterations to do */
1757                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1758                     }
1759                     if ( pr->ordered ) {
1760                         #ifdef KMP_DEBUG
1761                         {
1762                             const char * buff;
1763                             // create format specifiers before the debug output
1764                             buff = __kmp_str_format(
1765                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1766                                 traits_t< UT >::spec, traits_t< UT >::spec );
1767                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1768                             __kmp_str_free( &buff );
1769                         }
1770                         #endif
1771                     } // if
1772                 } // case
1773                 break;
1774             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1775             case kmp_sch_static_chunked:
1776                 {
1777                     T parm1;
1778 
1779                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1780                                    gtid ) );
1781                     parm1 = pr->u.p.parm1;
1782 
1783                     trip  = pr->u.p.tc - 1;
1784                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1785 
1786                     if ( (status = (init <= trip)) != 0 ) {
1787                         start = pr->u.p.lb;
1788                         incr  = pr->u.p.st;
1789                         limit = parm1 + init - 1;
1790 
1791                         if ( (last = (limit >= trip)) != 0 )
1792                             limit = trip;
1793 
1794                         if ( p_st != NULL ) *p_st = incr;
1795 
1796                         pr->u.p.count += team->t.t_nproc;
1797 
1798                         if ( incr == 1 ) {
1799                             *p_lb = start + init;
1800                             *p_ub = start + limit;
1801                         }
1802                         else {
1803                             *p_lb = start + init * incr;
1804                             *p_ub = start + limit * incr;
1805                         }
1806 
1807                         if ( pr->ordered ) {
1808                             pr->u.p.ordered_lower = init;
1809                             pr->u.p.ordered_upper = limit;
1810                             #ifdef KMP_DEBUG
1811                             {
1812                                 const char * buff;
1813                                 // create format specifiers before the debug output
1814                                 buff = __kmp_str_format(
1815                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1816                                     traits_t< UT >::spec, traits_t< UT >::spec );
1817                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1818                                 __kmp_str_free( &buff );
1819                             }
1820                             #endif
1821                         } // if
1822                     } // if
1823                 } // case
1824                 break;
1825 
1826             case kmp_sch_dynamic_chunked:
1827                 {
1828                     T chunk = pr->u.p.parm1;
1829 
1830                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1831                                    gtid ) );
1832 
1833                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1834                     trip = pr->u.p.tc - 1;
1835 
1836                     if ( (status = (init <= trip)) == 0 ) {
1837                         *p_lb = 0;
1838                         *p_ub = 0;
1839                         if ( p_st != NULL ) *p_st = 0;
1840                     } else {
1841                         start = pr->u.p.lb;
1842                         limit = chunk + init - 1;
1843                         incr  = pr->u.p.st;
1844 
1845                         if ( (last = (limit >= trip)) != 0 )
1846                             limit = trip;
1847 
1848                         if ( p_st != NULL ) *p_st = incr;
1849 
1850                         if ( incr == 1 ) {
1851                             *p_lb = start + init;
1852                             *p_ub = start + limit;
1853                         } else {
1854                             *p_lb = start + init * incr;
1855                             *p_ub = start + limit * incr;
1856                         }
1857 
1858                         if ( pr->ordered ) {
1859                             pr->u.p.ordered_lower = init;
1860                             pr->u.p.ordered_upper = limit;
1861                             #ifdef KMP_DEBUG
1862                             {
1863                                 const char * buff;
1864                                 // create format specifiers before the debug output
1865                                 buff = __kmp_str_format(
1866                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1867                                     traits_t< UT >::spec, traits_t< UT >::spec );
1868                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1869                                 __kmp_str_free( &buff );
1870                             }
1871                             #endif
1872                         } // if
1873                     } // if
1874                 } // case
1875                 break;
1876 
1877             case kmp_sch_guided_iterative_chunked:
1878                 {
1879                     T  chunkspec = pr->u.p.parm1;
1880                     KD_TRACE(100,
1881                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1882                     trip  = pr->u.p.tc;
1883                     // Start atomic part of calculations
1884                     while(1) {
1885                         ST  remaining;             // signed, because can be < 0
1886                         init = sh->u.s.iteration;  // shared value
1887                         remaining = trip - init;
1888                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1889                             // nothing to do, don't try atomic op
1890                             status = 0;
1891                             break;
1892                         }
1893                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1894                             // use dynamic-style shcedule
1895                             // atomically inrement iterations, get old value
1896                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1897                             remaining = trip - init;
1898                             if (remaining <= 0) {
1899                                 status = 0;    // all iterations got by other threads
1900                             } else {
1901                                 // got some iterations to work on
1902                                 status = 1;
1903                                 if ( (T)remaining > chunkspec ) {
1904                                     limit = init + chunkspec - 1;
1905                                 } else {
1906                                     last = 1;   // the last chunk
1907                                     limit = init + remaining - 1;
1908                                 } // if
1909                             } // if
1910                             break;
1911                         } // if
1912                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1913                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1914                             // CAS was successful, chunk obtained
1915                             status = 1;
1916                             --limit;
1917                             break;
1918                         } // if
1919                     } // while
1920                     if ( status != 0 ) {
1921                         start = pr->u.p.lb;
1922                         incr = pr->u.p.st;
1923                         if ( p_st != NULL )
1924                             *p_st = incr;
1925                         *p_lb = start + init * incr;
1926                         *p_ub = start + limit * incr;
1927                         if ( pr->ordered ) {
1928                             pr->u.p.ordered_lower = init;
1929                             pr->u.p.ordered_upper = limit;
1930                             #ifdef KMP_DEBUG
1931                             {
1932                                 const char * buff;
1933                                 // create format specifiers before the debug output
1934                                 buff = __kmp_str_format(
1935                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1936                                     traits_t< UT >::spec, traits_t< UT >::spec );
1937                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1938                                 __kmp_str_free( &buff );
1939                             }
1940                             #endif
1941                         } // if
1942                     } else {
1943                         *p_lb = 0;
1944                         *p_ub = 0;
1945                         if ( p_st != NULL )
1946                             *p_st = 0;
1947                     } // if
1948                 } // case
1949                 break;
1950 
1951             case kmp_sch_guided_analytical_chunked:
1952                 {
1953                     T   chunkspec = pr->u.p.parm1;
1954                     UT chunkIdx;
1955     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1956                     /* for storing original FPCW value for Windows* OS on
1957 		       IA-32 architecture 8-byte version */
1958                     unsigned int oldFpcw;
1959                     unsigned int fpcwSet = 0;
1960     #endif
1961                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1962                                    gtid ) );
1963 
1964                     trip  = pr->u.p.tc;
1965 
1966                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1967                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1968 
1969                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1970                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1971                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1972                             --trip;
1973                             /* use dynamic-style scheduling */
1974                             init = chunkIdx * chunkspec + pr->u.p.count;
1975                             /* need to verify init > 0 in case of overflow in the above calculation */
1976                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1977                                 limit = init + chunkspec -1;
1978 
1979                                 if ( (last = (limit >= trip)) != 0 )
1980                                     limit = trip;
1981                             }
1982                             break;
1983                         } else {
1984                             /* use exponential-style scheduling */
1985                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1986                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1987                              */
1988     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1989                             /* If we haven't already done so, save original
1990 			       FPCW and set precision to 64-bit, as Windows* OS
1991 			       on IA-32 architecture defaults to 53-bit */
1992                             if ( !fpcwSet ) {
1993                                 oldFpcw = _control87(0,0);
1994                                 _control87(_PC_64,_MCW_PC);
1995                                 fpcwSet = 0x30000;
1996                             }
1997     #endif
1998                             if ( chunkIdx ) {
1999                                 init = __kmp_dispatch_guided_remaining< T >(
2000                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2001                                 KMP_DEBUG_ASSERT(init);
2002                                 init = trip - init;
2003                             } else
2004                                 init = 0;
2005                             limit = trip - __kmp_dispatch_guided_remaining< T >(
2006                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2007                             KMP_ASSERT(init <= limit);
2008                             if ( init < limit ) {
2009                                 KMP_DEBUG_ASSERT(limit <= trip);
2010                                 --limit;
2011                                 status = 1;
2012                                 break;
2013                             } // if
2014                         } // if
2015                     } // while (1)
2016     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2017                     /* restore FPCW if necessary
2018                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2019                     */
2020                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2021                         _control87(oldFpcw,_MCW_PC);
2022     #endif
2023                     if ( status != 0 ) {
2024                         start = pr->u.p.lb;
2025                         incr = pr->u.p.st;
2026                         if ( p_st != NULL )
2027                             *p_st = incr;
2028                         *p_lb = start + init * incr;
2029                         *p_ub = start + limit * incr;
2030                         if ( pr->ordered ) {
2031                             pr->u.p.ordered_lower = init;
2032                             pr->u.p.ordered_upper = limit;
2033                             #ifdef KMP_DEBUG
2034                             {
2035                                 const char * buff;
2036                                 // create format specifiers before the debug output
2037                                 buff = __kmp_str_format(
2038                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2039                                     traits_t< UT >::spec, traits_t< UT >::spec );
2040                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2041                                 __kmp_str_free( &buff );
2042                             }
2043                             #endif
2044                         }
2045                     } else {
2046                         *p_lb = 0;
2047                         *p_ub = 0;
2048                         if ( p_st != NULL )
2049                             *p_st = 0;
2050                     }
2051                 } // case
2052                 break;
2053 
2054             case kmp_sch_trapezoidal:
2055                 {
2056                     UT   index;
2057                     T    parm2 = pr->u.p.parm2;
2058                     T    parm3 = pr->u.p.parm3;
2059                     T    parm4 = pr->u.p.parm4;
2060                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2061                                    gtid ) );
2062 
2063                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2064 
2065                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2066                     trip = pr->u.p.tc - 1;
2067 
2068                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2069                         *p_lb = 0;
2070                         *p_ub = 0;
2071                         if ( p_st != NULL ) *p_st = 0;
2072                     } else {
2073                         start = pr->u.p.lb;
2074                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2075                         incr  = pr->u.p.st;
2076 
2077                         if ( (last = (limit >= trip)) != 0 )
2078                             limit = trip;
2079 
2080                         if ( p_st != NULL ) *p_st = incr;
2081 
2082                         if ( incr == 1 ) {
2083                             *p_lb = start + init;
2084                             *p_ub = start + limit;
2085                         } else {
2086                             *p_lb = start + init * incr;
2087                             *p_ub = start + limit * incr;
2088                         }
2089 
2090                         if ( pr->ordered ) {
2091                             pr->u.p.ordered_lower = init;
2092                             pr->u.p.ordered_upper = limit;
2093                             #ifdef KMP_DEBUG
2094                             {
2095                                 const char * buff;
2096                                 // create format specifiers before the debug output
2097                                 buff = __kmp_str_format(
2098                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2099                                     traits_t< UT >::spec, traits_t< UT >::spec );
2100                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2101                                 __kmp_str_free( &buff );
2102                             }
2103                             #endif
2104                         } // if
2105                     } // if
2106                 } // case
2107                 break;
2108             default:
2109                 {
2110                     status = 0; // to avoid complaints on uninitialized variable use
2111                     __kmp_msg(
2112                         kmp_ms_fatal,                        // Severity
2113                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2114                         KMP_HNT( GetNewerLibrary ),          // Hint
2115                         __kmp_msg_null                       // Variadic argument list terminator
2116                     );
2117                 }
2118                 break;
2119             } // switch
2120         } // if tc == 0;
2121 
2122         if ( status == 0 ) {
2123             UT   num_done;
2124 
2125             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2126             #ifdef KMP_DEBUG
2127             {
2128                 const char * buff;
2129                 // create format specifiers before the debug output
2130                 buff = __kmp_str_format(
2131                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2132                     traits_t< UT >::spec );
2133                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2134                 __kmp_str_free( &buff );
2135             }
2136             #endif
2137 
2138             if ( (ST)num_done == team->t.t_nproc-1 ) {
2139                 /* NOTE: release this buffer to be reused */
2140 
2141                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2142 
2143                 sh->u.s.num_done = 0;
2144                 sh->u.s.iteration = 0;
2145 
2146                 /* TODO replace with general release procedure? */
2147                 if ( pr->ordered ) {
2148                     sh->u.s.ordered_iteration = 0;
2149                 }
2150 
2151                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2152 
2153                 sh -> buffer_index += __kmp_dispatch_num_buffers;
2154                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2155                                 gtid, sh->buffer_index) );
2156 
2157                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2158 
2159             } // if
2160             if ( __kmp_env_consistency_check ) {
2161                 if ( pr->pushed_ws != ct_none ) {
2162                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2163                 }
2164             }
2165 
2166             th -> th.th_dispatch -> th_deo_fcn = NULL;
2167             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2168             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2169             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2170         } // if (status == 0)
2171 #if KMP_OS_WINDOWS
2172         else if ( last ) {
2173             pr->u.p.last_upper = pr->u.p.ub;
2174         }
2175 #endif /* KMP_OS_WINDOWS */
2176         if ( p_last != NULL && status != 0 )
2177             *p_last = last;
2178     } // if
2179 
2180     #ifdef KMP_DEBUG
2181     {
2182         const char * buff;
2183         // create format specifiers before the debug output
2184         buff = __kmp_str_format(
2185             "__kmp_dispatch_next: T#%%d normal case: " \
2186             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2187             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2188         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2189         __kmp_str_free( &buff );
2190     }
2191     #endif
2192 #if INCLUDE_SSC_MARKS
2193     SSC_MARK_DISPATCH_NEXT();
2194 #endif
2195     OMPT_LOOP_END;
2196     return status;
2197 }
2198 
2199 template< typename T >
2200 static void
2201 __kmp_dist_get_bounds(
2202     ident_t                          *loc,
2203     kmp_int32                         gtid,
2204     kmp_int32                        *plastiter,
2205     T                                *plower,
2206     T                                *pupper,
2207     typename traits_t< T >::signed_t  incr
2208 ) {
2209     typedef typename traits_t< T >::unsigned_t  UT;
2210     typedef typename traits_t< T >::signed_t    ST;
2211     register kmp_uint32  team_id;
2212     register kmp_uint32  nteams;
2213     register UT          trip_count;
2214     register kmp_team_t *team;
2215     kmp_info_t * th;
2216 
2217     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2218     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2219     #ifdef KMP_DEBUG
2220     {
2221         const char * buff;
2222         // create format specifiers before the debug output
2223         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2224             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2225             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2226             traits_t< T >::spec );
2227         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2228         __kmp_str_free( &buff );
2229     }
2230     #endif
2231 
2232     if( __kmp_env_consistency_check ) {
2233         if( incr == 0 ) {
2234             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2235         }
2236         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2237             // The loop is illegal.
2238             // Some zero-trip loops maintained by compiler, e.g.:
2239             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2240             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2241             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2242             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2243             // Compiler does not check the following illegal loops:
2244             //   for(i=0;i<10;i+=incr) // where incr<0
2245             //   for(i=10;i>0;i-=incr) // where incr<0
2246             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2247         }
2248     }
2249     th = __kmp_threads[gtid];
2250     team = th->th.th_team;
2251     #if OMP_40_ENABLED
2252     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2253     nteams = th->th.th_teams_size.nteams;
2254     #endif
2255     team_id = team->t.t_master_tid;
2256     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2257 
2258     // compute global trip count
2259     if( incr == 1 ) {
2260         trip_count = *pupper - *plower + 1;
2261     } else if(incr == -1) {
2262         trip_count = *plower - *pupper + 1;
2263     } else if ( incr > 0 ) {
2264         // upper-lower can exceed the limit of signed type
2265         trip_count = (UT)(*pupper - *plower) / incr + 1;
2266     } else {
2267         trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
2268     }
2269 
2270     if( trip_count <= nteams ) {
2271         KMP_DEBUG_ASSERT(
2272             __kmp_static == kmp_sch_static_greedy || \
2273             __kmp_static == kmp_sch_static_balanced
2274         ); // Unknown static scheduling type.
2275         // only some teams get single iteration, others get nothing
2276         if( team_id < trip_count ) {
2277             *pupper = *plower = *plower + team_id * incr;
2278         } else {
2279             *plower = *pupper + incr; // zero-trip loop
2280         }
2281         if( plastiter != NULL )
2282             *plastiter = ( team_id == trip_count - 1 );
2283     } else {
2284         if( __kmp_static == kmp_sch_static_balanced ) {
2285             register UT chunk = trip_count / nteams;
2286             register UT extras = trip_count % nteams;
2287             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2288             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2289             if( plastiter != NULL )
2290                 *plastiter = ( team_id == nteams - 1 );
2291         } else {
2292             register T chunk_inc_count =
2293                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2294             register T upper = *pupper;
2295             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2296                 // Unknown static scheduling type.
2297             *plower += team_id * chunk_inc_count;
2298             *pupper = *plower + chunk_inc_count - incr;
2299             // Check/correct bounds if needed
2300             if( incr > 0 ) {
2301                 if( *pupper < *plower )
2302                     *pupper = i_maxmin< T >::mx;
2303                 if( plastiter != NULL )
2304                     *plastiter = *plower <= upper && *pupper > upper - incr;
2305                 if( *pupper > upper )
2306                     *pupper = upper; // tracker C73258
2307             } else {
2308                 if( *pupper > *plower )
2309                     *pupper = i_maxmin< T >::mn;
2310                 if( plastiter != NULL )
2311                     *plastiter = *plower >= upper && *pupper < upper - incr;
2312                 if( *pupper < upper )
2313                     *pupper = upper; // tracker C73258
2314             }
2315         }
2316     }
2317 }
2318 
2319 //-----------------------------------------------------------------------------------------
2320 // Dispatch routines
2321 //    Transfer call to template< type T >
2322 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2323 //                         T lb, T ub, ST st, ST chunk )
2324 extern "C" {
2325 
2326 /*!
2327 @ingroup WORK_SHARING
2328 @{
2329 @param loc Source location
2330 @param gtid Global thread id
2331 @param schedule Schedule type
2332 @param lb  Lower bound
2333 @param ub  Upper bound
2334 @param st  Step (or increment if you prefer)
2335 @param chunk The chunk size to block with
2336 
2337 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2338 These functions are all identical apart from the types of the arguments.
2339 */
2340 
2341 void
2342 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2343                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2344 {
2345     KMP_DEBUG_ASSERT( __kmp_init_serial );
2346     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2347 }
2348 /*!
2349 See @ref __kmpc_dispatch_init_4
2350 */
2351 void
2352 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2354 {
2355     KMP_DEBUG_ASSERT( __kmp_init_serial );
2356     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2357 }
2358 
2359 /*!
2360 See @ref __kmpc_dispatch_init_4
2361 */
2362 void
2363 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2364                         kmp_int64 lb, kmp_int64 ub,
2365                         kmp_int64 st, kmp_int64 chunk )
2366 {
2367     KMP_DEBUG_ASSERT( __kmp_init_serial );
2368     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2369 }
2370 
2371 /*!
2372 See @ref __kmpc_dispatch_init_4
2373 */
2374 void
2375 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2376                          kmp_uint64 lb, kmp_uint64 ub,
2377                          kmp_int64 st, kmp_int64 chunk )
2378 {
2379     KMP_DEBUG_ASSERT( __kmp_init_serial );
2380     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2381 }
2382 
2383 /*!
2384 See @ref __kmpc_dispatch_init_4
2385 
2386 Difference from __kmpc_dispatch_init set of functions is these functions
2387 are called for composite distribute parallel for construct. Thus before
2388 regular iterations dispatching we need to calc per-team iteration space.
2389 
2390 These functions are all identical apart from the types of the arguments.
2391 */
2392 void
2393 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2394     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2395 {
2396     KMP_DEBUG_ASSERT( __kmp_init_serial );
2397     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2398     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2399 }
2400 
2401 void
2402 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2403     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2404 {
2405     KMP_DEBUG_ASSERT( __kmp_init_serial );
2406     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2407     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2408 }
2409 
2410 void
2411 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2412     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2413 {
2414     KMP_DEBUG_ASSERT( __kmp_init_serial );
2415     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2416     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2417 }
2418 
2419 void
2420 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2421     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2422 {
2423     KMP_DEBUG_ASSERT( __kmp_init_serial );
2424     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2425     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2426 }
2427 
2428 /*!
2429 @param loc Source code location
2430 @param gtid Global thread id
2431 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2432 @param p_lb   Pointer to the lower bound for the next chunk of work
2433 @param p_ub   Pointer to the upper bound for the next chunk of work
2434 @param p_st   Pointer to the stride for the next chunk of work
2435 @return one if there is work to be done, zero otherwise
2436 
2437 Get the next dynamically allocated chunk of work for this thread.
2438 If there is no more work, then the lb,ub and stride need not be modified.
2439 */
2440 int
2441 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2442                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2443 {
2444     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2445 }
2446 
2447 /*!
2448 See @ref __kmpc_dispatch_next_4
2449 */
2450 int
2451 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2452                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2453 {
2454     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2455 }
2456 
2457 /*!
2458 See @ref __kmpc_dispatch_next_4
2459 */
2460 int
2461 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2462                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2463 {
2464     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2465 }
2466 
2467 /*!
2468 See @ref __kmpc_dispatch_next_4
2469 */
2470 int
2471 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2472                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2473 {
2474     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2475 }
2476 
2477 /*!
2478 @param loc Source code location
2479 @param gtid Global thread id
2480 
2481 Mark the end of a dynamic loop.
2482 */
2483 void
2484 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2485 {
2486     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2487 }
2488 
2489 /*!
2490 See @ref __kmpc_dispatch_fini_4
2491 */
2492 void
2493 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2494 {
2495     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2496 }
2497 
2498 /*!
2499 See @ref __kmpc_dispatch_fini_4
2500 */
2501 void
2502 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2503 {
2504     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2505 }
2506 
2507 /*!
2508 See @ref __kmpc_dispatch_fini_4
2509 */
2510 void
2511 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2512 {
2513     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2514 }
2515 /*! @} */
2516 
2517 //-----------------------------------------------------------------------------------------
2518 //Non-template routines from kmp_dispatch.c used in other sources
2519 
2520 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2521     return value == checker;
2522 }
2523 
2524 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2525     return value != checker;
2526 }
2527 
2528 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2529     return value < checker;
2530 }
2531 
2532 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2533     return value >= checker;
2534 }
2535 
2536 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2537     return value <= checker;
2538 }
2539 
2540 kmp_uint32
2541 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2542                    kmp_uint32            checker,
2543                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2544                    , void        * obj    // Higher-level synchronization object, or NULL.
2545                    )
2546 {
2547     // note: we may not belong to a team at this point
2548     register volatile kmp_uint32         * spin          = spinner;
2549     register          kmp_uint32           check         = checker;
2550     register          kmp_uint32   spins;
2551     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2552     register          kmp_uint32           r;
2553 
2554     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2555     KMP_INIT_YIELD( spins );
2556     // main wait spin loop
2557     while(!f(r = TCR_4(*spin), check)) {
2558         KMP_FSYNC_SPIN_PREPARE( obj );
2559         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2560            It causes problems with infinite recursion because of exit lock */
2561         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2562             __kmp_abort_thread(); */
2563 
2564         /* if we have waited a bit, or are oversubscribed, yield */
2565         /* pause is in the following code */
2566         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2567         KMP_YIELD_SPIN( spins );
2568     }
2569     KMP_FSYNC_SPIN_ACQUIRED( obj );
2570     return r;
2571 }
2572 
2573 void
2574 __kmp_wait_yield_4_ptr(void *spinner,
2575                    kmp_uint32 checker,
2576                    kmp_uint32 (*pred)( void *, kmp_uint32 ),
2577                    void        *obj    // Higher-level synchronization object, or NULL.
2578                    )
2579 {
2580     // note: we may not belong to a team at this point
2581     register void                *spin          = spinner;
2582     register kmp_uint32           check         = checker;
2583     register kmp_uint32           spins;
2584     register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2585 
2586     KMP_FSYNC_SPIN_INIT( obj, spin );
2587     KMP_INIT_YIELD( spins );
2588     // main wait spin loop
2589     while ( !f( spin, check ) ) {
2590         KMP_FSYNC_SPIN_PREPARE( obj );
2591         /* if we have waited a bit, or are oversubscribed, yield */
2592         /* pause is in the following code */
2593         KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2594         KMP_YIELD_SPIN( spins );
2595     }
2596     KMP_FSYNC_SPIN_ACQUIRED( obj );
2597 }
2598 
2599 } // extern "C"
2600 
2601 #ifdef KMP_GOMP_COMPAT
2602 
2603 void
2604 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2605                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2606                            kmp_int32 chunk, int push_ws )
2607 {
2608     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2609                                       push_ws );
2610 }
2611 
2612 void
2613 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2614                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2615                             kmp_int32 chunk, int push_ws )
2616 {
2617     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2618                                        push_ws );
2619 }
2620 
2621 void
2622 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2623                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2624                            kmp_int64 chunk, int push_ws )
2625 {
2626     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2627                                       push_ws );
2628 }
2629 
2630 void
2631 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2632                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2633                             kmp_int64 chunk, int push_ws )
2634 {
2635     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2636                                        push_ws );
2637 }
2638 
2639 void
2640 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2641 {
2642     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2643 }
2644 
2645 void
2646 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2647 {
2648     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2649 }
2650 
2651 void
2652 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2653 {
2654     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2655 }
2656 
2657 void
2658 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2659 {
2660     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2661 }
2662 
2663 #endif /* KMP_GOMP_COMPAT */
2664 
2665 /* ------------------------------------------------------------------------ */
2666 /* ------------------------------------------------------------------------ */
2667 
2668