1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194     kmp_int32 r;
195     r = KMP_TEST_THEN_ADD32( p, d );
196     return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203     kmp_int64 r;
204     r = KMP_TEST_THEN_ADD64( p, d );
205     return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217     kmp_int32 r;
218     r = KMP_TEST_THEN_INC_ACQ32( p );
219     return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226     kmp_int64 r;
227     r = KMP_TEST_THEN_INC_ACQ64( p );
228     return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240     kmp_int32 r;
241     r = KMP_TEST_THEN_INC32( p );
242     return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249     kmp_int64 r;
250     r = KMP_TEST_THEN_INC64( p );
251     return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274     Spin wait loop that first does pause, then yield.
275     Waits until function returns non-zero when called with *spinner and check.
276     Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278     Arguments:
279         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280             locks consistently. For example, if lock is acquired immediately, its address is
281             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283             address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT  // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290                   UT            checker,
291                   kmp_uint32 (* pred)( UT, UT )
292                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
293                   )
294 {
295     // note: we may not belong to a team at this point
296     register volatile UT         * spin          = spinner;
297     register          UT           check         = checker;
298     register          kmp_uint32   spins;
299     register          kmp_uint32 (*f) ( UT, UT ) = pred;
300     register          UT           r;
301 
302     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303     KMP_INIT_YIELD( spins );
304     // main wait spin loop
305     while(!f(r = *spin, check))
306     {
307         KMP_FSYNC_SPIN_PREPARE( obj );
308         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309            It causes problems with infinite recursion because of exit lock */
310         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311             __kmp_abort_thread(); */
312 
313         // if we are oversubscribed,
314         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315         // pause is in the following code
316         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317         KMP_YIELD_SPIN( spins );
318     }
319     KMP_FSYNC_SPIN_ACQUIRED( obj );
320     return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325     return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330     return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335     return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340     return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345     return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355     kmp_info_t *th;
356 
357     KMP_DEBUG_ASSERT( gtid_ref );
358 
359     if ( __kmp_env_consistency_check ) {
360         th = __kmp_threads[*gtid_ref];
361         if ( th -> th.th_root -> r.r_active
362           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368         }
369     }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376     typedef typename traits_t< UT >::signed_t    ST;
377     dispatch_private_info_template< UT > * pr;
378 
379     int gtid = *gtid_ref;
380 //    int  cid = *cid_ref;
381     kmp_info_t *th = __kmp_threads[ gtid ];
382     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385     if ( __kmp_env_consistency_check ) {
386         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387             ( th -> th.th_dispatch -> th_dispatch_pr_current );
388         if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394         }
395     }
396 
397     if ( ! th -> th.th_team -> t.t_serialized ) {
398         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399             ( th -> th.th_dispatch -> th_dispatch_sh_current );
400         UT  lower;
401 
402         if ( ! __kmp_env_consistency_check ) {
403                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         }
406         lower = pr->u.p.ordered_lower;
407 
408         #if ! defined( KMP_GOMP_COMPAT )
409             if ( __kmp_env_consistency_check ) {
410                 if ( pr->ordered_bumped ) {
411                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412                     __kmp_error_construct2(
413                         kmp_i18n_msg_CnsMultipleNesting,
414                         ct_ordered_in_pdo, loc_ref,
415                         & p->stack_data[ p->w_top ]
416                     );
417                 }
418             }
419         #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421         KMP_MB();
422         #ifdef KMP_DEBUG
423         {
424             const char * buff;
425             // create format specifiers before the debug output
426             buff = __kmp_str_format(
427                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428                 traits_t< UT >::spec, traits_t< UT >::spec );
429             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430             __kmp_str_free( &buff );
431         }
432         #endif
433 
434         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435                                 USE_ITT_BUILD_ARG( NULL )
436                                 );
437         KMP_MB();  /* is this necessary? */
438         #ifdef KMP_DEBUG
439         {
440             const char * buff;
441             // create format specifiers before the debug output
442             buff = __kmp_str_format(
443                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444                 traits_t< UT >::spec, traits_t< UT >::spec );
445             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446             __kmp_str_free( &buff );
447         }
448         #endif
449     }
450     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456     kmp_info_t *th;
457 
458     if ( __kmp_env_consistency_check ) {
459         th = __kmp_threads[*gtid_ref];
460         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470     typedef typename traits_t< UT >::signed_t    ST;
471     dispatch_private_info_template< UT > * pr;
472 
473     int gtid = *gtid_ref;
474 //    int  cid = *cid_ref;
475     kmp_info_t *th = __kmp_threads[ gtid ];
476     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479     if ( __kmp_env_consistency_check ) {
480         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481             ( th -> th.th_dispatch -> th_dispatch_pr_current );
482         if ( pr -> pushed_ws != ct_none ) {
483             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484         }
485     }
486 
487     if ( ! th -> th.th_team -> t.t_serialized ) {
488         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489             ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491         if ( ! __kmp_env_consistency_check ) {
492             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494         }
495 
496         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497         #if ! defined( KMP_GOMP_COMPAT )
498             if ( __kmp_env_consistency_check ) {
499                 if ( pr->ordered_bumped != 0 ) {
500                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501                     /* How to test it? - OM */
502                     __kmp_error_construct2(
503                         kmp_i18n_msg_CnsMultipleNesting,
504                         ct_ordered_in_pdo, loc_ref,
505                         & p->stack_data[ p->w_top ]
506                     );
507                 }
508             }
509         #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511         KMP_MB();       /* Flush all pending memory write invalidates.  */
512 
513         pr->ordered_bumped += 1;
514 
515         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516                         gtid, pr->ordered_bumped ) );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519 
520         /* TODO use general release procedure? */
521         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524     }
525     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532     long double s=1.0L;
533 
534     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536     while(y) {
537         if ( y & 1 )
538             s *= x;
539         x *= x;
540         y >>= 1;
541     }
542     return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553     T                                  tc,
554     typename traits_t< T >::floating_t base,
555     typename traits_t< T >::unsigned_t idx
556 ) {
557     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558        least for ICL 8.1, long double arithmetic may not really have
559        long double precision, even with /Qlong_double.  Currently, we
560        workaround that in the caller code, by manipulating the FPCW for
561        Windows* OS on IA-32 architecture.  The lack of precision is not
562        expected to be a correctness issue, though.
563     */
564     typedef typename traits_t< T >::unsigned_t  UT;
565 
566     long double x = tc * __kmp_pow< UT >(base, idx);
567     UT r = (UT) x;
568     if ( x == r )
569         return r;
570     return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
575 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586     ident_t                        * loc,
587     int                              gtid,
588     enum sched_type                  schedule,
589     T                                lb,
590     T                                ub,
591     typename traits_t< T >::signed_t st,
592     typename traits_t< T >::signed_t chunk,
593     int                              push_ws
594 ) {
595     typedef typename traits_t< T >::unsigned_t  UT;
596     typedef typename traits_t< T >::signed_t    ST;
597     typedef typename traits_t< T >::floating_t  DBL;
598     static const int ___kmp_size_type = sizeof( UT );
599 
600     int                                            active;
601     T                                              tc;
602     kmp_info_t *                                   th;
603     kmp_team_t *                                   team;
604     kmp_uint32                                     my_buffer_index;
605     dispatch_private_info_template< T >          * pr;
606     dispatch_shared_info_template< UT > volatile * sh;
607 
608     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611     if ( ! TCR_4( __kmp_init_parallel ) )
612         __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615     SSC_MARK_DISPATCH_INIT();
616 #endif
617     #ifdef KMP_DEBUG
618     {
619         const char * buff;
620         // create format specifiers before the debug output
621         buff = __kmp_str_format(
622             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625         __kmp_str_free( &buff );
626     }
627     #endif
628     /* setup data */
629     th     = __kmp_threads[ gtid ];
630     team   = th -> th.th_team;
631     active = ! team -> t.t_serialized;
632     th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635     kmp_uint64 cur_chunk = chunk;
636     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637         KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639         th->th.th_teams_microtask == NULL &&
640 #endif
641         team->t.t_active_level == 1;
642 #endif
643     if ( ! active ) {
644         pr = reinterpret_cast< dispatch_private_info_template< T >* >
645             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646     } else {
647         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652         /* What happens when number of threads changes, need to resize buffer? */
653         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
654             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657     }
658 
659     /* Pick up the nomerge/ordered bits from the scheduling type */
660     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661         pr->nomerge = TRUE;
662         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663     } else {
664         pr->nomerge = FALSE;
665     }
666     pr->type_size = ___kmp_size_type; // remember the size of variables
667     if ( kmp_ord_lower & schedule ) {
668         pr->ordered = TRUE;
669         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670     } else {
671         pr->ordered = FALSE;
672     }
673 
674     if ( schedule == kmp_sch_static ) {
675         schedule = __kmp_static;
676     } else {
677         if ( schedule == kmp_sch_runtime ) {
678             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679             schedule = team -> t.t_sched.r_sched_type;
680             // Detail the schedule if needed (global controls are differentiated appropriately)
681             if ( schedule == kmp_sch_guided_chunked ) {
682                 schedule = __kmp_guided;
683             } else if ( schedule == kmp_sch_static ) {
684                 schedule = __kmp_static;
685             }
686             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687             chunk = team -> t.t_sched.chunk;
688 #if USE_ITT_BUILD
689             cur_chunk = chunk;
690 #endif
691             #ifdef KMP_DEBUG
692             {
693                 const char * buff;
694                 // create format specifiers before the debug output
695                 buff = __kmp_str_format(
696                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
697                     traits_t< ST >::spec );
698                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
699                 __kmp_str_free( &buff );
700             }
701             #endif
702         } else {
703             if ( schedule == kmp_sch_guided_chunked ) {
704                 schedule = __kmp_guided;
705             }
706             if ( chunk <= 0 ) {
707                 chunk = KMP_DEFAULT_CHUNK;
708             }
709         }
710 
711         if ( schedule == kmp_sch_auto ) {
712             // mapping and differentiation: in the __kmp_do_serial_initialize()
713             schedule = __kmp_auto;
714             #ifdef KMP_DEBUG
715             {
716                 const char * buff;
717                 // create format specifiers before the debug output
718                 buff = __kmp_str_format(
719                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
720                     traits_t< ST >::spec );
721                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
722                 __kmp_str_free( &buff );
723             }
724             #endif
725         }
726 
727         /* guided analytical not safe for too many threads */
728         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
729             schedule = kmp_sch_guided_iterative_chunked;
730             KMP_WARNING( DispatchManyThreads );
731         }
732         pr->u.p.parm1 = chunk;
733     }
734     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
735                 "unknown scheduling type" );
736 
737     pr->u.p.count = 0;
738 
739     if ( __kmp_env_consistency_check ) {
740         if ( st == 0 ) {
741             __kmp_error_construct(
742                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
743                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
744             );
745         }
746     }
747 
748     tc = ( ub - lb + st );
749     if ( st != 1 ) {
750         if ( st < 0 ) {
751             if ( lb < ub ) {
752                 tc = 0;            // zero-trip
753             } else {   // lb >= ub
754                 tc = (ST)tc / st;  // convert to signed division
755             }
756         } else {       // st > 0
757             if ( ub < lb ) {
758                 tc = 0;            // zero-trip
759             } else {   // lb >= ub
760                 tc /= st;
761             }
762         }
763     } else if ( ub < lb ) {        // st == 1
764         tc = 0;                    // zero-trip
765     }
766 
767     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
768     // when statistics are disabled.
769     if (schedule == __kmp_static)
770     {
771         KMP_COUNT_BLOCK(OMP_FOR_static);
772         KMP_COUNT_VALUE(FOR_static_iterations, tc);
773     }
774     else
775     {
776         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
777         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
778     }
779 
780     pr->u.p.lb = lb;
781     pr->u.p.ub = ub;
782     pr->u.p.st = st;
783     pr->u.p.tc = tc;
784 
785     #if KMP_OS_WINDOWS
786     pr->u.p.last_upper = ub + st;
787     #endif /* KMP_OS_WINDOWS */
788 
789     /* NOTE: only the active parallel region(s) has active ordered sections */
790 
791     if ( active ) {
792         if ( pr->ordered == 0 ) {
793             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
794             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
795         } else {
796             pr->ordered_bumped = 0;
797 
798             pr->u.p.ordered_lower = 1;
799             pr->u.p.ordered_upper = 0;
800 
801             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
802             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
803         }
804     }
805 
806     if ( __kmp_env_consistency_check ) {
807         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
808         if ( push_ws ) {
809             __kmp_push_workshare( gtid, ws, loc );
810             pr->pushed_ws = ws;
811         } else {
812             __kmp_check_workshare( gtid, ws, loc );
813             pr->pushed_ws = ct_none;
814         }
815     }
816 
817     switch ( schedule ) {
818     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
819     case kmp_sch_static_steal:
820         {
821             T nproc = team->t.t_nproc;
822             T ntc, init;
823 
824             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
825 
826             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
827             if ( nproc > 1 && ntc >= nproc ) {
828                 T id = __kmp_tid_from_gtid(gtid);
829                 T small_chunk, extras;
830 
831                 small_chunk = ntc / nproc;
832                 extras = ntc % nproc;
833 
834                 init = id * small_chunk + ( id < extras ? id : extras );
835                 pr->u.p.count = init;
836                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
837 
838                 pr->u.p.parm2 = lb;
839                 //pr->pfields.parm3 = 0; // it's not used in static_steal
840                 pr->u.p.parm4 = id;
841                 pr->u.p.st = st;
842                 break;
843             } else {
844                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
845                                gtid ) );
846                 schedule = kmp_sch_static_balanced;
847                 /* too few iterations: fall-through to kmp_sch_static_balanced */
848             } // if
849             /* FALL-THROUGH to static balanced */
850         } // case
851     #endif
852     case kmp_sch_static_balanced:
853         {
854             T nproc = team->t.t_nproc;
855             T init, limit;
856 
857             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
858                             gtid ) );
859 
860             if ( nproc > 1 ) {
861                 T id = __kmp_tid_from_gtid(gtid);
862 
863                 if ( tc < nproc ) {
864                     if ( id < tc ) {
865                         init = id;
866                         limit = id;
867                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
868                     } else {
869                         pr->u.p.count = 1;  /* means no more chunks to execute */
870                         pr->u.p.parm1 = FALSE;
871                         break;
872                     }
873                 } else {
874                     T small_chunk = tc / nproc;
875                     T extras = tc % nproc;
876                     init = id * small_chunk + (id < extras ? id : extras);
877                     limit = init + small_chunk - (id < extras ? 0 : 1);
878                     pr->u.p.parm1 = (id == nproc - 1);
879                 }
880             } else {
881                 if ( tc > 0 ) {
882                     init = 0;
883                     limit = tc - 1;
884                     pr->u.p.parm1 = TRUE;
885                 } else {
886                     // zero trip count
887                     pr->u.p.count = 1;  /* means no more chunks to execute */
888                     pr->u.p.parm1 = FALSE;
889                     break;
890                 }
891             }
892 #if USE_ITT_BUILD
893             // Calculate chunk for metadata report
894             if ( itt_need_metadata_reporting )
895                 cur_chunk = limit - init + 1;
896 #endif
897             if ( st == 1 ) {
898                 pr->u.p.lb = lb + init;
899                 pr->u.p.ub = lb + limit;
900             } else {
901                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
902                 pr->u.p.lb = lb + init * st;
903                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
904                 if ( st > 0 ) {
905                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
906                 } else {
907                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
908                 }
909             }
910             if ( pr->ordered ) {
911                 pr->u.p.ordered_lower = init;
912                 pr->u.p.ordered_upper = limit;
913             }
914             break;
915         } // case
916     case kmp_sch_guided_iterative_chunked :
917         {
918             T nproc = team->t.t_nproc;
919             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
920 
921             if ( nproc > 1 ) {
922                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
923                     /* chunk size too large, switch to dynamic */
924                     schedule = kmp_sch_dynamic_chunked;
925                 } else {
926                     // when remaining iters become less than parm2 - switch to dynamic
927                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
928                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
929                 }
930             } else {
931                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
932                 schedule = kmp_sch_static_greedy;
933                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
934                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
935                 pr->u.p.parm1 = tc;
936             } // if
937         } // case
938         break;
939     case kmp_sch_guided_analytical_chunked:
940         {
941             T nproc = team->t.t_nproc;
942             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
943 
944             if ( nproc > 1 ) {
945                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
946                     /* chunk size too large, switch to dynamic */
947                     schedule = kmp_sch_dynamic_chunked;
948                 } else {
949                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
950                     DBL x;
951 
952                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
953                     /* Linux* OS already has 64-bit computation by default for
954 		       long double, and on Windows* OS on Intel(R) 64,
955 		       /Qlong_double doesn't work.  On Windows* OS
956 		       on IA-32 architecture, we need to set precision to
957 		       64-bit instead of the default 53-bit. Even though long
958 		       double doesn't work on Windows* OS on Intel(R) 64, the
959 		       resulting lack of precision is not expected to impact
960 		       the correctness of the algorithm, but this has not been
961 		       mathematically proven.
962                     */
963                     // save original FPCW and set precision to 64-bit, as
964                     // Windows* OS on IA-32 architecture defaults to 53-bit
965                     unsigned int oldFpcw = _control87(0,0);
966                     _control87(_PC_64,_MCW_PC); // 0,0x30000
967                     #endif
968                     /* value used for comparison in solver for cross-over point */
969                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
970 
971                     /* crossover point--chunk indexes equal to or greater than
972 		       this point switch to dynamic-style scheduling */
973                     UT   cross;
974 
975                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
976                     x = (long double)1.0 - (long double)0.5 / nproc;
977 
978                     #ifdef KMP_DEBUG
979                     { // test natural alignment
980                         struct _test_a {
981                             char a;
982                             union {
983                                 char b;
984                                 DBL  d;
985                             };
986                         } t;
987                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
988                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
989                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
990                     }
991                     #endif // KMP_DEBUG
992 
993                     /* save the term in thread private dispatch structure */
994                     *(DBL*)&pr->u.p.parm3 = x;
995 
996                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
997                     {
998                         UT          left, right, mid;
999                         long double p;
1000 
1001                         /* estimate initial upper and lower bound */
1002 
1003                         /* doesn't matter what value right is as long as it is positive, but
1004                            it affects performance of the solver
1005                         */
1006                         right = 229;
1007                         p = __kmp_pow< UT >(x,right);
1008                         if ( p > target ) {
1009                             do{
1010                                 p *= p;
1011                                 right <<= 1;
1012                             } while(p>target && right < (1<<27));
1013                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1014                         } else {
1015                             left = 0;
1016                         }
1017 
1018                         /* bisection root-finding method */
1019                         while ( left + 1 < right ) {
1020                             mid = (left + right) / 2;
1021                             if ( __kmp_pow< UT >(x,mid) > target ) {
1022                                 left = mid;
1023                             } else {
1024                                 right = mid;
1025                             }
1026                         } // while
1027                         cross = right;
1028                     }
1029                     /* assert sanity of computed crossover point */
1030                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1031 
1032                     /* save the crossover point in thread private dispatch structure */
1033                     pr->u.p.parm2 = cross;
1034 
1035                     // C75803
1036                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1037                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1038                     #else
1039                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1040                     #endif
1041                     /* dynamic-style scheduling offset */
1042                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1043                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1044                         // restore FPCW
1045                         _control87(oldFpcw,_MCW_PC);
1046                     #endif
1047                 } // if
1048             } else {
1049                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1050                                gtid ) );
1051                 schedule = kmp_sch_static_greedy;
1052                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1053                 pr->u.p.parm1 = tc;
1054             } // if
1055         } // case
1056         break;
1057     case kmp_sch_static_greedy:
1058         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1059             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1060                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1061                 tc;
1062         break;
1063     case kmp_sch_static_chunked :
1064     case kmp_sch_dynamic_chunked :
1065         if ( pr->u.p.parm1 <= 0 ) {
1066             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1067         }
1068         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1069         break;
1070     case kmp_sch_trapezoidal :
1071         {
1072             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1073 
1074             T parm1, parm2, parm3, parm4;
1075             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1076 
1077             parm1 = chunk;
1078 
1079             /* F : size of the first cycle */
1080             parm2 = ( tc / (2 * team->t.t_nproc) );
1081 
1082             if ( parm2 < 1 ) {
1083                 parm2 = 1;
1084             }
1085 
1086             /* L : size of the last cycle.  Make sure the last cycle
1087              *     is not larger than the first cycle.
1088              */
1089             if ( parm1 < 1 ) {
1090                 parm1 = 1;
1091             } else if ( parm1 > parm2 ) {
1092                 parm1 = parm2;
1093             }
1094 
1095             /* N : number of cycles */
1096             parm3 = ( parm2 + parm1 );
1097             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1098 
1099             if ( parm3 < 2 ) {
1100                 parm3 = 2;
1101             }
1102 
1103             /* sigma : decreasing incr of the trapezoid */
1104             parm4 = ( parm3 - 1 );
1105             parm4 = ( parm2 - parm1 ) / parm4;
1106 
1107             // pointless check, because parm4 >= 0 always
1108             //if ( parm4 < 0 ) {
1109             //    parm4 = 0;
1110             //}
1111 
1112             pr->u.p.parm1 = parm1;
1113             pr->u.p.parm2 = parm2;
1114             pr->u.p.parm3 = parm3;
1115             pr->u.p.parm4 = parm4;
1116         } // case
1117         break;
1118 
1119     default:
1120         {
1121             __kmp_msg(
1122                 kmp_ms_fatal,                        // Severity
1123                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1124                 KMP_HNT( GetNewerLibrary ),          // Hint
1125                 __kmp_msg_null                       // Variadic argument list terminator
1126             );
1127         }
1128         break;
1129     } // switch
1130     pr->schedule = schedule;
1131     if ( active ) {
1132         /* The name of this buffer should be my_buffer_index when it's free to use it */
1133 
1134         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1135                         gtid, my_buffer_index, sh->buffer_index) );
1136         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1137                                         USE_ITT_BUILD_ARG( NULL )
1138                                         );
1139             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1140             // *always* 32-bit integers.
1141         KMP_MB();  /* is this necessary? */
1142         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1143                         gtid, my_buffer_index, sh->buffer_index) );
1144 
1145         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1146         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1147 #if USE_ITT_BUILD
1148         if ( pr->ordered ) {
1149             __kmp_itt_ordered_init( gtid );
1150         }; // if
1151         // Report loop metadata
1152         if ( itt_need_metadata_reporting ) {
1153             // Only report metadata by master of active team at level 1
1154             kmp_uint64 schedtype = 0;
1155             switch ( schedule ) {
1156             case kmp_sch_static_chunked:
1157             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1158                 break;
1159             case kmp_sch_static_greedy:
1160                 cur_chunk = pr->u.p.parm1;
1161                 break;
1162             case kmp_sch_dynamic_chunked:
1163                 schedtype = 1;
1164                 break;
1165             case kmp_sch_guided_iterative_chunked:
1166             case kmp_sch_guided_analytical_chunked:
1167                 schedtype = 2;
1168                 break;
1169             default:
1170 //            Should we put this case under "static"?
1171 //            case kmp_sch_static_steal:
1172                 schedtype = 3;
1173                 break;
1174             }
1175             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1176         }
1177 #endif /* USE_ITT_BUILD */
1178     }; // if
1179 
1180     #ifdef KMP_DEBUG
1181     {
1182         const char * buff;
1183         // create format specifiers before the debug output
1184         buff = __kmp_str_format(
1185             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1186             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1187             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1188             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1189             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1190             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1191             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1192         KD_TRACE(10, ( buff,
1193             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1194             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1195             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1196             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1197         __kmp_str_free( &buff );
1198     }
1199     #endif
1200     #if ( KMP_STATIC_STEAL_ENABLED )
1201     if ( ___kmp_size_type < 8 ) {
1202       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1203       // all the parm3 variables will contain the same value.
1204       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1205       // rather than program life-time increment.
1206       // So the dedicated variable is required. The 'static_steal_counter' is used.
1207       if( schedule == kmp_sch_static_steal ) {
1208         // Other threads will inspect this variable when searching for a victim.
1209         // This is a flag showing that other threads may steal from this thread since then.
1210         volatile T * p = &pr->u.p.static_steal_counter;
1211         *p = *p + 1;
1212       }
1213     }
1214     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1215 
1216 #if OMPT_SUPPORT && OMPT_TRACE
1217     if (ompt_enabled &&
1218         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1219         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1220         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1221         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1222             team_info->parallel_id, task_info->task_id, team_info->microtask);
1223     }
1224 #endif
1225 }
1226 
1227 /*
1228  * For ordered loops, either __kmp_dispatch_finish() should be called after
1229  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1230  * every chunk of iterations.  If the ordered section(s) were not executed
1231  * for this iteration (or every iteration in this chunk), we need to set the
1232  * ordered iteration counters so that the next thread can proceed.
1233  */
1234 template< typename UT >
1235 static void
1236 __kmp_dispatch_finish( int gtid, ident_t *loc )
1237 {
1238     typedef typename traits_t< UT >::signed_t ST;
1239     kmp_info_t *th = __kmp_threads[ gtid ];
1240 
1241     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1242     if ( ! th -> th.th_team -> t.t_serialized ) {
1243 
1244         dispatch_private_info_template< UT > * pr =
1245             reinterpret_cast< dispatch_private_info_template< UT >* >
1246             ( th->th.th_dispatch->th_dispatch_pr_current );
1247         dispatch_shared_info_template< UT > volatile * sh =
1248             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1249             ( th->th.th_dispatch->th_dispatch_sh_current );
1250         KMP_DEBUG_ASSERT( pr );
1251         KMP_DEBUG_ASSERT( sh );
1252         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1253                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1254 
1255         if ( pr->ordered_bumped ) {
1256             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1257                             gtid ) );
1258             pr->ordered_bumped = 0;
1259         } else {
1260             UT lower = pr->u.p.ordered_lower;
1261 
1262             #ifdef KMP_DEBUG
1263             {
1264                 const char * buff;
1265                 // create format specifiers before the debug output
1266                 buff = __kmp_str_format(
1267                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1268                     traits_t< UT >::spec, traits_t< UT >::spec );
1269                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1270                 __kmp_str_free( &buff );
1271             }
1272             #endif
1273 
1274             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1275                                    USE_ITT_BUILD_ARG(NULL)
1276                                    );
1277             KMP_MB();  /* is this necessary? */
1278             #ifdef KMP_DEBUG
1279             {
1280                 const char * buff;
1281                 // create format specifiers before the debug output
1282                 buff = __kmp_str_format(
1283                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1284                     traits_t< UT >::spec, traits_t< UT >::spec );
1285                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1286                 __kmp_str_free( &buff );
1287             }
1288             #endif
1289 
1290             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1291         } // if
1292     } // if
1293     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1294 }
1295 
1296 #ifdef KMP_GOMP_COMPAT
1297 
1298 template< typename UT >
1299 static void
1300 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1301 {
1302     typedef typename traits_t< UT >::signed_t ST;
1303     kmp_info_t *th = __kmp_threads[ gtid ];
1304 
1305     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1306     if ( ! th -> th.th_team -> t.t_serialized ) {
1307 //        int cid;
1308         dispatch_private_info_template< UT > * pr =
1309             reinterpret_cast< dispatch_private_info_template< UT >* >
1310             ( th->th.th_dispatch->th_dispatch_pr_current );
1311         dispatch_shared_info_template< UT > volatile * sh =
1312             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1313             ( th->th.th_dispatch->th_dispatch_sh_current );
1314         KMP_DEBUG_ASSERT( pr );
1315         KMP_DEBUG_ASSERT( sh );
1316         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1317                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1318 
1319 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1320             UT lower = pr->u.p.ordered_lower;
1321             UT upper = pr->u.p.ordered_upper;
1322             UT inc = upper - lower + 1;
1323 
1324             if ( pr->ordered_bumped == inc ) {
1325                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1326                   gtid ) );
1327                 pr->ordered_bumped = 0;
1328             } else {
1329                 inc -= pr->ordered_bumped;
1330 
1331                 #ifdef KMP_DEBUG
1332                 {
1333                     const char * buff;
1334                     // create format specifiers before the debug output
1335                     buff = __kmp_str_format(
1336                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1337                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1338                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1339                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1340                     __kmp_str_free( &buff );
1341                 }
1342                 #endif
1343 
1344                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1345                                        USE_ITT_BUILD_ARG(NULL)
1346                                        );
1347 
1348                 KMP_MB();  /* is this necessary? */
1349                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1350                   gtid ) );
1351                 pr->ordered_bumped = 0;
1352 //!!!!! TODO check if the inc should be unsigned, or signed???
1353                 #ifdef KMP_DEBUG
1354                 {
1355                     const char * buff;
1356                     // create format specifiers before the debug output
1357                     buff = __kmp_str_format(
1358                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1359                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1360                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1361                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1362                     __kmp_str_free( &buff );
1363                 }
1364                 #endif
1365 
1366                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1367             }
1368 //        }
1369     }
1370     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1371 }
1372 
1373 #endif /* KMP_GOMP_COMPAT */
1374 
1375 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1376  * (no more work), then tell OMPT the loop is over. In some cases
1377  * kmp_dispatch_fini() is not called. */
1378 #if OMPT_SUPPORT && OMPT_TRACE
1379 #define OMPT_LOOP_END                                                          \
1380     if (status == 0) {                                                         \
1381         if (ompt_enabled &&                     \
1382             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1383             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1384             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1385             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1386                 team_info->parallel_id, task_info->task_id);                   \
1387         }                                                                      \
1388     }
1389 #else
1390 #define OMPT_LOOP_END // no-op
1391 #endif
1392 
1393 template< typename T >
1394 static int
1395 __kmp_dispatch_next(
1396     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1397 ) {
1398 
1399     typedef typename traits_t< T >::unsigned_t  UT;
1400     typedef typename traits_t< T >::signed_t    ST;
1401     typedef typename traits_t< T >::floating_t  DBL;
1402 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1403     static const int ___kmp_size_type = sizeof( UT );
1404 #endif
1405 
1406     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1407     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1408     // more than a compile time choice to use static scheduling would.)
1409     KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1410 
1411     int                                   status;
1412     dispatch_private_info_template< T > * pr;
1413     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1414     kmp_team_t                          * team = th -> th.th_team;
1415 
1416     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1417     #ifdef KMP_DEBUG
1418     {
1419         const char * buff;
1420         // create format specifiers before the debug output
1421         buff = __kmp_str_format(
1422             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1423             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1424         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1425         __kmp_str_free( &buff );
1426     }
1427     #endif
1428 
1429     if ( team -> t.t_serialized ) {
1430         /* NOTE: serialize this dispatch becase we are not at the active level */
1431         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1432             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1433         KMP_DEBUG_ASSERT( pr );
1434 
1435         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1436             *p_lb = 0;
1437             *p_ub = 0;
1438 //            if ( p_last != NULL )
1439 //                *p_last = 0;
1440             if ( p_st != NULL )
1441                 *p_st = 0;
1442             if ( __kmp_env_consistency_check ) {
1443                 if ( pr->pushed_ws != ct_none ) {
1444                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1445                 }
1446             }
1447         } else if ( pr->nomerge ) {
1448             kmp_int32 last;
1449             T         start;
1450             UT        limit, trip, init;
1451             ST        incr;
1452             T         chunk = pr->u.p.parm1;
1453 
1454             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1455 
1456             init = chunk * pr->u.p.count++;
1457             trip = pr->u.p.tc - 1;
1458 
1459             if ( (status = (init <= trip)) == 0 ) {
1460                 *p_lb = 0;
1461                 *p_ub = 0;
1462 //                if ( p_last != NULL )
1463 //                    *p_last = 0;
1464                 if ( p_st != NULL )
1465                     *p_st = 0;
1466                 if ( __kmp_env_consistency_check ) {
1467                     if ( pr->pushed_ws != ct_none ) {
1468                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1469                     }
1470                 }
1471             } else {
1472                 start = pr->u.p.lb;
1473                 limit = chunk + init - 1;
1474                 incr  = pr->u.p.st;
1475 
1476                 if ( (last = (limit >= trip)) != 0 ) {
1477                     limit = trip;
1478                     #if KMP_OS_WINDOWS
1479                     pr->u.p.last_upper = pr->u.p.ub;
1480                     #endif /* KMP_OS_WINDOWS */
1481                 }
1482                 if ( p_last != NULL )
1483                     *p_last = last;
1484                 if ( p_st != NULL )
1485                     *p_st = incr;
1486                 if ( incr == 1 ) {
1487                     *p_lb = start + init;
1488                     *p_ub = start + limit;
1489                 } else {
1490                     *p_lb = start + init * incr;
1491                     *p_ub = start + limit * incr;
1492                 }
1493 
1494                 if ( pr->ordered ) {
1495                     pr->u.p.ordered_lower = init;
1496                     pr->u.p.ordered_upper = limit;
1497                     #ifdef KMP_DEBUG
1498                     {
1499                         const char * buff;
1500                         // create format specifiers before the debug output
1501                         buff = __kmp_str_format(
1502                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1503                             traits_t< UT >::spec, traits_t< UT >::spec );
1504                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1505                         __kmp_str_free( &buff );
1506                     }
1507                     #endif
1508                 } // if
1509             } // if
1510         } else {
1511             pr->u.p.tc = 0;
1512             *p_lb = pr->u.p.lb;
1513             *p_ub = pr->u.p.ub;
1514             #if KMP_OS_WINDOWS
1515             pr->u.p.last_upper = *p_ub;
1516             #endif /* KMP_OS_WINDOWS */
1517             if ( p_last != NULL )
1518                 *p_last = TRUE;
1519             if ( p_st != NULL )
1520                 *p_st = pr->u.p.st;
1521         } // if
1522         #ifdef KMP_DEBUG
1523         {
1524             const char * buff;
1525             // create format specifiers before the debug output
1526             buff = __kmp_str_format(
1527                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1528                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1529                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1530             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1531             __kmp_str_free( &buff );
1532         }
1533         #endif
1534 #if INCLUDE_SSC_MARKS
1535         SSC_MARK_DISPATCH_NEXT();
1536 #endif
1537         OMPT_LOOP_END;
1538         return status;
1539     } else {
1540         kmp_int32 last = 0;
1541         dispatch_shared_info_template< UT > *sh;
1542         T         start;
1543         ST        incr;
1544         UT        limit, trip, init;
1545 
1546         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1547                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1548 
1549         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1550             ( th->th.th_dispatch->th_dispatch_pr_current );
1551         KMP_DEBUG_ASSERT( pr );
1552         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1553             ( th->th.th_dispatch->th_dispatch_sh_current );
1554         KMP_DEBUG_ASSERT( sh );
1555 
1556         if ( pr->u.p.tc == 0 ) {
1557             // zero trip count
1558             status = 0;
1559         } else {
1560             switch (pr->schedule) {
1561             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1562             case kmp_sch_static_steal:
1563                 {
1564                     T chunk = pr->u.p.parm1;
1565 
1566                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1567 
1568                     trip = pr->u.p.tc - 1;
1569 
1570                     if ( ___kmp_size_type > 4 ) {
1571                         // Other threads do not look into the data of this thread,
1572                         //  so it's not necessary to make volatile casting.
1573                         init   = ( pr->u.p.count )++;
1574                         status = ( init < (UT)pr->u.p.ub );
1575                     } else {
1576                         typedef union {
1577                             struct {
1578                                 UT count;
1579                                 T  ub;
1580                             } p;
1581                             kmp_int64 b;
1582                         } union_i4;
1583                         // All operations on 'count' or 'ub' must be combined atomically together.
1584                         // stealing implemented only for 4-byte indexes
1585                         {
1586                             union_i4 vold, vnew;
1587                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1588                             vnew = vold;
1589                             vnew.p.count++;
1590                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1591                                         ( volatile kmp_int64* )&pr->u.p.count,
1592                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1593                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1594                                 KMP_CPU_PAUSE();
1595                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1596                                 vnew = vold;
1597                                 vnew.p.count++;
1598                             }
1599                             vnew = vold;
1600                             init   = vnew.p.count;
1601                             status = ( init < (UT)vnew.p.ub ) ;
1602                         }
1603 
1604                         if( !status ) {
1605                             kmp_info_t   **other_threads = team->t.t_threads;
1606                             int          while_limit = 10;
1607                             int          while_index = 0;
1608 
1609                             // TODO: algorithm of searching for a victim
1610                             // should be cleaned up and measured
1611                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1612                                 union_i4  vold, vnew;
1613                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1614                                 T         victimIdx    = pr->u.p.parm4;
1615                                 T         oldVictimIdx = victimIdx;
1616                                 dispatch_private_info_template< T > * victim;
1617 
1618                                 do {
1619                                     if( !victimIdx ) {
1620                                         victimIdx = team->t.t_nproc - 1;
1621                                     } else {
1622                                         --victimIdx;
1623                                     }
1624                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1625                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1626                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1627                                 // TODO: think about a proper place of this test
1628                                 if ( ( !victim ) ||
1629                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1630                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1631                                     // TODO: delay would be nice
1632                                     continue;
1633                                     // the victim is not ready yet to participate in stealing
1634                                     // because the victim is still in kmp_init_dispatch
1635                                 }
1636                                 if ( oldVictimIdx == victimIdx ) {
1637                                     break;
1638                                 }
1639                                 pr->u.p.parm4 = victimIdx;
1640 
1641                                 while( 1 ) {
1642                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1643                                     vnew = vold;
1644 
1645                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1646                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1647                                         break;
1648                                     }
1649                                     vnew.p.ub -= (remaining >> 2);
1650                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1651                                     #pragma warning( push )
1652                                     // disable warning on pointless comparison of unsigned with 0
1653                                     #pragma warning( disable: 186 )
1654                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1655                                     #pragma warning( pop )
1656                                     // TODO: Should this be acquire or release?
1657                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1658                                             ( volatile kmp_int64 * )&victim->u.p.count,
1659                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1660                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1661                                         status = 1;
1662                                         while_index = 0;
1663                                         // now update own count and ub
1664                                         #if KMP_ARCH_X86
1665                                         // stealing executed on non-KMP_ARCH_X86 only
1666                                             // Atomic 64-bit write on ia32 is
1667                                             // unavailable, so we do this in steps.
1668                                             //     This code is not tested.
1669                                             init = vold.p.count;
1670                                             pr->u.p.ub = 0;
1671                                             pr->u.p.count = init + 1;
1672                                             pr->u.p.ub = vnew.p.count;
1673                                         #else
1674                                             init = vnew.p.ub;
1675                                             vold.p.count = init + 1;
1676                                             // TODO: is it safe and enough?
1677                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1678                                         #endif // KMP_ARCH_X86
1679                                         break;
1680                                     } // if
1681                                 KMP_CPU_PAUSE();
1682                                 } // while (1)
1683                             } // while
1684                         } // if
1685                     } // if
1686                     if ( !status ) {
1687                         *p_lb = 0;
1688                         *p_ub = 0;
1689                         if ( p_st != NULL ) *p_st = 0;
1690                     } else {
1691                         start = pr->u.p.parm2;
1692                         init *= chunk;
1693                         limit = chunk + init - 1;
1694                         incr  = pr->u.p.st;
1695 
1696                         KMP_DEBUG_ASSERT(init <= trip);
1697                         if ( (last = (limit >= trip)) != 0 )
1698                             limit = trip;
1699                         if ( p_st != NULL ) *p_st = incr;
1700 
1701                         if ( incr == 1 ) {
1702                             *p_lb = start + init;
1703                             *p_ub = start + limit;
1704                         } else {
1705                             *p_lb = start + init * incr;
1706                             *p_ub = start + limit * incr;
1707                         }
1708 
1709                         if ( pr->ordered ) {
1710                             pr->u.p.ordered_lower = init;
1711                             pr->u.p.ordered_upper = limit;
1712                             #ifdef KMP_DEBUG
1713                             {
1714                                 const char * buff;
1715                                 // create format specifiers before the debug output
1716                                 buff = __kmp_str_format(
1717                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1718                                     traits_t< UT >::spec, traits_t< UT >::spec );
1719                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1720                                 __kmp_str_free( &buff );
1721                             }
1722                             #endif
1723                         } // if
1724                     } // if
1725                     break;
1726                 } // case
1727             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1728             case kmp_sch_static_balanced:
1729                 {
1730                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1731                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1732                         pr->u.p.count = 1;
1733                         *p_lb = pr->u.p.lb;
1734                         *p_ub = pr->u.p.ub;
1735                         last = pr->u.p.parm1;
1736                         if ( p_st != NULL )
1737                             *p_st = pr->u.p.st;
1738                     } else {  /* no iterations to do */
1739                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1740                     }
1741                     if ( pr->ordered ) {
1742                         #ifdef KMP_DEBUG
1743                         {
1744                             const char * buff;
1745                             // create format specifiers before the debug output
1746                             buff = __kmp_str_format(
1747                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1748                                 traits_t< UT >::spec, traits_t< UT >::spec );
1749                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1750                             __kmp_str_free( &buff );
1751                         }
1752                         #endif
1753                     } // if
1754                 } // case
1755                 break;
1756             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1757             case kmp_sch_static_chunked:
1758                 {
1759                     T parm1;
1760 
1761                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1762                                    gtid ) );
1763                     parm1 = pr->u.p.parm1;
1764 
1765                     trip  = pr->u.p.tc - 1;
1766                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1767 
1768                     if ( (status = (init <= trip)) != 0 ) {
1769                         start = pr->u.p.lb;
1770                         incr  = pr->u.p.st;
1771                         limit = parm1 + init - 1;
1772 
1773                         if ( (last = (limit >= trip)) != 0 )
1774                             limit = trip;
1775 
1776                         if ( p_st != NULL ) *p_st = incr;
1777 
1778                         pr->u.p.count += team->t.t_nproc;
1779 
1780                         if ( incr == 1 ) {
1781                             *p_lb = start + init;
1782                             *p_ub = start + limit;
1783                         }
1784                         else {
1785                             *p_lb = start + init * incr;
1786                             *p_ub = start + limit * incr;
1787                         }
1788 
1789                         if ( pr->ordered ) {
1790                             pr->u.p.ordered_lower = init;
1791                             pr->u.p.ordered_upper = limit;
1792                             #ifdef KMP_DEBUG
1793                             {
1794                                 const char * buff;
1795                                 // create format specifiers before the debug output
1796                                 buff = __kmp_str_format(
1797                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1798                                     traits_t< UT >::spec, traits_t< UT >::spec );
1799                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1800                                 __kmp_str_free( &buff );
1801                             }
1802                             #endif
1803                         } // if
1804                     } // if
1805                 } // case
1806                 break;
1807 
1808             case kmp_sch_dynamic_chunked:
1809                 {
1810                     T chunk = pr->u.p.parm1;
1811 
1812                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1813                                    gtid ) );
1814 
1815                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1816                     trip = pr->u.p.tc - 1;
1817 
1818                     if ( (status = (init <= trip)) == 0 ) {
1819                         *p_lb = 0;
1820                         *p_ub = 0;
1821                         if ( p_st != NULL ) *p_st = 0;
1822                     } else {
1823                         start = pr->u.p.lb;
1824                         limit = chunk + init - 1;
1825                         incr  = pr->u.p.st;
1826 
1827                         if ( (last = (limit >= trip)) != 0 )
1828                             limit = trip;
1829 
1830                         if ( p_st != NULL ) *p_st = incr;
1831 
1832                         if ( incr == 1 ) {
1833                             *p_lb = start + init;
1834                             *p_ub = start + limit;
1835                         } else {
1836                             *p_lb = start + init * incr;
1837                             *p_ub = start + limit * incr;
1838                         }
1839 
1840                         if ( pr->ordered ) {
1841                             pr->u.p.ordered_lower = init;
1842                             pr->u.p.ordered_upper = limit;
1843                             #ifdef KMP_DEBUG
1844                             {
1845                                 const char * buff;
1846                                 // create format specifiers before the debug output
1847                                 buff = __kmp_str_format(
1848                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1849                                     traits_t< UT >::spec, traits_t< UT >::spec );
1850                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1851                                 __kmp_str_free( &buff );
1852                             }
1853                             #endif
1854                         } // if
1855                     } // if
1856                 } // case
1857                 break;
1858 
1859             case kmp_sch_guided_iterative_chunked:
1860                 {
1861                     T  chunkspec = pr->u.p.parm1;
1862                     KD_TRACE(100,
1863                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1864                     trip  = pr->u.p.tc;
1865                     // Start atomic part of calculations
1866                     while(1) {
1867                         ST  remaining;             // signed, because can be < 0
1868                         init = sh->u.s.iteration;  // shared value
1869                         remaining = trip - init;
1870                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1871                             // nothing to do, don't try atomic op
1872                             status = 0;
1873                             break;
1874                         }
1875                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1876                             // use dynamic-style shcedule
1877                             // atomically inrement iterations, get old value
1878                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1879                             remaining = trip - init;
1880                             if (remaining <= 0) {
1881                                 status = 0;    // all iterations got by other threads
1882                             } else {
1883                                 // got some iterations to work on
1884                                 status = 1;
1885                                 if ( (T)remaining > chunkspec ) {
1886                                     limit = init + chunkspec - 1;
1887                                 } else {
1888                                     last = 1;   // the last chunk
1889                                     limit = init + remaining - 1;
1890                                 } // if
1891                             } // if
1892                             break;
1893                         } // if
1894                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1895                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1896                             // CAS was successful, chunk obtained
1897                             status = 1;
1898                             --limit;
1899                             break;
1900                         } // if
1901                     } // while
1902                     if ( status != 0 ) {
1903                         start = pr->u.p.lb;
1904                         incr = pr->u.p.st;
1905                         if ( p_st != NULL )
1906                             *p_st = incr;
1907                         *p_lb = start + init * incr;
1908                         *p_ub = start + limit * incr;
1909                         if ( pr->ordered ) {
1910                             pr->u.p.ordered_lower = init;
1911                             pr->u.p.ordered_upper = limit;
1912                             #ifdef KMP_DEBUG
1913                             {
1914                                 const char * buff;
1915                                 // create format specifiers before the debug output
1916                                 buff = __kmp_str_format(
1917                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1918                                     traits_t< UT >::spec, traits_t< UT >::spec );
1919                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1920                                 __kmp_str_free( &buff );
1921                             }
1922                             #endif
1923                         } // if
1924                     } else {
1925                         *p_lb = 0;
1926                         *p_ub = 0;
1927                         if ( p_st != NULL )
1928                             *p_st = 0;
1929                     } // if
1930                 } // case
1931                 break;
1932 
1933             case kmp_sch_guided_analytical_chunked:
1934                 {
1935                     T   chunkspec = pr->u.p.parm1;
1936                     UT chunkIdx;
1937     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1938                     /* for storing original FPCW value for Windows* OS on
1939 		       IA-32 architecture 8-byte version */
1940                     unsigned int oldFpcw;
1941                     unsigned int fpcwSet = 0;
1942     #endif
1943                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1944                                    gtid ) );
1945 
1946                     trip  = pr->u.p.tc;
1947 
1948                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1949                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1950 
1951                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1952                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1953                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1954                             --trip;
1955                             /* use dynamic-style scheduling */
1956                             init = chunkIdx * chunkspec + pr->u.p.count;
1957                             /* need to verify init > 0 in case of overflow in the above calculation */
1958                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1959                                 limit = init + chunkspec -1;
1960 
1961                                 if ( (last = (limit >= trip)) != 0 )
1962                                     limit = trip;
1963                             }
1964                             break;
1965                         } else {
1966                             /* use exponential-style scheduling */
1967                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1968                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1969                              */
1970     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1971                             /* If we haven't already done so, save original
1972 			       FPCW and set precision to 64-bit, as Windows* OS
1973 			       on IA-32 architecture defaults to 53-bit */
1974                             if ( !fpcwSet ) {
1975                                 oldFpcw = _control87(0,0);
1976                                 _control87(_PC_64,_MCW_PC);
1977                                 fpcwSet = 0x30000;
1978                             }
1979     #endif
1980                             if ( chunkIdx ) {
1981                                 init = __kmp_dispatch_guided_remaining< T >(
1982                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1983                                 KMP_DEBUG_ASSERT(init);
1984                                 init = trip - init;
1985                             } else
1986                                 init = 0;
1987                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1988                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1989                             KMP_ASSERT(init <= limit);
1990                             if ( init < limit ) {
1991                                 KMP_DEBUG_ASSERT(limit <= trip);
1992                                 --limit;
1993                                 status = 1;
1994                                 break;
1995                             } // if
1996                         } // if
1997                     } // while (1)
1998     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1999                     /* restore FPCW if necessary
2000                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2001                     */
2002                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2003                         _control87(oldFpcw,_MCW_PC);
2004     #endif
2005                     if ( status != 0 ) {
2006                         start = pr->u.p.lb;
2007                         incr = pr->u.p.st;
2008                         if ( p_st != NULL )
2009                             *p_st = incr;
2010                         *p_lb = start + init * incr;
2011                         *p_ub = start + limit * incr;
2012                         if ( pr->ordered ) {
2013                             pr->u.p.ordered_lower = init;
2014                             pr->u.p.ordered_upper = limit;
2015                             #ifdef KMP_DEBUG
2016                             {
2017                                 const char * buff;
2018                                 // create format specifiers before the debug output
2019                                 buff = __kmp_str_format(
2020                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2021                                     traits_t< UT >::spec, traits_t< UT >::spec );
2022                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2023                                 __kmp_str_free( &buff );
2024                             }
2025                             #endif
2026                         }
2027                     } else {
2028                         *p_lb = 0;
2029                         *p_ub = 0;
2030                         if ( p_st != NULL )
2031                             *p_st = 0;
2032                     }
2033                 } // case
2034                 break;
2035 
2036             case kmp_sch_trapezoidal:
2037                 {
2038                     UT   index;
2039                     T    parm2 = pr->u.p.parm2;
2040                     T    parm3 = pr->u.p.parm3;
2041                     T    parm4 = pr->u.p.parm4;
2042                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2043                                    gtid ) );
2044 
2045                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2046 
2047                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2048                     trip = pr->u.p.tc - 1;
2049 
2050                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2051                         *p_lb = 0;
2052                         *p_ub = 0;
2053                         if ( p_st != NULL ) *p_st = 0;
2054                     } else {
2055                         start = pr->u.p.lb;
2056                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2057                         incr  = pr->u.p.st;
2058 
2059                         if ( (last = (limit >= trip)) != 0 )
2060                             limit = trip;
2061 
2062                         if ( p_st != NULL ) *p_st = incr;
2063 
2064                         if ( incr == 1 ) {
2065                             *p_lb = start + init;
2066                             *p_ub = start + limit;
2067                         } else {
2068                             *p_lb = start + init * incr;
2069                             *p_ub = start + limit * incr;
2070                         }
2071 
2072                         if ( pr->ordered ) {
2073                             pr->u.p.ordered_lower = init;
2074                             pr->u.p.ordered_upper = limit;
2075                             #ifdef KMP_DEBUG
2076                             {
2077                                 const char * buff;
2078                                 // create format specifiers before the debug output
2079                                 buff = __kmp_str_format(
2080                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2081                                     traits_t< UT >::spec, traits_t< UT >::spec );
2082                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2083                                 __kmp_str_free( &buff );
2084                             }
2085                             #endif
2086                         } // if
2087                     } // if
2088                 } // case
2089                 break;
2090             default:
2091                 {
2092                     status = 0; // to avoid complaints on uninitialized variable use
2093                     __kmp_msg(
2094                         kmp_ms_fatal,                        // Severity
2095                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2096                         KMP_HNT( GetNewerLibrary ),          // Hint
2097                         __kmp_msg_null                       // Variadic argument list terminator
2098                     );
2099                 }
2100                 break;
2101             } // switch
2102         } // if tc == 0;
2103 
2104         if ( status == 0 ) {
2105             UT   num_done;
2106 
2107             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2108             #ifdef KMP_DEBUG
2109             {
2110                 const char * buff;
2111                 // create format specifiers before the debug output
2112                 buff = __kmp_str_format(
2113                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2114                     traits_t< UT >::spec );
2115                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2116                 __kmp_str_free( &buff );
2117             }
2118             #endif
2119 
2120             if ( (ST)num_done == team->t.t_nproc-1 ) {
2121                 /* NOTE: release this buffer to be reused */
2122 
2123                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2124 
2125                 sh->u.s.num_done = 0;
2126                 sh->u.s.iteration = 0;
2127 
2128                 /* TODO replace with general release procedure? */
2129                 if ( pr->ordered ) {
2130                     sh->u.s.ordered_iteration = 0;
2131                 }
2132 
2133                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2134 
2135                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2136                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2137                                 gtid, sh->buffer_index) );
2138 
2139                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2140 
2141             } // if
2142             if ( __kmp_env_consistency_check ) {
2143                 if ( pr->pushed_ws != ct_none ) {
2144                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2145                 }
2146             }
2147 
2148             th -> th.th_dispatch -> th_deo_fcn = NULL;
2149             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2150             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2151             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2152         } // if (status == 0)
2153 #if KMP_OS_WINDOWS
2154         else if ( last ) {
2155             pr->u.p.last_upper = pr->u.p.ub;
2156         }
2157 #endif /* KMP_OS_WINDOWS */
2158         if ( p_last != NULL && status != 0 )
2159             *p_last = last;
2160     } // if
2161 
2162     #ifdef KMP_DEBUG
2163     {
2164         const char * buff;
2165         // create format specifiers before the debug output
2166         buff = __kmp_str_format(
2167             "__kmp_dispatch_next: T#%%d normal case: " \
2168             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2169             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2170         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2171         __kmp_str_free( &buff );
2172     }
2173     #endif
2174 #if INCLUDE_SSC_MARKS
2175     SSC_MARK_DISPATCH_NEXT();
2176 #endif
2177     OMPT_LOOP_END;
2178     return status;
2179 }
2180 
2181 template< typename T >
2182 static void
2183 __kmp_dist_get_bounds(
2184     ident_t                          *loc,
2185     kmp_int32                         gtid,
2186     kmp_int32                        *plastiter,
2187     T                                *plower,
2188     T                                *pupper,
2189     typename traits_t< T >::signed_t  incr
2190 ) {
2191     typedef typename traits_t< T >::unsigned_t  UT;
2192     typedef typename traits_t< T >::signed_t    ST;
2193     register kmp_uint32  team_id;
2194     register kmp_uint32  nteams;
2195     register UT          trip_count;
2196     register kmp_team_t *team;
2197     kmp_info_t * th;
2198 
2199     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2200     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2201     #ifdef KMP_DEBUG
2202     {
2203         const char * buff;
2204         // create format specifiers before the debug output
2205         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2206             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2207             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2208             traits_t< T >::spec );
2209         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2210         __kmp_str_free( &buff );
2211     }
2212     #endif
2213 
2214     if( __kmp_env_consistency_check ) {
2215         if( incr == 0 ) {
2216             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2217         }
2218         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2219             // The loop is illegal.
2220             // Some zero-trip loops maintained by compiler, e.g.:
2221             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2222             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2223             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2224             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2225             // Compiler does not check the following illegal loops:
2226             //   for(i=0;i<10;i+=incr) // where incr<0
2227             //   for(i=10;i>0;i-=incr) // where incr<0
2228             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2229         }
2230     }
2231     th = __kmp_threads[gtid];
2232     team = th->th.th_team;
2233     #if OMP_40_ENABLED
2234     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2235     nteams = th->th.th_teams_size.nteams;
2236     #endif
2237     team_id = team->t.t_master_tid;
2238     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2239 
2240     // compute global trip count
2241     if( incr == 1 ) {
2242         trip_count = *pupper - *plower + 1;
2243     } else if(incr == -1) {
2244         trip_count = *plower - *pupper + 1;
2245     } else {
2246         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2247     }
2248 
2249     if( trip_count <= nteams ) {
2250         KMP_DEBUG_ASSERT(
2251             __kmp_static == kmp_sch_static_greedy || \
2252             __kmp_static == kmp_sch_static_balanced
2253         ); // Unknown static scheduling type.
2254         // only some teams get single iteration, others get nothing
2255         if( team_id < trip_count ) {
2256             *pupper = *plower = *plower + team_id * incr;
2257         } else {
2258             *plower = *pupper + incr; // zero-trip loop
2259         }
2260         if( plastiter != NULL )
2261             *plastiter = ( team_id == trip_count - 1 );
2262     } else {
2263         if( __kmp_static == kmp_sch_static_balanced ) {
2264             register UT chunk = trip_count / nteams;
2265             register UT extras = trip_count % nteams;
2266             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2267             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2268             if( plastiter != NULL )
2269                 *plastiter = ( team_id == nteams - 1 );
2270         } else {
2271             register T chunk_inc_count =
2272                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2273             register T upper = *pupper;
2274             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2275                 // Unknown static scheduling type.
2276             *plower += team_id * chunk_inc_count;
2277             *pupper = *plower + chunk_inc_count - incr;
2278             // Check/correct bounds if needed
2279             if( incr > 0 ) {
2280                 if( *pupper < *plower )
2281                     *pupper = i_maxmin< T >::mx;
2282                 if( plastiter != NULL )
2283                     *plastiter = *plower <= upper && *pupper > upper - incr;
2284                 if( *pupper > upper )
2285                     *pupper = upper; // tracker C73258
2286             } else {
2287                 if( *pupper > *plower )
2288                     *pupper = i_maxmin< T >::mn;
2289                 if( plastiter != NULL )
2290                     *plastiter = *plower >= upper && *pupper < upper - incr;
2291                 if( *pupper < upper )
2292                     *pupper = upper; // tracker C73258
2293             }
2294         }
2295     }
2296 }
2297 
2298 //-----------------------------------------------------------------------------------------
2299 // Dispatch routines
2300 //    Transfer call to template< type T >
2301 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2302 //                         T lb, T ub, ST st, ST chunk )
2303 extern "C" {
2304 
2305 /*!
2306 @ingroup WORK_SHARING
2307 @{
2308 @param loc Source location
2309 @param gtid Global thread id
2310 @param schedule Schedule type
2311 @param lb  Lower bound
2312 @param ub  Upper bound
2313 @param st  Step (or increment if you prefer)
2314 @param chunk The chunk size to block with
2315 
2316 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2317 These functions are all identical apart from the types of the arguments.
2318 */
2319 
2320 void
2321 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2322                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2323 {
2324     KMP_DEBUG_ASSERT( __kmp_init_serial );
2325     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2326 }
2327 /*!
2328 See @ref __kmpc_dispatch_init_4
2329 */
2330 void
2331 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2332                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2333 {
2334     KMP_DEBUG_ASSERT( __kmp_init_serial );
2335     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2336 }
2337 
2338 /*!
2339 See @ref __kmpc_dispatch_init_4
2340 */
2341 void
2342 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2343                         kmp_int64 lb, kmp_int64 ub,
2344                         kmp_int64 st, kmp_int64 chunk )
2345 {
2346     KMP_DEBUG_ASSERT( __kmp_init_serial );
2347     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2348 }
2349 
2350 /*!
2351 See @ref __kmpc_dispatch_init_4
2352 */
2353 void
2354 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2355                          kmp_uint64 lb, kmp_uint64 ub,
2356                          kmp_int64 st, kmp_int64 chunk )
2357 {
2358     KMP_DEBUG_ASSERT( __kmp_init_serial );
2359     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2360 }
2361 
2362 /*!
2363 See @ref __kmpc_dispatch_init_4
2364 
2365 Difference from __kmpc_dispatch_init set of functions is these functions
2366 are called for composite distribute parallel for construct. Thus before
2367 regular iterations dispatching we need to calc per-team iteration space.
2368 
2369 These functions are all identical apart from the types of the arguments.
2370 */
2371 void
2372 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2373     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2374 {
2375     KMP_DEBUG_ASSERT( __kmp_init_serial );
2376     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2377     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2378 }
2379 
2380 void
2381 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2382     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2383 {
2384     KMP_DEBUG_ASSERT( __kmp_init_serial );
2385     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2386     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2387 }
2388 
2389 void
2390 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2391     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2392 {
2393     KMP_DEBUG_ASSERT( __kmp_init_serial );
2394     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2395     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2396 }
2397 
2398 void
2399 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2400     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2401 {
2402     KMP_DEBUG_ASSERT( __kmp_init_serial );
2403     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2404     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2405 }
2406 
2407 /*!
2408 @param loc Source code location
2409 @param gtid Global thread id
2410 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2411 @param p_lb   Pointer to the lower bound for the next chunk of work
2412 @param p_ub   Pointer to the upper bound for the next chunk of work
2413 @param p_st   Pointer to the stride for the next chunk of work
2414 @return one if there is work to be done, zero otherwise
2415 
2416 Get the next dynamically allocated chunk of work for this thread.
2417 If there is no more work, then the lb,ub and stride need not be modified.
2418 */
2419 int
2420 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2421                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2422 {
2423     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2424 }
2425 
2426 /*!
2427 See @ref __kmpc_dispatch_next_4
2428 */
2429 int
2430 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2431                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2432 {
2433     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2434 }
2435 
2436 /*!
2437 See @ref __kmpc_dispatch_next_4
2438 */
2439 int
2440 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2441                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2442 {
2443     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2444 }
2445 
2446 /*!
2447 See @ref __kmpc_dispatch_next_4
2448 */
2449 int
2450 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2451                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2452 {
2453     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2454 }
2455 
2456 /*!
2457 @param loc Source code location
2458 @param gtid Global thread id
2459 
2460 Mark the end of a dynamic loop.
2461 */
2462 void
2463 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2464 {
2465     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2466 }
2467 
2468 /*!
2469 See @ref __kmpc_dispatch_fini_4
2470 */
2471 void
2472 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2473 {
2474     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2475 }
2476 
2477 /*!
2478 See @ref __kmpc_dispatch_fini_4
2479 */
2480 void
2481 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2482 {
2483     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2484 }
2485 
2486 /*!
2487 See @ref __kmpc_dispatch_fini_4
2488 */
2489 void
2490 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2491 {
2492     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2493 }
2494 /*! @} */
2495 
2496 //-----------------------------------------------------------------------------------------
2497 //Non-template routines from kmp_dispatch.c used in other sources
2498 
2499 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2500     return value == checker;
2501 }
2502 
2503 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2504     return value != checker;
2505 }
2506 
2507 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2508     return value < checker;
2509 }
2510 
2511 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2512     return value >= checker;
2513 }
2514 
2515 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2516     return value <= checker;
2517 }
2518 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2519     return value == checker;
2520 }
2521 
2522 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2523     return value != checker;
2524 }
2525 
2526 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2527     return value < checker;
2528 }
2529 
2530 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2531     return value >= checker;
2532 }
2533 
2534 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2535     return value <= checker;
2536 }
2537 
2538 kmp_uint32
2539 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2540                    kmp_uint32            checker,
2541                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2542                    , void        * obj    // Higher-level synchronization object, or NULL.
2543                    )
2544 {
2545     // note: we may not belong to a team at this point
2546     register volatile kmp_uint32         * spin          = spinner;
2547     register          kmp_uint32           check         = checker;
2548     register          kmp_uint32   spins;
2549     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2550     register          kmp_uint32           r;
2551 
2552     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2553     KMP_INIT_YIELD( spins );
2554     // main wait spin loop
2555     while(!f(r = TCR_4(*spin), check)) {
2556         KMP_FSYNC_SPIN_PREPARE( obj );
2557         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2558            It causes problems with infinite recursion because of exit lock */
2559         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2560             __kmp_abort_thread(); */
2561 
2562         /* if we have waited a bit, or are oversubscribed, yield */
2563         /* pause is in the following code */
2564         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2565         KMP_YIELD_SPIN( spins );
2566     }
2567     KMP_FSYNC_SPIN_ACQUIRED( obj );
2568     return r;
2569 }
2570 
2571 kmp_uint64
2572 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2573                     kmp_uint64            checker,
2574                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2575                     , void        * obj    // Higher-level synchronization object, or NULL.
2576                     )
2577 {
2578     // note: we may not belong to a team at this point
2579     register volatile kmp_uint64         * spin          = spinner;
2580     register          kmp_uint64           check         = checker;
2581     register          kmp_uint32   spins;
2582     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2583     register          kmp_uint64           r;
2584 
2585     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2586     KMP_INIT_YIELD( spins );
2587     // main wait spin loop
2588     while(!f(r = *spin, check))
2589     {
2590         KMP_FSYNC_SPIN_PREPARE( obj );
2591         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2592            It causes problems with infinite recursion because of exit lock */
2593         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2594             __kmp_abort_thread(); */
2595 
2596         // if we are oversubscribed,
2597         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2598         // pause is in the following code
2599         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2600         KMP_YIELD_SPIN( spins );
2601     }
2602     KMP_FSYNC_SPIN_ACQUIRED( obj );
2603     return r;
2604 }
2605 
2606 } // extern "C"
2607 
2608 #ifdef KMP_GOMP_COMPAT
2609 
2610 void
2611 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2612                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2613                            kmp_int32 chunk, int push_ws )
2614 {
2615     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2616                                       push_ws );
2617 }
2618 
2619 void
2620 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2621                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2622                             kmp_int32 chunk, int push_ws )
2623 {
2624     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2625                                        push_ws );
2626 }
2627 
2628 void
2629 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2630                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2631                            kmp_int64 chunk, int push_ws )
2632 {
2633     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2634                                       push_ws );
2635 }
2636 
2637 void
2638 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2639                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2640                             kmp_int64 chunk, int push_ws )
2641 {
2642     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2643                                        push_ws );
2644 }
2645 
2646 void
2647 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2648 {
2649     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2650 }
2651 
2652 void
2653 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2654 {
2655     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2656 }
2657 
2658 void
2659 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2660 {
2661     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2662 }
2663 
2664 void
2665 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2666 {
2667     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2668 }
2669 
2670 #endif /* KMP_GOMP_COMPAT */
2671 
2672 /* ------------------------------------------------------------------------ */
2673 /* ------------------------------------------------------------------------ */
2674 
2675