1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194     kmp_int32 r;
195     r = KMP_TEST_THEN_ADD32( p, d );
196     return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203     kmp_int64 r;
204     r = KMP_TEST_THEN_ADD64( p, d );
205     return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217     kmp_int32 r;
218     r = KMP_TEST_THEN_INC_ACQ32( p );
219     return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226     kmp_int64 r;
227     r = KMP_TEST_THEN_INC_ACQ64( p );
228     return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240     kmp_int32 r;
241     r = KMP_TEST_THEN_INC32( p );
242     return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249     kmp_int64 r;
250     r = KMP_TEST_THEN_INC64( p );
251     return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274     Spin wait loop that first does pause, then yield.
275     Waits until function returns non-zero when called with *spinner and check.
276     Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278     Arguments:
279         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280             locks consistently. For example, if lock is acquired immediately, its address is
281             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283             address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT  // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290                   UT            checker,
291                   kmp_uint32 (* pred)( UT, UT )
292                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
293                   )
294 {
295     // note: we may not belong to a team at this point
296     register volatile UT         * spin          = spinner;
297     register          UT           check         = checker;
298     register          kmp_uint32   spins;
299     register          kmp_uint32 (*f) ( UT, UT ) = pred;
300     register          UT           r;
301 
302     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303     KMP_INIT_YIELD( spins );
304     // main wait spin loop
305     while(!f(r = *spin, check))
306     {
307         KMP_FSYNC_SPIN_PREPARE( obj );
308         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309            It causes problems with infinite recursion because of exit lock */
310         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311             __kmp_abort_thread(); */
312 
313         // if we are oversubscribed,
314         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315         // pause is in the following code
316         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317         KMP_YIELD_SPIN( spins );
318     }
319     KMP_FSYNC_SPIN_ACQUIRED( obj );
320     return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325     return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330     return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335     return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340     return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345     return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355     kmp_info_t *th;
356 
357     KMP_DEBUG_ASSERT( gtid_ref );
358 
359     if ( __kmp_env_consistency_check ) {
360         th = __kmp_threads[*gtid_ref];
361         if ( th -> th.th_root -> r.r_active
362           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368         }
369     }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376     typedef typename traits_t< UT >::signed_t    ST;
377     dispatch_private_info_template< UT > * pr;
378 
379     int gtid = *gtid_ref;
380 //    int  cid = *cid_ref;
381     kmp_info_t *th = __kmp_threads[ gtid ];
382     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385     if ( __kmp_env_consistency_check ) {
386         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387             ( th -> th.th_dispatch -> th_dispatch_pr_current );
388         if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394         }
395     }
396 
397     if ( ! th -> th.th_team -> t.t_serialized ) {
398         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399             ( th -> th.th_dispatch -> th_dispatch_sh_current );
400         UT  lower;
401 
402         if ( ! __kmp_env_consistency_check ) {
403                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         }
406         lower = pr->u.p.ordered_lower;
407 
408         #if ! defined( KMP_GOMP_COMPAT )
409             if ( __kmp_env_consistency_check ) {
410                 if ( pr->ordered_bumped ) {
411                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412                     __kmp_error_construct2(
413                         kmp_i18n_msg_CnsMultipleNesting,
414                         ct_ordered_in_pdo, loc_ref,
415                         & p->stack_data[ p->w_top ]
416                     );
417                 }
418             }
419         #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421         KMP_MB();
422         #ifdef KMP_DEBUG
423         {
424             const char * buff;
425             // create format specifiers before the debug output
426             buff = __kmp_str_format(
427                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428                 traits_t< UT >::spec, traits_t< UT >::spec );
429             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430             __kmp_str_free( &buff );
431         }
432         #endif
433 
434         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435                                 USE_ITT_BUILD_ARG( NULL )
436                                 );
437         KMP_MB();  /* is this necessary? */
438         #ifdef KMP_DEBUG
439         {
440             const char * buff;
441             // create format specifiers before the debug output
442             buff = __kmp_str_format(
443                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444                 traits_t< UT >::spec, traits_t< UT >::spec );
445             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446             __kmp_str_free( &buff );
447         }
448         #endif
449     }
450     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456     kmp_info_t *th;
457 
458     if ( __kmp_env_consistency_check ) {
459         th = __kmp_threads[*gtid_ref];
460         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470     typedef typename traits_t< UT >::signed_t    ST;
471     dispatch_private_info_template< UT > * pr;
472 
473     int gtid = *gtid_ref;
474 //    int  cid = *cid_ref;
475     kmp_info_t *th = __kmp_threads[ gtid ];
476     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479     if ( __kmp_env_consistency_check ) {
480         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481             ( th -> th.th_dispatch -> th_dispatch_pr_current );
482         if ( pr -> pushed_ws != ct_none ) {
483             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484         }
485     }
486 
487     if ( ! th -> th.th_team -> t.t_serialized ) {
488         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489             ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491         if ( ! __kmp_env_consistency_check ) {
492             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494         }
495 
496         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497         #if ! defined( KMP_GOMP_COMPAT )
498             if ( __kmp_env_consistency_check ) {
499                 if ( pr->ordered_bumped != 0 ) {
500                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501                     /* How to test it? - OM */
502                     __kmp_error_construct2(
503                         kmp_i18n_msg_CnsMultipleNesting,
504                         ct_ordered_in_pdo, loc_ref,
505                         & p->stack_data[ p->w_top ]
506                     );
507                 }
508             }
509         #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511         KMP_MB();       /* Flush all pending memory write invalidates.  */
512 
513         pr->ordered_bumped += 1;
514 
515         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516                         gtid, pr->ordered_bumped ) );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519 
520         /* TODO use general release procedure? */
521         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524     }
525     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532     long double s=1.0L;
533 
534     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536     while(y) {
537         if ( y & 1 )
538             s *= x;
539         x *= x;
540         y >>= 1;
541     }
542     return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553     T                                  tc,
554     typename traits_t< T >::floating_t base,
555     typename traits_t< T >::unsigned_t idx
556 ) {
557     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558        least for ICL 8.1, long double arithmetic may not really have
559        long double precision, even with /Qlong_double.  Currently, we
560        workaround that in the caller code, by manipulating the FPCW for
561        Windows* OS on IA-32 architecture.  The lack of precision is not
562        expected to be a correctness issue, though.
563     */
564     typedef typename traits_t< T >::unsigned_t  UT;
565 
566     long double x = tc * __kmp_pow< UT >(base, idx);
567     UT r = (UT) x;
568     if ( x == r )
569         return r;
570     return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
575 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586     ident_t                        * loc,
587     int                              gtid,
588     enum sched_type                  schedule,
589     T                                lb,
590     T                                ub,
591     typename traits_t< T >::signed_t st,
592     typename traits_t< T >::signed_t chunk,
593     int                              push_ws
594 ) {
595     typedef typename traits_t< T >::unsigned_t  UT;
596     typedef typename traits_t< T >::signed_t    ST;
597     typedef typename traits_t< T >::floating_t  DBL;
598     static const int ___kmp_size_type = sizeof( UT );
599 
600     int                                            active;
601     T                                              tc;
602     kmp_info_t *                                   th;
603     kmp_team_t *                                   team;
604     kmp_uint32                                     my_buffer_index;
605     dispatch_private_info_template< T >          * pr;
606     dispatch_shared_info_template< UT > volatile * sh;
607 
608     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611     if ( ! TCR_4( __kmp_init_parallel ) )
612         __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615     SSC_MARK_DISPATCH_INIT();
616 #endif
617     #ifdef KMP_DEBUG
618     {
619         const char * buff;
620         // create format specifiers before the debug output
621         buff = __kmp_str_format(
622             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625         __kmp_str_free( &buff );
626     }
627     #endif
628     /* setup data */
629     th     = __kmp_threads[ gtid ];
630     team   = th -> th.th_team;
631     active = ! team -> t.t_serialized;
632     th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635     kmp_uint64 cur_chunk = chunk;
636     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637         KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639         th->th.th_teams_microtask == NULL &&
640 #endif
641         team->t.t_active_level == 1;
642 #endif
643     if ( ! active ) {
644         pr = reinterpret_cast< dispatch_private_info_template< T >* >
645             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646     } else {
647         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652         /* What happens when number of threads changes, need to resize buffer? */
653         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
654             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657     }
658 
659     /* Pick up the nomerge/ordered bits from the scheduling type */
660     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661         pr->nomerge = TRUE;
662         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663     } else {
664         pr->nomerge = FALSE;
665     }
666     pr->type_size = ___kmp_size_type; // remember the size of variables
667     if ( kmp_ord_lower & schedule ) {
668         pr->ordered = TRUE;
669         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670     } else {
671         pr->ordered = FALSE;
672     }
673 
674     if ( schedule == kmp_sch_static ) {
675         schedule = __kmp_static;
676     } else {
677         if ( schedule == kmp_sch_runtime ) {
678             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679             schedule = team -> t.t_sched.r_sched_type;
680             // Detail the schedule if needed (global controls are differentiated appropriately)
681             if ( schedule == kmp_sch_guided_chunked ) {
682                 schedule = __kmp_guided;
683             } else if ( schedule == kmp_sch_static ) {
684                 schedule = __kmp_static;
685             }
686             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687             chunk = team -> t.t_sched.chunk;
688 
689             #ifdef KMP_DEBUG
690             {
691                 const char * buff;
692                 // create format specifiers before the debug output
693                 buff = __kmp_str_format(
694                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
695                     traits_t< ST >::spec );
696                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
697                 __kmp_str_free( &buff );
698             }
699             #endif
700         } else {
701             if ( schedule == kmp_sch_guided_chunked ) {
702                 schedule = __kmp_guided;
703             }
704             if ( chunk <= 0 ) {
705                 chunk = KMP_DEFAULT_CHUNK;
706             }
707         }
708 
709         if ( schedule == kmp_sch_auto ) {
710             // mapping and differentiation: in the __kmp_do_serial_initialize()
711             schedule = __kmp_auto;
712             #ifdef KMP_DEBUG
713             {
714                 const char * buff;
715                 // create format specifiers before the debug output
716                 buff = __kmp_str_format(
717                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
718                     traits_t< ST >::spec );
719                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
720                 __kmp_str_free( &buff );
721             }
722             #endif
723         }
724 
725         /* guided analytical not safe for too many threads */
726         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
727             schedule = kmp_sch_guided_iterative_chunked;
728             KMP_WARNING( DispatchManyThreads );
729         }
730         pr->u.p.parm1 = chunk;
731     }
732     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
733                 "unknown scheduling type" );
734 
735     pr->u.p.count = 0;
736 
737     if ( __kmp_env_consistency_check ) {
738         if ( st == 0 ) {
739             __kmp_error_construct(
740                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
741                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
742             );
743         }
744     }
745 
746     tc = ( ub - lb + st );
747     if ( st != 1 ) {
748         if ( st < 0 ) {
749             if ( lb < ub ) {
750                 tc = 0;            // zero-trip
751             } else {   // lb >= ub
752                 tc = (ST)tc / st;  // convert to signed division
753             }
754         } else {       // st > 0
755             if ( ub < lb ) {
756                 tc = 0;            // zero-trip
757             } else {   // lb >= ub
758                 tc /= st;
759             }
760         }
761     } else if ( ub < lb ) {        // st == 1
762         tc = 0;                    // zero-trip
763     }
764 
765     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
766     // when statistics are disabled.
767     if (schedule == __kmp_static)
768     {
769         KMP_COUNT_BLOCK(OMP_FOR_static);
770         KMP_COUNT_VALUE(FOR_static_iterations, tc);
771     }
772     else
773     {
774         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
775         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
776     }
777 
778     pr->u.p.lb = lb;
779     pr->u.p.ub = ub;
780     pr->u.p.st = st;
781     pr->u.p.tc = tc;
782 
783     #if KMP_OS_WINDOWS
784     pr->u.p.last_upper = ub + st;
785     #endif /* KMP_OS_WINDOWS */
786 
787     /* NOTE: only the active parallel region(s) has active ordered sections */
788 
789     if ( active ) {
790         if ( pr->ordered == 0 ) {
791             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
792             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
793         } else {
794             pr->ordered_bumped = 0;
795 
796             pr->u.p.ordered_lower = 1;
797             pr->u.p.ordered_upper = 0;
798 
799             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
800             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
801         }
802     }
803 
804     if ( __kmp_env_consistency_check ) {
805         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
806         if ( push_ws ) {
807             __kmp_push_workshare( gtid, ws, loc );
808             pr->pushed_ws = ws;
809         } else {
810             __kmp_check_workshare( gtid, ws, loc );
811             pr->pushed_ws = ct_none;
812         }
813     }
814 
815     switch ( schedule ) {
816     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
817     case kmp_sch_static_steal:
818         {
819             T nproc = team->t.t_nproc;
820             T ntc, init;
821 
822             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
823 
824             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
825             if ( nproc > 1 && ntc >= nproc ) {
826                 T id = __kmp_tid_from_gtid(gtid);
827                 T small_chunk, extras;
828 
829                 small_chunk = ntc / nproc;
830                 extras = ntc % nproc;
831 
832                 init = id * small_chunk + ( id < extras ? id : extras );
833                 pr->u.p.count = init;
834                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
835 
836                 pr->u.p.parm2 = lb;
837                 //pr->pfields.parm3 = 0; // it's not used in static_steal
838                 pr->u.p.parm4 = id;
839                 pr->u.p.st = st;
840                 break;
841             } else {
842                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
843                                gtid ) );
844                 schedule = kmp_sch_static_balanced;
845                 /* too few iterations: fall-through to kmp_sch_static_balanced */
846             } // if
847             /* FALL-THROUGH to static balanced */
848         } // case
849     #endif
850     case kmp_sch_static_balanced:
851         {
852             T nproc = team->t.t_nproc;
853             T init, limit;
854 
855             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
856                             gtid ) );
857 
858             if ( nproc > 1 ) {
859                 T id = __kmp_tid_from_gtid(gtid);
860 
861                 if ( tc < nproc ) {
862                     if ( id < tc ) {
863                         init = id;
864                         limit = id;
865                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
866                     } else {
867                         pr->u.p.count = 1;  /* means no more chunks to execute */
868                         pr->u.p.parm1 = FALSE;
869                         break;
870                     }
871                 } else {
872                     T small_chunk = tc / nproc;
873                     T extras = tc % nproc;
874                     init = id * small_chunk + (id < extras ? id : extras);
875                     limit = init + small_chunk - (id < extras ? 0 : 1);
876                     pr->u.p.parm1 = (id == nproc - 1);
877                 }
878             } else {
879                 if ( tc > 0 ) {
880                     init = 0;
881                     limit = tc - 1;
882                     pr->u.p.parm1 = TRUE;
883                 } else {
884                     // zero trip count
885                     pr->u.p.count = 1;  /* means no more chunks to execute */
886                     pr->u.p.parm1 = FALSE;
887                     break;
888                 }
889             }
890 #if USE_ITT_BUILD
891             // Calculate chunk for metadata report
892             if ( itt_need_metadata_reporting )
893                 cur_chunk = limit - init + 1;
894 #endif
895             if ( st == 1 ) {
896                 pr->u.p.lb = lb + init;
897                 pr->u.p.ub = lb + limit;
898             } else {
899                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
900                 pr->u.p.lb = lb + init * st;
901                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
902                 if ( st > 0 ) {
903                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
904                 } else {
905                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
906                 }
907             }
908             if ( pr->ordered ) {
909                 pr->u.p.ordered_lower = init;
910                 pr->u.p.ordered_upper = limit;
911             }
912             break;
913         } // case
914     case kmp_sch_guided_iterative_chunked :
915         {
916             T nproc = team->t.t_nproc;
917             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
918 
919             if ( nproc > 1 ) {
920                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
921                     /* chunk size too large, switch to dynamic */
922                     schedule = kmp_sch_dynamic_chunked;
923                 } else {
924                     // when remaining iters become less than parm2 - switch to dynamic
925                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
926                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
927                 }
928             } else {
929                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
930                 schedule = kmp_sch_static_greedy;
931                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
932                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
933                 pr->u.p.parm1 = tc;
934             } // if
935         } // case
936         break;
937     case kmp_sch_guided_analytical_chunked:
938         {
939             T nproc = team->t.t_nproc;
940             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
941 
942             if ( nproc > 1 ) {
943                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
944                     /* chunk size too large, switch to dynamic */
945                     schedule = kmp_sch_dynamic_chunked;
946                 } else {
947                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
948                     DBL x;
949 
950                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
951                     /* Linux* OS already has 64-bit computation by default for
952 		       long double, and on Windows* OS on Intel(R) 64,
953 		       /Qlong_double doesn't work.  On Windows* OS
954 		       on IA-32 architecture, we need to set precision to
955 		       64-bit instead of the default 53-bit. Even though long
956 		       double doesn't work on Windows* OS on Intel(R) 64, the
957 		       resulting lack of precision is not expected to impact
958 		       the correctness of the algorithm, but this has not been
959 		       mathematically proven.
960                     */
961                     // save original FPCW and set precision to 64-bit, as
962                     // Windows* OS on IA-32 architecture defaults to 53-bit
963                     unsigned int oldFpcw = _control87(0,0);
964                     _control87(_PC_64,_MCW_PC); // 0,0x30000
965                     #endif
966                     /* value used for comparison in solver for cross-over point */
967                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
968 
969                     /* crossover point--chunk indexes equal to or greater than
970 		       this point switch to dynamic-style scheduling */
971                     UT   cross;
972 
973                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
974                     x = (long double)1.0 - (long double)0.5 / nproc;
975 
976                     #ifdef KMP_DEBUG
977                     { // test natural alignment
978                         struct _test_a {
979                             char a;
980                             union {
981                                 char b;
982                                 DBL  d;
983                             };
984                         } t;
985                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
986                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
987                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
988                     }
989                     #endif // KMP_DEBUG
990 
991                     /* save the term in thread private dispatch structure */
992                     *(DBL*)&pr->u.p.parm3 = x;
993 
994                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
995                     {
996                         UT          left, right, mid;
997                         long double p;
998 
999                         /* estimate initial upper and lower bound */
1000 
1001                         /* doesn't matter what value right is as long as it is positive, but
1002                            it affects performance of the solver
1003                         */
1004                         right = 229;
1005                         p = __kmp_pow< UT >(x,right);
1006                         if ( p > target ) {
1007                             do{
1008                                 p *= p;
1009                                 right <<= 1;
1010                             } while(p>target && right < (1<<27));
1011                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1012                         } else {
1013                             left = 0;
1014                         }
1015 
1016                         /* bisection root-finding method */
1017                         while ( left + 1 < right ) {
1018                             mid = (left + right) / 2;
1019                             if ( __kmp_pow< UT >(x,mid) > target ) {
1020                                 left = mid;
1021                             } else {
1022                                 right = mid;
1023                             }
1024                         } // while
1025                         cross = right;
1026                     }
1027                     /* assert sanity of computed crossover point */
1028                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1029 
1030                     /* save the crossover point in thread private dispatch structure */
1031                     pr->u.p.parm2 = cross;
1032 
1033                     // C75803
1034                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1035                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1036                     #else
1037                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1038                     #endif
1039                     /* dynamic-style scheduling offset */
1040                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1041                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1042                         // restore FPCW
1043                         _control87(oldFpcw,_MCW_PC);
1044                     #endif
1045                 } // if
1046             } else {
1047                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1048                                gtid ) );
1049                 schedule = kmp_sch_static_greedy;
1050                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1051                 pr->u.p.parm1 = tc;
1052             } // if
1053         } // case
1054         break;
1055     case kmp_sch_static_greedy:
1056         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1057             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1058                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1059                 tc;
1060         break;
1061     case kmp_sch_static_chunked :
1062     case kmp_sch_dynamic_chunked :
1063         if ( pr->u.p.parm1 <= 0 ) {
1064             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1065         }
1066         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1067         break;
1068     case kmp_sch_trapezoidal :
1069         {
1070             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1071 
1072             T parm1, parm2, parm3, parm4;
1073             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1074 
1075             parm1 = chunk;
1076 
1077             /* F : size of the first cycle */
1078             parm2 = ( tc / (2 * team->t.t_nproc) );
1079 
1080             if ( parm2 < 1 ) {
1081                 parm2 = 1;
1082             }
1083 
1084             /* L : size of the last cycle.  Make sure the last cycle
1085              *     is not larger than the first cycle.
1086              */
1087             if ( parm1 < 1 ) {
1088                 parm1 = 1;
1089             } else if ( parm1 > parm2 ) {
1090                 parm1 = parm2;
1091             }
1092 
1093             /* N : number of cycles */
1094             parm3 = ( parm2 + parm1 );
1095             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1096 
1097             if ( parm3 < 2 ) {
1098                 parm3 = 2;
1099             }
1100 
1101             /* sigma : decreasing incr of the trapezoid */
1102             parm4 = ( parm3 - 1 );
1103             parm4 = ( parm2 - parm1 ) / parm4;
1104 
1105             // pointless check, because parm4 >= 0 always
1106             //if ( parm4 < 0 ) {
1107             //    parm4 = 0;
1108             //}
1109 
1110             pr->u.p.parm1 = parm1;
1111             pr->u.p.parm2 = parm2;
1112             pr->u.p.parm3 = parm3;
1113             pr->u.p.parm4 = parm4;
1114         } // case
1115         break;
1116 
1117     default:
1118         {
1119             __kmp_msg(
1120                 kmp_ms_fatal,                        // Severity
1121                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1122                 KMP_HNT( GetNewerLibrary ),          // Hint
1123                 __kmp_msg_null                       // Variadic argument list terminator
1124             );
1125         }
1126         break;
1127     } // switch
1128     pr->schedule = schedule;
1129     if ( active ) {
1130         /* The name of this buffer should be my_buffer_index when it's free to use it */
1131 
1132         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1133                         gtid, my_buffer_index, sh->buffer_index) );
1134         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1135                                         USE_ITT_BUILD_ARG( NULL )
1136                                         );
1137             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1138             // *always* 32-bit integers.
1139         KMP_MB();  /* is this necessary? */
1140         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1141                         gtid, my_buffer_index, sh->buffer_index) );
1142 
1143         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1144         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1145 #if USE_ITT_BUILD
1146         if ( pr->ordered ) {
1147             __kmp_itt_ordered_init( gtid );
1148         }; // if
1149         // Report loop metadata
1150         if ( itt_need_metadata_reporting ) {
1151             // Only report metadata by master of active team at level 1
1152             kmp_uint64 schedtype = 0;
1153             switch ( schedule ) {
1154             case kmp_sch_static_chunked:
1155             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1156                 break;
1157             case kmp_sch_static_greedy:
1158                 cur_chunk = pr->u.p.parm1;
1159                 break;
1160             case kmp_sch_dynamic_chunked:
1161                 schedtype = 1;
1162                 break;
1163             case kmp_sch_guided_iterative_chunked:
1164             case kmp_sch_guided_analytical_chunked:
1165                 schedtype = 2;
1166                 break;
1167             default:
1168 //            Should we put this case under "static"?
1169 //            case kmp_sch_static_steal:
1170                 schedtype = 3;
1171                 break;
1172             }
1173             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1174         }
1175 #endif /* USE_ITT_BUILD */
1176     }; // if
1177 
1178     #ifdef KMP_DEBUG
1179     {
1180         const char * buff;
1181         // create format specifiers before the debug output
1182         buff = __kmp_str_format(
1183             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1184             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1185             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1186             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1187             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1188             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1189             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1190         KD_TRACE(10, ( buff,
1191             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1192             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1193             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1194             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1195         __kmp_str_free( &buff );
1196     }
1197     #endif
1198     #if ( KMP_STATIC_STEAL_ENABLED )
1199     if ( ___kmp_size_type < 8 ) {
1200       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1201       // all the parm3 variables will contain the same value.
1202       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1203       // rather than program life-time increment.
1204       // So the dedicated variable is required. The 'static_steal_counter' is used.
1205       if( schedule == kmp_sch_static_steal ) {
1206         // Other threads will inspect this variable when searching for a victim.
1207         // This is a flag showing that other threads may steal from this thread since then.
1208         volatile T * p = &pr->u.p.static_steal_counter;
1209         *p = *p + 1;
1210       }
1211     }
1212     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1213 
1214 #if OMPT_SUPPORT && OMPT_TRACE
1215     if (ompt_enabled &&
1216         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1217         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1218         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1219         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1220             team_info->parallel_id, task_info->task_id, team_info->microtask);
1221     }
1222 #endif
1223 }
1224 
1225 /*
1226  * For ordered loops, either __kmp_dispatch_finish() should be called after
1227  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1228  * every chunk of iterations.  If the ordered section(s) were not executed
1229  * for this iteration (or every iteration in this chunk), we need to set the
1230  * ordered iteration counters so that the next thread can proceed.
1231  */
1232 template< typename UT >
1233 static void
1234 __kmp_dispatch_finish( int gtid, ident_t *loc )
1235 {
1236     typedef typename traits_t< UT >::signed_t ST;
1237     kmp_info_t *th = __kmp_threads[ gtid ];
1238 
1239     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1240     if ( ! th -> th.th_team -> t.t_serialized ) {
1241 
1242         dispatch_private_info_template< UT > * pr =
1243             reinterpret_cast< dispatch_private_info_template< UT >* >
1244             ( th->th.th_dispatch->th_dispatch_pr_current );
1245         dispatch_shared_info_template< UT > volatile * sh =
1246             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1247             ( th->th.th_dispatch->th_dispatch_sh_current );
1248         KMP_DEBUG_ASSERT( pr );
1249         KMP_DEBUG_ASSERT( sh );
1250         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1251                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1252 
1253         if ( pr->ordered_bumped ) {
1254             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1255                             gtid ) );
1256             pr->ordered_bumped = 0;
1257         } else {
1258             UT lower = pr->u.p.ordered_lower;
1259 
1260             #ifdef KMP_DEBUG
1261             {
1262                 const char * buff;
1263                 // create format specifiers before the debug output
1264                 buff = __kmp_str_format(
1265                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1266                     traits_t< UT >::spec, traits_t< UT >::spec );
1267                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1268                 __kmp_str_free( &buff );
1269             }
1270             #endif
1271 
1272             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1273                                    USE_ITT_BUILD_ARG(NULL)
1274                                    );
1275             KMP_MB();  /* is this necessary? */
1276             #ifdef KMP_DEBUG
1277             {
1278                 const char * buff;
1279                 // create format specifiers before the debug output
1280                 buff = __kmp_str_format(
1281                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1282                     traits_t< UT >::spec, traits_t< UT >::spec );
1283                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1284                 __kmp_str_free( &buff );
1285             }
1286             #endif
1287 
1288             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1289         } // if
1290     } // if
1291     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1292 }
1293 
1294 #ifdef KMP_GOMP_COMPAT
1295 
1296 template< typename UT >
1297 static void
1298 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1299 {
1300     typedef typename traits_t< UT >::signed_t ST;
1301     kmp_info_t *th = __kmp_threads[ gtid ];
1302 
1303     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1304     if ( ! th -> th.th_team -> t.t_serialized ) {
1305 //        int cid;
1306         dispatch_private_info_template< UT > * pr =
1307             reinterpret_cast< dispatch_private_info_template< UT >* >
1308             ( th->th.th_dispatch->th_dispatch_pr_current );
1309         dispatch_shared_info_template< UT > volatile * sh =
1310             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1311             ( th->th.th_dispatch->th_dispatch_sh_current );
1312         KMP_DEBUG_ASSERT( pr );
1313         KMP_DEBUG_ASSERT( sh );
1314         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1315                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1316 
1317 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1318             UT lower = pr->u.p.ordered_lower;
1319             UT upper = pr->u.p.ordered_upper;
1320             UT inc = upper - lower + 1;
1321 
1322             if ( pr->ordered_bumped == inc ) {
1323                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1324                   gtid ) );
1325                 pr->ordered_bumped = 0;
1326             } else {
1327                 inc -= pr->ordered_bumped;
1328 
1329                 #ifdef KMP_DEBUG
1330                 {
1331                     const char * buff;
1332                     // create format specifiers before the debug output
1333                     buff = __kmp_str_format(
1334                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1335                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1336                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1337                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1338                     __kmp_str_free( &buff );
1339                 }
1340                 #endif
1341 
1342                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1343                                        USE_ITT_BUILD_ARG(NULL)
1344                                        );
1345 
1346                 KMP_MB();  /* is this necessary? */
1347                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1348                   gtid ) );
1349                 pr->ordered_bumped = 0;
1350 //!!!!! TODO check if the inc should be unsigned, or signed???
1351                 #ifdef KMP_DEBUG
1352                 {
1353                     const char * buff;
1354                     // create format specifiers before the debug output
1355                     buff = __kmp_str_format(
1356                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1357                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1358                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1359                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1360                     __kmp_str_free( &buff );
1361                 }
1362                 #endif
1363 
1364                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1365             }
1366 //        }
1367     }
1368     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1369 }
1370 
1371 #endif /* KMP_GOMP_COMPAT */
1372 
1373 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1374  * (no more work), then tell OMPT the loop is over. In some cases
1375  * kmp_dispatch_fini() is not called. */
1376 #if OMPT_SUPPORT && OMPT_TRACE
1377 #define OMPT_LOOP_END                                                          \
1378     if (status == 0) {                                                         \
1379         if (ompt_enabled &&                     \
1380             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1381             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1382             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1383             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1384                 team_info->parallel_id, task_info->task_id);                   \
1385         }                                                                      \
1386     }
1387 #else
1388 #define OMPT_LOOP_END // no-op
1389 #endif
1390 
1391 template< typename T >
1392 static int
1393 __kmp_dispatch_next(
1394     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1395 ) {
1396 
1397     typedef typename traits_t< T >::unsigned_t  UT;
1398     typedef typename traits_t< T >::signed_t    ST;
1399     typedef typename traits_t< T >::floating_t  DBL;
1400 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1401     static const int ___kmp_size_type = sizeof( UT );
1402 #endif
1403 
1404     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1405     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1406     // more than a compile time choice to use static scheduling would.)
1407     KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1408 
1409     int                                   status;
1410     dispatch_private_info_template< T > * pr;
1411     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1412     kmp_team_t                          * team = th -> th.th_team;
1413 
1414     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1415     #ifdef KMP_DEBUG
1416     {
1417         const char * buff;
1418         // create format specifiers before the debug output
1419         buff = __kmp_str_format(
1420             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1421             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1422         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1423         __kmp_str_free( &buff );
1424     }
1425     #endif
1426 
1427     if ( team -> t.t_serialized ) {
1428         /* NOTE: serialize this dispatch becase we are not at the active level */
1429         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1430             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1431         KMP_DEBUG_ASSERT( pr );
1432 
1433         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1434             *p_lb = 0;
1435             *p_ub = 0;
1436 //            if ( p_last != NULL )
1437 //                *p_last = 0;
1438             if ( p_st != NULL )
1439                 *p_st = 0;
1440             if ( __kmp_env_consistency_check ) {
1441                 if ( pr->pushed_ws != ct_none ) {
1442                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1443                 }
1444             }
1445         } else if ( pr->nomerge ) {
1446             kmp_int32 last;
1447             T         start;
1448             UT        limit, trip, init;
1449             ST        incr;
1450             T         chunk = pr->u.p.parm1;
1451 
1452             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1453 
1454             init = chunk * pr->u.p.count++;
1455             trip = pr->u.p.tc - 1;
1456 
1457             if ( (status = (init <= trip)) == 0 ) {
1458                 *p_lb = 0;
1459                 *p_ub = 0;
1460 //                if ( p_last != NULL )
1461 //                    *p_last = 0;
1462                 if ( p_st != NULL )
1463                     *p_st = 0;
1464                 if ( __kmp_env_consistency_check ) {
1465                     if ( pr->pushed_ws != ct_none ) {
1466                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1467                     }
1468                 }
1469             } else {
1470                 start = pr->u.p.lb;
1471                 limit = chunk + init - 1;
1472                 incr  = pr->u.p.st;
1473 
1474                 if ( (last = (limit >= trip)) != 0 ) {
1475                     limit = trip;
1476                     #if KMP_OS_WINDOWS
1477                     pr->u.p.last_upper = pr->u.p.ub;
1478                     #endif /* KMP_OS_WINDOWS */
1479                 }
1480                 if ( p_last != NULL )
1481                     *p_last = last;
1482                 if ( p_st != NULL )
1483                     *p_st = incr;
1484                 if ( incr == 1 ) {
1485                     *p_lb = start + init;
1486                     *p_ub = start + limit;
1487                 } else {
1488                     *p_lb = start + init * incr;
1489                     *p_ub = start + limit * incr;
1490                 }
1491 
1492                 if ( pr->ordered ) {
1493                     pr->u.p.ordered_lower = init;
1494                     pr->u.p.ordered_upper = limit;
1495                     #ifdef KMP_DEBUG
1496                     {
1497                         const char * buff;
1498                         // create format specifiers before the debug output
1499                         buff = __kmp_str_format(
1500                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1501                             traits_t< UT >::spec, traits_t< UT >::spec );
1502                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1503                         __kmp_str_free( &buff );
1504                     }
1505                     #endif
1506                 } // if
1507             } // if
1508         } else {
1509             pr->u.p.tc = 0;
1510             *p_lb = pr->u.p.lb;
1511             *p_ub = pr->u.p.ub;
1512             #if KMP_OS_WINDOWS
1513             pr->u.p.last_upper = *p_ub;
1514             #endif /* KMP_OS_WINDOWS */
1515             if ( p_last != NULL )
1516                 *p_last = TRUE;
1517             if ( p_st != NULL )
1518                 *p_st = pr->u.p.st;
1519         } // if
1520         #ifdef KMP_DEBUG
1521         {
1522             const char * buff;
1523             // create format specifiers before the debug output
1524             buff = __kmp_str_format(
1525                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1526                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1527                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1528             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1529             __kmp_str_free( &buff );
1530         }
1531         #endif
1532 #if INCLUDE_SSC_MARKS
1533         SSC_MARK_DISPATCH_NEXT();
1534 #endif
1535         OMPT_LOOP_END;
1536         return status;
1537     } else {
1538         kmp_int32 last = 0;
1539         dispatch_shared_info_template< UT > *sh;
1540         T         start;
1541         ST        incr;
1542         UT        limit, trip, init;
1543 
1544         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1545                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1546 
1547         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1548             ( th->th.th_dispatch->th_dispatch_pr_current );
1549         KMP_DEBUG_ASSERT( pr );
1550         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1551             ( th->th.th_dispatch->th_dispatch_sh_current );
1552         KMP_DEBUG_ASSERT( sh );
1553 
1554         if ( pr->u.p.tc == 0 ) {
1555             // zero trip count
1556             status = 0;
1557         } else {
1558             switch (pr->schedule) {
1559             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1560             case kmp_sch_static_steal:
1561                 {
1562                     T chunk = pr->u.p.parm1;
1563 
1564                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1565 
1566                     trip = pr->u.p.tc - 1;
1567 
1568                     if ( ___kmp_size_type > 4 ) {
1569                         // Other threads do not look into the data of this thread,
1570                         //  so it's not necessary to make volatile casting.
1571                         init   = ( pr->u.p.count )++;
1572                         status = ( init < (UT)pr->u.p.ub );
1573                     } else {
1574                         typedef union {
1575                             struct {
1576                                 UT count;
1577                                 T  ub;
1578                             } p;
1579                             kmp_int64 b;
1580                         } union_i4;
1581                         // All operations on 'count' or 'ub' must be combined atomically together.
1582                         // stealing implemented only for 4-byte indexes
1583                         {
1584                             union_i4 vold, vnew;
1585                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1586                             vnew = vold;
1587                             vnew.p.count++;
1588                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1589                                         ( volatile kmp_int64* )&pr->u.p.count,
1590                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1591                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1592                                 KMP_CPU_PAUSE();
1593                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1594                                 vnew = vold;
1595                                 vnew.p.count++;
1596                             }
1597                             vnew = vold;
1598                             init   = vnew.p.count;
1599                             status = ( init < (UT)vnew.p.ub ) ;
1600                         }
1601 
1602                         if( !status ) {
1603                             kmp_info_t   **other_threads = team->t.t_threads;
1604                             int          while_limit = 10;
1605                             int          while_index = 0;
1606 
1607                             // TODO: algorithm of searching for a victim
1608                             // should be cleaned up and measured
1609                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1610                                 union_i4  vold, vnew;
1611                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1612                                 T         victimIdx    = pr->u.p.parm4;
1613                                 T         oldVictimIdx = victimIdx;
1614                                 dispatch_private_info_template< T > * victim;
1615 
1616                                 do {
1617                                     if( !victimIdx ) {
1618                                         victimIdx = team->t.t_nproc - 1;
1619                                     } else {
1620                                         --victimIdx;
1621                                     }
1622                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1623                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1624                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1625                                 // TODO: think about a proper place of this test
1626                                 if ( ( !victim ) ||
1627                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1628                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1629                                     // TODO: delay would be nice
1630                                     continue;
1631                                     // the victim is not ready yet to participate in stealing
1632                                     // because the victim is still in kmp_init_dispatch
1633                                 }
1634                                 if ( oldVictimIdx == victimIdx ) {
1635                                     break;
1636                                 }
1637                                 pr->u.p.parm4 = victimIdx;
1638 
1639                                 while( 1 ) {
1640                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1641                                     vnew = vold;
1642 
1643                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1644                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1645                                         break;
1646                                     }
1647                                     vnew.p.ub -= (remaining >> 2);
1648                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1649                                     #pragma warning( push )
1650                                     // disable warning on pointless comparison of unsigned with 0
1651                                     #pragma warning( disable: 186 )
1652                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1653                                     #pragma warning( pop )
1654                                     // TODO: Should this be acquire or release?
1655                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1656                                             ( volatile kmp_int64 * )&victim->u.p.count,
1657                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1658                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1659                                         status = 1;
1660                                         while_index = 0;
1661                                         // now update own count and ub
1662                                         #if KMP_ARCH_X86
1663                                         // stealing executed on non-KMP_ARCH_X86 only
1664                                             // Atomic 64-bit write on ia32 is
1665                                             // unavailable, so we do this in steps.
1666                                             //     This code is not tested.
1667                                             init = vold.p.count;
1668                                             pr->u.p.ub = 0;
1669                                             pr->u.p.count = init + 1;
1670                                             pr->u.p.ub = vnew.p.count;
1671                                         #else
1672                                             init = vnew.p.ub;
1673                                             vold.p.count = init + 1;
1674                                             // TODO: is it safe and enough?
1675                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1676                                         #endif // KMP_ARCH_X86
1677                                         break;
1678                                     } // if
1679                                 KMP_CPU_PAUSE();
1680                                 } // while (1)
1681                             } // while
1682                         } // if
1683                     } // if
1684                     if ( !status ) {
1685                         *p_lb = 0;
1686                         *p_ub = 0;
1687                         if ( p_st != NULL ) *p_st = 0;
1688                     } else {
1689                         start = pr->u.p.parm2;
1690                         init *= chunk;
1691                         limit = chunk + init - 1;
1692                         incr  = pr->u.p.st;
1693 
1694                         KMP_DEBUG_ASSERT(init <= trip);
1695                         if ( (last = (limit >= trip)) != 0 )
1696                             limit = trip;
1697                         if ( p_st != NULL ) *p_st = incr;
1698 
1699                         if ( incr == 1 ) {
1700                             *p_lb = start + init;
1701                             *p_ub = start + limit;
1702                         } else {
1703                             *p_lb = start + init * incr;
1704                             *p_ub = start + limit * incr;
1705                         }
1706 
1707                         if ( pr->ordered ) {
1708                             pr->u.p.ordered_lower = init;
1709                             pr->u.p.ordered_upper = limit;
1710                             #ifdef KMP_DEBUG
1711                             {
1712                                 const char * buff;
1713                                 // create format specifiers before the debug output
1714                                 buff = __kmp_str_format(
1715                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1716                                     traits_t< UT >::spec, traits_t< UT >::spec );
1717                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1718                                 __kmp_str_free( &buff );
1719                             }
1720                             #endif
1721                         } // if
1722                     } // if
1723                     break;
1724                 } // case
1725             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1726             case kmp_sch_static_balanced:
1727                 {
1728                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1729                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1730                         pr->u.p.count = 1;
1731                         *p_lb = pr->u.p.lb;
1732                         *p_ub = pr->u.p.ub;
1733                         last = pr->u.p.parm1;
1734                         if ( p_st != NULL )
1735                             *p_st = pr->u.p.st;
1736                     } else {  /* no iterations to do */
1737                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1738                     }
1739                     if ( pr->ordered ) {
1740                         #ifdef KMP_DEBUG
1741                         {
1742                             const char * buff;
1743                             // create format specifiers before the debug output
1744                             buff = __kmp_str_format(
1745                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1746                                 traits_t< UT >::spec, traits_t< UT >::spec );
1747                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1748                             __kmp_str_free( &buff );
1749                         }
1750                         #endif
1751                     } // if
1752                 } // case
1753                 break;
1754             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1755             case kmp_sch_static_chunked:
1756                 {
1757                     T parm1;
1758 
1759                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1760                                    gtid ) );
1761                     parm1 = pr->u.p.parm1;
1762 
1763                     trip  = pr->u.p.tc - 1;
1764                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1765 
1766                     if ( (status = (init <= trip)) != 0 ) {
1767                         start = pr->u.p.lb;
1768                         incr  = pr->u.p.st;
1769                         limit = parm1 + init - 1;
1770 
1771                         if ( (last = (limit >= trip)) != 0 )
1772                             limit = trip;
1773 
1774                         if ( p_st != NULL ) *p_st = incr;
1775 
1776                         pr->u.p.count += team->t.t_nproc;
1777 
1778                         if ( incr == 1 ) {
1779                             *p_lb = start + init;
1780                             *p_ub = start + limit;
1781                         }
1782                         else {
1783                             *p_lb = start + init * incr;
1784                             *p_ub = start + limit * incr;
1785                         }
1786 
1787                         if ( pr->ordered ) {
1788                             pr->u.p.ordered_lower = init;
1789                             pr->u.p.ordered_upper = limit;
1790                             #ifdef KMP_DEBUG
1791                             {
1792                                 const char * buff;
1793                                 // create format specifiers before the debug output
1794                                 buff = __kmp_str_format(
1795                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1796                                     traits_t< UT >::spec, traits_t< UT >::spec );
1797                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1798                                 __kmp_str_free( &buff );
1799                             }
1800                             #endif
1801                         } // if
1802                     } // if
1803                 } // case
1804                 break;
1805 
1806             case kmp_sch_dynamic_chunked:
1807                 {
1808                     T chunk = pr->u.p.parm1;
1809 
1810                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1811                                    gtid ) );
1812 
1813                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1814                     trip = pr->u.p.tc - 1;
1815 
1816                     if ( (status = (init <= trip)) == 0 ) {
1817                         *p_lb = 0;
1818                         *p_ub = 0;
1819                         if ( p_st != NULL ) *p_st = 0;
1820                     } else {
1821                         start = pr->u.p.lb;
1822                         limit = chunk + init - 1;
1823                         incr  = pr->u.p.st;
1824 
1825                         if ( (last = (limit >= trip)) != 0 )
1826                             limit = trip;
1827 
1828                         if ( p_st != NULL ) *p_st = incr;
1829 
1830                         if ( incr == 1 ) {
1831                             *p_lb = start + init;
1832                             *p_ub = start + limit;
1833                         } else {
1834                             *p_lb = start + init * incr;
1835                             *p_ub = start + limit * incr;
1836                         }
1837 
1838                         if ( pr->ordered ) {
1839                             pr->u.p.ordered_lower = init;
1840                             pr->u.p.ordered_upper = limit;
1841                             #ifdef KMP_DEBUG
1842                             {
1843                                 const char * buff;
1844                                 // create format specifiers before the debug output
1845                                 buff = __kmp_str_format(
1846                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1847                                     traits_t< UT >::spec, traits_t< UT >::spec );
1848                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1849                                 __kmp_str_free( &buff );
1850                             }
1851                             #endif
1852                         } // if
1853                     } // if
1854                 } // case
1855                 break;
1856 
1857             case kmp_sch_guided_iterative_chunked:
1858                 {
1859                     T  chunkspec = pr->u.p.parm1;
1860                     KD_TRACE(100,
1861                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1862                     trip  = pr->u.p.tc;
1863                     // Start atomic part of calculations
1864                     while(1) {
1865                         ST  remaining;             // signed, because can be < 0
1866                         init = sh->u.s.iteration;  // shared value
1867                         remaining = trip - init;
1868                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1869                             // nothing to do, don't try atomic op
1870                             status = 0;
1871                             break;
1872                         }
1873                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1874                             // use dynamic-style shcedule
1875                             // atomically inrement iterations, get old value
1876                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1877                             remaining = trip - init;
1878                             if (remaining <= 0) {
1879                                 status = 0;    // all iterations got by other threads
1880                             } else {
1881                                 // got some iterations to work on
1882                                 status = 1;
1883                                 if ( (T)remaining > chunkspec ) {
1884                                     limit = init + chunkspec - 1;
1885                                 } else {
1886                                     last = 1;   // the last chunk
1887                                     limit = init + remaining - 1;
1888                                 } // if
1889                             } // if
1890                             break;
1891                         } // if
1892                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1893                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1894                             // CAS was successful, chunk obtained
1895                             status = 1;
1896                             --limit;
1897                             break;
1898                         } // if
1899                     } // while
1900                     if ( status != 0 ) {
1901                         start = pr->u.p.lb;
1902                         incr = pr->u.p.st;
1903                         if ( p_st != NULL )
1904                             *p_st = incr;
1905                         *p_lb = start + init * incr;
1906                         *p_ub = start + limit * incr;
1907                         if ( pr->ordered ) {
1908                             pr->u.p.ordered_lower = init;
1909                             pr->u.p.ordered_upper = limit;
1910                             #ifdef KMP_DEBUG
1911                             {
1912                                 const char * buff;
1913                                 // create format specifiers before the debug output
1914                                 buff = __kmp_str_format(
1915                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1916                                     traits_t< UT >::spec, traits_t< UT >::spec );
1917                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1918                                 __kmp_str_free( &buff );
1919                             }
1920                             #endif
1921                         } // if
1922                     } else {
1923                         *p_lb = 0;
1924                         *p_ub = 0;
1925                         if ( p_st != NULL )
1926                             *p_st = 0;
1927                     } // if
1928                 } // case
1929                 break;
1930 
1931             case kmp_sch_guided_analytical_chunked:
1932                 {
1933                     T   chunkspec = pr->u.p.parm1;
1934                     UT chunkIdx;
1935     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1936                     /* for storing original FPCW value for Windows* OS on
1937 		       IA-32 architecture 8-byte version */
1938                     unsigned int oldFpcw;
1939                     unsigned int fpcwSet = 0;
1940     #endif
1941                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1942                                    gtid ) );
1943 
1944                     trip  = pr->u.p.tc;
1945 
1946                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1947                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1948 
1949                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1950                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1951                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1952                             --trip;
1953                             /* use dynamic-style scheduling */
1954                             init = chunkIdx * chunkspec + pr->u.p.count;
1955                             /* need to verify init > 0 in case of overflow in the above calculation */
1956                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1957                                 limit = init + chunkspec -1;
1958 
1959                                 if ( (last = (limit >= trip)) != 0 )
1960                                     limit = trip;
1961                             }
1962                             break;
1963                         } else {
1964                             /* use exponential-style scheduling */
1965                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1966                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1967                              */
1968     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1969                             /* If we haven't already done so, save original
1970 			       FPCW and set precision to 64-bit, as Windows* OS
1971 			       on IA-32 architecture defaults to 53-bit */
1972                             if ( !fpcwSet ) {
1973                                 oldFpcw = _control87(0,0);
1974                                 _control87(_PC_64,_MCW_PC);
1975                                 fpcwSet = 0x30000;
1976                             }
1977     #endif
1978                             if ( chunkIdx ) {
1979                                 init = __kmp_dispatch_guided_remaining< T >(
1980                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1981                                 KMP_DEBUG_ASSERT(init);
1982                                 init = trip - init;
1983                             } else
1984                                 init = 0;
1985                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1986                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1987                             KMP_ASSERT(init <= limit);
1988                             if ( init < limit ) {
1989                                 KMP_DEBUG_ASSERT(limit <= trip);
1990                                 --limit;
1991                                 status = 1;
1992                                 break;
1993                             } // if
1994                         } // if
1995                     } // while (1)
1996     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1997                     /* restore FPCW if necessary
1998                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1999                     */
2000                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2001                         _control87(oldFpcw,_MCW_PC);
2002     #endif
2003                     if ( status != 0 ) {
2004                         start = pr->u.p.lb;
2005                         incr = pr->u.p.st;
2006                         if ( p_st != NULL )
2007                             *p_st = incr;
2008                         *p_lb = start + init * incr;
2009                         *p_ub = start + limit * incr;
2010                         if ( pr->ordered ) {
2011                             pr->u.p.ordered_lower = init;
2012                             pr->u.p.ordered_upper = limit;
2013                             #ifdef KMP_DEBUG
2014                             {
2015                                 const char * buff;
2016                                 // create format specifiers before the debug output
2017                                 buff = __kmp_str_format(
2018                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2019                                     traits_t< UT >::spec, traits_t< UT >::spec );
2020                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2021                                 __kmp_str_free( &buff );
2022                             }
2023                             #endif
2024                         }
2025                     } else {
2026                         *p_lb = 0;
2027                         *p_ub = 0;
2028                         if ( p_st != NULL )
2029                             *p_st = 0;
2030                     }
2031                 } // case
2032                 break;
2033 
2034             case kmp_sch_trapezoidal:
2035                 {
2036                     UT   index;
2037                     T    parm2 = pr->u.p.parm2;
2038                     T    parm3 = pr->u.p.parm3;
2039                     T    parm4 = pr->u.p.parm4;
2040                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2041                                    gtid ) );
2042 
2043                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2044 
2045                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2046                     trip = pr->u.p.tc - 1;
2047 
2048                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2049                         *p_lb = 0;
2050                         *p_ub = 0;
2051                         if ( p_st != NULL ) *p_st = 0;
2052                     } else {
2053                         start = pr->u.p.lb;
2054                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2055                         incr  = pr->u.p.st;
2056 
2057                         if ( (last = (limit >= trip)) != 0 )
2058                             limit = trip;
2059 
2060                         if ( p_st != NULL ) *p_st = incr;
2061 
2062                         if ( incr == 1 ) {
2063                             *p_lb = start + init;
2064                             *p_ub = start + limit;
2065                         } else {
2066                             *p_lb = start + init * incr;
2067                             *p_ub = start + limit * incr;
2068                         }
2069 
2070                         if ( pr->ordered ) {
2071                             pr->u.p.ordered_lower = init;
2072                             pr->u.p.ordered_upper = limit;
2073                             #ifdef KMP_DEBUG
2074                             {
2075                                 const char * buff;
2076                                 // create format specifiers before the debug output
2077                                 buff = __kmp_str_format(
2078                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2079                                     traits_t< UT >::spec, traits_t< UT >::spec );
2080                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2081                                 __kmp_str_free( &buff );
2082                             }
2083                             #endif
2084                         } // if
2085                     } // if
2086                 } // case
2087                 break;
2088             default:
2089                 {
2090                     status = 0; // to avoid complaints on uninitialized variable use
2091                     __kmp_msg(
2092                         kmp_ms_fatal,                        // Severity
2093                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2094                         KMP_HNT( GetNewerLibrary ),          // Hint
2095                         __kmp_msg_null                       // Variadic argument list terminator
2096                     );
2097                 }
2098                 break;
2099             } // switch
2100         } // if tc == 0;
2101 
2102         if ( status == 0 ) {
2103             UT   num_done;
2104 
2105             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2106             #ifdef KMP_DEBUG
2107             {
2108                 const char * buff;
2109                 // create format specifiers before the debug output
2110                 buff = __kmp_str_format(
2111                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2112                     traits_t< UT >::spec );
2113                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2114                 __kmp_str_free( &buff );
2115             }
2116             #endif
2117 
2118             if ( (ST)num_done == team->t.t_nproc-1 ) {
2119                 /* NOTE: release this buffer to be reused */
2120 
2121                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2122 
2123                 sh->u.s.num_done = 0;
2124                 sh->u.s.iteration = 0;
2125 
2126                 /* TODO replace with general release procedure? */
2127                 if ( pr->ordered ) {
2128                     sh->u.s.ordered_iteration = 0;
2129                 }
2130 
2131                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2132 
2133                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2134                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2135                                 gtid, sh->buffer_index) );
2136 
2137                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2138 
2139             } // if
2140             if ( __kmp_env_consistency_check ) {
2141                 if ( pr->pushed_ws != ct_none ) {
2142                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2143                 }
2144             }
2145 
2146             th -> th.th_dispatch -> th_deo_fcn = NULL;
2147             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2148             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2149             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2150         } // if (status == 0)
2151 #if KMP_OS_WINDOWS
2152         else if ( last ) {
2153             pr->u.p.last_upper = pr->u.p.ub;
2154         }
2155 #endif /* KMP_OS_WINDOWS */
2156         if ( p_last != NULL && status != 0 )
2157             *p_last = last;
2158     } // if
2159 
2160     #ifdef KMP_DEBUG
2161     {
2162         const char * buff;
2163         // create format specifiers before the debug output
2164         buff = __kmp_str_format(
2165             "__kmp_dispatch_next: T#%%d normal case: " \
2166             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2167             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2168         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2169         __kmp_str_free( &buff );
2170     }
2171     #endif
2172 #if INCLUDE_SSC_MARKS
2173     SSC_MARK_DISPATCH_NEXT();
2174 #endif
2175     OMPT_LOOP_END;
2176     return status;
2177 }
2178 
2179 template< typename T >
2180 static void
2181 __kmp_dist_get_bounds(
2182     ident_t                          *loc,
2183     kmp_int32                         gtid,
2184     kmp_int32                        *plastiter,
2185     T                                *plower,
2186     T                                *pupper,
2187     typename traits_t< T >::signed_t  incr
2188 ) {
2189     typedef typename traits_t< T >::unsigned_t  UT;
2190     typedef typename traits_t< T >::signed_t    ST;
2191     register kmp_uint32  team_id;
2192     register kmp_uint32  nteams;
2193     register UT          trip_count;
2194     register kmp_team_t *team;
2195     kmp_info_t * th;
2196 
2197     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2198     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2199     #ifdef KMP_DEBUG
2200     {
2201         const char * buff;
2202         // create format specifiers before the debug output
2203         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2204             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2205             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2206             traits_t< T >::spec );
2207         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2208         __kmp_str_free( &buff );
2209     }
2210     #endif
2211 
2212     if( __kmp_env_consistency_check ) {
2213         if( incr == 0 ) {
2214             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2215         }
2216         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2217             // The loop is illegal.
2218             // Some zero-trip loops maintained by compiler, e.g.:
2219             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2220             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2221             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2222             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2223             // Compiler does not check the following illegal loops:
2224             //   for(i=0;i<10;i+=incr) // where incr<0
2225             //   for(i=10;i>0;i-=incr) // where incr<0
2226             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2227         }
2228     }
2229     th = __kmp_threads[gtid];
2230     team = th->th.th_team;
2231     #if OMP_40_ENABLED
2232     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2233     nteams = th->th.th_teams_size.nteams;
2234     #endif
2235     team_id = team->t.t_master_tid;
2236     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2237 
2238     // compute global trip count
2239     if( incr == 1 ) {
2240         trip_count = *pupper - *plower + 1;
2241     } else if(incr == -1) {
2242         trip_count = *plower - *pupper + 1;
2243     } else {
2244         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2245     }
2246 
2247     if( trip_count <= nteams ) {
2248         KMP_DEBUG_ASSERT(
2249             __kmp_static == kmp_sch_static_greedy || \
2250             __kmp_static == kmp_sch_static_balanced
2251         ); // Unknown static scheduling type.
2252         // only some teams get single iteration, others get nothing
2253         if( team_id < trip_count ) {
2254             *pupper = *plower = *plower + team_id * incr;
2255         } else {
2256             *plower = *pupper + incr; // zero-trip loop
2257         }
2258         if( plastiter != NULL )
2259             *plastiter = ( team_id == trip_count - 1 );
2260     } else {
2261         if( __kmp_static == kmp_sch_static_balanced ) {
2262             register UT chunk = trip_count / nteams;
2263             register UT extras = trip_count % nteams;
2264             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2265             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2266             if( plastiter != NULL )
2267                 *plastiter = ( team_id == nteams - 1 );
2268         } else {
2269             register T chunk_inc_count =
2270                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2271             register T upper = *pupper;
2272             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2273                 // Unknown static scheduling type.
2274             *plower += team_id * chunk_inc_count;
2275             *pupper = *plower + chunk_inc_count - incr;
2276             // Check/correct bounds if needed
2277             if( incr > 0 ) {
2278                 if( *pupper < *plower )
2279                     *pupper = i_maxmin< T >::mx;
2280                 if( plastiter != NULL )
2281                     *plastiter = *plower <= upper && *pupper > upper - incr;
2282                 if( *pupper > upper )
2283                     *pupper = upper; // tracker C73258
2284             } else {
2285                 if( *pupper > *plower )
2286                     *pupper = i_maxmin< T >::mn;
2287                 if( plastiter != NULL )
2288                     *plastiter = *plower >= upper && *pupper < upper - incr;
2289                 if( *pupper < upper )
2290                     *pupper = upper; // tracker C73258
2291             }
2292         }
2293     }
2294 }
2295 
2296 //-----------------------------------------------------------------------------------------
2297 // Dispatch routines
2298 //    Transfer call to template< type T >
2299 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2300 //                         T lb, T ub, ST st, ST chunk )
2301 extern "C" {
2302 
2303 /*!
2304 @ingroup WORK_SHARING
2305 @{
2306 @param loc Source location
2307 @param gtid Global thread id
2308 @param schedule Schedule type
2309 @param lb  Lower bound
2310 @param ub  Upper bound
2311 @param st  Step (or increment if you prefer)
2312 @param chunk The chunk size to block with
2313 
2314 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2315 These functions are all identical apart from the types of the arguments.
2316 */
2317 
2318 void
2319 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2320                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2321 {
2322     KMP_DEBUG_ASSERT( __kmp_init_serial );
2323     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2324 }
2325 /*!
2326 See @ref __kmpc_dispatch_init_4
2327 */
2328 void
2329 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2330                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2331 {
2332     KMP_DEBUG_ASSERT( __kmp_init_serial );
2333     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2334 }
2335 
2336 /*!
2337 See @ref __kmpc_dispatch_init_4
2338 */
2339 void
2340 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2341                         kmp_int64 lb, kmp_int64 ub,
2342                         kmp_int64 st, kmp_int64 chunk )
2343 {
2344     KMP_DEBUG_ASSERT( __kmp_init_serial );
2345     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2346 }
2347 
2348 /*!
2349 See @ref __kmpc_dispatch_init_4
2350 */
2351 void
2352 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353                          kmp_uint64 lb, kmp_uint64 ub,
2354                          kmp_int64 st, kmp_int64 chunk )
2355 {
2356     KMP_DEBUG_ASSERT( __kmp_init_serial );
2357     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2358 }
2359 
2360 /*!
2361 See @ref __kmpc_dispatch_init_4
2362 
2363 Difference from __kmpc_dispatch_init set of functions is these functions
2364 are called for composite distribute parallel for construct. Thus before
2365 regular iterations dispatching we need to calc per-team iteration space.
2366 
2367 These functions are all identical apart from the types of the arguments.
2368 */
2369 void
2370 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2371     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2372 {
2373     KMP_DEBUG_ASSERT( __kmp_init_serial );
2374     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2375     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2376 }
2377 
2378 void
2379 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2380     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2381 {
2382     KMP_DEBUG_ASSERT( __kmp_init_serial );
2383     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2384     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2385 }
2386 
2387 void
2388 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2389     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2390 {
2391     KMP_DEBUG_ASSERT( __kmp_init_serial );
2392     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2393     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2394 }
2395 
2396 void
2397 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2398     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2399 {
2400     KMP_DEBUG_ASSERT( __kmp_init_serial );
2401     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2402     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2403 }
2404 
2405 /*!
2406 @param loc Source code location
2407 @param gtid Global thread id
2408 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2409 @param p_lb   Pointer to the lower bound for the next chunk of work
2410 @param p_ub   Pointer to the upper bound for the next chunk of work
2411 @param p_st   Pointer to the stride for the next chunk of work
2412 @return one if there is work to be done, zero otherwise
2413 
2414 Get the next dynamically allocated chunk of work for this thread.
2415 If there is no more work, then the lb,ub and stride need not be modified.
2416 */
2417 int
2418 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2419                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2420 {
2421     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2422 }
2423 
2424 /*!
2425 See @ref __kmpc_dispatch_next_4
2426 */
2427 int
2428 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2429                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2430 {
2431     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2432 }
2433 
2434 /*!
2435 See @ref __kmpc_dispatch_next_4
2436 */
2437 int
2438 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2439                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2440 {
2441     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2442 }
2443 
2444 /*!
2445 See @ref __kmpc_dispatch_next_4
2446 */
2447 int
2448 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2449                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2450 {
2451     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2452 }
2453 
2454 /*!
2455 @param loc Source code location
2456 @param gtid Global thread id
2457 
2458 Mark the end of a dynamic loop.
2459 */
2460 void
2461 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2462 {
2463     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2464 }
2465 
2466 /*!
2467 See @ref __kmpc_dispatch_fini_4
2468 */
2469 void
2470 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2471 {
2472     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2473 }
2474 
2475 /*!
2476 See @ref __kmpc_dispatch_fini_4
2477 */
2478 void
2479 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2480 {
2481     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2482 }
2483 
2484 /*!
2485 See @ref __kmpc_dispatch_fini_4
2486 */
2487 void
2488 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2489 {
2490     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2491 }
2492 /*! @} */
2493 
2494 //-----------------------------------------------------------------------------------------
2495 //Non-template routines from kmp_dispatch.c used in other sources
2496 
2497 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2498     return value == checker;
2499 }
2500 
2501 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2502     return value != checker;
2503 }
2504 
2505 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2506     return value < checker;
2507 }
2508 
2509 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2510     return value >= checker;
2511 }
2512 
2513 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2514     return value <= checker;
2515 }
2516 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2517     return value == checker;
2518 }
2519 
2520 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2521     return value != checker;
2522 }
2523 
2524 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2525     return value < checker;
2526 }
2527 
2528 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2529     return value >= checker;
2530 }
2531 
2532 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2533     return value <= checker;
2534 }
2535 
2536 kmp_uint32
2537 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2538                    kmp_uint32            checker,
2539                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2540                    , void        * obj    // Higher-level synchronization object, or NULL.
2541                    )
2542 {
2543     // note: we may not belong to a team at this point
2544     register volatile kmp_uint32         * spin          = spinner;
2545     register          kmp_uint32           check         = checker;
2546     register          kmp_uint32   spins;
2547     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2548     register          kmp_uint32           r;
2549 
2550     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2551     KMP_INIT_YIELD( spins );
2552     // main wait spin loop
2553     while(!f(r = TCR_4(*spin), check)) {
2554         KMP_FSYNC_SPIN_PREPARE( obj );
2555         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2556            It causes problems with infinite recursion because of exit lock */
2557         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2558             __kmp_abort_thread(); */
2559 
2560         /* if we have waited a bit, or are oversubscribed, yield */
2561         /* pause is in the following code */
2562         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2563         KMP_YIELD_SPIN( spins );
2564     }
2565     KMP_FSYNC_SPIN_ACQUIRED( obj );
2566     return r;
2567 }
2568 
2569 kmp_uint64
2570 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2571                     kmp_uint64            checker,
2572                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2573                     , void        * obj    // Higher-level synchronization object, or NULL.
2574                     )
2575 {
2576     // note: we may not belong to a team at this point
2577     register volatile kmp_uint64         * spin          = spinner;
2578     register          kmp_uint64           check         = checker;
2579     register          kmp_uint32   spins;
2580     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2581     register          kmp_uint64           r;
2582 
2583     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2584     KMP_INIT_YIELD( spins );
2585     // main wait spin loop
2586     while(!f(r = *spin, check))
2587     {
2588         KMP_FSYNC_SPIN_PREPARE( obj );
2589         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2590            It causes problems with infinite recursion because of exit lock */
2591         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2592             __kmp_abort_thread(); */
2593 
2594         // if we are oversubscribed,
2595         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2596         // pause is in the following code
2597         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2598         KMP_YIELD_SPIN( spins );
2599     }
2600     KMP_FSYNC_SPIN_ACQUIRED( obj );
2601     return r;
2602 }
2603 
2604 } // extern "C"
2605 
2606 #ifdef KMP_GOMP_COMPAT
2607 
2608 void
2609 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2610                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2611                            kmp_int32 chunk, int push_ws )
2612 {
2613     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2614                                       push_ws );
2615 }
2616 
2617 void
2618 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2619                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2620                             kmp_int32 chunk, int push_ws )
2621 {
2622     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2623                                        push_ws );
2624 }
2625 
2626 void
2627 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2628                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2629                            kmp_int64 chunk, int push_ws )
2630 {
2631     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2632                                       push_ws );
2633 }
2634 
2635 void
2636 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2637                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2638                             kmp_int64 chunk, int push_ws )
2639 {
2640     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2641                                        push_ws );
2642 }
2643 
2644 void
2645 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2646 {
2647     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2648 }
2649 
2650 void
2651 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2652 {
2653     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2654 }
2655 
2656 void
2657 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2658 {
2659     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2660 }
2661 
2662 void
2663 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2664 {
2665     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2666 }
2667 
2668 #endif /* KMP_GOMP_COMPAT */
2669 
2670 /* ------------------------------------------------------------------------ */
2671 /* ------------------------------------------------------------------------ */
2672 
2673