1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194     kmp_int32 r;
195     r = KMP_TEST_THEN_ADD32( p, d );
196     return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203     kmp_int64 r;
204     r = KMP_TEST_THEN_ADD64( p, d );
205     return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217     kmp_int32 r;
218     r = KMP_TEST_THEN_INC_ACQ32( p );
219     return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226     kmp_int64 r;
227     r = KMP_TEST_THEN_INC_ACQ64( p );
228     return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240     kmp_int32 r;
241     r = KMP_TEST_THEN_INC32( p );
242     return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249     kmp_int64 r;
250     r = KMP_TEST_THEN_INC64( p );
251     return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274     Spin wait loop that first does pause, then yield.
275     Waits until function returns non-zero when called with *spinner and check.
276     Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278     Arguments:
279         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280             locks consistently. For example, if lock is acquired immediately, its address is
281             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283             address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT  // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290                   UT            checker,
291                   kmp_uint32 (* pred)( UT, UT )
292                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
293                   )
294 {
295     // note: we may not belong to a team at this point
296     register volatile UT         * spin          = spinner;
297     register          UT           check         = checker;
298     register          kmp_uint32   spins;
299     register          kmp_uint32 (*f) ( UT, UT ) = pred;
300     register          UT           r;
301 
302     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303     KMP_INIT_YIELD( spins );
304     // main wait spin loop
305     while(!f(r = *spin, check))
306     {
307         KMP_FSYNC_SPIN_PREPARE( obj );
308         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309            It causes problems with infinite recursion because of exit lock */
310         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311             __kmp_abort_thread(); */
312 
313         // if we are oversubscribed,
314         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315         // pause is in the following code
316         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317         KMP_YIELD_SPIN( spins );
318     }
319     KMP_FSYNC_SPIN_ACQUIRED( obj );
320     return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325     return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330     return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335     return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340     return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345     return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355     kmp_info_t *th;
356 
357     KMP_DEBUG_ASSERT( gtid_ref );
358 
359     if ( __kmp_env_consistency_check ) {
360         th = __kmp_threads[*gtid_ref];
361         if ( th -> th.th_root -> r.r_active
362           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368         }
369     }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376     typedef typename traits_t< UT >::signed_t    ST;
377     dispatch_private_info_template< UT > * pr;
378 
379     int gtid = *gtid_ref;
380 //    int  cid = *cid_ref;
381     kmp_info_t *th = __kmp_threads[ gtid ];
382     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385     if ( __kmp_env_consistency_check ) {
386         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387             ( th -> th.th_dispatch -> th_dispatch_pr_current );
388         if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394         }
395     }
396 
397     if ( ! th -> th.th_team -> t.t_serialized ) {
398         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399             ( th -> th.th_dispatch -> th_dispatch_sh_current );
400         UT  lower;
401 
402         if ( ! __kmp_env_consistency_check ) {
403                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         }
406         lower = pr->u.p.ordered_lower;
407 
408         #if ! defined( KMP_GOMP_COMPAT )
409             if ( __kmp_env_consistency_check ) {
410                 if ( pr->ordered_bumped ) {
411                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412                     __kmp_error_construct2(
413                         kmp_i18n_msg_CnsMultipleNesting,
414                         ct_ordered_in_pdo, loc_ref,
415                         & p->stack_data[ p->w_top ]
416                     );
417                 }
418             }
419         #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421         KMP_MB();
422         #ifdef KMP_DEBUG
423         {
424             const char * buff;
425             // create format specifiers before the debug output
426             buff = __kmp_str_format(
427                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428                 traits_t< UT >::spec, traits_t< UT >::spec );
429             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430             __kmp_str_free( &buff );
431         }
432         #endif
433 
434         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435                                 USE_ITT_BUILD_ARG( NULL )
436                                 );
437         KMP_MB();  /* is this necessary? */
438         #ifdef KMP_DEBUG
439         {
440             const char * buff;
441             // create format specifiers before the debug output
442             buff = __kmp_str_format(
443                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444                 traits_t< UT >::spec, traits_t< UT >::spec );
445             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446             __kmp_str_free( &buff );
447         }
448         #endif
449     }
450     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456     kmp_info_t *th;
457 
458     if ( __kmp_env_consistency_check ) {
459         th = __kmp_threads[*gtid_ref];
460         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470     typedef typename traits_t< UT >::signed_t    ST;
471     dispatch_private_info_template< UT > * pr;
472 
473     int gtid = *gtid_ref;
474 //    int  cid = *cid_ref;
475     kmp_info_t *th = __kmp_threads[ gtid ];
476     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479     if ( __kmp_env_consistency_check ) {
480         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481             ( th -> th.th_dispatch -> th_dispatch_pr_current );
482         if ( pr -> pushed_ws != ct_none ) {
483             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484         }
485     }
486 
487     if ( ! th -> th.th_team -> t.t_serialized ) {
488         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489             ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491         if ( ! __kmp_env_consistency_check ) {
492             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494         }
495 
496         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497         #if ! defined( KMP_GOMP_COMPAT )
498             if ( __kmp_env_consistency_check ) {
499                 if ( pr->ordered_bumped != 0 ) {
500                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501                     /* How to test it? - OM */
502                     __kmp_error_construct2(
503                         kmp_i18n_msg_CnsMultipleNesting,
504                         ct_ordered_in_pdo, loc_ref,
505                         & p->stack_data[ p->w_top ]
506                     );
507                 }
508             }
509         #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511         KMP_MB();       /* Flush all pending memory write invalidates.  */
512 
513         pr->ordered_bumped += 1;
514 
515         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516                         gtid, pr->ordered_bumped ) );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519 
520         /* TODO use general release procedure? */
521         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524     }
525     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532     long double s=1.0L;
533 
534     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536     while(y) {
537         if ( y & 1 )
538             s *= x;
539         x *= x;
540         y >>= 1;
541     }
542     return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553     T                                  tc,
554     typename traits_t< T >::floating_t base,
555     typename traits_t< T >::unsigned_t idx
556 ) {
557     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558        least for ICL 8.1, long double arithmetic may not really have
559        long double precision, even with /Qlong_double.  Currently, we
560        workaround that in the caller code, by manipulating the FPCW for
561        Windows* OS on IA-32 architecture.  The lack of precision is not
562        expected to be a correctness issue, though.
563     */
564     typedef typename traits_t< T >::unsigned_t  UT;
565 
566     long double x = tc * __kmp_pow< UT >(base, idx);
567     UT r = (UT) x;
568     if ( x == r )
569         return r;
570     return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
575 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586     ident_t                        * loc,
587     int                              gtid,
588     enum sched_type                  schedule,
589     T                                lb,
590     T                                ub,
591     typename traits_t< T >::signed_t st,
592     typename traits_t< T >::signed_t chunk,
593     int                              push_ws
594 ) {
595     typedef typename traits_t< T >::unsigned_t  UT;
596     typedef typename traits_t< T >::signed_t    ST;
597     typedef typename traits_t< T >::floating_t  DBL;
598     static const int ___kmp_size_type = sizeof( UT );
599 
600     int                                            active;
601     T                                              tc;
602     kmp_info_t *                                   th;
603     kmp_team_t *                                   team;
604     kmp_uint32                                     my_buffer_index;
605     dispatch_private_info_template< T >          * pr;
606     dispatch_shared_info_template< UT > volatile * sh;
607 
608     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611     if ( ! TCR_4( __kmp_init_parallel ) )
612         __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615     SSC_MARK_DISPATCH_INIT();
616 #endif
617     #ifdef KMP_DEBUG
618     {
619         const char * buff;
620         // create format specifiers before the debug output
621         buff = __kmp_str_format(
622             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625         __kmp_str_free( &buff );
626     }
627     #endif
628     /* setup data */
629     th     = __kmp_threads[ gtid ];
630     team   = th -> th.th_team;
631     active = ! team -> t.t_serialized;
632     th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635     kmp_uint64 cur_chunk = chunk;
636     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637         KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639         th->th.th_teams_microtask == NULL &&
640 #endif
641         team->t.t_active_level == 1;
642 #endif
643     if ( ! active ) {
644         pr = reinterpret_cast< dispatch_private_info_template< T >* >
645             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646     } else {
647         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652         /* What happens when number of threads changes, need to resize buffer? */
653         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
654             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657     }
658 
659     /* Pick up the nomerge/ordered bits from the scheduling type */
660     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661         pr->nomerge = TRUE;
662         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663     } else {
664         pr->nomerge = FALSE;
665     }
666     pr->type_size = ___kmp_size_type; // remember the size of variables
667     if ( kmp_ord_lower & schedule ) {
668         pr->ordered = TRUE;
669         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670     } else {
671         pr->ordered = FALSE;
672     }
673 
674     if ( schedule == kmp_sch_static ) {
675         schedule = __kmp_static;
676     } else {
677         if ( schedule == kmp_sch_runtime ) {
678             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679             schedule = team -> t.t_sched.r_sched_type;
680             // Detail the schedule if needed (global controls are differentiated appropriately)
681             if ( schedule == kmp_sch_guided_chunked ) {
682                 schedule = __kmp_guided;
683             } else if ( schedule == kmp_sch_static ) {
684                 schedule = __kmp_static;
685             }
686             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687             chunk = team -> t.t_sched.chunk;
688 
689             #ifdef KMP_DEBUG
690             {
691                 const char * buff;
692                 // create format specifiers before the debug output
693                 buff = __kmp_str_format(
694                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
695                     traits_t< ST >::spec );
696                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
697                 __kmp_str_free( &buff );
698             }
699             #endif
700         } else {
701             if ( schedule == kmp_sch_guided_chunked ) {
702                 schedule = __kmp_guided;
703             }
704             if ( chunk <= 0 ) {
705                 chunk = KMP_DEFAULT_CHUNK;
706             }
707         }
708 
709         if ( schedule == kmp_sch_auto ) {
710             // mapping and differentiation: in the __kmp_do_serial_initialize()
711             schedule = __kmp_auto;
712             #ifdef KMP_DEBUG
713             {
714                 const char * buff;
715                 // create format specifiers before the debug output
716                 buff = __kmp_str_format(
717                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
718                     traits_t< ST >::spec );
719                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
720                 __kmp_str_free( &buff );
721             }
722             #endif
723         }
724 
725         /* guided analytical not safe for too many threads */
726         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
727             schedule = kmp_sch_guided_iterative_chunked;
728             KMP_WARNING( DispatchManyThreads );
729         }
730         pr->u.p.parm1 = chunk;
731     }
732     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
733                 "unknown scheduling type" );
734 
735     pr->u.p.count = 0;
736 
737     if ( __kmp_env_consistency_check ) {
738         if ( st == 0 ) {
739             __kmp_error_construct(
740                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
741                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
742             );
743         }
744     }
745 
746     tc = ( ub - lb + st );
747     if ( st != 1 ) {
748         if ( st < 0 ) {
749             if ( lb < ub ) {
750                 tc = 0;            // zero-trip
751             } else {   // lb >= ub
752                 tc = (ST)tc / st;  // convert to signed division
753             }
754         } else {       // st > 0
755             if ( ub < lb ) {
756                 tc = 0;            // zero-trip
757             } else {   // lb >= ub
758                 tc /= st;
759             }
760         }
761     } else if ( ub < lb ) {        // st == 1
762         tc = 0;                    // zero-trip
763     }
764 
765     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
766     // when statistics are disabled.
767     if (schedule == __kmp_static)
768     {
769         KMP_COUNT_BLOCK(OMP_FOR_static);
770         KMP_COUNT_VALUE(FOR_static_iterations, tc);
771     }
772     else
773     {
774         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
775         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
776     }
777 
778     pr->u.p.lb = lb;
779     pr->u.p.ub = ub;
780     pr->u.p.st = st;
781     pr->u.p.tc = tc;
782 
783     #if KMP_OS_WINDOWS
784     pr->u.p.last_upper = ub + st;
785     #endif /* KMP_OS_WINDOWS */
786 
787     /* NOTE: only the active parallel region(s) has active ordered sections */
788 
789     if ( active ) {
790         if ( pr->ordered == 0 ) {
791             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
792             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
793         } else {
794             pr->ordered_bumped = 0;
795 
796             pr->u.p.ordered_lower = 1;
797             pr->u.p.ordered_upper = 0;
798 
799             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
800             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
801         }
802     }
803 
804     if ( __kmp_env_consistency_check ) {
805         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
806         if ( push_ws ) {
807             __kmp_push_workshare( gtid, ws, loc );
808             pr->pushed_ws = ws;
809         } else {
810             __kmp_check_workshare( gtid, ws, loc );
811             pr->pushed_ws = ct_none;
812         }
813     }
814 
815     switch ( schedule ) {
816     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
817     case kmp_sch_static_steal:
818         {
819             T nproc = team->t.t_nproc;
820             T ntc, init;
821 
822             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
823 
824             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
825             if ( nproc > 1 && ntc >= nproc ) {
826                 T id = __kmp_tid_from_gtid(gtid);
827                 T small_chunk, extras;
828 
829                 small_chunk = ntc / nproc;
830                 extras = ntc % nproc;
831 
832                 init = id * small_chunk + ( id < extras ? id : extras );
833                 pr->u.p.count = init;
834                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
835 
836                 pr->u.p.parm2 = lb;
837                 //pr->pfields.parm3 = 0; // it's not used in static_steal
838                 pr->u.p.parm4 = id;
839                 pr->u.p.st = st;
840                 break;
841             } else {
842                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
843                                gtid ) );
844                 schedule = kmp_sch_static_balanced;
845                 /* too few iterations: fall-through to kmp_sch_static_balanced */
846             } // if
847             /* FALL-THROUGH to static balanced */
848         } // case
849     #endif
850     case kmp_sch_static_balanced:
851         {
852             T nproc = team->t.t_nproc;
853             T init, limit;
854 
855             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
856                             gtid ) );
857 
858             if ( nproc > 1 ) {
859                 T id = __kmp_tid_from_gtid(gtid);
860 
861                 if ( tc < nproc ) {
862                     if ( id < tc ) {
863                         init = id;
864                         limit = id;
865                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
866                     } else {
867                         pr->u.p.count = 1;  /* means no more chunks to execute */
868                         pr->u.p.parm1 = FALSE;
869                         break;
870                     }
871                 } else {
872                     T small_chunk = tc / nproc;
873                     T extras = tc % nproc;
874                     init = id * small_chunk + (id < extras ? id : extras);
875                     limit = init + small_chunk - (id < extras ? 0 : 1);
876                     pr->u.p.parm1 = (id == nproc - 1);
877                 }
878             } else {
879                 if ( tc > 0 ) {
880                     init = 0;
881                     limit = tc - 1;
882                     pr->u.p.parm1 = TRUE;
883                 } else {
884                     // zero trip count
885                     pr->u.p.count = 1;  /* means no more chunks to execute */
886                     pr->u.p.parm1 = FALSE;
887                     break;
888                 }
889             }
890 #if USE_ITT_BUILD
891             // Calculate chunk for metadata report
892             if ( itt_need_metadata_reporting )
893                 cur_chunk = limit - init + 1;
894 #endif
895             if ( st == 1 ) {
896                 pr->u.p.lb = lb + init;
897                 pr->u.p.ub = lb + limit;
898             } else {
899                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
900                 pr->u.p.lb = lb + init * st;
901                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
902                 if ( st > 0 ) {
903                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
904                 } else {
905                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
906                 }
907             }
908             if ( pr->ordered ) {
909                 pr->u.p.ordered_lower = init;
910                 pr->u.p.ordered_upper = limit;
911             }
912             break;
913         } // case
914     case kmp_sch_guided_iterative_chunked :
915         {
916             T nproc = team->t.t_nproc;
917             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
918 
919             if ( nproc > 1 ) {
920                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
921                     /* chunk size too large, switch to dynamic */
922                     schedule = kmp_sch_dynamic_chunked;
923                 } else {
924                     // when remaining iters become less than parm2 - switch to dynamic
925                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
926                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
927                 }
928             } else {
929                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
930                 schedule = kmp_sch_static_greedy;
931                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
932                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
933                 pr->u.p.parm1 = tc;
934             } // if
935         } // case
936         break;
937     case kmp_sch_guided_analytical_chunked:
938         {
939             T nproc = team->t.t_nproc;
940             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
941 
942             if ( nproc > 1 ) {
943                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
944                     /* chunk size too large, switch to dynamic */
945                     schedule = kmp_sch_dynamic_chunked;
946                 } else {
947                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
948                     DBL x;
949 
950                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
951                     /* Linux* OS already has 64-bit computation by default for
952 		       long double, and on Windows* OS on Intel(R) 64,
953 		       /Qlong_double doesn't work.  On Windows* OS
954 		       on IA-32 architecture, we need to set precision to
955 		       64-bit instead of the default 53-bit. Even though long
956 		       double doesn't work on Windows* OS on Intel(R) 64, the
957 		       resulting lack of precision is not expected to impact
958 		       the correctness of the algorithm, but this has not been
959 		       mathematically proven.
960                     */
961                     // save original FPCW and set precision to 64-bit, as
962                     // Windows* OS on IA-32 architecture defaults to 53-bit
963                     unsigned int oldFpcw = _control87(0,0);
964                     _control87(_PC_64,_MCW_PC); // 0,0x30000
965                     #endif
966                     /* value used for comparison in solver for cross-over point */
967                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
968 
969                     /* crossover point--chunk indexes equal to or greater than
970 		       this point switch to dynamic-style scheduling */
971                     UT   cross;
972 
973                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
974                     x = (long double)1.0 - (long double)0.5 / nproc;
975 
976                     #ifdef KMP_DEBUG
977                     { // test natural alignment
978                         struct _test_a {
979                             char a;
980                             union {
981                                 char b;
982                                 DBL  d;
983                             };
984                         } t;
985                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
986                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
987                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
988                     }
989                     #endif // KMP_DEBUG
990 
991                     /* save the term in thread private dispatch structure */
992                     *(DBL*)&pr->u.p.parm3 = x;
993 
994                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
995                     {
996                         UT          left, right, mid;
997                         long double p;
998 
999                         /* estimate initial upper and lower bound */
1000 
1001                         /* doesn't matter what value right is as long as it is positive, but
1002                            it affects performance of the solver
1003                         */
1004                         right = 229;
1005                         p = __kmp_pow< UT >(x,right);
1006                         if ( p > target ) {
1007                             do{
1008                                 p *= p;
1009                                 right <<= 1;
1010                             } while(p>target && right < (1<<27));
1011                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1012                         } else {
1013                             left = 0;
1014                         }
1015 
1016                         /* bisection root-finding method */
1017                         while ( left + 1 < right ) {
1018                             mid = (left + right) / 2;
1019                             if ( __kmp_pow< UT >(x,mid) > target ) {
1020                                 left = mid;
1021                             } else {
1022                                 right = mid;
1023                             }
1024                         } // while
1025                         cross = right;
1026                     }
1027                     /* assert sanity of computed crossover point */
1028                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1029 
1030                     /* save the crossover point in thread private dispatch structure */
1031                     pr->u.p.parm2 = cross;
1032 
1033                     // C75803
1034                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1035                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1036                     #else
1037                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1038                     #endif
1039                     /* dynamic-style scheduling offset */
1040                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1041                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1042                         // restore FPCW
1043                         _control87(oldFpcw,_MCW_PC);
1044                     #endif
1045                 } // if
1046             } else {
1047                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1048                                gtid ) );
1049                 schedule = kmp_sch_static_greedy;
1050                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1051                 pr->u.p.parm1 = tc;
1052             } // if
1053         } // case
1054         break;
1055     case kmp_sch_static_greedy:
1056         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1057             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1058                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1059                 tc;
1060         break;
1061     case kmp_sch_static_chunked :
1062     case kmp_sch_dynamic_chunked :
1063         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1064         break;
1065     case kmp_sch_trapezoidal :
1066         {
1067             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1068 
1069             T parm1, parm2, parm3, parm4;
1070             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1071 
1072             parm1 = chunk;
1073 
1074             /* F : size of the first cycle */
1075             parm2 = ( tc / (2 * team->t.t_nproc) );
1076 
1077             if ( parm2 < 1 ) {
1078                 parm2 = 1;
1079             }
1080 
1081             /* L : size of the last cycle.  Make sure the last cycle
1082              *     is not larger than the first cycle.
1083              */
1084             if ( parm1 < 1 ) {
1085                 parm1 = 1;
1086             } else if ( parm1 > parm2 ) {
1087                 parm1 = parm2;
1088             }
1089 
1090             /* N : number of cycles */
1091             parm3 = ( parm2 + parm1 );
1092             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1093 
1094             if ( parm3 < 2 ) {
1095                 parm3 = 2;
1096             }
1097 
1098             /* sigma : decreasing incr of the trapezoid */
1099             parm4 = ( parm3 - 1 );
1100             parm4 = ( parm2 - parm1 ) / parm4;
1101 
1102             // pointless check, because parm4 >= 0 always
1103             //if ( parm4 < 0 ) {
1104             //    parm4 = 0;
1105             //}
1106 
1107             pr->u.p.parm1 = parm1;
1108             pr->u.p.parm2 = parm2;
1109             pr->u.p.parm3 = parm3;
1110             pr->u.p.parm4 = parm4;
1111         } // case
1112         break;
1113 
1114     default:
1115         {
1116             __kmp_msg(
1117                 kmp_ms_fatal,                        // Severity
1118                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1119                 KMP_HNT( GetNewerLibrary ),          // Hint
1120                 __kmp_msg_null                       // Variadic argument list terminator
1121             );
1122         }
1123         break;
1124     } // switch
1125     pr->schedule = schedule;
1126     if ( active ) {
1127         /* The name of this buffer should be my_buffer_index when it's free to use it */
1128 
1129         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1130                         gtid, my_buffer_index, sh->buffer_index) );
1131         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1132                                         USE_ITT_BUILD_ARG( NULL )
1133                                         );
1134             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1135             // *always* 32-bit integers.
1136         KMP_MB();  /* is this necessary? */
1137         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1138                         gtid, my_buffer_index, sh->buffer_index) );
1139 
1140         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1141         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1142 #if USE_ITT_BUILD
1143         if ( pr->ordered ) {
1144             __kmp_itt_ordered_init( gtid );
1145         }; // if
1146         // Report loop metadata
1147         if ( itt_need_metadata_reporting ) {
1148             // Only report metadata by master of active team at level 1
1149             kmp_uint64 schedtype = 0;
1150             switch ( schedule ) {
1151             case kmp_sch_static_chunked:
1152             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1153                 break;
1154             case kmp_sch_static_greedy:
1155                 cur_chunk = pr->u.p.parm1;
1156                 break;
1157             case kmp_sch_dynamic_chunked:
1158                 schedtype = 1;
1159                 break;
1160             case kmp_sch_guided_iterative_chunked:
1161             case kmp_sch_guided_analytical_chunked:
1162                 schedtype = 2;
1163                 break;
1164             default:
1165 //            Should we put this case under "static"?
1166 //            case kmp_sch_static_steal:
1167                 schedtype = 3;
1168                 break;
1169             }
1170             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1171         }
1172 #endif /* USE_ITT_BUILD */
1173     }; // if
1174 
1175     #ifdef KMP_DEBUG
1176     {
1177         const char * buff;
1178         // create format specifiers before the debug output
1179         buff = __kmp_str_format(
1180             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1181             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1182             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1183             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1184             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1185             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1186             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1187         KD_TRACE(10, ( buff,
1188             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1189             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1190             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1191             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1192         __kmp_str_free( &buff );
1193     }
1194     #endif
1195     #if ( KMP_STATIC_STEAL_ENABLED )
1196     if ( ___kmp_size_type < 8 ) {
1197       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1198       // all the parm3 variables will contain the same value.
1199       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1200       // rather than program life-time increment.
1201       // So the dedicated variable is required. The 'static_steal_counter' is used.
1202       if( schedule == kmp_sch_static_steal ) {
1203         // Other threads will inspect this variable when searching for a victim.
1204         // This is a flag showing that other threads may steal from this thread since then.
1205         volatile T * p = &pr->u.p.static_steal_counter;
1206         *p = *p + 1;
1207       }
1208     }
1209     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1210 
1211 #if OMPT_SUPPORT && OMPT_TRACE
1212     if (ompt_enabled &&
1213         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1214         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1215         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1216         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1217             team_info->parallel_id, task_info->task_id, team_info->microtask);
1218     }
1219 #endif
1220 }
1221 
1222 /*
1223  * For ordered loops, either __kmp_dispatch_finish() should be called after
1224  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1225  * every chunk of iterations.  If the ordered section(s) were not executed
1226  * for this iteration (or every iteration in this chunk), we need to set the
1227  * ordered iteration counters so that the next thread can proceed.
1228  */
1229 template< typename UT >
1230 static void
1231 __kmp_dispatch_finish( int gtid, ident_t *loc )
1232 {
1233     typedef typename traits_t< UT >::signed_t ST;
1234     kmp_info_t *th = __kmp_threads[ gtid ];
1235 
1236     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1237     if ( ! th -> th.th_team -> t.t_serialized ) {
1238 
1239         dispatch_private_info_template< UT > * pr =
1240             reinterpret_cast< dispatch_private_info_template< UT >* >
1241             ( th->th.th_dispatch->th_dispatch_pr_current );
1242         dispatch_shared_info_template< UT > volatile * sh =
1243             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1244             ( th->th.th_dispatch->th_dispatch_sh_current );
1245         KMP_DEBUG_ASSERT( pr );
1246         KMP_DEBUG_ASSERT( sh );
1247         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1248                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1249 
1250         if ( pr->ordered_bumped ) {
1251             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1252                             gtid ) );
1253             pr->ordered_bumped = 0;
1254         } else {
1255             UT lower = pr->u.p.ordered_lower;
1256 
1257             #ifdef KMP_DEBUG
1258             {
1259                 const char * buff;
1260                 // create format specifiers before the debug output
1261                 buff = __kmp_str_format(
1262                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1263                     traits_t< UT >::spec, traits_t< UT >::spec );
1264                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1265                 __kmp_str_free( &buff );
1266             }
1267             #endif
1268 
1269             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1270                                    USE_ITT_BUILD_ARG(NULL)
1271                                    );
1272             KMP_MB();  /* is this necessary? */
1273             #ifdef KMP_DEBUG
1274             {
1275                 const char * buff;
1276                 // create format specifiers before the debug output
1277                 buff = __kmp_str_format(
1278                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1279                     traits_t< UT >::spec, traits_t< UT >::spec );
1280                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1281                 __kmp_str_free( &buff );
1282             }
1283             #endif
1284 
1285             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1286         } // if
1287     } // if
1288     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1289 }
1290 
1291 #ifdef KMP_GOMP_COMPAT
1292 
1293 template< typename UT >
1294 static void
1295 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1296 {
1297     typedef typename traits_t< UT >::signed_t ST;
1298     kmp_info_t *th = __kmp_threads[ gtid ];
1299 
1300     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1301     if ( ! th -> th.th_team -> t.t_serialized ) {
1302 //        int cid;
1303         dispatch_private_info_template< UT > * pr =
1304             reinterpret_cast< dispatch_private_info_template< UT >* >
1305             ( th->th.th_dispatch->th_dispatch_pr_current );
1306         dispatch_shared_info_template< UT > volatile * sh =
1307             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1308             ( th->th.th_dispatch->th_dispatch_sh_current );
1309         KMP_DEBUG_ASSERT( pr );
1310         KMP_DEBUG_ASSERT( sh );
1311         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1312                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1313 
1314 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1315             UT lower = pr->u.p.ordered_lower;
1316             UT upper = pr->u.p.ordered_upper;
1317             UT inc = upper - lower + 1;
1318 
1319             if ( pr->ordered_bumped == inc ) {
1320                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1321                   gtid ) );
1322                 pr->ordered_bumped = 0;
1323             } else {
1324                 inc -= pr->ordered_bumped;
1325 
1326                 #ifdef KMP_DEBUG
1327                 {
1328                     const char * buff;
1329                     // create format specifiers before the debug output
1330                     buff = __kmp_str_format(
1331                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1332                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1333                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1334                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1335                     __kmp_str_free( &buff );
1336                 }
1337                 #endif
1338 
1339                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1340                                        USE_ITT_BUILD_ARG(NULL)
1341                                        );
1342 
1343                 KMP_MB();  /* is this necessary? */
1344                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1345                   gtid ) );
1346                 pr->ordered_bumped = 0;
1347 //!!!!! TODO check if the inc should be unsigned, or signed???
1348                 #ifdef KMP_DEBUG
1349                 {
1350                     const char * buff;
1351                     // create format specifiers before the debug output
1352                     buff = __kmp_str_format(
1353                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1354                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1355                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1356                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1357                     __kmp_str_free( &buff );
1358                 }
1359                 #endif
1360 
1361                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1362             }
1363 //        }
1364     }
1365     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1366 }
1367 
1368 #endif /* KMP_GOMP_COMPAT */
1369 
1370 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1371  * (no more work), then tell OMPT the loop is over. In some cases
1372  * kmp_dispatch_fini() is not called. */
1373 #if OMPT_SUPPORT && OMPT_TRACE
1374 #define OMPT_LOOP_END                                                          \
1375     if (status == 0) {                                                         \
1376         if (ompt_enabled &&                     \
1377             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1378             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1379             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1380             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1381                 team_info->parallel_id, task_info->task_id);                   \
1382         }                                                                      \
1383     }
1384 #else
1385 #define OMPT_LOOP_END // no-op
1386 #endif
1387 
1388 template< typename T >
1389 static int
1390 __kmp_dispatch_next(
1391     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1392 ) {
1393 
1394     typedef typename traits_t< T >::unsigned_t  UT;
1395     typedef typename traits_t< T >::signed_t    ST;
1396     typedef typename traits_t< T >::floating_t  DBL;
1397 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1398     static const int ___kmp_size_type = sizeof( UT );
1399 #endif
1400 
1401     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1402     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1403     // more than a compile time choice to use static scheduling would.)
1404     KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1405 
1406     int                                   status;
1407     dispatch_private_info_template< T > * pr;
1408     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1409     kmp_team_t                          * team = th -> th.th_team;
1410 
1411     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1412     #ifdef KMP_DEBUG
1413     {
1414         const char * buff;
1415         // create format specifiers before the debug output
1416         buff = __kmp_str_format(
1417             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1418             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1419         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1420         __kmp_str_free( &buff );
1421     }
1422     #endif
1423 
1424     if ( team -> t.t_serialized ) {
1425         /* NOTE: serialize this dispatch becase we are not at the active level */
1426         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1427             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1428         KMP_DEBUG_ASSERT( pr );
1429 
1430         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1431             *p_lb = 0;
1432             *p_ub = 0;
1433 //            if ( p_last != NULL )
1434 //                *p_last = 0;
1435             if ( p_st != NULL )
1436                 *p_st = 0;
1437             if ( __kmp_env_consistency_check ) {
1438                 if ( pr->pushed_ws != ct_none ) {
1439                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1440                 }
1441             }
1442         } else if ( pr->nomerge ) {
1443             kmp_int32 last;
1444             T         start;
1445             UT        limit, trip, init;
1446             ST        incr;
1447             T         chunk = pr->u.p.parm1;
1448 
1449             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1450 
1451             init = chunk * pr->u.p.count++;
1452             trip = pr->u.p.tc - 1;
1453 
1454             if ( (status = (init <= trip)) == 0 ) {
1455                 *p_lb = 0;
1456                 *p_ub = 0;
1457 //                if ( p_last != NULL )
1458 //                    *p_last = 0;
1459                 if ( p_st != NULL )
1460                     *p_st = 0;
1461                 if ( __kmp_env_consistency_check ) {
1462                     if ( pr->pushed_ws != ct_none ) {
1463                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1464                     }
1465                 }
1466             } else {
1467                 start = pr->u.p.lb;
1468                 limit = chunk + init - 1;
1469                 incr  = pr->u.p.st;
1470 
1471                 if ( (last = (limit >= trip)) != 0 ) {
1472                     limit = trip;
1473                     #if KMP_OS_WINDOWS
1474                     pr->u.p.last_upper = pr->u.p.ub;
1475                     #endif /* KMP_OS_WINDOWS */
1476                 }
1477                 if ( p_last != NULL )
1478                     *p_last = last;
1479                 if ( p_st != NULL )
1480                     *p_st = incr;
1481                 if ( incr == 1 ) {
1482                     *p_lb = start + init;
1483                     *p_ub = start + limit;
1484                 } else {
1485                     *p_lb = start + init * incr;
1486                     *p_ub = start + limit * incr;
1487                 }
1488 
1489                 if ( pr->ordered ) {
1490                     pr->u.p.ordered_lower = init;
1491                     pr->u.p.ordered_upper = limit;
1492                     #ifdef KMP_DEBUG
1493                     {
1494                         const char * buff;
1495                         // create format specifiers before the debug output
1496                         buff = __kmp_str_format(
1497                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1498                             traits_t< UT >::spec, traits_t< UT >::spec );
1499                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1500                         __kmp_str_free( &buff );
1501                     }
1502                     #endif
1503                 } // if
1504             } // if
1505         } else {
1506             pr->u.p.tc = 0;
1507             *p_lb = pr->u.p.lb;
1508             *p_ub = pr->u.p.ub;
1509             #if KMP_OS_WINDOWS
1510             pr->u.p.last_upper = *p_ub;
1511             #endif /* KMP_OS_WINDOWS */
1512             if ( p_last != NULL )
1513                 *p_last = TRUE;
1514             if ( p_st != NULL )
1515                 *p_st = pr->u.p.st;
1516         } // if
1517         #ifdef KMP_DEBUG
1518         {
1519             const char * buff;
1520             // create format specifiers before the debug output
1521             buff = __kmp_str_format(
1522                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1523                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1524                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1525             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1526             __kmp_str_free( &buff );
1527         }
1528         #endif
1529 #if INCLUDE_SSC_MARKS
1530         SSC_MARK_DISPATCH_NEXT();
1531 #endif
1532         OMPT_LOOP_END;
1533         return status;
1534     } else {
1535         kmp_int32 last = 0;
1536         dispatch_shared_info_template< UT > *sh;
1537         T         start;
1538         ST        incr;
1539         UT        limit, trip, init;
1540 
1541         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1542                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1543 
1544         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1545             ( th->th.th_dispatch->th_dispatch_pr_current );
1546         KMP_DEBUG_ASSERT( pr );
1547         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1548             ( th->th.th_dispatch->th_dispatch_sh_current );
1549         KMP_DEBUG_ASSERT( sh );
1550 
1551         if ( pr->u.p.tc == 0 ) {
1552             // zero trip count
1553             status = 0;
1554         } else {
1555             switch (pr->schedule) {
1556             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1557             case kmp_sch_static_steal:
1558                 {
1559                     T chunk = pr->u.p.parm1;
1560 
1561                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1562 
1563                     trip = pr->u.p.tc - 1;
1564 
1565                     if ( ___kmp_size_type > 4 ) {
1566                         // Other threads do not look into the data of this thread,
1567                         //  so it's not necessary to make volatile casting.
1568                         init   = ( pr->u.p.count )++;
1569                         status = ( init < (UT)pr->u.p.ub );
1570                     } else {
1571                         typedef union {
1572                             struct {
1573                                 UT count;
1574                                 T  ub;
1575                             } p;
1576                             kmp_int64 b;
1577                         } union_i4;
1578                         // All operations on 'count' or 'ub' must be combined atomically together.
1579                         // stealing implemented only for 4-byte indexes
1580                         {
1581                             union_i4 vold, vnew;
1582                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1583                             vnew = vold;
1584                             vnew.p.count++;
1585                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1586                                         ( volatile kmp_int64* )&pr->u.p.count,
1587                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1588                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1589                                 KMP_CPU_PAUSE();
1590                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1591                                 vnew = vold;
1592                                 vnew.p.count++;
1593                             }
1594                             vnew = vold;
1595                             init   = vnew.p.count;
1596                             status = ( init < (UT)vnew.p.ub ) ;
1597                         }
1598 
1599                         if( !status ) {
1600                             kmp_info_t   **other_threads = team->t.t_threads;
1601                             int          while_limit = 10;
1602                             int          while_index = 0;
1603 
1604                             // TODO: algorithm of searching for a victim
1605                             // should be cleaned up and measured
1606                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1607                                 union_i4  vold, vnew;
1608                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1609                                 T         victimIdx    = pr->u.p.parm4;
1610                                 T         oldVictimIdx = victimIdx;
1611                                 dispatch_private_info_template< T > * victim;
1612 
1613                                 do {
1614                                     if( !victimIdx ) {
1615                                         victimIdx = team->t.t_nproc - 1;
1616                                     } else {
1617                                         --victimIdx;
1618                                     }
1619                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1620                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1621                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1622                                 // TODO: think about a proper place of this test
1623                                 if ( ( !victim ) ||
1624                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1625                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1626                                     // TODO: delay would be nice
1627                                     continue;
1628                                     // the victim is not ready yet to participate in stealing
1629                                     // because the victim is still in kmp_init_dispatch
1630                                 }
1631                                 if ( oldVictimIdx == victimIdx ) {
1632                                     break;
1633                                 }
1634                                 pr->u.p.parm4 = victimIdx;
1635 
1636                                 while( 1 ) {
1637                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1638                                     vnew = vold;
1639 
1640                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1641                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1642                                         break;
1643                                     }
1644                                     vnew.p.ub -= (remaining >> 2);
1645                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1646                                     #pragma warning( push )
1647                                     // disable warning on pointless comparison of unsigned with 0
1648                                     #pragma warning( disable: 186 )
1649                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1650                                     #pragma warning( pop )
1651                                     // TODO: Should this be acquire or release?
1652                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1653                                             ( volatile kmp_int64 * )&victim->u.p.count,
1654                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1655                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1656                                         status = 1;
1657                                         while_index = 0;
1658                                         // now update own count and ub
1659                                         #if KMP_ARCH_X86
1660                                         // stealing executed on non-KMP_ARCH_X86 only
1661                                             // Atomic 64-bit write on ia32 is
1662                                             // unavailable, so we do this in steps.
1663                                             //     This code is not tested.
1664                                             init = vold.p.count;
1665                                             pr->u.p.ub = 0;
1666                                             pr->u.p.count = init + 1;
1667                                             pr->u.p.ub = vnew.p.count;
1668                                         #else
1669                                             init = vnew.p.ub;
1670                                             vold.p.count = init + 1;
1671                                             // TODO: is it safe and enough?
1672                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1673                                         #endif // KMP_ARCH_X86
1674                                         break;
1675                                     } // if
1676                                 KMP_CPU_PAUSE();
1677                                 } // while (1)
1678                             } // while
1679                         } // if
1680                     } // if
1681                     if ( !status ) {
1682                         *p_lb = 0;
1683                         *p_ub = 0;
1684                         if ( p_st != NULL ) *p_st = 0;
1685                     } else {
1686                         start = pr->u.p.parm2;
1687                         init *= chunk;
1688                         limit = chunk + init - 1;
1689                         incr  = pr->u.p.st;
1690 
1691                         KMP_DEBUG_ASSERT(init <= trip);
1692                         if ( (last = (limit >= trip)) != 0 )
1693                             limit = trip;
1694                         if ( p_st != NULL ) *p_st = incr;
1695 
1696                         if ( incr == 1 ) {
1697                             *p_lb = start + init;
1698                             *p_ub = start + limit;
1699                         } else {
1700                             *p_lb = start + init * incr;
1701                             *p_ub = start + limit * incr;
1702                         }
1703 
1704                         if ( pr->ordered ) {
1705                             pr->u.p.ordered_lower = init;
1706                             pr->u.p.ordered_upper = limit;
1707                             #ifdef KMP_DEBUG
1708                             {
1709                                 const char * buff;
1710                                 // create format specifiers before the debug output
1711                                 buff = __kmp_str_format(
1712                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1713                                     traits_t< UT >::spec, traits_t< UT >::spec );
1714                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1715                                 __kmp_str_free( &buff );
1716                             }
1717                             #endif
1718                         } // if
1719                     } // if
1720                     break;
1721                 } // case
1722             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1723             case kmp_sch_static_balanced:
1724                 {
1725                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1726                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1727                         pr->u.p.count = 1;
1728                         *p_lb = pr->u.p.lb;
1729                         *p_ub = pr->u.p.ub;
1730                         last = pr->u.p.parm1;
1731                         if ( p_st != NULL )
1732                             *p_st = pr->u.p.st;
1733                     } else {  /* no iterations to do */
1734                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1735                     }
1736                     if ( pr->ordered ) {
1737                         #ifdef KMP_DEBUG
1738                         {
1739                             const char * buff;
1740                             // create format specifiers before the debug output
1741                             buff = __kmp_str_format(
1742                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1743                                 traits_t< UT >::spec, traits_t< UT >::spec );
1744                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1745                             __kmp_str_free( &buff );
1746                         }
1747                         #endif
1748                     } // if
1749                 } // case
1750                 break;
1751             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1752             case kmp_sch_static_chunked:
1753                 {
1754                     T parm1;
1755 
1756                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1757                                    gtid ) );
1758                     parm1 = pr->u.p.parm1;
1759 
1760                     trip  = pr->u.p.tc - 1;
1761                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1762 
1763                     if ( (status = (init <= trip)) != 0 ) {
1764                         start = pr->u.p.lb;
1765                         incr  = pr->u.p.st;
1766                         limit = parm1 + init - 1;
1767 
1768                         if ( (last = (limit >= trip)) != 0 )
1769                             limit = trip;
1770 
1771                         if ( p_st != NULL ) *p_st = incr;
1772 
1773                         pr->u.p.count += team->t.t_nproc;
1774 
1775                         if ( incr == 1 ) {
1776                             *p_lb = start + init;
1777                             *p_ub = start + limit;
1778                         }
1779                         else {
1780                             *p_lb = start + init * incr;
1781                             *p_ub = start + limit * incr;
1782                         }
1783 
1784                         if ( pr->ordered ) {
1785                             pr->u.p.ordered_lower = init;
1786                             pr->u.p.ordered_upper = limit;
1787                             #ifdef KMP_DEBUG
1788                             {
1789                                 const char * buff;
1790                                 // create format specifiers before the debug output
1791                                 buff = __kmp_str_format(
1792                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1793                                     traits_t< UT >::spec, traits_t< UT >::spec );
1794                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1795                                 __kmp_str_free( &buff );
1796                             }
1797                             #endif
1798                         } // if
1799                     } // if
1800                 } // case
1801                 break;
1802 
1803             case kmp_sch_dynamic_chunked:
1804                 {
1805                     T chunk = pr->u.p.parm1;
1806 
1807                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1808                                    gtid ) );
1809 
1810                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1811                     trip = pr->u.p.tc - 1;
1812 
1813                     if ( (status = (init <= trip)) == 0 ) {
1814                         *p_lb = 0;
1815                         *p_ub = 0;
1816                         if ( p_st != NULL ) *p_st = 0;
1817                     } else {
1818                         start = pr->u.p.lb;
1819                         limit = chunk + init - 1;
1820                         incr  = pr->u.p.st;
1821 
1822                         if ( (last = (limit >= trip)) != 0 )
1823                             limit = trip;
1824 
1825                         if ( p_st != NULL ) *p_st = incr;
1826 
1827                         if ( incr == 1 ) {
1828                             *p_lb = start + init;
1829                             *p_ub = start + limit;
1830                         } else {
1831                             *p_lb = start + init * incr;
1832                             *p_ub = start + limit * incr;
1833                         }
1834 
1835                         if ( pr->ordered ) {
1836                             pr->u.p.ordered_lower = init;
1837                             pr->u.p.ordered_upper = limit;
1838                             #ifdef KMP_DEBUG
1839                             {
1840                                 const char * buff;
1841                                 // create format specifiers before the debug output
1842                                 buff = __kmp_str_format(
1843                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1844                                     traits_t< UT >::spec, traits_t< UT >::spec );
1845                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1846                                 __kmp_str_free( &buff );
1847                             }
1848                             #endif
1849                         } // if
1850                     } // if
1851                 } // case
1852                 break;
1853 
1854             case kmp_sch_guided_iterative_chunked:
1855                 {
1856                     T  chunkspec = pr->u.p.parm1;
1857                     KD_TRACE(100,
1858                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1859                     trip  = pr->u.p.tc;
1860                     // Start atomic part of calculations
1861                     while(1) {
1862                         ST  remaining;             // signed, because can be < 0
1863                         init = sh->u.s.iteration;  // shared value
1864                         remaining = trip - init;
1865                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1866                             // nothing to do, don't try atomic op
1867                             status = 0;
1868                             break;
1869                         }
1870                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1871                             // use dynamic-style shcedule
1872                             // atomically inrement iterations, get old value
1873                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1874                             remaining = trip - init;
1875                             if (remaining <= 0) {
1876                                 status = 0;    // all iterations got by other threads
1877                             } else {
1878                                 // got some iterations to work on
1879                                 status = 1;
1880                                 if ( (T)remaining > chunkspec ) {
1881                                     limit = init + chunkspec - 1;
1882                                 } else {
1883                                     last = 1;   // the last chunk
1884                                     limit = init + remaining - 1;
1885                                 } // if
1886                             } // if
1887                             break;
1888                         } // if
1889                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1890                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1891                             // CAS was successful, chunk obtained
1892                             status = 1;
1893                             --limit;
1894                             break;
1895                         } // if
1896                     } // while
1897                     if ( status != 0 ) {
1898                         start = pr->u.p.lb;
1899                         incr = pr->u.p.st;
1900                         if ( p_st != NULL )
1901                             *p_st = incr;
1902                         *p_lb = start + init * incr;
1903                         *p_ub = start + limit * incr;
1904                         if ( pr->ordered ) {
1905                             pr->u.p.ordered_lower = init;
1906                             pr->u.p.ordered_upper = limit;
1907                             #ifdef KMP_DEBUG
1908                             {
1909                                 const char * buff;
1910                                 // create format specifiers before the debug output
1911                                 buff = __kmp_str_format(
1912                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1913                                     traits_t< UT >::spec, traits_t< UT >::spec );
1914                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1915                                 __kmp_str_free( &buff );
1916                             }
1917                             #endif
1918                         } // if
1919                     } else {
1920                         *p_lb = 0;
1921                         *p_ub = 0;
1922                         if ( p_st != NULL )
1923                             *p_st = 0;
1924                     } // if
1925                 } // case
1926                 break;
1927 
1928             case kmp_sch_guided_analytical_chunked:
1929                 {
1930                     T   chunkspec = pr->u.p.parm1;
1931                     UT chunkIdx;
1932     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1933                     /* for storing original FPCW value for Windows* OS on
1934 		       IA-32 architecture 8-byte version */
1935                     unsigned int oldFpcw;
1936                     unsigned int fpcwSet = 0;
1937     #endif
1938                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1939                                    gtid ) );
1940 
1941                     trip  = pr->u.p.tc;
1942 
1943                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1944                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1945 
1946                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1947                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1948                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1949                             --trip;
1950                             /* use dynamic-style scheduling */
1951                             init = chunkIdx * chunkspec + pr->u.p.count;
1952                             /* need to verify init > 0 in case of overflow in the above calculation */
1953                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1954                                 limit = init + chunkspec -1;
1955 
1956                                 if ( (last = (limit >= trip)) != 0 )
1957                                     limit = trip;
1958                             }
1959                             break;
1960                         } else {
1961                             /* use exponential-style scheduling */
1962                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1963                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1964                              */
1965     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1966                             /* If we haven't already done so, save original
1967 			       FPCW and set precision to 64-bit, as Windows* OS
1968 			       on IA-32 architecture defaults to 53-bit */
1969                             if ( !fpcwSet ) {
1970                                 oldFpcw = _control87(0,0);
1971                                 _control87(_PC_64,_MCW_PC);
1972                                 fpcwSet = 0x30000;
1973                             }
1974     #endif
1975                             if ( chunkIdx ) {
1976                                 init = __kmp_dispatch_guided_remaining< T >(
1977                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1978                                 KMP_DEBUG_ASSERT(init);
1979                                 init = trip - init;
1980                             } else
1981                                 init = 0;
1982                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1983                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1984                             KMP_ASSERT(init <= limit);
1985                             if ( init < limit ) {
1986                                 KMP_DEBUG_ASSERT(limit <= trip);
1987                                 --limit;
1988                                 status = 1;
1989                                 break;
1990                             } // if
1991                         } // if
1992                     } // while (1)
1993     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1994                     /* restore FPCW if necessary
1995                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1996                     */
1997                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1998                         _control87(oldFpcw,_MCW_PC);
1999     #endif
2000                     if ( status != 0 ) {
2001                         start = pr->u.p.lb;
2002                         incr = pr->u.p.st;
2003                         if ( p_st != NULL )
2004                             *p_st = incr;
2005                         *p_lb = start + init * incr;
2006                         *p_ub = start + limit * incr;
2007                         if ( pr->ordered ) {
2008                             pr->u.p.ordered_lower = init;
2009                             pr->u.p.ordered_upper = limit;
2010                             #ifdef KMP_DEBUG
2011                             {
2012                                 const char * buff;
2013                                 // create format specifiers before the debug output
2014                                 buff = __kmp_str_format(
2015                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2016                                     traits_t< UT >::spec, traits_t< UT >::spec );
2017                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2018                                 __kmp_str_free( &buff );
2019                             }
2020                             #endif
2021                         }
2022                     } else {
2023                         *p_lb = 0;
2024                         *p_ub = 0;
2025                         if ( p_st != NULL )
2026                             *p_st = 0;
2027                     }
2028                 } // case
2029                 break;
2030 
2031             case kmp_sch_trapezoidal:
2032                 {
2033                     UT   index;
2034                     T    parm2 = pr->u.p.parm2;
2035                     T    parm3 = pr->u.p.parm3;
2036                     T    parm4 = pr->u.p.parm4;
2037                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2038                                    gtid ) );
2039 
2040                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2041 
2042                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2043                     trip = pr->u.p.tc - 1;
2044 
2045                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2046                         *p_lb = 0;
2047                         *p_ub = 0;
2048                         if ( p_st != NULL ) *p_st = 0;
2049                     } else {
2050                         start = pr->u.p.lb;
2051                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2052                         incr  = pr->u.p.st;
2053 
2054                         if ( (last = (limit >= trip)) != 0 )
2055                             limit = trip;
2056 
2057                         if ( p_st != NULL ) *p_st = incr;
2058 
2059                         if ( incr == 1 ) {
2060                             *p_lb = start + init;
2061                             *p_ub = start + limit;
2062                         } else {
2063                             *p_lb = start + init * incr;
2064                             *p_ub = start + limit * incr;
2065                         }
2066 
2067                         if ( pr->ordered ) {
2068                             pr->u.p.ordered_lower = init;
2069                             pr->u.p.ordered_upper = limit;
2070                             #ifdef KMP_DEBUG
2071                             {
2072                                 const char * buff;
2073                                 // create format specifiers before the debug output
2074                                 buff = __kmp_str_format(
2075                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2076                                     traits_t< UT >::spec, traits_t< UT >::spec );
2077                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2078                                 __kmp_str_free( &buff );
2079                             }
2080                             #endif
2081                         } // if
2082                     } // if
2083                 } // case
2084                 break;
2085             default:
2086                 {
2087                     status = 0; // to avoid complaints on uninitialized variable use
2088                     __kmp_msg(
2089                         kmp_ms_fatal,                        // Severity
2090                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2091                         KMP_HNT( GetNewerLibrary ),          // Hint
2092                         __kmp_msg_null                       // Variadic argument list terminator
2093                     );
2094                 }
2095                 break;
2096             } // switch
2097         } // if tc == 0;
2098 
2099         if ( status == 0 ) {
2100             UT   num_done;
2101 
2102             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2103             #ifdef KMP_DEBUG
2104             {
2105                 const char * buff;
2106                 // create format specifiers before the debug output
2107                 buff = __kmp_str_format(
2108                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2109                     traits_t< UT >::spec );
2110                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2111                 __kmp_str_free( &buff );
2112             }
2113             #endif
2114 
2115             if ( (ST)num_done == team->t.t_nproc-1 ) {
2116                 /* NOTE: release this buffer to be reused */
2117 
2118                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2119 
2120                 sh->u.s.num_done = 0;
2121                 sh->u.s.iteration = 0;
2122 
2123                 /* TODO replace with general release procedure? */
2124                 if ( pr->ordered ) {
2125                     sh->u.s.ordered_iteration = 0;
2126                 }
2127 
2128                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2129 
2130                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2131                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2132                                 gtid, sh->buffer_index) );
2133 
2134                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2135 
2136             } // if
2137             if ( __kmp_env_consistency_check ) {
2138                 if ( pr->pushed_ws != ct_none ) {
2139                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2140                 }
2141             }
2142 
2143             th -> th.th_dispatch -> th_deo_fcn = NULL;
2144             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2145             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2146             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2147         } // if (status == 0)
2148 #if KMP_OS_WINDOWS
2149         else if ( last ) {
2150             pr->u.p.last_upper = pr->u.p.ub;
2151         }
2152 #endif /* KMP_OS_WINDOWS */
2153         if ( p_last != NULL && status != 0 )
2154             *p_last = last;
2155     } // if
2156 
2157     #ifdef KMP_DEBUG
2158     {
2159         const char * buff;
2160         // create format specifiers before the debug output
2161         buff = __kmp_str_format(
2162             "__kmp_dispatch_next: T#%%d normal case: " \
2163             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2164             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2165         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2166         __kmp_str_free( &buff );
2167     }
2168     #endif
2169 #if INCLUDE_SSC_MARKS
2170     SSC_MARK_DISPATCH_NEXT();
2171 #endif
2172     OMPT_LOOP_END;
2173     return status;
2174 }
2175 
2176 template< typename T >
2177 static void
2178 __kmp_dist_get_bounds(
2179     ident_t                          *loc,
2180     kmp_int32                         gtid,
2181     kmp_int32                        *plastiter,
2182     T                                *plower,
2183     T                                *pupper,
2184     typename traits_t< T >::signed_t  incr
2185 ) {
2186     typedef typename traits_t< T >::unsigned_t  UT;
2187     typedef typename traits_t< T >::signed_t    ST;
2188     register kmp_uint32  team_id;
2189     register kmp_uint32  nteams;
2190     register UT          trip_count;
2191     register kmp_team_t *team;
2192     kmp_info_t * th;
2193 
2194     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2195     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2196     #ifdef KMP_DEBUG
2197     {
2198         const char * buff;
2199         // create format specifiers before the debug output
2200         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2201             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2202             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2203             traits_t< T >::spec );
2204         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2205         __kmp_str_free( &buff );
2206     }
2207     #endif
2208 
2209     if( __kmp_env_consistency_check ) {
2210         if( incr == 0 ) {
2211             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2212         }
2213         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2214             // The loop is illegal.
2215             // Some zero-trip loops maintained by compiler, e.g.:
2216             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2217             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2218             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2219             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2220             // Compiler does not check the following illegal loops:
2221             //   for(i=0;i<10;i+=incr) // where incr<0
2222             //   for(i=10;i>0;i-=incr) // where incr<0
2223             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2224         }
2225     }
2226     th = __kmp_threads[gtid];
2227     team = th->th.th_team;
2228     #if OMP_40_ENABLED
2229     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2230     nteams = th->th.th_teams_size.nteams;
2231     #endif
2232     team_id = team->t.t_master_tid;
2233     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2234 
2235     // compute global trip count
2236     if( incr == 1 ) {
2237         trip_count = *pupper - *plower + 1;
2238     } else if(incr == -1) {
2239         trip_count = *plower - *pupper + 1;
2240     } else {
2241         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2242     }
2243 
2244     if( trip_count <= nteams ) {
2245         KMP_DEBUG_ASSERT(
2246             __kmp_static == kmp_sch_static_greedy || \
2247             __kmp_static == kmp_sch_static_balanced
2248         ); // Unknown static scheduling type.
2249         // only some teams get single iteration, others get nothing
2250         if( team_id < trip_count ) {
2251             *pupper = *plower = *plower + team_id * incr;
2252         } else {
2253             *plower = *pupper + incr; // zero-trip loop
2254         }
2255         if( plastiter != NULL )
2256             *plastiter = ( team_id == trip_count - 1 );
2257     } else {
2258         if( __kmp_static == kmp_sch_static_balanced ) {
2259             register UT chunk = trip_count / nteams;
2260             register UT extras = trip_count % nteams;
2261             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2262             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2263             if( plastiter != NULL )
2264                 *plastiter = ( team_id == nteams - 1 );
2265         } else {
2266             register T chunk_inc_count =
2267                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2268             register T upper = *pupper;
2269             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2270                 // Unknown static scheduling type.
2271             *plower += team_id * chunk_inc_count;
2272             *pupper = *plower + chunk_inc_count - incr;
2273             // Check/correct bounds if needed
2274             if( incr > 0 ) {
2275                 if( *pupper < *plower )
2276                     *pupper = i_maxmin< T >::mx;
2277                 if( plastiter != NULL )
2278                     *plastiter = *plower <= upper && *pupper > upper - incr;
2279                 if( *pupper > upper )
2280                     *pupper = upper; // tracker C73258
2281             } else {
2282                 if( *pupper > *plower )
2283                     *pupper = i_maxmin< T >::mn;
2284                 if( plastiter != NULL )
2285                     *plastiter = *plower >= upper && *pupper < upper - incr;
2286                 if( *pupper < upper )
2287                     *pupper = upper; // tracker C73258
2288             }
2289         }
2290     }
2291 }
2292 
2293 //-----------------------------------------------------------------------------------------
2294 // Dispatch routines
2295 //    Transfer call to template< type T >
2296 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2297 //                         T lb, T ub, ST st, ST chunk )
2298 extern "C" {
2299 
2300 /*!
2301 @ingroup WORK_SHARING
2302 @{
2303 @param loc Source location
2304 @param gtid Global thread id
2305 @param schedule Schedule type
2306 @param lb  Lower bound
2307 @param ub  Upper bound
2308 @param st  Step (or increment if you prefer)
2309 @param chunk The chunk size to block with
2310 
2311 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2312 These functions are all identical apart from the types of the arguments.
2313 */
2314 
2315 void
2316 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2317                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2318 {
2319     KMP_DEBUG_ASSERT( __kmp_init_serial );
2320     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2321 }
2322 /*!
2323 See @ref __kmpc_dispatch_init_4
2324 */
2325 void
2326 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2327                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2328 {
2329     KMP_DEBUG_ASSERT( __kmp_init_serial );
2330     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2331 }
2332 
2333 /*!
2334 See @ref __kmpc_dispatch_init_4
2335 */
2336 void
2337 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2338                         kmp_int64 lb, kmp_int64 ub,
2339                         kmp_int64 st, kmp_int64 chunk )
2340 {
2341     KMP_DEBUG_ASSERT( __kmp_init_serial );
2342     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2343 }
2344 
2345 /*!
2346 See @ref __kmpc_dispatch_init_4
2347 */
2348 void
2349 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2350                          kmp_uint64 lb, kmp_uint64 ub,
2351                          kmp_int64 st, kmp_int64 chunk )
2352 {
2353     KMP_DEBUG_ASSERT( __kmp_init_serial );
2354     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2355 }
2356 
2357 /*!
2358 See @ref __kmpc_dispatch_init_4
2359 
2360 Difference from __kmpc_dispatch_init set of functions is these functions
2361 are called for composite distribute parallel for construct. Thus before
2362 regular iterations dispatching we need to calc per-team iteration space.
2363 
2364 These functions are all identical apart from the types of the arguments.
2365 */
2366 void
2367 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2368     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2369 {
2370     KMP_DEBUG_ASSERT( __kmp_init_serial );
2371     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2372     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2373 }
2374 
2375 void
2376 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2377     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2378 {
2379     KMP_DEBUG_ASSERT( __kmp_init_serial );
2380     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2381     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2382 }
2383 
2384 void
2385 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2386     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2387 {
2388     KMP_DEBUG_ASSERT( __kmp_init_serial );
2389     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2390     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2391 }
2392 
2393 void
2394 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2395     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2396 {
2397     KMP_DEBUG_ASSERT( __kmp_init_serial );
2398     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2399     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2400 }
2401 
2402 /*!
2403 @param loc Source code location
2404 @param gtid Global thread id
2405 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2406 @param p_lb   Pointer to the lower bound for the next chunk of work
2407 @param p_ub   Pointer to the upper bound for the next chunk of work
2408 @param p_st   Pointer to the stride for the next chunk of work
2409 @return one if there is work to be done, zero otherwise
2410 
2411 Get the next dynamically allocated chunk of work for this thread.
2412 If there is no more work, then the lb,ub and stride need not be modified.
2413 */
2414 int
2415 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2416                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2417 {
2418     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2419 }
2420 
2421 /*!
2422 See @ref __kmpc_dispatch_next_4
2423 */
2424 int
2425 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2426                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2427 {
2428     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2429 }
2430 
2431 /*!
2432 See @ref __kmpc_dispatch_next_4
2433 */
2434 int
2435 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2436                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2437 {
2438     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2439 }
2440 
2441 /*!
2442 See @ref __kmpc_dispatch_next_4
2443 */
2444 int
2445 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2446                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2447 {
2448     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2449 }
2450 
2451 /*!
2452 @param loc Source code location
2453 @param gtid Global thread id
2454 
2455 Mark the end of a dynamic loop.
2456 */
2457 void
2458 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2459 {
2460     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2461 }
2462 
2463 /*!
2464 See @ref __kmpc_dispatch_fini_4
2465 */
2466 void
2467 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2468 {
2469     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2470 }
2471 
2472 /*!
2473 See @ref __kmpc_dispatch_fini_4
2474 */
2475 void
2476 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2477 {
2478     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2479 }
2480 
2481 /*!
2482 See @ref __kmpc_dispatch_fini_4
2483 */
2484 void
2485 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2486 {
2487     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2488 }
2489 /*! @} */
2490 
2491 //-----------------------------------------------------------------------------------------
2492 //Non-template routines from kmp_dispatch.c used in other sources
2493 
2494 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2495     return value == checker;
2496 }
2497 
2498 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2499     return value != checker;
2500 }
2501 
2502 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2503     return value < checker;
2504 }
2505 
2506 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2507     return value >= checker;
2508 }
2509 
2510 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2511     return value <= checker;
2512 }
2513 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2514     return value == checker;
2515 }
2516 
2517 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2518     return value != checker;
2519 }
2520 
2521 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2522     return value < checker;
2523 }
2524 
2525 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2526     return value >= checker;
2527 }
2528 
2529 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2530     return value <= checker;
2531 }
2532 
2533 kmp_uint32
2534 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2535                    kmp_uint32            checker,
2536                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2537                    , void        * obj    // Higher-level synchronization object, or NULL.
2538                    )
2539 {
2540     // note: we may not belong to a team at this point
2541     register volatile kmp_uint32         * spin          = spinner;
2542     register          kmp_uint32           check         = checker;
2543     register          kmp_uint32   spins;
2544     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2545     register          kmp_uint32           r;
2546 
2547     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2548     KMP_INIT_YIELD( spins );
2549     // main wait spin loop
2550     while(!f(r = TCR_4(*spin), check)) {
2551         KMP_FSYNC_SPIN_PREPARE( obj );
2552         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2553            It causes problems with infinite recursion because of exit lock */
2554         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2555             __kmp_abort_thread(); */
2556 
2557         /* if we have waited a bit, or are oversubscribed, yield */
2558         /* pause is in the following code */
2559         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2560         KMP_YIELD_SPIN( spins );
2561     }
2562     KMP_FSYNC_SPIN_ACQUIRED( obj );
2563     return r;
2564 }
2565 
2566 kmp_uint64
2567 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2568                     kmp_uint64            checker,
2569                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2570                     , void        * obj    // Higher-level synchronization object, or NULL.
2571                     )
2572 {
2573     // note: we may not belong to a team at this point
2574     register volatile kmp_uint64         * spin          = spinner;
2575     register          kmp_uint64           check         = checker;
2576     register          kmp_uint32   spins;
2577     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2578     register          kmp_uint64           r;
2579 
2580     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2581     KMP_INIT_YIELD( spins );
2582     // main wait spin loop
2583     while(!f(r = *spin, check))
2584     {
2585         KMP_FSYNC_SPIN_PREPARE( obj );
2586         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2587            It causes problems with infinite recursion because of exit lock */
2588         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2589             __kmp_abort_thread(); */
2590 
2591         // if we are oversubscribed,
2592         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2593         // pause is in the following code
2594         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2595         KMP_YIELD_SPIN( spins );
2596     }
2597     KMP_FSYNC_SPIN_ACQUIRED( obj );
2598     return r;
2599 }
2600 
2601 } // extern "C"
2602 
2603 #ifdef KMP_GOMP_COMPAT
2604 
2605 void
2606 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2607                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2608                            kmp_int32 chunk, int push_ws )
2609 {
2610     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2611                                       push_ws );
2612 }
2613 
2614 void
2615 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2616                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2617                             kmp_int32 chunk, int push_ws )
2618 {
2619     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2620                                        push_ws );
2621 }
2622 
2623 void
2624 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2625                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2626                            kmp_int64 chunk, int push_ws )
2627 {
2628     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2629                                       push_ws );
2630 }
2631 
2632 void
2633 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2634                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2635                             kmp_int64 chunk, int push_ws )
2636 {
2637     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2638                                        push_ws );
2639 }
2640 
2641 void
2642 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2643 {
2644     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2645 }
2646 
2647 void
2648 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2649 {
2650     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2651 }
2652 
2653 void
2654 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2655 {
2656     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2657 }
2658 
2659 void
2660 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2661 {
2662     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2663 }
2664 
2665 #endif /* KMP_GOMP_COMPAT */
2666 
2667 /* ------------------------------------------------------------------------ */
2668 /* ------------------------------------------------------------------------ */
2669 
2670