1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 /* ------------------------------------------------------------------------ */
39 /* ------------------------------------------------------------------------ */
40 
41 // template for type limits
42 template< typename T >
43 struct i_maxmin {
44     static const T mx;
45     static const T mn;
46 };
47 template<>
48 struct i_maxmin< int > {
49     static const int mx = 0x7fffffff;
50     static const int mn = 0x80000000;
51 };
52 template<>
53 struct i_maxmin< unsigned int > {
54     static const unsigned int mx = 0xffffffff;
55     static const unsigned int mn = 0x00000000;
56 };
57 template<>
58 struct i_maxmin< long long > {
59     static const long long mx = 0x7fffffffffffffffLL;
60     static const long long mn = 0x8000000000000000LL;
61 };
62 template<>
63 struct i_maxmin< unsigned long long > {
64     static const unsigned long long mx = 0xffffffffffffffffLL;
65     static const unsigned long long mn = 0x0000000000000000LL;
66 };
67 //-------------------------------------------------------------------------
68 
69 #ifdef KMP_STATIC_STEAL_ENABLED
70 
71     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
72     template< typename T >
73     struct dispatch_private_infoXX_template {
74         typedef typename traits_t< T >::unsigned_t  UT;
75         typedef typename traits_t< T >::signed_t    ST;
76         UT count;                // unsigned
77         T  ub;
78         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
79         T  lb;
80         ST st;                   // signed
81         UT tc;                   // unsigned
82         T  static_steal_counter; // for static_steal only; maybe better to put after ub
83 
84         /* parm[1-4] are used in different ways by different scheduling algorithms */
85 
86         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
87         //    a) parm3 is properly aligned and
88         //    b) all parm1-4 are in the same cache line.
89         // Because of parm1-4 are used together, performance seems to be better
90         // if they are in the same line (not measured though).
91 
92         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
93             T  parm1;
94             T  parm2;
95             T  parm3;
96             T  parm4;
97         };
98 
99         UT ordered_lower; // unsigned
100         UT ordered_upper; // unsigned
101         #if KMP_OS_WINDOWS
102         T  last_upper;
103         #endif /* KMP_OS_WINDOWS */
104     };
105 
106 #else /* KMP_STATIC_STEAL_ENABLED */
107 
108     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
109     template< typename T >
110     struct dispatch_private_infoXX_template {
111         typedef typename traits_t< T >::unsigned_t  UT;
112         typedef typename traits_t< T >::signed_t    ST;
113         T  lb;
114         T  ub;
115         ST st;            // signed
116         UT tc;            // unsigned
117 
118         T  parm1;
119         T  parm2;
120         T  parm3;
121         T  parm4;
122 
123         UT count;         // unsigned
124 
125         UT ordered_lower; // unsigned
126         UT ordered_upper; // unsigned
127         #if KMP_OS_WINDOWS
128 	T  last_upper;
129         #endif /* KMP_OS_WINDOWS */
130     };
131 
132 #endif /* KMP_STATIC_STEAL_ENABLED */
133 
134 // replaces dispatch_private_info structure and dispatch_private_info_t type
135 template< typename T >
136 struct KMP_ALIGN_CACHE dispatch_private_info_template {
137     // duplicate alignment here, otherwise size of structure is not correct in our compiler
138     union KMP_ALIGN_CACHE private_info_tmpl {
139         dispatch_private_infoXX_template< T > p;
140         dispatch_private_info64_t             p64;
141     } u;
142     enum sched_type schedule;  /* scheduling algorithm */
143     kmp_uint32      ordered;   /* ordered clause specified */
144     kmp_uint32      ordered_bumped;
145     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
146     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
147     kmp_uint32      nomerge;   /* don't merge iters if serialized */
148     kmp_uint32      type_size;
149     enum cons_type  pushed_ws;
150 };
151 
152 
153 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
154 template< typename UT >
155 struct dispatch_shared_infoXX_template {
156     /* chunk index under dynamic, number of idle threads under static-steal;
157        iteration index otherwise */
158     volatile UT     iteration;
159     volatile UT     num_done;
160     volatile UT     ordered_iteration;
161     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
162 };
163 
164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
165 template< typename UT >
166 struct dispatch_shared_info_template {
167     // we need union here to keep the structure size
168     union shared_info_tmpl {
169         dispatch_shared_infoXX_template< UT >  s;
170         dispatch_shared_info64_t               s64;
171     } u;
172     volatile kmp_uint32     buffer_index;
173 };
174 
175 /* ------------------------------------------------------------------------ */
176 /* ------------------------------------------------------------------------ */
177 
178 #undef USE_TEST_LOCKS
179 
180 // test_then_add template (general template should NOT be used)
181 template< typename T >
182 static __forceinline T
183 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
184 
185 template<>
186 __forceinline kmp_int32
187 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
188 {
189     kmp_int32 r;
190     r = KMP_TEST_THEN_ADD32( p, d );
191     return r;
192 }
193 
194 template<>
195 __forceinline kmp_int64
196 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
197 {
198     kmp_int64 r;
199     r = KMP_TEST_THEN_ADD64( p, d );
200     return r;
201 }
202 
203 // test_then_inc_acq template (general template should NOT be used)
204 template< typename T >
205 static __forceinline T
206 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
207 
208 template<>
209 __forceinline kmp_int32
210 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
211 {
212     kmp_int32 r;
213     r = KMP_TEST_THEN_INC_ACQ32( p );
214     return r;
215 }
216 
217 template<>
218 __forceinline kmp_int64
219 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
220 {
221     kmp_int64 r;
222     r = KMP_TEST_THEN_INC_ACQ64( p );
223     return r;
224 }
225 
226 // test_then_inc template (general template should NOT be used)
227 template< typename T >
228 static __forceinline T
229 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
230 
231 template<>
232 __forceinline kmp_int32
233 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
234 {
235     kmp_int32 r;
236     r = KMP_TEST_THEN_INC32( p );
237     return r;
238 }
239 
240 template<>
241 __forceinline kmp_int64
242 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
243 {
244     kmp_int64 r;
245     r = KMP_TEST_THEN_INC64( p );
246     return r;
247 }
248 
249 // compare_and_swap template (general template should NOT be used)
250 template< typename T >
251 static __forceinline kmp_int32
252 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
253 
254 template<>
255 __forceinline kmp_int32
256 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
257 {
258     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
259 }
260 
261 template<>
262 __forceinline kmp_int32
263 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
264 {
265     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
266 }
267 
268 /*
269     Spin wait loop that first does pause, then yield.
270     Waits until function returns non-zero when called with *spinner and check.
271     Does NOT put threads to sleep.
272 #if USE_ITT_BUILD
273     Arguments:
274         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
275             locks consistently. For example, if lock is acquired immediately, its address is
276             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
277             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
278             address, not an address of low-level spinner.
279 #endif // USE_ITT_BUILD
280 */
281 template< typename UT >
282 // ToDo: make inline function (move to header file for icl)
283 static UT  // unsigned 4- or 8-byte type
284 __kmp_wait_yield( volatile UT * spinner,
285                   UT            checker,
286                   kmp_uint32 (* pred)( UT, UT )
287                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
288                   )
289 {
290     // note: we may not belong to a team at this point
291     register volatile UT         * spin          = spinner;
292     register          UT           check         = checker;
293     register          kmp_uint32   spins;
294     register          kmp_uint32 (*f) ( UT, UT ) = pred;
295     register          UT           r;
296 
297     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
298     KMP_INIT_YIELD( spins );
299     // main wait spin loop
300     while(!f(r = *spin, check))
301     {
302         KMP_FSYNC_SPIN_PREPARE( obj );
303         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
304            It causes problems with infinite recursion because of exit lock */
305         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
306             __kmp_abort_thread(); */
307 
308         // if we are oversubscribed,
309         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
310         // pause is in the following code
311         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
312         KMP_YIELD_SPIN( spins );
313     }
314     KMP_FSYNC_SPIN_ACQUIRED( obj );
315     return r;
316 }
317 
318 template< typename UT >
319 static kmp_uint32 __kmp_eq( UT value, UT checker) {
320     return value == checker;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_neq( UT value, UT checker) {
325     return value != checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_lt( UT value, UT checker) {
330     return value < checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_ge( UT value, UT checker) {
335     return value >= checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_le( UT value, UT checker) {
340     return value <= checker;
341 }
342 
343 
344 /* ------------------------------------------------------------------------ */
345 /* ------------------------------------------------------------------------ */
346 
347 static void
348 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
349 {
350     kmp_info_t *th;
351 
352     KMP_DEBUG_ASSERT( gtid_ref );
353 
354     if ( __kmp_env_consistency_check ) {
355         th = __kmp_threads[*gtid_ref];
356         if ( th -> th.th_root -> r.r_active
357           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
358 #if KMP_USE_DYNAMIC_LOCK
359             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
360 #else
361             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
362 #endif
363         }
364     }
365 }
366 
367 template< typename UT >
368 static void
369 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
370 {
371     typedef typename traits_t< UT >::signed_t    ST;
372     dispatch_private_info_template< UT > * pr;
373 
374     int gtid = *gtid_ref;
375 //    int  cid = *cid_ref;
376     kmp_info_t *th = __kmp_threads[ gtid ];
377     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
378 
379     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
380     if ( __kmp_env_consistency_check ) {
381         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
382             ( th -> th.th_dispatch -> th_dispatch_pr_current );
383         if ( pr -> pushed_ws != ct_none ) {
384 #if KMP_USE_DYNAMIC_LOCK
385             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
386 #else
387             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
388 #endif
389         }
390     }
391 
392     if ( ! th -> th.th_team -> t.t_serialized ) {
393         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
394             ( th -> th.th_dispatch -> th_dispatch_sh_current );
395         UT  lower;
396 
397         if ( ! __kmp_env_consistency_check ) {
398                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
399                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
400         }
401         lower = pr->u.p.ordered_lower;
402 
403         #if ! defined( KMP_GOMP_COMPAT )
404             if ( __kmp_env_consistency_check ) {
405                 if ( pr->ordered_bumped ) {
406                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
407                     __kmp_error_construct2(
408                         kmp_i18n_msg_CnsMultipleNesting,
409                         ct_ordered_in_pdo, loc_ref,
410                         & p->stack_data[ p->w_top ]
411                     );
412                 }
413             }
414         #endif /* !defined(KMP_GOMP_COMPAT) */
415 
416         KMP_MB();
417         #ifdef KMP_DEBUG
418         {
419             const char * buff;
420             // create format specifiers before the debug output
421             buff = __kmp_str_format(
422                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
423                 traits_t< UT >::spec, traits_t< UT >::spec );
424             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
425             __kmp_str_free( &buff );
426         }
427         #endif
428 
429         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
430                                 USE_ITT_BUILD_ARG( NULL )
431                                 );
432         KMP_MB();  /* is this necessary? */
433         #ifdef KMP_DEBUG
434         {
435             const char * buff;
436             // create format specifiers before the debug output
437             buff = __kmp_str_format(
438                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
439                 traits_t< UT >::spec, traits_t< UT >::spec );
440             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
441             __kmp_str_free( &buff );
442         }
443         #endif
444     }
445     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
446 }
447 
448 static void
449 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
450 {
451     kmp_info_t *th;
452 
453     if ( __kmp_env_consistency_check ) {
454         th = __kmp_threads[*gtid_ref];
455         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
456             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
457         }
458     }
459 }
460 
461 template< typename UT >
462 static void
463 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
464 {
465     typedef typename traits_t< UT >::signed_t    ST;
466     dispatch_private_info_template< UT > * pr;
467 
468     int gtid = *gtid_ref;
469 //    int  cid = *cid_ref;
470     kmp_info_t *th = __kmp_threads[ gtid ];
471     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
472 
473     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
474     if ( __kmp_env_consistency_check ) {
475         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
476             ( th -> th.th_dispatch -> th_dispatch_pr_current );
477         if ( pr -> pushed_ws != ct_none ) {
478             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
479         }
480     }
481 
482     if ( ! th -> th.th_team -> t.t_serialized ) {
483         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
484             ( th -> th.th_dispatch -> th_dispatch_sh_current );
485 
486         if ( ! __kmp_env_consistency_check ) {
487             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
488                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
489         }
490 
491         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
492         #if ! defined( KMP_GOMP_COMPAT )
493             if ( __kmp_env_consistency_check ) {
494                 if ( pr->ordered_bumped != 0 ) {
495                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
496                     /* How to test it? - OM */
497                     __kmp_error_construct2(
498                         kmp_i18n_msg_CnsMultipleNesting,
499                         ct_ordered_in_pdo, loc_ref,
500                         & p->stack_data[ p->w_top ]
501                     );
502                 }
503             }
504         #endif /* !defined(KMP_GOMP_COMPAT) */
505 
506         KMP_MB();       /* Flush all pending memory write invalidates.  */
507 
508         pr->ordered_bumped += 1;
509 
510         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
511                         gtid, pr->ordered_bumped ) );
512 
513         KMP_MB();       /* Flush all pending memory write invalidates.  */
514 
515         /* TODO use general release procedure? */
516         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519     }
520     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
521 }
522 
523 /* Computes and returns x to the power of y, where y must a non-negative integer */
524 template< typename UT >
525 static __forceinline long double
526 __kmp_pow(long double x, UT y) {
527     long double s=1.0L;
528 
529     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
530     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
531     while(y) {
532         if ( y & 1 )
533             s *= x;
534         x *= x;
535         y >>= 1;
536     }
537     return s;
538 }
539 
540 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
541    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
542    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
543    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
544 */
545 template< typename T >
546 static __inline typename traits_t< T >::unsigned_t
547 __kmp_dispatch_guided_remaining(
548     T                                  tc,
549     typename traits_t< T >::floating_t base,
550     typename traits_t< T >::unsigned_t idx
551 ) {
552     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
553        least for ICL 8.1, long double arithmetic may not really have
554        long double precision, even with /Qlong_double.  Currently, we
555        workaround that in the caller code, by manipulating the FPCW for
556        Windows* OS on IA-32 architecture.  The lack of precision is not
557        expected to be a correctness issue, though.
558     */
559     typedef typename traits_t< T >::unsigned_t  UT;
560 
561     long double x = tc * __kmp_pow< UT >(base, idx);
562     UT r = (UT) x;
563     if ( x == r )
564         return r;
565     return r + 1;
566 }
567 
568 // Parameters of the guided-iterative algorithm:
569 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
570 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
571 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
572 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
573 static int guided_int_param = 2;
574 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
575 
576 // UT - unsigned flavor of T, ST - signed flavor of T,
577 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
578 template< typename T >
579 static void
580 __kmp_dispatch_init(
581     ident_t                        * loc,
582     int                              gtid,
583     enum sched_type                  schedule,
584     T                                lb,
585     T                                ub,
586     typename traits_t< T >::signed_t st,
587     typename traits_t< T >::signed_t chunk,
588     int                              push_ws
589 ) {
590     typedef typename traits_t< T >::unsigned_t  UT;
591     typedef typename traits_t< T >::signed_t    ST;
592     typedef typename traits_t< T >::floating_t  DBL;
593     static const int ___kmp_size_type = sizeof( UT );
594 
595     int                                            active;
596     T                                              tc;
597     kmp_info_t *                                   th;
598     kmp_team_t *                                   team;
599     kmp_uint32                                     my_buffer_index;
600     dispatch_private_info_template< T >          * pr;
601     dispatch_shared_info_template< UT > volatile * sh;
602 
603     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
604     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
605 
606     if ( ! TCR_4( __kmp_init_parallel ) )
607         __kmp_parallel_initialize();
608 
609 #if INCLUDE_SSC_MARKS
610     SSC_MARK_DISPATCH_INIT();
611 #endif
612     #ifdef KMP_DEBUG
613     {
614         const char * buff;
615         // create format specifiers before the debug output
616         buff = __kmp_str_format(
617             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
618             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
619         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
620         __kmp_str_free( &buff );
621     }
622     #endif
623     /* setup data */
624     th     = __kmp_threads[ gtid ];
625     team   = th -> th.th_team;
626     active = ! team -> t.t_serialized;
627     th->th.th_ident = loc;
628 
629 #if USE_ITT_BUILD
630     kmp_uint64 cur_chunk = chunk;
631 #endif
632     if ( ! active ) {
633         pr = reinterpret_cast< dispatch_private_info_template< T >* >
634             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
635     } else {
636         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
637                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
638 
639         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
640 
641         /* What happens when number of threads changes, need to resize buffer? */
642         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
643             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
644         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
645             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
646     }
647 
648     /* Pick up the nomerge/ordered bits from the scheduling type */
649     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
650         pr->nomerge = TRUE;
651         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
652     } else {
653         pr->nomerge = FALSE;
654     }
655     pr->type_size = ___kmp_size_type; // remember the size of variables
656     if ( kmp_ord_lower & schedule ) {
657         pr->ordered = TRUE;
658         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
659     } else {
660         pr->ordered = FALSE;
661     }
662     if ( schedule == kmp_sch_static ) {
663         schedule = __kmp_static;
664     } else {
665         if ( schedule == kmp_sch_runtime ) {
666             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
667             schedule = team -> t.t_sched.r_sched_type;
668             // Detail the schedule if needed (global controls are differentiated appropriately)
669             if ( schedule == kmp_sch_guided_chunked ) {
670                 schedule = __kmp_guided;
671             } else if ( schedule == kmp_sch_static ) {
672                 schedule = __kmp_static;
673             }
674             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
675             chunk = team -> t.t_sched.chunk;
676 
677             #ifdef KMP_DEBUG
678             {
679                 const char * buff;
680                 // create format specifiers before the debug output
681                 buff = __kmp_str_format(
682                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
683                     traits_t< ST >::spec );
684                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
685                 __kmp_str_free( &buff );
686             }
687             #endif
688         } else {
689             if ( schedule == kmp_sch_guided_chunked ) {
690                 schedule = __kmp_guided;
691             }
692             if ( chunk <= 0 ) {
693                 chunk = KMP_DEFAULT_CHUNK;
694             }
695         }
696 
697         if ( schedule == kmp_sch_auto ) {
698             // mapping and differentiation: in the __kmp_do_serial_initialize()
699             schedule = __kmp_auto;
700             #ifdef KMP_DEBUG
701             {
702                 const char * buff;
703                 // create format specifiers before the debug output
704                 buff = __kmp_str_format(
705                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
706                     traits_t< ST >::spec );
707                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
708                 __kmp_str_free( &buff );
709             }
710             #endif
711         }
712 
713         /* guided analytical not safe for too many threads */
714         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
715             schedule = kmp_sch_guided_iterative_chunked;
716             KMP_WARNING( DispatchManyThreads );
717         }
718         pr->u.p.parm1 = chunk;
719     }
720     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
721                 "unknown scheduling type" );
722 
723     pr->u.p.count = 0;
724 
725     if ( __kmp_env_consistency_check ) {
726         if ( st == 0 ) {
727             __kmp_error_construct(
728                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
729                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
730             );
731         }
732     }
733 
734     tc = ( ub - lb + st );
735     if ( st != 1 ) {
736         if ( st < 0 ) {
737             if ( lb < ub ) {
738                 tc = 0;            // zero-trip
739             } else {   // lb >= ub
740                 tc = (ST)tc / st;  // convert to signed division
741             }
742         } else {       // st > 0
743             if ( ub < lb ) {
744                 tc = 0;            // zero-trip
745             } else {   // lb >= ub
746                 tc /= st;
747             }
748         }
749     } else if ( ub < lb ) {        // st == 1
750         tc = 0;                    // zero-trip
751     }
752 
753     pr->u.p.lb = lb;
754     pr->u.p.ub = ub;
755     pr->u.p.st = st;
756     pr->u.p.tc = tc;
757 
758     #if KMP_OS_WINDOWS
759     pr->u.p.last_upper = ub + st;
760     #endif /* KMP_OS_WINDOWS */
761 
762     /* NOTE: only the active parallel region(s) has active ordered sections */
763 
764     if ( active ) {
765         if ( pr->ordered == 0 ) {
766             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
767             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
768         } else {
769             pr->ordered_bumped = 0;
770 
771             pr->u.p.ordered_lower = 1;
772             pr->u.p.ordered_upper = 0;
773 
774             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
775             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
776         }
777     }
778 
779     if ( __kmp_env_consistency_check ) {
780         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
781         if ( push_ws ) {
782             __kmp_push_workshare( gtid, ws, loc );
783             pr->pushed_ws = ws;
784         } else {
785             __kmp_check_workshare( gtid, ws, loc );
786             pr->pushed_ws = ct_none;
787         }
788     }
789 
790     switch ( schedule ) {
791     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
792     case kmp_sch_static_steal:
793         {
794             T nproc = team->t.t_nproc;
795             T ntc, init;
796 
797             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
798 
799             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
800             if ( nproc > 1 && ntc >= nproc ) {
801                 T id = __kmp_tid_from_gtid(gtid);
802                 T small_chunk, extras;
803 
804                 small_chunk = ntc / nproc;
805                 extras = ntc % nproc;
806 
807                 init = id * small_chunk + ( id < extras ? id : extras );
808                 pr->u.p.count = init;
809                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
810 
811                 pr->u.p.parm2 = lb;
812                 //pr->pfields.parm3 = 0; // it's not used in static_steal
813                 pr->u.p.parm4 = id;
814                 pr->u.p.st = st;
815                 break;
816             } else {
817                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
818                                gtid ) );
819                 schedule = kmp_sch_static_balanced;
820                 /* too few iterations: fall-through to kmp_sch_static_balanced */
821             } // if
822             /* FALL-THROUGH to static balanced */
823         } // case
824     #endif
825     case kmp_sch_static_balanced:
826         {
827             T nproc = team->t.t_nproc;
828             T init, limit;
829 
830             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
831                             gtid ) );
832 
833             if ( nproc > 1 ) {
834                 T id = __kmp_tid_from_gtid(gtid);
835 
836                 if ( tc < nproc ) {
837                     if ( id < tc ) {
838                         init = id;
839                         limit = id;
840                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
841                     } else {
842                         pr->u.p.count = 1;  /* means no more chunks to execute */
843                         pr->u.p.parm1 = FALSE;
844                         break;
845                     }
846                 } else {
847                     T small_chunk = tc / nproc;
848                     T extras = tc % nproc;
849                     init = id * small_chunk + (id < extras ? id : extras);
850                     limit = init + small_chunk - (id < extras ? 0 : 1);
851                     pr->u.p.parm1 = (id == nproc - 1);
852                 }
853             } else {
854                 if ( tc > 0 ) {
855                     init = 0;
856                     limit = tc - 1;
857                     pr->u.p.parm1 = TRUE;
858                 } else {
859                     // zero trip count
860                     pr->u.p.count = 1;  /* means no more chunks to execute */
861                     pr->u.p.parm1 = FALSE;
862                     break;
863                 }
864             }
865 #if USE_ITT_BUILD
866             // Calculate chunk for metadata report
867             if(  __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
868                 cur_chunk = limit - init + 1;
869             }
870 #endif
871             if ( st == 1 ) {
872                 pr->u.p.lb = lb + init;
873                 pr->u.p.ub = lb + limit;
874             } else {
875                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
876                 pr->u.p.lb = lb + init * st;
877                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
878                 if ( st > 0 ) {
879                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
880                 } else {
881                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
882                 }
883             }
884             if ( pr->ordered ) {
885                 pr->u.p.ordered_lower = init;
886                 pr->u.p.ordered_upper = limit;
887             }
888             break;
889         } // case
890     case kmp_sch_guided_iterative_chunked :
891         {
892             T nproc = team->t.t_nproc;
893             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
894 
895             if ( nproc > 1 ) {
896                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
897                     /* chunk size too large, switch to dynamic */
898                     schedule = kmp_sch_dynamic_chunked;
899                 } else {
900                     // when remaining iters become less than parm2 - switch to dynamic
901                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
902                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
903                 }
904             } else {
905                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
906                 schedule = kmp_sch_static_greedy;
907                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
908                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
909                 pr->u.p.parm1 = tc;
910             } // if
911         } // case
912         break;
913     case kmp_sch_guided_analytical_chunked:
914         {
915             T nproc = team->t.t_nproc;
916             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
917 
918             if ( nproc > 1 ) {
919                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
920                     /* chunk size too large, switch to dynamic */
921                     schedule = kmp_sch_dynamic_chunked;
922                 } else {
923                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
924                     DBL x;
925 
926                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
927                     /* Linux* OS already has 64-bit computation by default for
928 		       long double, and on Windows* OS on Intel(R) 64,
929 		       /Qlong_double doesn't work.  On Windows* OS
930 		       on IA-32 architecture, we need to set precision to
931 		       64-bit instead of the default 53-bit. Even though long
932 		       double doesn't work on Windows* OS on Intel(R) 64, the
933 		       resulting lack of precision is not expected to impact
934 		       the correctness of the algorithm, but this has not been
935 		       mathematically proven.
936                     */
937                     // save original FPCW and set precision to 64-bit, as
938                     // Windows* OS on IA-32 architecture defaults to 53-bit
939                     unsigned int oldFpcw = _control87(0,0);
940                     _control87(_PC_64,_MCW_PC); // 0,0x30000
941                     #endif
942                     /* value used for comparison in solver for cross-over point */
943                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
944 
945                     /* crossover point--chunk indexes equal to or greater than
946 		       this point switch to dynamic-style scheduling */
947                     UT   cross;
948 
949                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
950                     x = (long double)1.0 - (long double)0.5 / nproc;
951 
952                     #ifdef KMP_DEBUG
953                     { // test natural alignment
954                         struct _test_a {
955                             char a;
956                             union {
957                                 char b;
958                                 DBL  d;
959                             };
960                         } t;
961                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
962                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
963                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
964                     }
965                     #endif // KMP_DEBUG
966 
967                     /* save the term in thread private dispatch structure */
968                     *(DBL*)&pr->u.p.parm3 = x;
969 
970                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
971                     {
972                         UT          left, right, mid;
973                         long double p;
974 
975                         /* estimate initial upper and lower bound */
976 
977                         /* doesn't matter what value right is as long as it is positive, but
978                            it affects performance of the solver
979                         */
980                         right = 229;
981                         p = __kmp_pow< UT >(x,right);
982                         if ( p > target ) {
983                             do{
984                                 p *= p;
985                                 right <<= 1;
986                             } while(p>target && right < (1<<27));
987                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
988                         } else {
989                             left = 0;
990                         }
991 
992                         /* bisection root-finding method */
993                         while ( left + 1 < right ) {
994                             mid = (left + right) / 2;
995                             if ( __kmp_pow< UT >(x,mid) > target ) {
996                                 left = mid;
997                             } else {
998                                 right = mid;
999                             }
1000                         } // while
1001                         cross = right;
1002                     }
1003                     /* assert sanity of computed crossover point */
1004                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1005 
1006                     /* save the crossover point in thread private dispatch structure */
1007                     pr->u.p.parm2 = cross;
1008 
1009                     // C75803
1010                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1011                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1012                     #else
1013                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1014                     #endif
1015                     /* dynamic-style scheduling offset */
1016                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1017                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1018                         // restore FPCW
1019                         _control87(oldFpcw,_MCW_PC);
1020                     #endif
1021                 } // if
1022             } else {
1023                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1024                                gtid ) );
1025                 schedule = kmp_sch_static_greedy;
1026                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1027                 pr->u.p.parm1 = tc;
1028             } // if
1029         } // case
1030         break;
1031     case kmp_sch_static_greedy:
1032         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1033             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1034                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1035                 tc;
1036         break;
1037     case kmp_sch_static_chunked :
1038     case kmp_sch_dynamic_chunked :
1039         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1040         break;
1041     case kmp_sch_trapezoidal :
1042         {
1043             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1044 
1045             T parm1, parm2, parm3, parm4;
1046             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1047 
1048             parm1 = chunk;
1049 
1050             /* F : size of the first cycle */
1051             parm2 = ( tc / (2 * team->t.t_nproc) );
1052 
1053             if ( parm2 < 1 ) {
1054                 parm2 = 1;
1055             }
1056 
1057             /* L : size of the last cycle.  Make sure the last cycle
1058              *     is not larger than the first cycle.
1059              */
1060             if ( parm1 < 1 ) {
1061                 parm1 = 1;
1062             } else if ( parm1 > parm2 ) {
1063                 parm1 = parm2;
1064             }
1065 
1066             /* N : number of cycles */
1067             parm3 = ( parm2 + parm1 );
1068             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1069 
1070             if ( parm3 < 2 ) {
1071                 parm3 = 2;
1072             }
1073 
1074             /* sigma : decreasing incr of the trapezoid */
1075             parm4 = ( parm3 - 1 );
1076             parm4 = ( parm2 - parm1 ) / parm4;
1077 
1078             // pointless check, because parm4 >= 0 always
1079             //if ( parm4 < 0 ) {
1080             //    parm4 = 0;
1081             //}
1082 
1083             pr->u.p.parm1 = parm1;
1084             pr->u.p.parm2 = parm2;
1085             pr->u.p.parm3 = parm3;
1086             pr->u.p.parm4 = parm4;
1087         } // case
1088         break;
1089 
1090     default:
1091         {
1092             __kmp_msg(
1093                 kmp_ms_fatal,                        // Severity
1094                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1095                 KMP_HNT( GetNewerLibrary ),          // Hint
1096                 __kmp_msg_null                       // Variadic argument list terminator
1097             );
1098         }
1099         break;
1100     } // switch
1101     pr->schedule = schedule;
1102     if ( active ) {
1103         /* The name of this buffer should be my_buffer_index when it's free to use it */
1104 
1105         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1106                         gtid, my_buffer_index, sh->buffer_index) );
1107         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1108                                         USE_ITT_BUILD_ARG( NULL )
1109                                         );
1110             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1111             // *always* 32-bit integers.
1112         KMP_MB();  /* is this necessary? */
1113         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1114                         gtid, my_buffer_index, sh->buffer_index) );
1115 
1116         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1117         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1118 #if USE_ITT_BUILD
1119         if ( pr->ordered ) {
1120             __kmp_itt_ordered_init( gtid );
1121         }; // if
1122 #endif /* USE_ITT_BUILD */
1123     }; // if
1124 
1125 #if USE_ITT_BUILD
1126     // Report loop metadata
1127     if( __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
1128         kmp_uint32 tid  = __kmp_tid_from_gtid( gtid );
1129         if (KMP_MASTER_TID(tid)) {
1130             kmp_uint64 schedtype = 0;
1131 
1132             switch ( schedule ) {
1133             case kmp_sch_static_chunked:
1134             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1135                 break;
1136             case kmp_sch_static_greedy:
1137                 cur_chunk = pr->u.p.parm1;
1138                 break;
1139             case kmp_sch_dynamic_chunked:
1140                 schedtype = 1;
1141                 break;
1142             case kmp_sch_guided_iterative_chunked:
1143             case kmp_sch_guided_analytical_chunked:
1144                 schedtype = 2;
1145                 break;
1146             default:
1147 //            Should we put this case under "static"?
1148 //            case kmp_sch_static_steal:
1149                 schedtype = 3;
1150                 break;
1151             }
1152             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1153         }
1154     }
1155 #endif /* USE_ITT_BUILD */
1156 
1157     #ifdef KMP_DEBUG
1158     {
1159         const char * buff;
1160         // create format specifiers before the debug output
1161         buff = __kmp_str_format(
1162             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1163             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1164             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1165             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1166             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1167             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1168             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1169         KD_TRACE(10, ( buff,
1170             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1171             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1172             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1173             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1174         __kmp_str_free( &buff );
1175     }
1176     #endif
1177     #if ( KMP_STATIC_STEAL_ENABLED )
1178     if ( ___kmp_size_type < 8 ) {
1179       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1180       // all the parm3 variables will contain the same value.
1181       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1182       // rather than program life-time increment.
1183       // So the dedicated variable is required. The 'static_steal_counter' is used.
1184       if( schedule == kmp_sch_static_steal ) {
1185         // Other threads will inspect this variable when searching for a victim.
1186         // This is a flag showing that other threads may steal from this thread since then.
1187         volatile T * p = &pr->u.p.static_steal_counter;
1188         *p = *p + 1;
1189       }
1190     }
1191     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1192 }
1193 
1194 /*
1195  * For ordered loops, either __kmp_dispatch_finish() should be called after
1196  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1197  * every chunk of iterations.  If the ordered section(s) were not executed
1198  * for this iteration (or every iteration in this chunk), we need to set the
1199  * ordered iteration counters so that the next thread can proceed.
1200  */
1201 template< typename UT >
1202 static void
1203 __kmp_dispatch_finish( int gtid, ident_t *loc )
1204 {
1205     typedef typename traits_t< UT >::signed_t ST;
1206     kmp_info_t *th = __kmp_threads[ gtid ];
1207 
1208     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1209     if ( ! th -> th.th_team -> t.t_serialized ) {
1210 
1211         dispatch_private_info_template< UT > * pr =
1212             reinterpret_cast< dispatch_private_info_template< UT >* >
1213             ( th->th.th_dispatch->th_dispatch_pr_current );
1214         dispatch_shared_info_template< UT > volatile * sh =
1215             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1216             ( th->th.th_dispatch->th_dispatch_sh_current );
1217         KMP_DEBUG_ASSERT( pr );
1218         KMP_DEBUG_ASSERT( sh );
1219         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1220                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1221 
1222         if ( pr->ordered_bumped ) {
1223             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1224                             gtid ) );
1225             pr->ordered_bumped = 0;
1226         } else {
1227             UT lower = pr->u.p.ordered_lower;
1228 
1229             #ifdef KMP_DEBUG
1230             {
1231                 const char * buff;
1232                 // create format specifiers before the debug output
1233                 buff = __kmp_str_format(
1234                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1235                     traits_t< UT >::spec, traits_t< UT >::spec );
1236                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1237                 __kmp_str_free( &buff );
1238             }
1239             #endif
1240 
1241             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1242                                    USE_ITT_BUILD_ARG(NULL)
1243                                    );
1244             KMP_MB();  /* is this necessary? */
1245             #ifdef KMP_DEBUG
1246             {
1247                 const char * buff;
1248                 // create format specifiers before the debug output
1249                 buff = __kmp_str_format(
1250                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1251                     traits_t< UT >::spec, traits_t< UT >::spec );
1252                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1253                 __kmp_str_free( &buff );
1254             }
1255             #endif
1256 
1257             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1258         } // if
1259     } // if
1260     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1261 }
1262 
1263 #ifdef KMP_GOMP_COMPAT
1264 
1265 template< typename UT >
1266 static void
1267 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1268 {
1269     typedef typename traits_t< UT >::signed_t ST;
1270     kmp_info_t *th = __kmp_threads[ gtid ];
1271 
1272     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1273     if ( ! th -> th.th_team -> t.t_serialized ) {
1274 //        int cid;
1275         dispatch_private_info_template< UT > * pr =
1276             reinterpret_cast< dispatch_private_info_template< UT >* >
1277             ( th->th.th_dispatch->th_dispatch_pr_current );
1278         dispatch_shared_info_template< UT > volatile * sh =
1279             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1280             ( th->th.th_dispatch->th_dispatch_sh_current );
1281         KMP_DEBUG_ASSERT( pr );
1282         KMP_DEBUG_ASSERT( sh );
1283         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1284                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1285 
1286 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1287             UT lower = pr->u.p.ordered_lower;
1288             UT upper = pr->u.p.ordered_upper;
1289             UT inc = upper - lower + 1;
1290 
1291             if ( pr->ordered_bumped == inc ) {
1292                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1293                   gtid ) );
1294                 pr->ordered_bumped = 0;
1295             } else {
1296                 inc -= pr->ordered_bumped;
1297 
1298                 #ifdef KMP_DEBUG
1299                 {
1300                     const char * buff;
1301                     // create format specifiers before the debug output
1302                     buff = __kmp_str_format(
1303                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1304                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1305                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1306                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1307                     __kmp_str_free( &buff );
1308                 }
1309                 #endif
1310 
1311                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1312                                        USE_ITT_BUILD_ARG(NULL)
1313                                        );
1314 
1315                 KMP_MB();  /* is this necessary? */
1316                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1317                   gtid ) );
1318                 pr->ordered_bumped = 0;
1319 //!!!!! TODO check if the inc should be unsigned, or signed???
1320                 #ifdef KMP_DEBUG
1321                 {
1322                     const char * buff;
1323                     // create format specifiers before the debug output
1324                     buff = __kmp_str_format(
1325                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1326                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1327                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1328                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1329                     __kmp_str_free( &buff );
1330                 }
1331                 #endif
1332 
1333                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1334             }
1335 //        }
1336     }
1337     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1338 }
1339 
1340 #endif /* KMP_GOMP_COMPAT */
1341 
1342 template< typename T >
1343 static int
1344 __kmp_dispatch_next(
1345     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1346 ) {
1347 
1348     typedef typename traits_t< T >::unsigned_t  UT;
1349     typedef typename traits_t< T >::signed_t    ST;
1350     typedef typename traits_t< T >::floating_t  DBL;
1351     static const int ___kmp_size_type = sizeof( UT );
1352 
1353     int                                   status;
1354     dispatch_private_info_template< T > * pr;
1355     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1356     kmp_team_t                          * team = th -> th.th_team;
1357 
1358     KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1359     #ifdef KMP_DEBUG
1360     {
1361         const char * buff;
1362         // create format specifiers before the debug output
1363         buff = __kmp_str_format(
1364             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1365             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1366         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1367         __kmp_str_free( &buff );
1368     }
1369     #endif
1370 
1371     if ( team -> t.t_serialized ) {
1372         /* NOTE: serialize this dispatch becase we are not at the active level */
1373         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1374             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1375         KMP_DEBUG_ASSERT( pr );
1376 
1377         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1378             *p_lb = 0;
1379             *p_ub = 0;
1380 //            if ( p_last != NULL )
1381 //                *p_last = 0;
1382             if ( p_st != NULL )
1383                 *p_st = 0;
1384             if ( __kmp_env_consistency_check ) {
1385                 if ( pr->pushed_ws != ct_none ) {
1386                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1387                 }
1388             }
1389         } else if ( pr->nomerge ) {
1390             kmp_int32 last;
1391             T         start;
1392             UT        limit, trip, init;
1393             ST        incr;
1394             T         chunk = pr->u.p.parm1;
1395 
1396             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1397 
1398             init = chunk * pr->u.p.count++;
1399             trip = pr->u.p.tc - 1;
1400 
1401             if ( (status = (init <= trip)) == 0 ) {
1402                 *p_lb = 0;
1403                 *p_ub = 0;
1404 //                if ( p_last != NULL )
1405 //                    *p_last = 0;
1406                 if ( p_st != NULL )
1407                     *p_st = 0;
1408                 if ( __kmp_env_consistency_check ) {
1409                     if ( pr->pushed_ws != ct_none ) {
1410                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1411                     }
1412                 }
1413             } else {
1414                 start = pr->u.p.lb;
1415                 limit = chunk + init - 1;
1416                 incr  = pr->u.p.st;
1417 
1418                 if ( (last = (limit >= trip)) != 0 ) {
1419                     limit = trip;
1420                     #if KMP_OS_WINDOWS
1421                     pr->u.p.last_upper = pr->u.p.ub;
1422                     #endif /* KMP_OS_WINDOWS */
1423                 }
1424                 if ( p_last != NULL )
1425                     *p_last = last;
1426                 if ( p_st != NULL )
1427                     *p_st = incr;
1428                 if ( incr == 1 ) {
1429                     *p_lb = start + init;
1430                     *p_ub = start + limit;
1431                 } else {
1432                     *p_lb = start + init * incr;
1433                     *p_ub = start + limit * incr;
1434                 }
1435 
1436                 if ( pr->ordered ) {
1437                     pr->u.p.ordered_lower = init;
1438                     pr->u.p.ordered_upper = limit;
1439                     #ifdef KMP_DEBUG
1440                     {
1441                         const char * buff;
1442                         // create format specifiers before the debug output
1443                         buff = __kmp_str_format(
1444                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1445                             traits_t< UT >::spec, traits_t< UT >::spec );
1446                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1447                         __kmp_str_free( &buff );
1448                     }
1449                     #endif
1450                 } // if
1451             } // if
1452         } else {
1453             pr->u.p.tc = 0;
1454             *p_lb = pr->u.p.lb;
1455             *p_ub = pr->u.p.ub;
1456             #if KMP_OS_WINDOWS
1457             pr->u.p.last_upper = *p_ub;
1458             #endif /* KMP_OS_WINDOWS */
1459             if ( p_last != NULL )
1460                 *p_last = TRUE;
1461             if ( p_st != NULL )
1462                 *p_st = pr->u.p.st;
1463         } // if
1464         #ifdef KMP_DEBUG
1465         {
1466             const char * buff;
1467             // create format specifiers before the debug output
1468             buff = __kmp_str_format(
1469                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1470                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1471                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1472             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1473             __kmp_str_free( &buff );
1474         }
1475         #endif
1476 #if INCLUDE_SSC_MARKS
1477         SSC_MARK_DISPATCH_NEXT();
1478 #endif
1479         return status;
1480     } else {
1481         kmp_int32 last = 0;
1482         dispatch_shared_info_template< UT > *sh;
1483         T         start;
1484         ST        incr;
1485         UT        limit, trip, init;
1486 
1487         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1488                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1489 
1490         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1491             ( th->th.th_dispatch->th_dispatch_pr_current );
1492         KMP_DEBUG_ASSERT( pr );
1493         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1494             ( th->th.th_dispatch->th_dispatch_sh_current );
1495         KMP_DEBUG_ASSERT( sh );
1496 
1497         if ( pr->u.p.tc == 0 ) {
1498             // zero trip count
1499             status = 0;
1500         } else {
1501             switch (pr->schedule) {
1502             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1503             case kmp_sch_static_steal:
1504                 {
1505                     T chunk = pr->u.p.parm1;
1506 
1507                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1508 
1509                     trip = pr->u.p.tc - 1;
1510 
1511                     if ( ___kmp_size_type > 4 ) {
1512                         // Other threads do not look into the data of this thread,
1513                         //  so it's not necessary to make volatile casting.
1514                         init   = ( pr->u.p.count )++;
1515                         status = ( init < (UT)pr->u.p.ub );
1516                     } else {
1517                         typedef union {
1518                             struct {
1519                                 UT count;
1520                                 T  ub;
1521                             } p;
1522                             kmp_int64 b;
1523                         } union_i4;
1524                         // All operations on 'count' or 'ub' must be combined atomically together.
1525                         // stealing implemented only for 4-byte indexes
1526                         {
1527                             union_i4 vold, vnew;
1528                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1529                             vnew = vold;
1530                             vnew.p.count++;
1531                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1532                                         ( volatile kmp_int64* )&pr->u.p.count,
1533                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1534                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1535                                 KMP_CPU_PAUSE();
1536                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1537                                 vnew = vold;
1538                                 vnew.p.count++;
1539                             }
1540                             vnew = vold;
1541                             init   = vnew.p.count;
1542                             status = ( init < (UT)vnew.p.ub ) ;
1543                         }
1544 
1545                         if( !status ) {
1546                             kmp_info_t   **other_threads = team->t.t_threads;
1547                             int          while_limit = 10;
1548                             int          while_index = 0;
1549 
1550                             // TODO: algorithm of searching for a victim
1551                             // should be cleaned up and measured
1552                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1553                                 union_i4  vold, vnew;
1554                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1555                                 T         victimIdx    = pr->u.p.parm4;
1556                                 T         oldVictimIdx = victimIdx;
1557                                 dispatch_private_info_template< T > * victim;
1558 
1559                                 do {
1560                                     if( !victimIdx ) {
1561                                         victimIdx = team->t.t_nproc - 1;
1562                                     } else {
1563                                         --victimIdx;
1564                                     }
1565                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1566                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1567                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1568                                 // TODO: think about a proper place of this test
1569                                 if ( ( !victim ) ||
1570                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1571                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1572                                     // TODO: delay would be nice
1573                                     continue;
1574                                     // the victim is not ready yet to participate in stealing
1575                                     // because the victim is still in kmp_init_dispatch
1576                                 }
1577                                 if ( oldVictimIdx == victimIdx ) {
1578                                     break;
1579                                 }
1580                                 pr->u.p.parm4 = victimIdx;
1581 
1582                                 while( 1 ) {
1583                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1584                                     vnew = vold;
1585 
1586                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1587                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1588                                         break;
1589                                     }
1590                                     vnew.p.ub -= (remaining >> 2);
1591                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1592                                     #pragma warning( push )
1593                                     // disable warning on pointless comparison of unsigned with 0
1594                                     #pragma warning( disable: 186 )
1595                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1596                                     #pragma warning( pop )
1597                                     // TODO: Should this be acquire or release?
1598                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1599                                             ( volatile kmp_int64 * )&victim->u.p.count,
1600                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1601                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1602                                         status = 1;
1603                                         while_index = 0;
1604                                         // now update own count and ub
1605                                         #if KMP_ARCH_X86
1606                                         // stealing executed on non-KMP_ARCH_X86 only
1607                                             // Atomic 64-bit write on ia32 is
1608                                             // unavailable, so we do this in steps.
1609                                             //     This code is not tested.
1610                                             init = vold.p.count;
1611                                             pr->u.p.ub = 0;
1612                                             pr->u.p.count = init + 1;
1613                                             pr->u.p.ub = vnew.p.count;
1614                                         #else
1615                                             init = vnew.p.ub;
1616                                             vold.p.count = init + 1;
1617                                             // TODO: is it safe and enough?
1618                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1619                                         #endif // KMP_ARCH_X86
1620                                         break;
1621                                     } // if
1622                                 KMP_CPU_PAUSE();
1623                                 } // while (1)
1624                             } // while
1625                         } // if
1626                     } // if
1627                     if ( !status ) {
1628                         *p_lb = 0;
1629                         *p_ub = 0;
1630                         if ( p_st != NULL ) *p_st = 0;
1631                     } else {
1632                         start = pr->u.p.parm2;
1633                         init *= chunk;
1634                         limit = chunk + init - 1;
1635                         incr  = pr->u.p.st;
1636 
1637                         KMP_DEBUG_ASSERT(init <= trip);
1638                         if ( (last = (limit >= trip)) != 0 )
1639                             limit = trip;
1640                         if ( p_st != NULL ) *p_st = incr;
1641 
1642                         if ( incr == 1 ) {
1643                             *p_lb = start + init;
1644                             *p_ub = start + limit;
1645                         } else {
1646                             *p_lb = start + init * incr;
1647                             *p_ub = start + limit * incr;
1648                         }
1649 
1650                         if ( pr->ordered ) {
1651                             pr->u.p.ordered_lower = init;
1652                             pr->u.p.ordered_upper = limit;
1653                             #ifdef KMP_DEBUG
1654                             {
1655                                 const char * buff;
1656                                 // create format specifiers before the debug output
1657                                 buff = __kmp_str_format(
1658                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1659                                     traits_t< UT >::spec, traits_t< UT >::spec );
1660                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1661                                 __kmp_str_free( &buff );
1662                             }
1663                             #endif
1664                         } // if
1665                     } // if
1666                     break;
1667                 } // case
1668             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1669             case kmp_sch_static_balanced:
1670                 {
1671                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1672                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1673                         pr->u.p.count = 1;
1674                         *p_lb = pr->u.p.lb;
1675                         *p_ub = pr->u.p.ub;
1676                         last = pr->u.p.parm1;
1677                         if ( p_st != NULL )
1678                             *p_st = pr->u.p.st;
1679                     } else {  /* no iterations to do */
1680                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1681                     }
1682                     if ( pr->ordered ) {
1683                         #ifdef KMP_DEBUG
1684                         {
1685                             const char * buff;
1686                             // create format specifiers before the debug output
1687                             buff = __kmp_str_format(
1688                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1689                                 traits_t< UT >::spec, traits_t< UT >::spec );
1690                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1691                             __kmp_str_free( &buff );
1692                         }
1693                         #endif
1694                     } // if
1695                 } // case
1696                 break;
1697             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1698             case kmp_sch_static_chunked:
1699                 {
1700                     T parm1;
1701 
1702                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1703                                    gtid ) );
1704                     parm1 = pr->u.p.parm1;
1705 
1706                     trip  = pr->u.p.tc - 1;
1707                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1708 
1709                     if ( (status = (init <= trip)) != 0 ) {
1710                         start = pr->u.p.lb;
1711                         incr  = pr->u.p.st;
1712                         limit = parm1 + init - 1;
1713 
1714                         if ( (last = (limit >= trip)) != 0 )
1715                             limit = trip;
1716 
1717                         if ( p_st != NULL ) *p_st = incr;
1718 
1719                         pr->u.p.count += team->t.t_nproc;
1720 
1721                         if ( incr == 1 ) {
1722                             *p_lb = start + init;
1723                             *p_ub = start + limit;
1724                         }
1725                         else {
1726                             *p_lb = start + init * incr;
1727                             *p_ub = start + limit * incr;
1728                         }
1729 
1730                         if ( pr->ordered ) {
1731                             pr->u.p.ordered_lower = init;
1732                             pr->u.p.ordered_upper = limit;
1733                             #ifdef KMP_DEBUG
1734                             {
1735                                 const char * buff;
1736                                 // create format specifiers before the debug output
1737                                 buff = __kmp_str_format(
1738                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1739                                     traits_t< UT >::spec, traits_t< UT >::spec );
1740                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1741                                 __kmp_str_free( &buff );
1742                             }
1743                             #endif
1744                         } // if
1745                     } // if
1746                 } // case
1747                 break;
1748 
1749             case kmp_sch_dynamic_chunked:
1750                 {
1751                     T chunk = pr->u.p.parm1;
1752 
1753                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1754                                    gtid ) );
1755 
1756                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1757                     trip = pr->u.p.tc - 1;
1758 
1759                     if ( (status = (init <= trip)) == 0 ) {
1760                         *p_lb = 0;
1761                         *p_ub = 0;
1762                         if ( p_st != NULL ) *p_st = 0;
1763                     } else {
1764                         start = pr->u.p.lb;
1765                         limit = chunk + init - 1;
1766                         incr  = pr->u.p.st;
1767 
1768                         if ( (last = (limit >= trip)) != 0 )
1769                             limit = trip;
1770 
1771                         if ( p_st != NULL ) *p_st = incr;
1772 
1773                         if ( incr == 1 ) {
1774                             *p_lb = start + init;
1775                             *p_ub = start + limit;
1776                         } else {
1777                             *p_lb = start + init * incr;
1778                             *p_ub = start + limit * incr;
1779                         }
1780 
1781                         if ( pr->ordered ) {
1782                             pr->u.p.ordered_lower = init;
1783                             pr->u.p.ordered_upper = limit;
1784                             #ifdef KMP_DEBUG
1785                             {
1786                                 const char * buff;
1787                                 // create format specifiers before the debug output
1788                                 buff = __kmp_str_format(
1789                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1790                                     traits_t< UT >::spec, traits_t< UT >::spec );
1791                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1792                                 __kmp_str_free( &buff );
1793                             }
1794                             #endif
1795                         } // if
1796                     } // if
1797                 } // case
1798                 break;
1799 
1800             case kmp_sch_guided_iterative_chunked:
1801                 {
1802                     T  chunkspec = pr->u.p.parm1;
1803                     KD_TRACE(100,
1804                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1805                     trip  = pr->u.p.tc;
1806                     // Start atomic part of calculations
1807                     while(1) {
1808                         ST  remaining;             // signed, because can be < 0
1809                         init = sh->u.s.iteration;  // shared value
1810                         remaining = trip - init;
1811                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1812                             // nothing to do, don't try atomic op
1813                             status = 0;
1814                             break;
1815                         }
1816                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1817                             // use dynamic-style shcedule
1818                             // atomically inrement iterations, get old value
1819                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1820                             remaining = trip - init;
1821                             if (remaining <= 0) {
1822                                 status = 0;    // all iterations got by other threads
1823                             } else {
1824                                 // got some iterations to work on
1825                                 status = 1;
1826                                 if ( (T)remaining > chunkspec ) {
1827                                     limit = init + chunkspec - 1;
1828                                 } else {
1829                                     last = 1;   // the last chunk
1830                                     limit = init + remaining - 1;
1831                                 } // if
1832                             } // if
1833                             break;
1834                         } // if
1835                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1836                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1837                             // CAS was successful, chunk obtained
1838                             status = 1;
1839                             --limit;
1840                             break;
1841                         } // if
1842                     } // while
1843                     if ( status != 0 ) {
1844                         start = pr->u.p.lb;
1845                         incr = pr->u.p.st;
1846                         if ( p_st != NULL )
1847                             *p_st = incr;
1848                         *p_lb = start + init * incr;
1849                         *p_ub = start + limit * incr;
1850                         if ( pr->ordered ) {
1851                             pr->u.p.ordered_lower = init;
1852                             pr->u.p.ordered_upper = limit;
1853                             #ifdef KMP_DEBUG
1854                             {
1855                                 const char * buff;
1856                                 // create format specifiers before the debug output
1857                                 buff = __kmp_str_format(
1858                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1859                                     traits_t< UT >::spec, traits_t< UT >::spec );
1860                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1861                                 __kmp_str_free( &buff );
1862                             }
1863                             #endif
1864                         } // if
1865                     } else {
1866                         *p_lb = 0;
1867                         *p_ub = 0;
1868                         if ( p_st != NULL )
1869                             *p_st = 0;
1870                     } // if
1871                 } // case
1872                 break;
1873 
1874             case kmp_sch_guided_analytical_chunked:
1875                 {
1876                     T   chunkspec = pr->u.p.parm1;
1877                     UT chunkIdx;
1878     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1879                     /* for storing original FPCW value for Windows* OS on
1880 		       IA-32 architecture 8-byte version */
1881                     unsigned int oldFpcw;
1882                     unsigned int fpcwSet = 0;
1883     #endif
1884                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1885                                    gtid ) );
1886 
1887                     trip  = pr->u.p.tc;
1888 
1889                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1890                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1891 
1892                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1893                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1894                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1895                             --trip;
1896                             /* use dynamic-style scheduling */
1897                             init = chunkIdx * chunkspec + pr->u.p.count;
1898                             /* need to verify init > 0 in case of overflow in the above calculation */
1899                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1900                                 limit = init + chunkspec -1;
1901 
1902                                 if ( (last = (limit >= trip)) != 0 )
1903                                     limit = trip;
1904                             }
1905                             break;
1906                         } else {
1907                             /* use exponential-style scheduling */
1908                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1909                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1910                              */
1911     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1912                             /* If we haven't already done so, save original
1913 			       FPCW and set precision to 64-bit, as Windows* OS
1914 			       on IA-32 architecture defaults to 53-bit */
1915                             if ( !fpcwSet ) {
1916                                 oldFpcw = _control87(0,0);
1917                                 _control87(_PC_64,_MCW_PC);
1918                                 fpcwSet = 0x30000;
1919                             }
1920     #endif
1921                             if ( chunkIdx ) {
1922                                 init = __kmp_dispatch_guided_remaining< T >(
1923                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1924                                 KMP_DEBUG_ASSERT(init);
1925                                 init = trip - init;
1926                             } else
1927                                 init = 0;
1928                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1929                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1930                             KMP_ASSERT(init <= limit);
1931                             if ( init < limit ) {
1932                                 KMP_DEBUG_ASSERT(limit <= trip);
1933                                 --limit;
1934                                 status = 1;
1935                                 break;
1936                             } // if
1937                         } // if
1938                     } // while (1)
1939     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1940                     /* restore FPCW if necessary
1941                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1942                     */
1943                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1944                         _control87(oldFpcw,_MCW_PC);
1945     #endif
1946                     if ( status != 0 ) {
1947                         start = pr->u.p.lb;
1948                         incr = pr->u.p.st;
1949                         if ( p_st != NULL )
1950                             *p_st = incr;
1951                         *p_lb = start + init * incr;
1952                         *p_ub = start + limit * incr;
1953                         if ( pr->ordered ) {
1954                             pr->u.p.ordered_lower = init;
1955                             pr->u.p.ordered_upper = limit;
1956                             #ifdef KMP_DEBUG
1957                             {
1958                                 const char * buff;
1959                                 // create format specifiers before the debug output
1960                                 buff = __kmp_str_format(
1961                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1962                                     traits_t< UT >::spec, traits_t< UT >::spec );
1963                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1964                                 __kmp_str_free( &buff );
1965                             }
1966                             #endif
1967                         }
1968                     } else {
1969                         *p_lb = 0;
1970                         *p_ub = 0;
1971                         if ( p_st != NULL )
1972                             *p_st = 0;
1973                     }
1974                 } // case
1975                 break;
1976 
1977             case kmp_sch_trapezoidal:
1978                 {
1979                     UT   index;
1980                     T    parm2 = pr->u.p.parm2;
1981                     T    parm3 = pr->u.p.parm3;
1982                     T    parm4 = pr->u.p.parm4;
1983                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1984                                    gtid ) );
1985 
1986                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1987 
1988                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1989                     trip = pr->u.p.tc - 1;
1990 
1991                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1992                         *p_lb = 0;
1993                         *p_ub = 0;
1994                         if ( p_st != NULL ) *p_st = 0;
1995                     } else {
1996                         start = pr->u.p.lb;
1997                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1998                         incr  = pr->u.p.st;
1999 
2000                         if ( (last = (limit >= trip)) != 0 )
2001                             limit = trip;
2002 
2003                         if ( p_st != NULL ) *p_st = incr;
2004 
2005                         if ( incr == 1 ) {
2006                             *p_lb = start + init;
2007                             *p_ub = start + limit;
2008                         } else {
2009                             *p_lb = start + init * incr;
2010                             *p_ub = start + limit * incr;
2011                         }
2012 
2013                         if ( pr->ordered ) {
2014                             pr->u.p.ordered_lower = init;
2015                             pr->u.p.ordered_upper = limit;
2016                             #ifdef KMP_DEBUG
2017                             {
2018                                 const char * buff;
2019                                 // create format specifiers before the debug output
2020                                 buff = __kmp_str_format(
2021                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2022                                     traits_t< UT >::spec, traits_t< UT >::spec );
2023                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2024                                 __kmp_str_free( &buff );
2025                             }
2026                             #endif
2027                         } // if
2028                     } // if
2029                 } // case
2030                 break;
2031             default:
2032                 {
2033                     status = 0; // to avoid complaints on uninitialized variable use
2034                     __kmp_msg(
2035                         kmp_ms_fatal,                        // Severity
2036                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2037                         KMP_HNT( GetNewerLibrary ),          // Hint
2038                         __kmp_msg_null                       // Variadic argument list terminator
2039                     );
2040                 }
2041                 break;
2042             } // switch
2043         } // if tc == 0;
2044 
2045         if ( status == 0 ) {
2046             UT   num_done;
2047 
2048             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2049             #ifdef KMP_DEBUG
2050             {
2051                 const char * buff;
2052                 // create format specifiers before the debug output
2053                 buff = __kmp_str_format(
2054                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2055                     traits_t< UT >::spec );
2056                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2057                 __kmp_str_free( &buff );
2058             }
2059             #endif
2060 
2061             if ( (ST)num_done == team->t.t_nproc-1 ) {
2062                 /* NOTE: release this buffer to be reused */
2063 
2064                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2065 
2066                 sh->u.s.num_done = 0;
2067                 sh->u.s.iteration = 0;
2068 
2069                 /* TODO replace with general release procedure? */
2070                 if ( pr->ordered ) {
2071                     sh->u.s.ordered_iteration = 0;
2072                 }
2073 
2074                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2075 
2076                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2077                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2078                                 gtid, sh->buffer_index) );
2079 
2080                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2081 
2082             } // if
2083             if ( __kmp_env_consistency_check ) {
2084                 if ( pr->pushed_ws != ct_none ) {
2085                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2086                 }
2087             }
2088 
2089             th -> th.th_dispatch -> th_deo_fcn = NULL;
2090             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2091             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2092             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2093         } // if (status == 0)
2094 #if KMP_OS_WINDOWS
2095         else if ( last ) {
2096             pr->u.p.last_upper = pr->u.p.ub;
2097         }
2098 #endif /* KMP_OS_WINDOWS */
2099         if ( p_last != NULL && status != 0 )
2100             *p_last = last;
2101     } // if
2102 
2103     #ifdef KMP_DEBUG
2104     {
2105         const char * buff;
2106         // create format specifiers before the debug output
2107         buff = __kmp_str_format(
2108             "__kmp_dispatch_next: T#%%d normal case: " \
2109             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2110             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2111         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2112         __kmp_str_free( &buff );
2113     }
2114     #endif
2115 #if INCLUDE_SSC_MARKS
2116     SSC_MARK_DISPATCH_NEXT();
2117 #endif
2118     return status;
2119 }
2120 
2121 template< typename T >
2122 static void
2123 __kmp_dist_get_bounds(
2124     ident_t                          *loc,
2125     kmp_int32                         gtid,
2126     kmp_int32                        *plastiter,
2127     T                                *plower,
2128     T                                *pupper,
2129     typename traits_t< T >::signed_t  incr
2130 ) {
2131     KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2132     typedef typename traits_t< T >::unsigned_t  UT;
2133     typedef typename traits_t< T >::signed_t    ST;
2134     register kmp_uint32  team_id;
2135     register kmp_uint32  nteams;
2136     register UT          trip_count;
2137     register kmp_team_t *team;
2138     kmp_info_t * th;
2139 
2140     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2141     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2142     #ifdef KMP_DEBUG
2143     {
2144         const char * buff;
2145         // create format specifiers before the debug output
2146         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2147             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2148             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2149             traits_t< T >::spec );
2150         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2151         __kmp_str_free( &buff );
2152     }
2153     #endif
2154 
2155     if( __kmp_env_consistency_check ) {
2156         if( incr == 0 ) {
2157             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2158         }
2159         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2160             // The loop is illegal.
2161             // Some zero-trip loops maintained by compiler, e.g.:
2162             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2163             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2164             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2165             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2166             // Compiler does not check the following illegal loops:
2167             //   for(i=0;i<10;i+=incr) // where incr<0
2168             //   for(i=10;i>0;i-=incr) // where incr<0
2169             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2170         }
2171     }
2172     th = __kmp_threads[gtid];
2173     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2174     team = th->th.th_team;
2175     #if OMP_40_ENABLED
2176     nteams = th->th.th_teams_size.nteams;
2177     #endif
2178     team_id = team->t.t_master_tid;
2179     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2180 
2181     // compute global trip count
2182     if( incr == 1 ) {
2183         trip_count = *pupper - *plower + 1;
2184     } else if(incr == -1) {
2185         trip_count = *plower - *pupper + 1;
2186     } else {
2187         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2188     }
2189     if( trip_count <= nteams ) {
2190         KMP_DEBUG_ASSERT(
2191             __kmp_static == kmp_sch_static_greedy || \
2192             __kmp_static == kmp_sch_static_balanced
2193         ); // Unknown static scheduling type.
2194         // only some teams get single iteration, others get nothing
2195         if( team_id < trip_count ) {
2196             *pupper = *plower = *plower + team_id * incr;
2197         } else {
2198             *plower = *pupper + incr; // zero-trip loop
2199         }
2200         if( plastiter != NULL )
2201             *plastiter = ( team_id == trip_count - 1 );
2202     } else {
2203         if( __kmp_static == kmp_sch_static_balanced ) {
2204             register UT chunk = trip_count / nteams;
2205             register UT extras = trip_count % nteams;
2206             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2207             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2208             if( plastiter != NULL )
2209                 *plastiter = ( team_id == nteams - 1 );
2210         } else {
2211             register T chunk_inc_count =
2212                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2213             register T upper = *pupper;
2214             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2215                 // Unknown static scheduling type.
2216             *plower += team_id * chunk_inc_count;
2217             *pupper = *plower + chunk_inc_count - incr;
2218             // Check/correct bounds if needed
2219             if( incr > 0 ) {
2220                 if( *pupper < *plower )
2221                     *pupper = i_maxmin< T >::mx;
2222                 if( plastiter != NULL )
2223                     *plastiter = *plower <= upper && *pupper > upper - incr;
2224                 if( *pupper > upper )
2225                     *pupper = upper; // tracker C73258
2226             } else {
2227                 if( *pupper > *plower )
2228                     *pupper = i_maxmin< T >::mn;
2229                 if( plastiter != NULL )
2230                     *plastiter = *plower >= upper && *pupper < upper - incr;
2231                 if( *pupper < upper )
2232                     *pupper = upper; // tracker C73258
2233             }
2234         }
2235     }
2236 }
2237 
2238 //-----------------------------------------------------------------------------------------
2239 // Dispatch routines
2240 //    Transfer call to template< type T >
2241 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2242 //                         T lb, T ub, ST st, ST chunk )
2243 extern "C" {
2244 
2245 /*!
2246 @ingroup WORK_SHARING
2247 @{
2248 @param loc Source location
2249 @param gtid Global thread id
2250 @param schedule Schedule type
2251 @param lb  Lower bound
2252 @param ub  Upper bound
2253 @param st  Step (or increment if you prefer)
2254 @param chunk The chunk size to block with
2255 
2256 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2257 These functions are all identical apart from the types of the arguments.
2258 */
2259 
2260 void
2261 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2262                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2263 {
2264     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2265     KMP_DEBUG_ASSERT( __kmp_init_serial );
2266     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2267 }
2268 /*!
2269 See @ref __kmpc_dispatch_init_4
2270 */
2271 void
2272 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2273                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2274 {
2275     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2276     KMP_DEBUG_ASSERT( __kmp_init_serial );
2277     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2278 }
2279 
2280 /*!
2281 See @ref __kmpc_dispatch_init_4
2282 */
2283 void
2284 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2285                         kmp_int64 lb, kmp_int64 ub,
2286                         kmp_int64 st, kmp_int64 chunk )
2287 {
2288     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2289     KMP_DEBUG_ASSERT( __kmp_init_serial );
2290     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2291 }
2292 
2293 /*!
2294 See @ref __kmpc_dispatch_init_4
2295 */
2296 void
2297 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2298                          kmp_uint64 lb, kmp_uint64 ub,
2299                          kmp_int64 st, kmp_int64 chunk )
2300 {
2301     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2302     KMP_DEBUG_ASSERT( __kmp_init_serial );
2303     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2304 }
2305 
2306 /*!
2307 See @ref __kmpc_dispatch_init_4
2308 
2309 Difference from __kmpc_dispatch_init set of functions is these functions
2310 are called for composite distribute parallel for construct. Thus before
2311 regular iterations dispatching we need to calc per-team iteration space.
2312 
2313 These functions are all identical apart from the types of the arguments.
2314 */
2315 void
2316 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2317     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2318 {
2319     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2320     KMP_DEBUG_ASSERT( __kmp_init_serial );
2321     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2322     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2323 }
2324 
2325 void
2326 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2327     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2328 {
2329     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2330     KMP_DEBUG_ASSERT( __kmp_init_serial );
2331     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2332     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2333 }
2334 
2335 void
2336 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2337     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2338 {
2339     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2340     KMP_DEBUG_ASSERT( __kmp_init_serial );
2341     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2342     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2343 }
2344 
2345 void
2346 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2347     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2348 {
2349     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2350     KMP_DEBUG_ASSERT( __kmp_init_serial );
2351     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2352     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2353 }
2354 
2355 /*!
2356 @param loc Source code location
2357 @param gtid Global thread id
2358 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2359 @param p_lb   Pointer to the lower bound for the next chunk of work
2360 @param p_ub   Pointer to the upper bound for the next chunk of work
2361 @param p_st   Pointer to the stride for the next chunk of work
2362 @return one if there is work to be done, zero otherwise
2363 
2364 Get the next dynamically allocated chunk of work for this thread.
2365 If there is no more work, then the lb,ub and stride need not be modified.
2366 */
2367 int
2368 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2369                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2370 {
2371     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2372 }
2373 
2374 /*!
2375 See @ref __kmpc_dispatch_next_4
2376 */
2377 int
2378 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2379                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2380 {
2381     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2382 }
2383 
2384 /*!
2385 See @ref __kmpc_dispatch_next_4
2386 */
2387 int
2388 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2389                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2390 {
2391     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2392 }
2393 
2394 /*!
2395 See @ref __kmpc_dispatch_next_4
2396 */
2397 int
2398 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2399                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2400 {
2401     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2402 }
2403 
2404 /*!
2405 @param loc Source code location
2406 @param gtid Global thread id
2407 
2408 Mark the end of a dynamic loop.
2409 */
2410 void
2411 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2412 {
2413     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2414 }
2415 
2416 /*!
2417 See @ref __kmpc_dispatch_fini_4
2418 */
2419 void
2420 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2421 {
2422     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2423 }
2424 
2425 /*!
2426 See @ref __kmpc_dispatch_fini_4
2427 */
2428 void
2429 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2430 {
2431     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2432 }
2433 
2434 /*!
2435 See @ref __kmpc_dispatch_fini_4
2436 */
2437 void
2438 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2439 {
2440     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2441 }
2442 /*! @} */
2443 
2444 //-----------------------------------------------------------------------------------------
2445 //Non-template routines from kmp_dispatch.c used in other sources
2446 
2447 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2448     return value == checker;
2449 }
2450 
2451 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2452     return value != checker;
2453 }
2454 
2455 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2456     return value < checker;
2457 }
2458 
2459 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2460     return value >= checker;
2461 }
2462 
2463 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2464     return value <= checker;
2465 }
2466 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2467     return value == checker;
2468 }
2469 
2470 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2471     return value != checker;
2472 }
2473 
2474 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2475     return value < checker;
2476 }
2477 
2478 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2479     return value >= checker;
2480 }
2481 
2482 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2483     return value <= checker;
2484 }
2485 
2486 kmp_uint32
2487 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2488                    kmp_uint32            checker,
2489                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2490                    , void        * obj    // Higher-level synchronization object, or NULL.
2491                    )
2492 {
2493     // note: we may not belong to a team at this point
2494     register volatile kmp_uint32         * spin          = spinner;
2495     register          kmp_uint32           check         = checker;
2496     register          kmp_uint32   spins;
2497     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2498     register          kmp_uint32           r;
2499 
2500     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2501     KMP_INIT_YIELD( spins );
2502     // main wait spin loop
2503     while(!f(r = TCR_4(*spin), check)) {
2504         KMP_FSYNC_SPIN_PREPARE( obj );
2505         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2506            It causes problems with infinite recursion because of exit lock */
2507         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2508             __kmp_abort_thread(); */
2509 
2510         /* if we have waited a bit, or are oversubscribed, yield */
2511         /* pause is in the following code */
2512         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2513         KMP_YIELD_SPIN( spins );
2514     }
2515     KMP_FSYNC_SPIN_ACQUIRED( obj );
2516     return r;
2517 }
2518 
2519 kmp_uint64
2520 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2521                     kmp_uint64            checker,
2522                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2523                     , void        * obj    // Higher-level synchronization object, or NULL.
2524                     )
2525 {
2526     // note: we may not belong to a team at this point
2527     register volatile kmp_uint64         * spin          = spinner;
2528     register          kmp_uint64           check         = checker;
2529     register          kmp_uint32   spins;
2530     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2531     register          kmp_uint64           r;
2532 
2533     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2534     KMP_INIT_YIELD( spins );
2535     // main wait spin loop
2536     while(!f(r = *spin, check))
2537     {
2538         KMP_FSYNC_SPIN_PREPARE( obj );
2539         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2540            It causes problems with infinite recursion because of exit lock */
2541         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2542             __kmp_abort_thread(); */
2543 
2544         // if we are oversubscribed,
2545         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2546         // pause is in the following code
2547         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2548         KMP_YIELD_SPIN( spins );
2549     }
2550     KMP_FSYNC_SPIN_ACQUIRED( obj );
2551     return r;
2552 }
2553 
2554 } // extern "C"
2555 
2556 #ifdef KMP_GOMP_COMPAT
2557 
2558 void
2559 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2560                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2561                            kmp_int32 chunk, int push_ws )
2562 {
2563     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2564                                       push_ws );
2565 }
2566 
2567 void
2568 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2569                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2570                             kmp_int32 chunk, int push_ws )
2571 {
2572     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2573                                        push_ws );
2574 }
2575 
2576 void
2577 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2578                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2579                            kmp_int64 chunk, int push_ws )
2580 {
2581     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2582                                       push_ws );
2583 }
2584 
2585 void
2586 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2587                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2588                             kmp_int64 chunk, int push_ws )
2589 {
2590     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2591                                        push_ws );
2592 }
2593 
2594 void
2595 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2596 {
2597     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2598 }
2599 
2600 void
2601 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2602 {
2603     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2604 }
2605 
2606 void
2607 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2608 {
2609     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2610 }
2611 
2612 void
2613 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2614 {
2615     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2616 }
2617 
2618 #endif /* KMP_GOMP_COMPAT */
2619 
2620 /* ------------------------------------------------------------------------ */
2621 /* ------------------------------------------------------------------------ */
2622 
2623