1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 /* ------------------------------------------------------------------------ */
39 /* ------------------------------------------------------------------------ */
40 
41 // template for type limits
42 template< typename T >
43 struct i_maxmin {
44     static const T mx;
45     static const T mn;
46 };
47 template<>
48 struct i_maxmin< int > {
49     static const int mx = 0x7fffffff;
50     static const int mn = 0x80000000;
51 };
52 template<>
53 struct i_maxmin< unsigned int > {
54     static const unsigned int mx = 0xffffffff;
55     static const unsigned int mn = 0x00000000;
56 };
57 template<>
58 struct i_maxmin< long long > {
59     static const long long mx = 0x7fffffffffffffffLL;
60     static const long long mn = 0x8000000000000000LL;
61 };
62 template<>
63 struct i_maxmin< unsigned long long > {
64     static const unsigned long long mx = 0xffffffffffffffffLL;
65     static const unsigned long long mn = 0x0000000000000000LL;
66 };
67 //-------------------------------------------------------------------------
68 
69 #ifdef KMP_STATIC_STEAL_ENABLED
70 
71     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
72     template< typename T >
73     struct dispatch_private_infoXX_template {
74         typedef typename traits_t< T >::unsigned_t  UT;
75         typedef typename traits_t< T >::signed_t    ST;
76         UT count;                // unsigned
77         T  ub;
78         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
79         T  lb;
80         ST st;                   // signed
81         UT tc;                   // unsigned
82         T  static_steal_counter; // for static_steal only; maybe better to put after ub
83 
84         /* parm[1-4] are used in different ways by different scheduling algorithms */
85 
86         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
87         //    a) parm3 is properly aligned and
88         //    b) all parm1-4 are in the same cache line.
89         // Because of parm1-4 are used together, performance seems to be better
90         // if they are in the same line (not measured though).
91 
92         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
93             T  parm1;
94             T  parm2;
95             T  parm3;
96             T  parm4;
97         };
98 
99         UT ordered_lower; // unsigned
100         UT ordered_upper; // unsigned
101         #if KMP_OS_WINDOWS
102         T  last_upper;
103         #endif /* KMP_OS_WINDOWS */
104     };
105 
106 #else /* KMP_STATIC_STEAL_ENABLED */
107 
108     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
109     template< typename T >
110     struct dispatch_private_infoXX_template {
111         typedef typename traits_t< T >::unsigned_t  UT;
112         typedef typename traits_t< T >::signed_t    ST;
113         T  lb;
114         T  ub;
115         ST st;            // signed
116         UT tc;            // unsigned
117 
118         T  parm1;
119         T  parm2;
120         T  parm3;
121         T  parm4;
122 
123         UT count;         // unsigned
124 
125         UT ordered_lower; // unsigned
126         UT ordered_upper; // unsigned
127         #if KMP_OS_WINDOWS
128 	T  last_upper;
129         #endif /* KMP_OS_WINDOWS */
130     };
131 
132 #endif /* KMP_STATIC_STEAL_ENABLED */
133 
134 // replaces dispatch_private_info structure and dispatch_private_info_t type
135 template< typename T >
136 struct KMP_ALIGN_CACHE dispatch_private_info_template {
137     // duplicate alignment here, otherwise size of structure is not correct in our compiler
138     union KMP_ALIGN_CACHE private_info_tmpl {
139         dispatch_private_infoXX_template< T > p;
140         dispatch_private_info64_t             p64;
141     } u;
142     enum sched_type schedule;  /* scheduling algorithm */
143     kmp_uint32      ordered;   /* ordered clause specified */
144     kmp_uint32      ordered_bumped;
145     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
146     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
147     kmp_uint32      nomerge;   /* don't merge iters if serialized */
148     kmp_uint32      type_size;
149     enum cons_type  pushed_ws;
150 };
151 
152 
153 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
154 template< typename UT >
155 struct dispatch_shared_infoXX_template {
156     /* chunk index under dynamic, number of idle threads under static-steal;
157        iteration index otherwise */
158     volatile UT     iteration;
159     volatile UT     num_done;
160     volatile UT     ordered_iteration;
161     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
162 };
163 
164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
165 template< typename UT >
166 struct dispatch_shared_info_template {
167     // we need union here to keep the structure size
168     union shared_info_tmpl {
169         dispatch_shared_infoXX_template< UT >  s;
170         dispatch_shared_info64_t               s64;
171     } u;
172     volatile kmp_uint32     buffer_index;
173 };
174 
175 /* ------------------------------------------------------------------------ */
176 /* ------------------------------------------------------------------------ */
177 
178 #undef USE_TEST_LOCKS
179 
180 // test_then_add template (general template should NOT be used)
181 template< typename T >
182 static __forceinline T
183 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
184 
185 template<>
186 __forceinline kmp_int32
187 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
188 {
189     kmp_int32 r;
190     r = KMP_TEST_THEN_ADD32( p, d );
191     return r;
192 }
193 
194 template<>
195 __forceinline kmp_int64
196 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
197 {
198     kmp_int64 r;
199     r = KMP_TEST_THEN_ADD64( p, d );
200     return r;
201 }
202 
203 // test_then_inc_acq template (general template should NOT be used)
204 template< typename T >
205 static __forceinline T
206 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
207 
208 template<>
209 __forceinline kmp_int32
210 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
211 {
212     kmp_int32 r;
213     r = KMP_TEST_THEN_INC_ACQ32( p );
214     return r;
215 }
216 
217 template<>
218 __forceinline kmp_int64
219 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
220 {
221     kmp_int64 r;
222     r = KMP_TEST_THEN_INC_ACQ64( p );
223     return r;
224 }
225 
226 // test_then_inc template (general template should NOT be used)
227 template< typename T >
228 static __forceinline T
229 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
230 
231 template<>
232 __forceinline kmp_int32
233 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
234 {
235     kmp_int32 r;
236     r = KMP_TEST_THEN_INC32( p );
237     return r;
238 }
239 
240 template<>
241 __forceinline kmp_int64
242 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
243 {
244     kmp_int64 r;
245     r = KMP_TEST_THEN_INC64( p );
246     return r;
247 }
248 
249 // compare_and_swap template (general template should NOT be used)
250 template< typename T >
251 static __forceinline kmp_int32
252 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
253 
254 template<>
255 __forceinline kmp_int32
256 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
257 {
258     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
259 }
260 
261 template<>
262 __forceinline kmp_int32
263 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
264 {
265     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
266 }
267 
268 /*
269     Spin wait loop that first does pause, then yield.
270     Waits until function returns non-zero when called with *spinner and check.
271     Does NOT put threads to sleep.
272 #if USE_ITT_BUILD
273     Arguments:
274         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
275             locks consistently. For example, if lock is acquired immediately, its address is
276             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
277             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
278             address, not an address of low-level spinner.
279 #endif // USE_ITT_BUILD
280 */
281 template< typename UT >
282 // ToDo: make inline function (move to header file for icl)
283 static UT  // unsigned 4- or 8-byte type
284 __kmp_wait_yield( volatile UT * spinner,
285                   UT            checker,
286                   kmp_uint32 (* pred)( UT, UT )
287                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
288                   )
289 {
290     // note: we may not belong to a team at this point
291     register volatile UT         * spin          = spinner;
292     register          UT           check         = checker;
293     register          kmp_uint32   spins;
294     register          kmp_uint32 (*f) ( UT, UT ) = pred;
295     register          UT           r;
296 
297     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
298     KMP_INIT_YIELD( spins );
299     // main wait spin loop
300     while(!f(r = *spin, check))
301     {
302         KMP_FSYNC_SPIN_PREPARE( obj );
303         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
304            It causes problems with infinite recursion because of exit lock */
305         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
306             __kmp_abort_thread(); */
307 
308         // if we are oversubscribed,
309         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
310         // pause is in the following code
311         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
312         KMP_YIELD_SPIN( spins );
313     }
314     KMP_FSYNC_SPIN_ACQUIRED( obj );
315     return r;
316 }
317 
318 template< typename UT >
319 static kmp_uint32 __kmp_eq( UT value, UT checker) {
320     return value == checker;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_neq( UT value, UT checker) {
325     return value != checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_lt( UT value, UT checker) {
330     return value < checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_ge( UT value, UT checker) {
335     return value >= checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_le( UT value, UT checker) {
340     return value <= checker;
341 }
342 
343 
344 /* ------------------------------------------------------------------------ */
345 /* ------------------------------------------------------------------------ */
346 
347 static void
348 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
349 {
350     kmp_info_t *th;
351 
352     KMP_DEBUG_ASSERT( gtid_ref );
353 
354     if ( __kmp_env_consistency_check ) {
355         th = __kmp_threads[*gtid_ref];
356         if ( th -> th.th_root -> r.r_active
357           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
358             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
359         }
360     }
361 }
362 
363 template< typename UT >
364 static void
365 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
366 {
367     typedef typename traits_t< UT >::signed_t    ST;
368     dispatch_private_info_template< UT > * pr;
369 
370     int gtid = *gtid_ref;
371 //    int  cid = *cid_ref;
372     kmp_info_t *th = __kmp_threads[ gtid ];
373     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
374 
375     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
376     if ( __kmp_env_consistency_check ) {
377         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
378             ( th -> th.th_dispatch -> th_dispatch_pr_current );
379         if ( pr -> pushed_ws != ct_none ) {
380             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
381         }
382     }
383 
384     if ( ! th -> th.th_team -> t.t_serialized ) {
385         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
386             ( th -> th.th_dispatch -> th_dispatch_sh_current );
387         UT  lower;
388 
389         if ( ! __kmp_env_consistency_check ) {
390                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
391                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
392         }
393         lower = pr->u.p.ordered_lower;
394 
395         #if ! defined( KMP_GOMP_COMPAT )
396             if ( __kmp_env_consistency_check ) {
397                 if ( pr->ordered_bumped ) {
398                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
399                     __kmp_error_construct2(
400                         kmp_i18n_msg_CnsMultipleNesting,
401                         ct_ordered_in_pdo, loc_ref,
402                         & p->stack_data[ p->w_top ]
403                     );
404                 }
405             }
406         #endif /* !defined(KMP_GOMP_COMPAT) */
407 
408         KMP_MB();
409         #ifdef KMP_DEBUG
410         {
411             const char * buff;
412             // create format specifiers before the debug output
413             buff = __kmp_str_format(
414                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
415                 traits_t< UT >::spec, traits_t< UT >::spec );
416             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
417             __kmp_str_free( &buff );
418         }
419         #endif
420 
421         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
422                                 USE_ITT_BUILD_ARG( NULL )
423                                 );
424         KMP_MB();  /* is this necessary? */
425         #ifdef KMP_DEBUG
426         {
427             const char * buff;
428             // create format specifiers before the debug output
429             buff = __kmp_str_format(
430                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
431                 traits_t< UT >::spec, traits_t< UT >::spec );
432             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
433             __kmp_str_free( &buff );
434         }
435         #endif
436     }
437     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
438 }
439 
440 static void
441 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
442 {
443     kmp_info_t *th;
444 
445     if ( __kmp_env_consistency_check ) {
446         th = __kmp_threads[*gtid_ref];
447         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
448             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
449         }
450     }
451 }
452 
453 template< typename UT >
454 static void
455 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
456 {
457     typedef typename traits_t< UT >::signed_t    ST;
458     dispatch_private_info_template< UT > * pr;
459 
460     int gtid = *gtid_ref;
461 //    int  cid = *cid_ref;
462     kmp_info_t *th = __kmp_threads[ gtid ];
463     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
464 
465     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
466     if ( __kmp_env_consistency_check ) {
467         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
468             ( th -> th.th_dispatch -> th_dispatch_pr_current );
469         if ( pr -> pushed_ws != ct_none ) {
470             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
471         }
472     }
473 
474     if ( ! th -> th.th_team -> t.t_serialized ) {
475         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
476             ( th -> th.th_dispatch -> th_dispatch_sh_current );
477 
478         if ( ! __kmp_env_consistency_check ) {
479             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
480                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
481         }
482 
483         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
484         #if ! defined( KMP_GOMP_COMPAT )
485             if ( __kmp_env_consistency_check ) {
486                 if ( pr->ordered_bumped != 0 ) {
487                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
488                     /* How to test it? - OM */
489                     __kmp_error_construct2(
490                         kmp_i18n_msg_CnsMultipleNesting,
491                         ct_ordered_in_pdo, loc_ref,
492                         & p->stack_data[ p->w_top ]
493                     );
494                 }
495             }
496         #endif /* !defined(KMP_GOMP_COMPAT) */
497 
498         KMP_MB();       /* Flush all pending memory write invalidates.  */
499 
500         pr->ordered_bumped += 1;
501 
502         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
503                         gtid, pr->ordered_bumped ) );
504 
505         KMP_MB();       /* Flush all pending memory write invalidates.  */
506 
507         /* TODO use general release procedure? */
508         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
509 
510         KMP_MB();       /* Flush all pending memory write invalidates.  */
511     }
512     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
513 }
514 
515 /* Computes and returns x to the power of y, where y must a non-negative integer */
516 template< typename UT >
517 static __forceinline long double
518 __kmp_pow(long double x, UT y) {
519     long double s=1.0L;
520 
521     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
522     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
523     while(y) {
524         if ( y & 1 )
525             s *= x;
526         x *= x;
527         y >>= 1;
528     }
529     return s;
530 }
531 
532 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
533    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
534    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
535    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
536 */
537 template< typename T >
538 static __inline typename traits_t< T >::unsigned_t
539 __kmp_dispatch_guided_remaining(
540     T                                  tc,
541     typename traits_t< T >::floating_t base,
542     typename traits_t< T >::unsigned_t idx
543 ) {
544     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
545        least for ICL 8.1, long double arithmetic may not really have
546        long double precision, even with /Qlong_double.  Currently, we
547        workaround that in the caller code, by manipulating the FPCW for
548        Windows* OS on IA-32 architecture.  The lack of precision is not
549        expected to be a correctness issue, though.
550     */
551     typedef typename traits_t< T >::unsigned_t  UT;
552 
553     long double x = tc * __kmp_pow< UT >(base, idx);
554     UT r = (UT) x;
555     if ( x == r )
556         return r;
557     return r + 1;
558 }
559 
560 // Parameters of the guided-iterative algorithm:
561 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
562 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
563 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
564 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
565 static int guided_int_param = 2;
566 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
567 
568 // UT - unsigned flavor of T, ST - signed flavor of T,
569 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
570 template< typename T >
571 static void
572 __kmp_dispatch_init(
573     ident_t                        * loc,
574     int                              gtid,
575     enum sched_type                  schedule,
576     T                                lb,
577     T                                ub,
578     typename traits_t< T >::signed_t st,
579     typename traits_t< T >::signed_t chunk,
580     int                              push_ws
581 ) {
582     typedef typename traits_t< T >::unsigned_t  UT;
583     typedef typename traits_t< T >::signed_t    ST;
584     typedef typename traits_t< T >::floating_t  DBL;
585     static const int ___kmp_size_type = sizeof( UT );
586 
587     int                                            active;
588     T                                              tc;
589     kmp_info_t *                                   th;
590     kmp_team_t *                                   team;
591     kmp_uint32                                     my_buffer_index;
592     dispatch_private_info_template< T >          * pr;
593     dispatch_shared_info_template< UT > volatile * sh;
594 
595     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
596     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
597 
598     if ( ! TCR_4( __kmp_init_parallel ) )
599         __kmp_parallel_initialize();
600 
601 #if INCLUDE_SSC_MARKS
602     SSC_MARK_DISPATCH_INIT();
603 #endif
604     #ifdef KMP_DEBUG
605     {
606         const char * buff;
607         // create format specifiers before the debug output
608         buff = __kmp_str_format(
609             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
610             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
611         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
612         __kmp_str_free( &buff );
613     }
614     #endif
615     /* setup data */
616     th     = __kmp_threads[ gtid ];
617     team   = th -> th.th_team;
618     active = ! team -> t.t_serialized;
619     th->th.th_ident = loc;
620 
621 #if USE_ITT_BUILD
622     kmp_uint64 cur_chunk = chunk;
623 #endif
624     if ( ! active ) {
625         pr = reinterpret_cast< dispatch_private_info_template< T >* >
626             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
627     } else {
628         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
629                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
630 
631         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
632 
633         /* What happens when number of threads changes, need to resize buffer? */
634         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
635             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
636         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
637             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
638     }
639 
640     /* Pick up the nomerge/ordered bits from the scheduling type */
641     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
642         pr->nomerge = TRUE;
643         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
644     } else {
645         pr->nomerge = FALSE;
646     }
647     pr->type_size = ___kmp_size_type; // remember the size of variables
648     if ( kmp_ord_lower & schedule ) {
649         pr->ordered = TRUE;
650         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
651     } else {
652         pr->ordered = FALSE;
653     }
654     if ( schedule == kmp_sch_static ) {
655         schedule = __kmp_static;
656     } else {
657         if ( schedule == kmp_sch_runtime ) {
658             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
659             schedule = team -> t.t_sched.r_sched_type;
660             // Detail the schedule if needed (global controls are differentiated appropriately)
661             if ( schedule == kmp_sch_guided_chunked ) {
662                 schedule = __kmp_guided;
663             } else if ( schedule == kmp_sch_static ) {
664                 schedule = __kmp_static;
665             }
666             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
667             chunk = team -> t.t_sched.chunk;
668 
669             #ifdef KMP_DEBUG
670             {
671                 const char * buff;
672                 // create format specifiers before the debug output
673                 buff = __kmp_str_format(
674                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
675                     traits_t< ST >::spec );
676                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
677                 __kmp_str_free( &buff );
678             }
679             #endif
680         } else {
681             if ( schedule == kmp_sch_guided_chunked ) {
682                 schedule = __kmp_guided;
683             }
684             if ( chunk <= 0 ) {
685                 chunk = KMP_DEFAULT_CHUNK;
686             }
687         }
688 
689         if ( schedule == kmp_sch_auto ) {
690             // mapping and differentiation: in the __kmp_do_serial_initialize()
691             schedule = __kmp_auto;
692             #ifdef KMP_DEBUG
693             {
694                 const char * buff;
695                 // create format specifiers before the debug output
696                 buff = __kmp_str_format(
697                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
698                     traits_t< ST >::spec );
699                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
700                 __kmp_str_free( &buff );
701             }
702             #endif
703         }
704 
705         /* guided analytical not safe for too many threads */
706         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
707             schedule = kmp_sch_guided_iterative_chunked;
708             KMP_WARNING( DispatchManyThreads );
709         }
710         pr->u.p.parm1 = chunk;
711     }
712     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
713                 "unknown scheduling type" );
714 
715     pr->u.p.count = 0;
716 
717     if ( __kmp_env_consistency_check ) {
718         if ( st == 0 ) {
719             __kmp_error_construct(
720                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
721                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
722             );
723         }
724     }
725 
726     tc = ( ub - lb + st );
727     if ( st != 1 ) {
728         if ( st < 0 ) {
729             if ( lb < ub ) {
730                 tc = 0;            // zero-trip
731             } else {   // lb >= ub
732                 tc = (ST)tc / st;  // convert to signed division
733             }
734         } else {       // st > 0
735             if ( ub < lb ) {
736                 tc = 0;            // zero-trip
737             } else {   // lb >= ub
738                 tc /= st;
739             }
740         }
741     } else if ( ub < lb ) {        // st == 1
742         tc = 0;                    // zero-trip
743     }
744 
745     pr->u.p.lb = lb;
746     pr->u.p.ub = ub;
747     pr->u.p.st = st;
748     pr->u.p.tc = tc;
749 
750     #if KMP_OS_WINDOWS
751     pr->u.p.last_upper = ub + st;
752     #endif /* KMP_OS_WINDOWS */
753 
754     /* NOTE: only the active parallel region(s) has active ordered sections */
755 
756     if ( active ) {
757         if ( pr->ordered == 0 ) {
758             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
759             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
760         } else {
761             pr->ordered_bumped = 0;
762 
763             pr->u.p.ordered_lower = 1;
764             pr->u.p.ordered_upper = 0;
765 
766             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
767             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
768         }
769     }
770 
771     if ( __kmp_env_consistency_check ) {
772         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
773         if ( push_ws ) {
774             __kmp_push_workshare( gtid, ws, loc );
775             pr->pushed_ws = ws;
776         } else {
777             __kmp_check_workshare( gtid, ws, loc );
778             pr->pushed_ws = ct_none;
779         }
780     }
781 
782     switch ( schedule ) {
783     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
784     case kmp_sch_static_steal:
785         {
786             T nproc = team->t.t_nproc;
787             T ntc, init;
788 
789             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
790 
791             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
792             if ( nproc > 1 && ntc >= nproc ) {
793                 T id = __kmp_tid_from_gtid(gtid);
794                 T small_chunk, extras;
795 
796                 small_chunk = ntc / nproc;
797                 extras = ntc % nproc;
798 
799                 init = id * small_chunk + ( id < extras ? id : extras );
800                 pr->u.p.count = init;
801                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
802 
803                 pr->u.p.parm2 = lb;
804                 //pr->pfields.parm3 = 0; // it's not used in static_steal
805                 pr->u.p.parm4 = id;
806                 pr->u.p.st = st;
807                 break;
808             } else {
809                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
810                                gtid ) );
811                 schedule = kmp_sch_static_balanced;
812                 /* too few iterations: fall-through to kmp_sch_static_balanced */
813             } // if
814             /* FALL-THROUGH to static balanced */
815         } // case
816     #endif
817     case kmp_sch_static_balanced:
818         {
819             T nproc = team->t.t_nproc;
820             T init, limit;
821 
822             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
823                             gtid ) );
824 
825             if ( nproc > 1 ) {
826                 T id = __kmp_tid_from_gtid(gtid);
827 
828                 if ( tc < nproc ) {
829                     if ( id < tc ) {
830                         init = id;
831                         limit = id;
832                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
833                     } else {
834                         pr->u.p.count = 1;  /* means no more chunks to execute */
835                         pr->u.p.parm1 = FALSE;
836                         break;
837                     }
838                 } else {
839                     T small_chunk = tc / nproc;
840                     T extras = tc % nproc;
841                     init = id * small_chunk + (id < extras ? id : extras);
842                     limit = init + small_chunk - (id < extras ? 0 : 1);
843                     pr->u.p.parm1 = (id == nproc - 1);
844                 }
845             } else {
846                 if ( tc > 0 ) {
847                     init = 0;
848                     limit = tc - 1;
849                     pr->u.p.parm1 = TRUE;
850                 } else {
851                     // zero trip count
852                     pr->u.p.count = 1;  /* means no more chunks to execute */
853                     pr->u.p.parm1 = FALSE;
854                     break;
855                 }
856             }
857 #if USE_ITT_BUILD
858             // Calculate chunk for metadata report
859             if(  __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
860                 cur_chunk = limit - init + 1;
861             }
862 #endif
863             if ( st == 1 ) {
864                 pr->u.p.lb = lb + init;
865                 pr->u.p.ub = lb + limit;
866             } else {
867                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
868                 pr->u.p.lb = lb + init * st;
869                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
870                 if ( st > 0 ) {
871                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
872                 } else {
873                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
874                 }
875             }
876             if ( pr->ordered ) {
877                 pr->u.p.ordered_lower = init;
878                 pr->u.p.ordered_upper = limit;
879             }
880             break;
881         } // case
882     case kmp_sch_guided_iterative_chunked :
883         {
884             T nproc = team->t.t_nproc;
885             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
886 
887             if ( nproc > 1 ) {
888                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
889                     /* chunk size too large, switch to dynamic */
890                     schedule = kmp_sch_dynamic_chunked;
891                 } else {
892                     // when remaining iters become less than parm2 - switch to dynamic
893                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
894                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
895                 }
896             } else {
897                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
898                 schedule = kmp_sch_static_greedy;
899                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
900                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
901                 pr->u.p.parm1 = tc;
902             } // if
903         } // case
904         break;
905     case kmp_sch_guided_analytical_chunked:
906         {
907             T nproc = team->t.t_nproc;
908             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
909 
910             if ( nproc > 1 ) {
911                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
912                     /* chunk size too large, switch to dynamic */
913                     schedule = kmp_sch_dynamic_chunked;
914                 } else {
915                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
916                     DBL x;
917 
918                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
919                     /* Linux* OS already has 64-bit computation by default for
920 		       long double, and on Windows* OS on Intel(R) 64,
921 		       /Qlong_double doesn't work.  On Windows* OS
922 		       on IA-32 architecture, we need to set precision to
923 		       64-bit instead of the default 53-bit. Even though long
924 		       double doesn't work on Windows* OS on Intel(R) 64, the
925 		       resulting lack of precision is not expected to impact
926 		       the correctness of the algorithm, but this has not been
927 		       mathematically proven.
928                     */
929                     // save original FPCW and set precision to 64-bit, as
930                     // Windows* OS on IA-32 architecture defaults to 53-bit
931                     unsigned int oldFpcw = _control87(0,0);
932                     _control87(_PC_64,_MCW_PC); // 0,0x30000
933                     #endif
934                     /* value used for comparison in solver for cross-over point */
935                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
936 
937                     /* crossover point--chunk indexes equal to or greater than
938 		       this point switch to dynamic-style scheduling */
939                     UT   cross;
940 
941                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
942                     x = (long double)1.0 - (long double)0.5 / nproc;
943 
944                     #ifdef KMP_DEBUG
945                     { // test natural alignment
946                         struct _test_a {
947                             char a;
948                             union {
949                                 char b;
950                                 DBL  d;
951                             };
952                         } t;
953                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
954                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
955                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
956                     }
957                     #endif // KMP_DEBUG
958 
959                     /* save the term in thread private dispatch structure */
960                     *(DBL*)&pr->u.p.parm3 = x;
961 
962                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
963                     {
964                         UT          left, right, mid;
965                         long double p;
966 
967                         /* estimate initial upper and lower bound */
968 
969                         /* doesn't matter what value right is as long as it is positive, but
970                            it affects performance of the solver
971                         */
972                         right = 229;
973                         p = __kmp_pow< UT >(x,right);
974                         if ( p > target ) {
975                             do{
976                                 p *= p;
977                                 right <<= 1;
978                             } while(p>target && right < (1<<27));
979                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
980                         } else {
981                             left = 0;
982                         }
983 
984                         /* bisection root-finding method */
985                         while ( left + 1 < right ) {
986                             mid = (left + right) / 2;
987                             if ( __kmp_pow< UT >(x,mid) > target ) {
988                                 left = mid;
989                             } else {
990                                 right = mid;
991                             }
992                         } // while
993                         cross = right;
994                     }
995                     /* assert sanity of computed crossover point */
996                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
997 
998                     /* save the crossover point in thread private dispatch structure */
999                     pr->u.p.parm2 = cross;
1000 
1001                     // C75803
1002                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1003                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1004                     #else
1005                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1006                     #endif
1007                     /* dynamic-style scheduling offset */
1008                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1009                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1010                         // restore FPCW
1011                         _control87(oldFpcw,_MCW_PC);
1012                     #endif
1013                 } // if
1014             } else {
1015                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1016                                gtid ) );
1017                 schedule = kmp_sch_static_greedy;
1018                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1019                 pr->u.p.parm1 = tc;
1020             } // if
1021         } // case
1022         break;
1023     case kmp_sch_static_greedy:
1024         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1025             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1026                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1027                 tc;
1028         break;
1029     case kmp_sch_static_chunked :
1030     case kmp_sch_dynamic_chunked :
1031         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1032         break;
1033     case kmp_sch_trapezoidal :
1034         {
1035             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1036 
1037             T parm1, parm2, parm3, parm4;
1038             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1039 
1040             parm1 = chunk;
1041 
1042             /* F : size of the first cycle */
1043             parm2 = ( tc / (2 * team->t.t_nproc) );
1044 
1045             if ( parm2 < 1 ) {
1046                 parm2 = 1;
1047             }
1048 
1049             /* L : size of the last cycle.  Make sure the last cycle
1050              *     is not larger than the first cycle.
1051              */
1052             if ( parm1 < 1 ) {
1053                 parm1 = 1;
1054             } else if ( parm1 > parm2 ) {
1055                 parm1 = parm2;
1056             }
1057 
1058             /* N : number of cycles */
1059             parm3 = ( parm2 + parm1 );
1060             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1061 
1062             if ( parm3 < 2 ) {
1063                 parm3 = 2;
1064             }
1065 
1066             /* sigma : decreasing incr of the trapezoid */
1067             parm4 = ( parm3 - 1 );
1068             parm4 = ( parm2 - parm1 ) / parm4;
1069 
1070             // pointless check, because parm4 >= 0 always
1071             //if ( parm4 < 0 ) {
1072             //    parm4 = 0;
1073             //}
1074 
1075             pr->u.p.parm1 = parm1;
1076             pr->u.p.parm2 = parm2;
1077             pr->u.p.parm3 = parm3;
1078             pr->u.p.parm4 = parm4;
1079         } // case
1080         break;
1081 
1082     default:
1083         {
1084             __kmp_msg(
1085                 kmp_ms_fatal,                        // Severity
1086                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1087                 KMP_HNT( GetNewerLibrary ),          // Hint
1088                 __kmp_msg_null                       // Variadic argument list terminator
1089             );
1090         }
1091         break;
1092     } // switch
1093     pr->schedule = schedule;
1094     if ( active ) {
1095         /* The name of this buffer should be my_buffer_index when it's free to use it */
1096 
1097         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1098                         gtid, my_buffer_index, sh->buffer_index) );
1099         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1100                                         USE_ITT_BUILD_ARG( NULL )
1101                                         );
1102             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1103             // *always* 32-bit integers.
1104         KMP_MB();  /* is this necessary? */
1105         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1106                         gtid, my_buffer_index, sh->buffer_index) );
1107 
1108         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1109         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1110 #if USE_ITT_BUILD
1111         if ( pr->ordered ) {
1112             __kmp_itt_ordered_init( gtid );
1113         }; // if
1114 #endif /* USE_ITT_BUILD */
1115     }; // if
1116 
1117 #if USE_ITT_BUILD
1118     // Report loop metadata
1119     if( __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
1120         kmp_uint32 tid  = __kmp_tid_from_gtid( gtid );
1121         if (KMP_MASTER_TID(tid)) {
1122             kmp_uint64 schedtype = 0;
1123 
1124             switch ( schedule ) {
1125             case kmp_sch_static_chunked:
1126             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1127                 break;
1128             case kmp_sch_static_greedy:
1129                 cur_chunk = pr->u.p.parm1;
1130                 break;
1131             case kmp_sch_dynamic_chunked:
1132                 schedtype = 1;
1133                 break;
1134             case kmp_sch_guided_iterative_chunked:
1135             case kmp_sch_guided_analytical_chunked:
1136                 schedtype = 2;
1137                 break;
1138             default:
1139 //            Should we put this case under "static"?
1140 //            case kmp_sch_static_steal:
1141                 schedtype = 3;
1142                 break;
1143             }
1144             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1145         }
1146     }
1147 #endif /* USE_ITT_BUILD */
1148 
1149     #ifdef KMP_DEBUG
1150     {
1151         const char * buff;
1152         // create format specifiers before the debug output
1153         buff = __kmp_str_format(
1154             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1155             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1156             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1157             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1158             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1159             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1160             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1161         KD_TRACE(10, ( buff,
1162             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1163             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1164             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1165             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1166         __kmp_str_free( &buff );
1167     }
1168     #endif
1169     #if ( KMP_STATIC_STEAL_ENABLED )
1170     if ( ___kmp_size_type < 8 ) {
1171       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1172       // all the parm3 variables will contain the same value.
1173       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1174       // rather than program life-time increment.
1175       // So the dedicated variable is required. The 'static_steal_counter' is used.
1176       if( schedule == kmp_sch_static_steal ) {
1177         // Other threads will inspect this variable when searching for a victim.
1178         // This is a flag showing that other threads may steal from this thread since then.
1179         volatile T * p = &pr->u.p.static_steal_counter;
1180         *p = *p + 1;
1181       }
1182     }
1183     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1184 }
1185 
1186 /*
1187  * For ordered loops, either __kmp_dispatch_finish() should be called after
1188  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1189  * every chunk of iterations.  If the ordered section(s) were not executed
1190  * for this iteration (or every iteration in this chunk), we need to set the
1191  * ordered iteration counters so that the next thread can proceed.
1192  */
1193 template< typename UT >
1194 static void
1195 __kmp_dispatch_finish( int gtid, ident_t *loc )
1196 {
1197     typedef typename traits_t< UT >::signed_t ST;
1198     kmp_info_t *th = __kmp_threads[ gtid ];
1199 
1200     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1201     if ( ! th -> th.th_team -> t.t_serialized ) {
1202 
1203         dispatch_private_info_template< UT > * pr =
1204             reinterpret_cast< dispatch_private_info_template< UT >* >
1205             ( th->th.th_dispatch->th_dispatch_pr_current );
1206         dispatch_shared_info_template< UT > volatile * sh =
1207             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1208             ( th->th.th_dispatch->th_dispatch_sh_current );
1209         KMP_DEBUG_ASSERT( pr );
1210         KMP_DEBUG_ASSERT( sh );
1211         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1212                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1213 
1214         if ( pr->ordered_bumped ) {
1215             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1216                             gtid ) );
1217             pr->ordered_bumped = 0;
1218         } else {
1219             UT lower = pr->u.p.ordered_lower;
1220 
1221             #ifdef KMP_DEBUG
1222             {
1223                 const char * buff;
1224                 // create format specifiers before the debug output
1225                 buff = __kmp_str_format(
1226                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1227                     traits_t< UT >::spec, traits_t< UT >::spec );
1228                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1229                 __kmp_str_free( &buff );
1230             }
1231             #endif
1232 
1233             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1234                                    USE_ITT_BUILD_ARG(NULL)
1235                                    );
1236             KMP_MB();  /* is this necessary? */
1237             #ifdef KMP_DEBUG
1238             {
1239                 const char * buff;
1240                 // create format specifiers before the debug output
1241                 buff = __kmp_str_format(
1242                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1243                     traits_t< UT >::spec, traits_t< UT >::spec );
1244                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1245                 __kmp_str_free( &buff );
1246             }
1247             #endif
1248 
1249             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1250         } // if
1251     } // if
1252     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1253 }
1254 
1255 #ifdef KMP_GOMP_COMPAT
1256 
1257 template< typename UT >
1258 static void
1259 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1260 {
1261     typedef typename traits_t< UT >::signed_t ST;
1262     kmp_info_t *th = __kmp_threads[ gtid ];
1263 
1264     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1265     if ( ! th -> th.th_team -> t.t_serialized ) {
1266 //        int cid;
1267         dispatch_private_info_template< UT > * pr =
1268             reinterpret_cast< dispatch_private_info_template< UT >* >
1269             ( th->th.th_dispatch->th_dispatch_pr_current );
1270         dispatch_shared_info_template< UT > volatile * sh =
1271             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1272             ( th->th.th_dispatch->th_dispatch_sh_current );
1273         KMP_DEBUG_ASSERT( pr );
1274         KMP_DEBUG_ASSERT( sh );
1275         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1276                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1277 
1278 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1279             UT lower = pr->u.p.ordered_lower;
1280             UT upper = pr->u.p.ordered_upper;
1281             UT inc = upper - lower + 1;
1282 
1283             if ( pr->ordered_bumped == inc ) {
1284                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1285                   gtid ) );
1286                 pr->ordered_bumped = 0;
1287             } else {
1288                 inc -= pr->ordered_bumped;
1289 
1290                 #ifdef KMP_DEBUG
1291                 {
1292                     const char * buff;
1293                     // create format specifiers before the debug output
1294                     buff = __kmp_str_format(
1295                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1296                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1297                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1298                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1299                     __kmp_str_free( &buff );
1300                 }
1301                 #endif
1302 
1303                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1304                                        USE_ITT_BUILD_ARG(NULL)
1305                                        );
1306 
1307                 KMP_MB();  /* is this necessary? */
1308                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1309                   gtid ) );
1310                 pr->ordered_bumped = 0;
1311 //!!!!! TODO check if the inc should be unsigned, or signed???
1312                 #ifdef KMP_DEBUG
1313                 {
1314                     const char * buff;
1315                     // create format specifiers before the debug output
1316                     buff = __kmp_str_format(
1317                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1318                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1319                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1320                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1321                     __kmp_str_free( &buff );
1322                 }
1323                 #endif
1324 
1325                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1326             }
1327 //        }
1328     }
1329     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1330 }
1331 
1332 #endif /* KMP_GOMP_COMPAT */
1333 
1334 template< typename T >
1335 static int
1336 __kmp_dispatch_next(
1337     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1338 ) {
1339 
1340     typedef typename traits_t< T >::unsigned_t  UT;
1341     typedef typename traits_t< T >::signed_t    ST;
1342     typedef typename traits_t< T >::floating_t  DBL;
1343     static const int ___kmp_size_type = sizeof( UT );
1344 
1345     int                                   status;
1346     dispatch_private_info_template< T > * pr;
1347     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1348     kmp_team_t                          * team = th -> th.th_team;
1349 
1350     KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1351     #ifdef KMP_DEBUG
1352     {
1353         const char * buff;
1354         // create format specifiers before the debug output
1355         buff = __kmp_str_format(
1356             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1357             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1358         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1359         __kmp_str_free( &buff );
1360     }
1361     #endif
1362 
1363     if ( team -> t.t_serialized ) {
1364         /* NOTE: serialize this dispatch becase we are not at the active level */
1365         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1366             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1367         KMP_DEBUG_ASSERT( pr );
1368 
1369         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1370             *p_lb = 0;
1371             *p_ub = 0;
1372 //            if ( p_last != NULL )
1373 //                *p_last = 0;
1374             if ( p_st != NULL )
1375                 *p_st = 0;
1376             if ( __kmp_env_consistency_check ) {
1377                 if ( pr->pushed_ws != ct_none ) {
1378                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1379                 }
1380             }
1381         } else if ( pr->nomerge ) {
1382             kmp_int32 last;
1383             T         start;
1384             UT        limit, trip, init;
1385             ST        incr;
1386             T         chunk = pr->u.p.parm1;
1387 
1388             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1389 
1390             init = chunk * pr->u.p.count++;
1391             trip = pr->u.p.tc - 1;
1392 
1393             if ( (status = (init <= trip)) == 0 ) {
1394                 *p_lb = 0;
1395                 *p_ub = 0;
1396 //                if ( p_last != NULL )
1397 //                    *p_last = 0;
1398                 if ( p_st != NULL )
1399                     *p_st = 0;
1400                 if ( __kmp_env_consistency_check ) {
1401                     if ( pr->pushed_ws != ct_none ) {
1402                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1403                     }
1404                 }
1405             } else {
1406                 start = pr->u.p.lb;
1407                 limit = chunk + init - 1;
1408                 incr  = pr->u.p.st;
1409 
1410                 if ( (last = (limit >= trip)) != 0 ) {
1411                     limit = trip;
1412                     #if KMP_OS_WINDOWS
1413                     pr->u.p.last_upper = pr->u.p.ub;
1414                     #endif /* KMP_OS_WINDOWS */
1415                 }
1416                 if ( p_last != NULL )
1417                     *p_last = last;
1418                 if ( p_st != NULL )
1419                     *p_st = incr;
1420                 if ( incr == 1 ) {
1421                     *p_lb = start + init;
1422                     *p_ub = start + limit;
1423                 } else {
1424                     *p_lb = start + init * incr;
1425                     *p_ub = start + limit * incr;
1426                 }
1427 
1428                 if ( pr->ordered ) {
1429                     pr->u.p.ordered_lower = init;
1430                     pr->u.p.ordered_upper = limit;
1431                     #ifdef KMP_DEBUG
1432                     {
1433                         const char * buff;
1434                         // create format specifiers before the debug output
1435                         buff = __kmp_str_format(
1436                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1437                             traits_t< UT >::spec, traits_t< UT >::spec );
1438                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1439                         __kmp_str_free( &buff );
1440                     }
1441                     #endif
1442                 } // if
1443             } // if
1444         } else {
1445             pr->u.p.tc = 0;
1446             *p_lb = pr->u.p.lb;
1447             *p_ub = pr->u.p.ub;
1448             #if KMP_OS_WINDOWS
1449             pr->u.p.last_upper = *p_ub;
1450             #endif /* KMP_OS_WINDOWS */
1451             if ( p_last != NULL )
1452                 *p_last = TRUE;
1453             if ( p_st != NULL )
1454                 *p_st = pr->u.p.st;
1455         } // if
1456         #ifdef KMP_DEBUG
1457         {
1458             const char * buff;
1459             // create format specifiers before the debug output
1460             buff = __kmp_str_format(
1461                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1462                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1463                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1464             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1465             __kmp_str_free( &buff );
1466         }
1467         #endif
1468 #if INCLUDE_SSC_MARKS
1469         SSC_MARK_DISPATCH_NEXT();
1470 #endif
1471         return status;
1472     } else {
1473         kmp_int32 last = 0;
1474         dispatch_shared_info_template< UT > *sh;
1475         T         start;
1476         ST        incr;
1477         UT        limit, trip, init;
1478 
1479         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1480                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1481 
1482         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1483             ( th->th.th_dispatch->th_dispatch_pr_current );
1484         KMP_DEBUG_ASSERT( pr );
1485         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1486             ( th->th.th_dispatch->th_dispatch_sh_current );
1487         KMP_DEBUG_ASSERT( sh );
1488 
1489         if ( pr->u.p.tc == 0 ) {
1490             // zero trip count
1491             status = 0;
1492         } else {
1493             switch (pr->schedule) {
1494             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1495             case kmp_sch_static_steal:
1496                 {
1497                     T chunk = pr->u.p.parm1;
1498 
1499                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1500 
1501                     trip = pr->u.p.tc - 1;
1502 
1503                     if ( ___kmp_size_type > 4 ) {
1504                         // Other threads do not look into the data of this thread,
1505                         //  so it's not necessary to make volatile casting.
1506                         init   = ( pr->u.p.count )++;
1507                         status = ( init < (UT)pr->u.p.ub );
1508                     } else {
1509                         typedef union {
1510                             struct {
1511                                 UT count;
1512                                 T  ub;
1513                             } p;
1514                             kmp_int64 b;
1515                         } union_i4;
1516                         // All operations on 'count' or 'ub' must be combined atomically together.
1517                         // stealing implemented only for 4-byte indexes
1518                         {
1519                             union_i4 vold, vnew;
1520                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1521                             vnew = vold;
1522                             vnew.p.count++;
1523                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1524                                         ( volatile kmp_int64* )&pr->u.p.count,
1525                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1526                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1527                                 KMP_CPU_PAUSE();
1528                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1529                                 vnew = vold;
1530                                 vnew.p.count++;
1531                             }
1532                             vnew = vold;
1533                             init   = vnew.p.count;
1534                             status = ( init < (UT)vnew.p.ub ) ;
1535                         }
1536 
1537                         if( !status ) {
1538                             kmp_info_t   **other_threads = team->t.t_threads;
1539                             int          while_limit = 10;
1540                             int          while_index = 0;
1541 
1542                             // TODO: algorithm of searching for a victim
1543                             // should be cleaned up and measured
1544                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1545                                 union_i4  vold, vnew;
1546                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1547                                 T         victimIdx    = pr->u.p.parm4;
1548                                 T         oldVictimIdx = victimIdx;
1549                                 dispatch_private_info_template< T > * victim;
1550 
1551                                 do {
1552                                     if( !victimIdx ) {
1553                                         victimIdx = team->t.t_nproc - 1;
1554                                     } else {
1555                                         --victimIdx;
1556                                     }
1557                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1558                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1559                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1560                                 // TODO: think about a proper place of this test
1561                                 if ( ( !victim ) ||
1562                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1563                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1564                                     // TODO: delay would be nice
1565                                     continue;
1566                                     // the victim is not ready yet to participate in stealing
1567                                     // because the victim is still in kmp_init_dispatch
1568                                 }
1569                                 if ( oldVictimIdx == victimIdx ) {
1570                                     break;
1571                                 }
1572                                 pr->u.p.parm4 = victimIdx;
1573 
1574                                 while( 1 ) {
1575                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1576                                     vnew = vold;
1577 
1578                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1579                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1580                                         break;
1581                                     }
1582                                     vnew.p.ub -= (remaining >> 2);
1583                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1584                                     #pragma warning( push )
1585                                     // disable warning on pointless comparison of unsigned with 0
1586                                     #pragma warning( disable: 186 )
1587                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1588                                     #pragma warning( pop )
1589                                     // TODO: Should this be acquire or release?
1590                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1591                                             ( volatile kmp_int64 * )&victim->u.p.count,
1592                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1593                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1594                                         status = 1;
1595                                         while_index = 0;
1596                                         // now update own count and ub
1597                                         #if KMP_ARCH_X86
1598                                         // stealing executed on non-KMP_ARCH_X86 only
1599                                             // Atomic 64-bit write on ia32 is
1600                                             // unavailable, so we do this in steps.
1601                                             //     This code is not tested.
1602                                             init = vold.p.count;
1603                                             pr->u.p.ub = 0;
1604                                             pr->u.p.count = init + 1;
1605                                             pr->u.p.ub = vnew.p.count;
1606                                         #else
1607                                             init = vnew.p.ub;
1608                                             vold.p.count = init + 1;
1609                                             // TODO: is it safe and enough?
1610                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1611                                         #endif // KMP_ARCH_X86
1612                                         break;
1613                                     } // if
1614                                 KMP_CPU_PAUSE();
1615                                 } // while (1)
1616                             } // while
1617                         } // if
1618                     } // if
1619                     if ( !status ) {
1620                         *p_lb = 0;
1621                         *p_ub = 0;
1622                         if ( p_st != NULL ) *p_st = 0;
1623                     } else {
1624                         start = pr->u.p.parm2;
1625                         init *= chunk;
1626                         limit = chunk + init - 1;
1627                         incr  = pr->u.p.st;
1628 
1629                         KMP_DEBUG_ASSERT(init <= trip);
1630                         if ( (last = (limit >= trip)) != 0 )
1631                             limit = trip;
1632                         if ( p_st != NULL ) *p_st = incr;
1633 
1634                         if ( incr == 1 ) {
1635                             *p_lb = start + init;
1636                             *p_ub = start + limit;
1637                         } else {
1638                             *p_lb = start + init * incr;
1639                             *p_ub = start + limit * incr;
1640                         }
1641 
1642                         if ( pr->ordered ) {
1643                             pr->u.p.ordered_lower = init;
1644                             pr->u.p.ordered_upper = limit;
1645                             #ifdef KMP_DEBUG
1646                             {
1647                                 const char * buff;
1648                                 // create format specifiers before the debug output
1649                                 buff = __kmp_str_format(
1650                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1651                                     traits_t< UT >::spec, traits_t< UT >::spec );
1652                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1653                                 __kmp_str_free( &buff );
1654                             }
1655                             #endif
1656                         } // if
1657                     } // if
1658                     break;
1659                 } // case
1660             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1661             case kmp_sch_static_balanced:
1662                 {
1663                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1664                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1665                         pr->u.p.count = 1;
1666                         *p_lb = pr->u.p.lb;
1667                         *p_ub = pr->u.p.ub;
1668                         last = pr->u.p.parm1;
1669                         if ( p_st != NULL )
1670                             *p_st = pr->u.p.st;
1671                     } else {  /* no iterations to do */
1672                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1673                     }
1674                     if ( pr->ordered ) {
1675                         #ifdef KMP_DEBUG
1676                         {
1677                             const char * buff;
1678                             // create format specifiers before the debug output
1679                             buff = __kmp_str_format(
1680                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1681                                 traits_t< UT >::spec, traits_t< UT >::spec );
1682                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1683                             __kmp_str_free( &buff );
1684                         }
1685                         #endif
1686                     } // if
1687                 } // case
1688                 break;
1689             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1690             case kmp_sch_static_chunked:
1691                 {
1692                     T parm1;
1693 
1694                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1695                                    gtid ) );
1696                     parm1 = pr->u.p.parm1;
1697 
1698                     trip  = pr->u.p.tc - 1;
1699                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1700 
1701                     if ( (status = (init <= trip)) != 0 ) {
1702                         start = pr->u.p.lb;
1703                         incr  = pr->u.p.st;
1704                         limit = parm1 + init - 1;
1705 
1706                         if ( (last = (limit >= trip)) != 0 )
1707                             limit = trip;
1708 
1709                         if ( p_st != NULL ) *p_st = incr;
1710 
1711                         pr->u.p.count += team->t.t_nproc;
1712 
1713                         if ( incr == 1 ) {
1714                             *p_lb = start + init;
1715                             *p_ub = start + limit;
1716                         }
1717                         else {
1718                             *p_lb = start + init * incr;
1719                             *p_ub = start + limit * incr;
1720                         }
1721 
1722                         if ( pr->ordered ) {
1723                             pr->u.p.ordered_lower = init;
1724                             pr->u.p.ordered_upper = limit;
1725                             #ifdef KMP_DEBUG
1726                             {
1727                                 const char * buff;
1728                                 // create format specifiers before the debug output
1729                                 buff = __kmp_str_format(
1730                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1731                                     traits_t< UT >::spec, traits_t< UT >::spec );
1732                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1733                                 __kmp_str_free( &buff );
1734                             }
1735                             #endif
1736                         } // if
1737                     } // if
1738                 } // case
1739                 break;
1740 
1741             case kmp_sch_dynamic_chunked:
1742                 {
1743                     T chunk = pr->u.p.parm1;
1744 
1745                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1746                                    gtid ) );
1747 
1748                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1749                     trip = pr->u.p.tc - 1;
1750 
1751                     if ( (status = (init <= trip)) == 0 ) {
1752                         *p_lb = 0;
1753                         *p_ub = 0;
1754                         if ( p_st != NULL ) *p_st = 0;
1755                     } else {
1756                         start = pr->u.p.lb;
1757                         limit = chunk + init - 1;
1758                         incr  = pr->u.p.st;
1759 
1760                         if ( (last = (limit >= trip)) != 0 )
1761                             limit = trip;
1762 
1763                         if ( p_st != NULL ) *p_st = incr;
1764 
1765                         if ( incr == 1 ) {
1766                             *p_lb = start + init;
1767                             *p_ub = start + limit;
1768                         } else {
1769                             *p_lb = start + init * incr;
1770                             *p_ub = start + limit * incr;
1771                         }
1772 
1773                         if ( pr->ordered ) {
1774                             pr->u.p.ordered_lower = init;
1775                             pr->u.p.ordered_upper = limit;
1776                             #ifdef KMP_DEBUG
1777                             {
1778                                 const char * buff;
1779                                 // create format specifiers before the debug output
1780                                 buff = __kmp_str_format(
1781                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1782                                     traits_t< UT >::spec, traits_t< UT >::spec );
1783                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1784                                 __kmp_str_free( &buff );
1785                             }
1786                             #endif
1787                         } // if
1788                     } // if
1789                 } // case
1790                 break;
1791 
1792             case kmp_sch_guided_iterative_chunked:
1793                 {
1794                     T  chunkspec = pr->u.p.parm1;
1795                     KD_TRACE(100,
1796                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1797                     trip  = pr->u.p.tc;
1798                     // Start atomic part of calculations
1799                     while(1) {
1800                         ST  remaining;             // signed, because can be < 0
1801                         init = sh->u.s.iteration;  // shared value
1802                         remaining = trip - init;
1803                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1804                             // nothing to do, don't try atomic op
1805                             status = 0;
1806                             break;
1807                         }
1808                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1809                             // use dynamic-style shcedule
1810                             // atomically inrement iterations, get old value
1811                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1812                             remaining = trip - init;
1813                             if (remaining <= 0) {
1814                                 status = 0;    // all iterations got by other threads
1815                             } else {
1816                                 // got some iterations to work on
1817                                 status = 1;
1818                                 if ( (T)remaining > chunkspec ) {
1819                                     limit = init + chunkspec - 1;
1820                                 } else {
1821                                     last = 1;   // the last chunk
1822                                     limit = init + remaining - 1;
1823                                 } // if
1824                             } // if
1825                             break;
1826                         } // if
1827                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1828                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1829                             // CAS was successful, chunk obtained
1830                             status = 1;
1831                             --limit;
1832                             break;
1833                         } // if
1834                     } // while
1835                     if ( status != 0 ) {
1836                         start = pr->u.p.lb;
1837                         incr = pr->u.p.st;
1838                         if ( p_st != NULL )
1839                             *p_st = incr;
1840                         *p_lb = start + init * incr;
1841                         *p_ub = start + limit * incr;
1842                         if ( pr->ordered ) {
1843                             pr->u.p.ordered_lower = init;
1844                             pr->u.p.ordered_upper = limit;
1845                             #ifdef KMP_DEBUG
1846                             {
1847                                 const char * buff;
1848                                 // create format specifiers before the debug output
1849                                 buff = __kmp_str_format(
1850                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1851                                     traits_t< UT >::spec, traits_t< UT >::spec );
1852                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1853                                 __kmp_str_free( &buff );
1854                             }
1855                             #endif
1856                         } // if
1857                     } else {
1858                         *p_lb = 0;
1859                         *p_ub = 0;
1860                         if ( p_st != NULL )
1861                             *p_st = 0;
1862                     } // if
1863                 } // case
1864                 break;
1865 
1866             case kmp_sch_guided_analytical_chunked:
1867                 {
1868                     T   chunkspec = pr->u.p.parm1;
1869                     UT chunkIdx;
1870     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1871                     /* for storing original FPCW value for Windows* OS on
1872 		       IA-32 architecture 8-byte version */
1873                     unsigned int oldFpcw;
1874                     unsigned int fpcwSet = 0;
1875     #endif
1876                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1877                                    gtid ) );
1878 
1879                     trip  = pr->u.p.tc;
1880 
1881                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1882                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1883 
1884                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1885                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1886                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1887                             --trip;
1888                             /* use dynamic-style scheduling */
1889                             init = chunkIdx * chunkspec + pr->u.p.count;
1890                             /* need to verify init > 0 in case of overflow in the above calculation */
1891                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1892                                 limit = init + chunkspec -1;
1893 
1894                                 if ( (last = (limit >= trip)) != 0 )
1895                                     limit = trip;
1896                             }
1897                             break;
1898                         } else {
1899                             /* use exponential-style scheduling */
1900                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1901                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1902                              */
1903     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1904                             /* If we haven't already done so, save original
1905 			       FPCW and set precision to 64-bit, as Windows* OS
1906 			       on IA-32 architecture defaults to 53-bit */
1907                             if ( !fpcwSet ) {
1908                                 oldFpcw = _control87(0,0);
1909                                 _control87(_PC_64,_MCW_PC);
1910                                 fpcwSet = 0x30000;
1911                             }
1912     #endif
1913                             if ( chunkIdx ) {
1914                                 init = __kmp_dispatch_guided_remaining< T >(
1915                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1916                                 KMP_DEBUG_ASSERT(init);
1917                                 init = trip - init;
1918                             } else
1919                                 init = 0;
1920                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1921                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1922                             KMP_ASSERT(init <= limit);
1923                             if ( init < limit ) {
1924                                 KMP_DEBUG_ASSERT(limit <= trip);
1925                                 --limit;
1926                                 status = 1;
1927                                 break;
1928                             } // if
1929                         } // if
1930                     } // while (1)
1931     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1932                     /* restore FPCW if necessary
1933                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1934                     */
1935                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1936                         _control87(oldFpcw,_MCW_PC);
1937     #endif
1938                     if ( status != 0 ) {
1939                         start = pr->u.p.lb;
1940                         incr = pr->u.p.st;
1941                         if ( p_st != NULL )
1942                             *p_st = incr;
1943                         *p_lb = start + init * incr;
1944                         *p_ub = start + limit * incr;
1945                         if ( pr->ordered ) {
1946                             pr->u.p.ordered_lower = init;
1947                             pr->u.p.ordered_upper = limit;
1948                             #ifdef KMP_DEBUG
1949                             {
1950                                 const char * buff;
1951                                 // create format specifiers before the debug output
1952                                 buff = __kmp_str_format(
1953                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1954                                     traits_t< UT >::spec, traits_t< UT >::spec );
1955                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1956                                 __kmp_str_free( &buff );
1957                             }
1958                             #endif
1959                         }
1960                     } else {
1961                         *p_lb = 0;
1962                         *p_ub = 0;
1963                         if ( p_st != NULL )
1964                             *p_st = 0;
1965                     }
1966                 } // case
1967                 break;
1968 
1969             case kmp_sch_trapezoidal:
1970                 {
1971                     UT   index;
1972                     T    parm2 = pr->u.p.parm2;
1973                     T    parm3 = pr->u.p.parm3;
1974                     T    parm4 = pr->u.p.parm4;
1975                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1976                                    gtid ) );
1977 
1978                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1979 
1980                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1981                     trip = pr->u.p.tc - 1;
1982 
1983                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1984                         *p_lb = 0;
1985                         *p_ub = 0;
1986                         if ( p_st != NULL ) *p_st = 0;
1987                     } else {
1988                         start = pr->u.p.lb;
1989                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1990                         incr  = pr->u.p.st;
1991 
1992                         if ( (last = (limit >= trip)) != 0 )
1993                             limit = trip;
1994 
1995                         if ( p_st != NULL ) *p_st = incr;
1996 
1997                         if ( incr == 1 ) {
1998                             *p_lb = start + init;
1999                             *p_ub = start + limit;
2000                         } else {
2001                             *p_lb = start + init * incr;
2002                             *p_ub = start + limit * incr;
2003                         }
2004 
2005                         if ( pr->ordered ) {
2006                             pr->u.p.ordered_lower = init;
2007                             pr->u.p.ordered_upper = limit;
2008                             #ifdef KMP_DEBUG
2009                             {
2010                                 const char * buff;
2011                                 // create format specifiers before the debug output
2012                                 buff = __kmp_str_format(
2013                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2014                                     traits_t< UT >::spec, traits_t< UT >::spec );
2015                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2016                                 __kmp_str_free( &buff );
2017                             }
2018                             #endif
2019                         } // if
2020                     } // if
2021                 } // case
2022                 break;
2023             default:
2024                 {
2025                     status = 0; // to avoid complaints on uninitialized variable use
2026                     __kmp_msg(
2027                         kmp_ms_fatal,                        // Severity
2028                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2029                         KMP_HNT( GetNewerLibrary ),          // Hint
2030                         __kmp_msg_null                       // Variadic argument list terminator
2031                     );
2032                 }
2033                 break;
2034             } // switch
2035         } // if tc == 0;
2036 
2037         if ( status == 0 ) {
2038             UT   num_done;
2039 
2040             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2041             #ifdef KMP_DEBUG
2042             {
2043                 const char * buff;
2044                 // create format specifiers before the debug output
2045                 buff = __kmp_str_format(
2046                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2047                     traits_t< UT >::spec );
2048                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2049                 __kmp_str_free( &buff );
2050             }
2051             #endif
2052 
2053             if ( (ST)num_done == team->t.t_nproc-1 ) {
2054                 /* NOTE: release this buffer to be reused */
2055 
2056                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2057 
2058                 sh->u.s.num_done = 0;
2059                 sh->u.s.iteration = 0;
2060 
2061                 /* TODO replace with general release procedure? */
2062                 if ( pr->ordered ) {
2063                     sh->u.s.ordered_iteration = 0;
2064                 }
2065 
2066                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2067 
2068                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2069                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2070                                 gtid, sh->buffer_index) );
2071 
2072                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2073 
2074             } // if
2075             if ( __kmp_env_consistency_check ) {
2076                 if ( pr->pushed_ws != ct_none ) {
2077                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2078                 }
2079             }
2080 
2081             th -> th.th_dispatch -> th_deo_fcn = NULL;
2082             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2083             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2084             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2085         } // if (status == 0)
2086 #if KMP_OS_WINDOWS
2087         else if ( last ) {
2088             pr->u.p.last_upper = pr->u.p.ub;
2089         }
2090 #endif /* KMP_OS_WINDOWS */
2091         if ( p_last != NULL && status != 0 )
2092             *p_last = last;
2093     } // if
2094 
2095     #ifdef KMP_DEBUG
2096     {
2097         const char * buff;
2098         // create format specifiers before the debug output
2099         buff = __kmp_str_format(
2100             "__kmp_dispatch_next: T#%%d normal case: " \
2101             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2102             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2103         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2104         __kmp_str_free( &buff );
2105     }
2106     #endif
2107 #if INCLUDE_SSC_MARKS
2108     SSC_MARK_DISPATCH_NEXT();
2109 #endif
2110     return status;
2111 }
2112 
2113 template< typename T >
2114 static void
2115 __kmp_dist_get_bounds(
2116     ident_t                          *loc,
2117     kmp_int32                         gtid,
2118     kmp_int32                        *plastiter,
2119     T                                *plower,
2120     T                                *pupper,
2121     typename traits_t< T >::signed_t  incr
2122 ) {
2123     KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2124     typedef typename traits_t< T >::unsigned_t  UT;
2125     typedef typename traits_t< T >::signed_t    ST;
2126     register kmp_uint32  team_id;
2127     register kmp_uint32  nteams;
2128     register UT          trip_count;
2129     register kmp_team_t *team;
2130     kmp_info_t * th;
2131 
2132     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2133     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2134     #ifdef KMP_DEBUG
2135     {
2136         const char * buff;
2137         // create format specifiers before the debug output
2138         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2139             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2140             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2141             traits_t< T >::spec );
2142         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2143         __kmp_str_free( &buff );
2144     }
2145     #endif
2146 
2147     if( __kmp_env_consistency_check ) {
2148         if( incr == 0 ) {
2149             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2150         }
2151         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2152             // The loop is illegal.
2153             // Some zero-trip loops maintained by compiler, e.g.:
2154             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2155             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2156             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2157             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2158             // Compiler does not check the following illegal loops:
2159             //   for(i=0;i<10;i+=incr) // where incr<0
2160             //   for(i=10;i>0;i-=incr) // where incr<0
2161             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2162         }
2163     }
2164     th = __kmp_threads[gtid];
2165     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2166     team = th->th.th_team;
2167     #if OMP_40_ENABLED
2168     nteams = th->th.th_teams_size.nteams;
2169     #endif
2170     team_id = team->t.t_master_tid;
2171     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2172 
2173     // compute global trip count
2174     if( incr == 1 ) {
2175         trip_count = *pupper - *plower + 1;
2176     } else if(incr == -1) {
2177         trip_count = *plower - *pupper + 1;
2178     } else {
2179         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2180     }
2181     if( trip_count <= nteams ) {
2182         KMP_DEBUG_ASSERT(
2183             __kmp_static == kmp_sch_static_greedy || \
2184             __kmp_static == kmp_sch_static_balanced
2185         ); // Unknown static scheduling type.
2186         // only some teams get single iteration, others get nothing
2187         if( team_id < trip_count ) {
2188             *pupper = *plower = *plower + team_id * incr;
2189         } else {
2190             *plower = *pupper + incr; // zero-trip loop
2191         }
2192         if( plastiter != NULL )
2193             *plastiter = ( team_id == trip_count - 1 );
2194     } else {
2195         if( __kmp_static == kmp_sch_static_balanced ) {
2196             register UT chunk = trip_count / nteams;
2197             register UT extras = trip_count % nteams;
2198             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2199             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2200             if( plastiter != NULL )
2201                 *plastiter = ( team_id == nteams - 1 );
2202         } else {
2203             register T chunk_inc_count =
2204                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2205             register T upper = *pupper;
2206             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2207                 // Unknown static scheduling type.
2208             *plower += team_id * chunk_inc_count;
2209             *pupper = *plower + chunk_inc_count - incr;
2210             // Check/correct bounds if needed
2211             if( incr > 0 ) {
2212                 if( *pupper < *plower )
2213                     *pupper = i_maxmin< T >::mx;
2214                 if( plastiter != NULL )
2215                     *plastiter = *plower <= upper && *pupper > upper - incr;
2216                 if( *pupper > upper )
2217                     *pupper = upper; // tracker C73258
2218             } else {
2219                 if( *pupper > *plower )
2220                     *pupper = i_maxmin< T >::mn;
2221                 if( plastiter != NULL )
2222                     *plastiter = *plower >= upper && *pupper < upper - incr;
2223                 if( *pupper < upper )
2224                     *pupper = upper; // tracker C73258
2225             }
2226         }
2227     }
2228 }
2229 
2230 //-----------------------------------------------------------------------------------------
2231 // Dispatch routines
2232 //    Transfer call to template< type T >
2233 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2234 //                         T lb, T ub, ST st, ST chunk )
2235 extern "C" {
2236 
2237 /*!
2238 @ingroup WORK_SHARING
2239 @{
2240 @param loc Source location
2241 @param gtid Global thread id
2242 @param schedule Schedule type
2243 @param lb  Lower bound
2244 @param ub  Upper bound
2245 @param st  Step (or increment if you prefer)
2246 @param chunk The chunk size to block with
2247 
2248 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2249 These functions are all identical apart from the types of the arguments.
2250 */
2251 
2252 void
2253 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2254                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2255 {
2256     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2257     KMP_DEBUG_ASSERT( __kmp_init_serial );
2258     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2259 }
2260 /*!
2261 See @ref __kmpc_dispatch_init_4
2262 */
2263 void
2264 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2265                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2266 {
2267     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2268     KMP_DEBUG_ASSERT( __kmp_init_serial );
2269     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2270 }
2271 
2272 /*!
2273 See @ref __kmpc_dispatch_init_4
2274 */
2275 void
2276 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2277                         kmp_int64 lb, kmp_int64 ub,
2278                         kmp_int64 st, kmp_int64 chunk )
2279 {
2280     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2281     KMP_DEBUG_ASSERT( __kmp_init_serial );
2282     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2283 }
2284 
2285 /*!
2286 See @ref __kmpc_dispatch_init_4
2287 */
2288 void
2289 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2290                          kmp_uint64 lb, kmp_uint64 ub,
2291                          kmp_int64 st, kmp_int64 chunk )
2292 {
2293     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2294     KMP_DEBUG_ASSERT( __kmp_init_serial );
2295     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2296 }
2297 
2298 /*!
2299 See @ref __kmpc_dispatch_init_4
2300 
2301 Difference from __kmpc_dispatch_init set of functions is these functions
2302 are called for composite distribute parallel for construct. Thus before
2303 regular iterations dispatching we need to calc per-team iteration space.
2304 
2305 These functions are all identical apart from the types of the arguments.
2306 */
2307 void
2308 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2309     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2310 {
2311     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2312     KMP_DEBUG_ASSERT( __kmp_init_serial );
2313     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2314     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2315 }
2316 
2317 void
2318 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2319     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2320 {
2321     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2322     KMP_DEBUG_ASSERT( __kmp_init_serial );
2323     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2324     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2325 }
2326 
2327 void
2328 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2329     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2330 {
2331     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2332     KMP_DEBUG_ASSERT( __kmp_init_serial );
2333     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2334     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2335 }
2336 
2337 void
2338 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2339     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2340 {
2341     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2342     KMP_DEBUG_ASSERT( __kmp_init_serial );
2343     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2344     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2345 }
2346 
2347 /*!
2348 @param loc Source code location
2349 @param gtid Global thread id
2350 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2351 @param p_lb   Pointer to the lower bound for the next chunk of work
2352 @param p_ub   Pointer to the upper bound for the next chunk of work
2353 @param p_st   Pointer to the stride for the next chunk of work
2354 @return one if there is work to be done, zero otherwise
2355 
2356 Get the next dynamically allocated chunk of work for this thread.
2357 If there is no more work, then the lb,ub and stride need not be modified.
2358 */
2359 int
2360 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2361                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2362 {
2363     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2364 }
2365 
2366 /*!
2367 See @ref __kmpc_dispatch_next_4
2368 */
2369 int
2370 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2371                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2372 {
2373     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2374 }
2375 
2376 /*!
2377 See @ref __kmpc_dispatch_next_4
2378 */
2379 int
2380 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2381                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2382 {
2383     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2384 }
2385 
2386 /*!
2387 See @ref __kmpc_dispatch_next_4
2388 */
2389 int
2390 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2391                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2392 {
2393     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2394 }
2395 
2396 /*!
2397 @param loc Source code location
2398 @param gtid Global thread id
2399 
2400 Mark the end of a dynamic loop.
2401 */
2402 void
2403 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2404 {
2405     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2406 }
2407 
2408 /*!
2409 See @ref __kmpc_dispatch_fini_4
2410 */
2411 void
2412 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2413 {
2414     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2415 }
2416 
2417 /*!
2418 See @ref __kmpc_dispatch_fini_4
2419 */
2420 void
2421 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2422 {
2423     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2424 }
2425 
2426 /*!
2427 See @ref __kmpc_dispatch_fini_4
2428 */
2429 void
2430 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2431 {
2432     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2433 }
2434 /*! @} */
2435 
2436 //-----------------------------------------------------------------------------------------
2437 //Non-template routines from kmp_dispatch.c used in other sources
2438 
2439 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2440     return value == checker;
2441 }
2442 
2443 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2444     return value != checker;
2445 }
2446 
2447 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2448     return value < checker;
2449 }
2450 
2451 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2452     return value >= checker;
2453 }
2454 
2455 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2456     return value <= checker;
2457 }
2458 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2459     return value == checker;
2460 }
2461 
2462 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2463     return value != checker;
2464 }
2465 
2466 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2467     return value < checker;
2468 }
2469 
2470 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2471     return value >= checker;
2472 }
2473 
2474 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2475     return value <= checker;
2476 }
2477 
2478 kmp_uint32
2479 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2480                    kmp_uint32            checker,
2481                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2482                    , void        * obj    // Higher-level synchronization object, or NULL.
2483                    )
2484 {
2485     // note: we may not belong to a team at this point
2486     register volatile kmp_uint32         * spin          = spinner;
2487     register          kmp_uint32           check         = checker;
2488     register          kmp_uint32   spins;
2489     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2490     register          kmp_uint32           r;
2491 
2492     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2493     KMP_INIT_YIELD( spins );
2494     // main wait spin loop
2495     while(!f(r = TCR_4(*spin), check)) {
2496         KMP_FSYNC_SPIN_PREPARE( obj );
2497         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2498            It causes problems with infinite recursion because of exit lock */
2499         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2500             __kmp_abort_thread(); */
2501 
2502         /* if we have waited a bit, or are oversubscribed, yield */
2503         /* pause is in the following code */
2504         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2505         KMP_YIELD_SPIN( spins );
2506     }
2507     KMP_FSYNC_SPIN_ACQUIRED( obj );
2508     return r;
2509 }
2510 
2511 kmp_uint64
2512 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2513                     kmp_uint64            checker,
2514                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2515                     , void        * obj    // Higher-level synchronization object, or NULL.
2516                     )
2517 {
2518     // note: we may not belong to a team at this point
2519     register volatile kmp_uint64         * spin          = spinner;
2520     register          kmp_uint64           check         = checker;
2521     register          kmp_uint32   spins;
2522     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2523     register          kmp_uint64           r;
2524 
2525     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2526     KMP_INIT_YIELD( spins );
2527     // main wait spin loop
2528     while(!f(r = *spin, check))
2529     {
2530         KMP_FSYNC_SPIN_PREPARE( obj );
2531         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2532            It causes problems with infinite recursion because of exit lock */
2533         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2534             __kmp_abort_thread(); */
2535 
2536         // if we are oversubscribed,
2537         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2538         // pause is in the following code
2539         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2540         KMP_YIELD_SPIN( spins );
2541     }
2542     KMP_FSYNC_SPIN_ACQUIRED( obj );
2543     return r;
2544 }
2545 
2546 } // extern "C"
2547 
2548 #ifdef KMP_GOMP_COMPAT
2549 
2550 void
2551 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2552                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2553                            kmp_int32 chunk, int push_ws )
2554 {
2555     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2556                                       push_ws );
2557 }
2558 
2559 void
2560 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2561                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2562                             kmp_int32 chunk, int push_ws )
2563 {
2564     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2565                                        push_ws );
2566 }
2567 
2568 void
2569 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2570                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2571                            kmp_int64 chunk, int push_ws )
2572 {
2573     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2574                                       push_ws );
2575 }
2576 
2577 void
2578 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2579                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2580                             kmp_int64 chunk, int push_ws )
2581 {
2582     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2583                                        push_ws );
2584 }
2585 
2586 void
2587 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2588 {
2589     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2590 }
2591 
2592 void
2593 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2594 {
2595     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2596 }
2597 
2598 void
2599 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2600 {
2601     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2602 }
2603 
2604 void
2605 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2606 {
2607     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2608 }
2609 
2610 #endif /* KMP_GOMP_COMPAT */
2611 
2612 /* ------------------------------------------------------------------------ */
2613 /* ------------------------------------------------------------------------ */
2614 
2615