1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  * $Revision: 43457 $
4  * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
5  */
6 
7 
8 //===----------------------------------------------------------------------===//
9 //
10 //                     The LLVM Compiler Infrastructure
11 //
12 // This file is dual licensed under the MIT and the University of Illinois Open
13 // Source Licenses. See LICENSE.txt for details.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 
18 /*
19  * Dynamic scheduling initialization and dispatch.
20  *
21  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
22  *       it may change values between parallel regions.  __kmp_max_nth
23  *       is the largest value __kmp_nth may take, 1 is the smallest.
24  *
25  */
26 
27 /* ------------------------------------------------------------------------ */
28 /* ------------------------------------------------------------------------ */
29 
30 #include "kmp.h"
31 #include "kmp_i18n.h"
32 #include "kmp_itt.h"
33 #include "kmp_str.h"
34 #include "kmp_error.h"
35 #include "kmp_stats.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37     #include <float.h>
38 #endif
39 
40 /* ------------------------------------------------------------------------ */
41 /* ------------------------------------------------------------------------ */
42 
43 // template for type limits
44 template< typename T >
45 struct i_maxmin {
46     static const T mx;
47     static const T mn;
48 };
49 template<>
50 struct i_maxmin< int > {
51     static const int mx = 0x7fffffff;
52     static const int mn = 0x80000000;
53 };
54 template<>
55 struct i_maxmin< unsigned int > {
56     static const unsigned int mx = 0xffffffff;
57     static const unsigned int mn = 0x00000000;
58 };
59 template<>
60 struct i_maxmin< long long > {
61     static const long long mx = 0x7fffffffffffffffLL;
62     static const long long mn = 0x8000000000000000LL;
63 };
64 template<>
65 struct i_maxmin< unsigned long long > {
66     static const unsigned long long mx = 0xffffffffffffffffLL;
67     static const unsigned long long mn = 0x0000000000000000LL;
68 };
69 //-------------------------------------------------------------------------
70 
71 #ifdef KMP_STATIC_STEAL_ENABLED
72 
73     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
74     template< typename T >
75     struct dispatch_private_infoXX_template {
76         typedef typename traits_t< T >::unsigned_t  UT;
77         typedef typename traits_t< T >::signed_t    ST;
78         UT count;                // unsigned
79         T  ub;
80         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
81         T  lb;
82         ST st;                   // signed
83         UT tc;                   // unsigned
84         T  static_steal_counter; // for static_steal only; maybe better to put after ub
85 
86         /* parm[1-4] are used in different ways by different scheduling algorithms */
87 
88         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
89         //    a) parm3 is properly aligned and
90         //    b) all parm1-4 are in the same cache line.
91         // Because of parm1-4 are used together, performance seems to be better
92         // if they are in the same line (not measured though).
93 
94         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
95             T  parm1;
96             T  parm2;
97             T  parm3;
98             T  parm4;
99         };
100 
101         UT ordered_lower; // unsigned
102         UT ordered_upper; // unsigned
103         #if KMP_OS_WINDOWS
104         T  last_upper;
105         #endif /* KMP_OS_WINDOWS */
106     };
107 
108 #else /* KMP_STATIC_STEAL_ENABLED */
109 
110     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
111     template< typename T >
112     struct dispatch_private_infoXX_template {
113         typedef typename traits_t< T >::unsigned_t  UT;
114         typedef typename traits_t< T >::signed_t    ST;
115         T  lb;
116         T  ub;
117         ST st;            // signed
118         UT tc;            // unsigned
119 
120         T  parm1;
121         T  parm2;
122         T  parm3;
123         T  parm4;
124 
125         UT count;         // unsigned
126 
127         UT ordered_lower; // unsigned
128         UT ordered_upper; // unsigned
129         #if KMP_OS_WINDOWS
130 	T  last_upper;
131         #endif /* KMP_OS_WINDOWS */
132     };
133 
134 #endif /* KMP_STATIC_STEAL_ENABLED */
135 
136 // replaces dispatch_private_info structure and dispatch_private_info_t type
137 template< typename T >
138 struct KMP_ALIGN_CACHE dispatch_private_info_template {
139     // duplicate alignment here, otherwise size of structure is not correct in our compiler
140     union KMP_ALIGN_CACHE private_info_tmpl {
141         dispatch_private_infoXX_template< T > p;
142         dispatch_private_info64_t             p64;
143     } u;
144     enum sched_type schedule;  /* scheduling algorithm */
145     kmp_uint32      ordered;   /* ordered clause specified */
146     kmp_uint32      ordered_bumped;
147     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
148     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
149     kmp_uint32      nomerge;   /* don't merge iters if serialized */
150     kmp_uint32      type_size;
151     enum cons_type  pushed_ws;
152 };
153 
154 
155 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
156 template< typename UT >
157 struct dispatch_shared_infoXX_template {
158     /* chunk index under dynamic, number of idle threads under static-steal;
159        iteration index otherwise */
160     volatile UT     iteration;
161     volatile UT     num_done;
162     volatile UT     ordered_iteration;
163     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
164 };
165 
166 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
167 template< typename UT >
168 struct dispatch_shared_info_template {
169     // we need union here to keep the structure size
170     union shared_info_tmpl {
171         dispatch_shared_infoXX_template< UT >  s;
172         dispatch_shared_info64_t               s64;
173     } u;
174     volatile kmp_uint32     buffer_index;
175 };
176 
177 /* ------------------------------------------------------------------------ */
178 /* ------------------------------------------------------------------------ */
179 
180 #undef USE_TEST_LOCKS
181 
182 // test_then_add template (general template should NOT be used)
183 template< typename T >
184 static __forceinline T
185 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
186 
187 template<>
188 __forceinline kmp_int32
189 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
190 {
191     kmp_int32 r;
192     r = KMP_TEST_THEN_ADD32( p, d );
193     return r;
194 }
195 
196 template<>
197 __forceinline kmp_int64
198 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
199 {
200     kmp_int64 r;
201     r = KMP_TEST_THEN_ADD64( p, d );
202     return r;
203 }
204 
205 // test_then_inc_acq template (general template should NOT be used)
206 template< typename T >
207 static __forceinline T
208 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
209 
210 template<>
211 __forceinline kmp_int32
212 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
213 {
214     kmp_int32 r;
215     r = KMP_TEST_THEN_INC_ACQ32( p );
216     return r;
217 }
218 
219 template<>
220 __forceinline kmp_int64
221 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
222 {
223     kmp_int64 r;
224     r = KMP_TEST_THEN_INC_ACQ64( p );
225     return r;
226 }
227 
228 // test_then_inc template (general template should NOT be used)
229 template< typename T >
230 static __forceinline T
231 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
232 
233 template<>
234 __forceinline kmp_int32
235 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
236 {
237     kmp_int32 r;
238     r = KMP_TEST_THEN_INC32( p );
239     return r;
240 }
241 
242 template<>
243 __forceinline kmp_int64
244 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
245 {
246     kmp_int64 r;
247     r = KMP_TEST_THEN_INC64( p );
248     return r;
249 }
250 
251 // compare_and_swap template (general template should NOT be used)
252 template< typename T >
253 static __forceinline kmp_int32
254 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
255 
256 template<>
257 __forceinline kmp_int32
258 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
259 {
260     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
261 }
262 
263 template<>
264 __forceinline kmp_int32
265 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
266 {
267     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
268 }
269 
270 /*
271     Spin wait loop that first does pause, then yield.
272     Waits until function returns non-zero when called with *spinner and check.
273     Does NOT put threads to sleep.
274 #if USE_ITT_BUILD
275     Arguments:
276         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
277             locks consistently. For example, if lock is acquired immediately, its address is
278             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
279             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
280             address, not an address of low-level spinner.
281 #endif // USE_ITT_BUILD
282 */
283 template< typename UT >
284 // ToDo: make inline function (move to header file for icl)
285 static UT  // unsigned 4- or 8-byte type
286 __kmp_wait_yield( volatile UT * spinner,
287                   UT            checker,
288                   kmp_uint32 (* pred)( UT, UT )
289                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
290                   )
291 {
292     // note: we may not belong to a team at this point
293     register volatile UT         * spin          = spinner;
294     register          UT           check         = checker;
295     register          kmp_uint32   spins;
296     register          kmp_uint32 (*f) ( UT, UT ) = pred;
297     register          UT           r;
298 
299     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
300     KMP_INIT_YIELD( spins );
301     // main wait spin loop
302     while(!f(r = *spin, check))
303     {
304         KMP_FSYNC_SPIN_PREPARE( obj );
305         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
306            It causes problems with infinite recursion because of exit lock */
307         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
308             __kmp_abort_thread(); */
309 
310         // if we are oversubscribed,
311         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
312         // pause is in the following code
313         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
314         KMP_YIELD_SPIN( spins );
315     }
316     KMP_FSYNC_SPIN_ACQUIRED( obj );
317     return r;
318 }
319 
320 template< typename UT >
321 static kmp_uint32 __kmp_eq( UT value, UT checker) {
322     return value == checker;
323 }
324 
325 template< typename UT >
326 static kmp_uint32 __kmp_neq( UT value, UT checker) {
327     return value != checker;
328 }
329 
330 template< typename UT >
331 static kmp_uint32 __kmp_lt( UT value, UT checker) {
332     return value < checker;
333 }
334 
335 template< typename UT >
336 static kmp_uint32 __kmp_ge( UT value, UT checker) {
337     return value >= checker;
338 }
339 
340 template< typename UT >
341 static kmp_uint32 __kmp_le( UT value, UT checker) {
342     return value <= checker;
343 }
344 
345 
346 /* ------------------------------------------------------------------------ */
347 /* ------------------------------------------------------------------------ */
348 
349 static void
350 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
351 {
352     kmp_info_t *th;
353 
354     KMP_DEBUG_ASSERT( gtid_ref );
355 
356     if ( __kmp_env_consistency_check ) {
357         th = __kmp_threads[*gtid_ref];
358         if ( th -> th.th_root -> r.r_active
359           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
360             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
361         }
362     }
363 }
364 
365 template< typename UT >
366 static void
367 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
368 {
369     typedef typename traits_t< UT >::signed_t    ST;
370     dispatch_private_info_template< UT > * pr;
371 
372     int gtid = *gtid_ref;
373 //    int  cid = *cid_ref;
374     kmp_info_t *th = __kmp_threads[ gtid ];
375     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
376 
377     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
378     if ( __kmp_env_consistency_check ) {
379         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
380             ( th -> th.th_dispatch -> th_dispatch_pr_current );
381         if ( pr -> pushed_ws != ct_none ) {
382             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
383         }
384     }
385 
386     if ( ! th -> th.th_team -> t.t_serialized ) {
387         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
388             ( th -> th.th_dispatch -> th_dispatch_sh_current );
389         UT  lower;
390 
391         if ( ! __kmp_env_consistency_check ) {
392                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
393                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
394         }
395         lower = pr->u.p.ordered_lower;
396 
397         #if ! defined( KMP_GOMP_COMPAT )
398             if ( __kmp_env_consistency_check ) {
399                 if ( pr->ordered_bumped ) {
400                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
401                     __kmp_error_construct2(
402                         kmp_i18n_msg_CnsMultipleNesting,
403                         ct_ordered_in_pdo, loc_ref,
404                         & p->stack_data[ p->w_top ]
405                     );
406                 }
407             }
408         #endif /* !defined(KMP_GOMP_COMPAT) */
409 
410         KMP_MB();
411         #ifdef KMP_DEBUG
412         {
413             const char * buff;
414             // create format specifiers before the debug output
415             buff = __kmp_str_format(
416                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
417                 traits_t< UT >::spec, traits_t< UT >::spec );
418             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
419             __kmp_str_free( &buff );
420         }
421         #endif
422 
423         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
424                                 USE_ITT_BUILD_ARG( NULL )
425                                 );
426         KMP_MB();  /* is this necessary? */
427         #ifdef KMP_DEBUG
428         {
429             const char * buff;
430             // create format specifiers before the debug output
431             buff = __kmp_str_format(
432                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
433                 traits_t< UT >::spec, traits_t< UT >::spec );
434             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435             __kmp_str_free( &buff );
436         }
437         #endif
438     }
439     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
440 }
441 
442 static void
443 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
444 {
445     kmp_info_t *th;
446 
447     if ( __kmp_env_consistency_check ) {
448         th = __kmp_threads[*gtid_ref];
449         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
450             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
451         }
452     }
453 }
454 
455 template< typename UT >
456 static void
457 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
458 {
459     typedef typename traits_t< UT >::signed_t    ST;
460     dispatch_private_info_template< UT > * pr;
461 
462     int gtid = *gtid_ref;
463 //    int  cid = *cid_ref;
464     kmp_info_t *th = __kmp_threads[ gtid ];
465     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
466 
467     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
468     if ( __kmp_env_consistency_check ) {
469         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
470             ( th -> th.th_dispatch -> th_dispatch_pr_current );
471         if ( pr -> pushed_ws != ct_none ) {
472             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
473         }
474     }
475 
476     if ( ! th -> th.th_team -> t.t_serialized ) {
477         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
478             ( th -> th.th_dispatch -> th_dispatch_sh_current );
479 
480         if ( ! __kmp_env_consistency_check ) {
481             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
482                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
483         }
484 
485         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
486         #if ! defined( KMP_GOMP_COMPAT )
487             if ( __kmp_env_consistency_check ) {
488                 if ( pr->ordered_bumped != 0 ) {
489                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
490                     /* How to test it? - OM */
491                     __kmp_error_construct2(
492                         kmp_i18n_msg_CnsMultipleNesting,
493                         ct_ordered_in_pdo, loc_ref,
494                         & p->stack_data[ p->w_top ]
495                     );
496                 }
497             }
498         #endif /* !defined(KMP_GOMP_COMPAT) */
499 
500         KMP_MB();       /* Flush all pending memory write invalidates.  */
501 
502         pr->ordered_bumped += 1;
503 
504         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
505                         gtid, pr->ordered_bumped ) );
506 
507         KMP_MB();       /* Flush all pending memory write invalidates.  */
508 
509         /* TODO use general release procedure? */
510         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
511 
512         KMP_MB();       /* Flush all pending memory write invalidates.  */
513     }
514     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
515 }
516 
517 /* Computes and returns x to the power of y, where y must a non-negative integer */
518 template< typename UT >
519 static __forceinline long double
520 __kmp_pow(long double x, UT y) {
521     long double s=1.0L;
522 
523     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
524     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
525     while(y) {
526         if ( y & 1 )
527             s *= x;
528         x *= x;
529         y >>= 1;
530     }
531     return s;
532 }
533 
534 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
535    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
536    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
537    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
538 */
539 template< typename T >
540 static __inline typename traits_t< T >::unsigned_t
541 __kmp_dispatch_guided_remaining(
542     T                                  tc,
543     typename traits_t< T >::floating_t base,
544     typename traits_t< T >::unsigned_t idx
545 ) {
546     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
547        least for ICL 8.1, long double arithmetic may not really have
548        long double precision, even with /Qlong_double.  Currently, we
549        workaround that in the caller code, by manipulating the FPCW for
550        Windows* OS on IA-32 architecture.  The lack of precision is not
551        expected to be a correctness issue, though.
552     */
553     typedef typename traits_t< T >::unsigned_t  UT;
554 
555     long double x = tc * __kmp_pow< UT >(base, idx);
556     UT r = (UT) x;
557     if ( x == r )
558         return r;
559     return r + 1;
560 }
561 
562 // Parameters of the guided-iterative algorithm:
563 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
564 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
565 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
566 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
567 static int guided_int_param = 2;
568 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
569 
570 // UT - unsigned flavor of T, ST - signed flavor of T,
571 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
572 template< typename T >
573 static void
574 __kmp_dispatch_init(
575     ident_t                        * loc,
576     int                              gtid,
577     enum sched_type                  schedule,
578     T                                lb,
579     T                                ub,
580     typename traits_t< T >::signed_t st,
581     typename traits_t< T >::signed_t chunk,
582     int                              push_ws
583 ) {
584     typedef typename traits_t< T >::unsigned_t  UT;
585     typedef typename traits_t< T >::signed_t    ST;
586     typedef typename traits_t< T >::floating_t  DBL;
587     static const int ___kmp_size_type = sizeof( UT );
588 
589     int                                            active;
590     T                                              tc;
591     kmp_info_t *                                   th;
592     kmp_team_t *                                   team;
593     kmp_uint32                                     my_buffer_index;
594     dispatch_private_info_template< T >          * pr;
595     dispatch_shared_info_template< UT > volatile * sh;
596 
597     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
598     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
599 
600     if ( ! TCR_4( __kmp_init_parallel ) )
601         __kmp_parallel_initialize();
602 
603 #if INCLUDE_SSC_MARKS
604     SSC_MARK_DISPATCH_INIT();
605 #endif
606     #ifdef KMP_DEBUG
607     {
608         const char * buff;
609         // create format specifiers before the debug output
610         buff = __kmp_str_format(
611             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
612             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
613         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
614         __kmp_str_free( &buff );
615     }
616     #endif
617     /* setup data */
618     th     = __kmp_threads[ gtid ];
619     team   = th -> th.th_team;
620     active = ! team -> t.t_serialized;
621     th->th.th_ident = loc;
622 
623 #if USE_ITT_BUILD
624     kmp_uint64 cur_chunk = chunk;
625 #endif
626     if ( ! active ) {
627         pr = reinterpret_cast< dispatch_private_info_template< T >* >
628             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
629     } else {
630         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
631                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
632 
633         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
634 
635         /* What happens when number of threads changes, need to resize buffer? */
636         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
637             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
638         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
639             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
640     }
641 
642     /* Pick up the nomerge/ordered bits from the scheduling type */
643     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
644         pr->nomerge = TRUE;
645         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
646     } else {
647         pr->nomerge = FALSE;
648     }
649     pr->type_size = ___kmp_size_type; // remember the size of variables
650     if ( kmp_ord_lower & schedule ) {
651         pr->ordered = TRUE;
652         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
653     } else {
654         pr->ordered = FALSE;
655     }
656     if ( schedule == kmp_sch_static ) {
657         schedule = __kmp_static;
658     } else {
659         if ( schedule == kmp_sch_runtime ) {
660             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
661             schedule = team -> t.t_sched.r_sched_type;
662             // Detail the schedule if needed (global controls are differentiated appropriately)
663             if ( schedule == kmp_sch_guided_chunked ) {
664                 schedule = __kmp_guided;
665             } else if ( schedule == kmp_sch_static ) {
666                 schedule = __kmp_static;
667             }
668             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
669             chunk = team -> t.t_sched.chunk;
670 
671             #ifdef KMP_DEBUG
672             {
673                 const char * buff;
674                 // create format specifiers before the debug output
675                 buff = __kmp_str_format(
676                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
677                     traits_t< ST >::spec );
678                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
679                 __kmp_str_free( &buff );
680             }
681             #endif
682         } else {
683             if ( schedule == kmp_sch_guided_chunked ) {
684                 schedule = __kmp_guided;
685             }
686             if ( chunk <= 0 ) {
687                 chunk = KMP_DEFAULT_CHUNK;
688             }
689         }
690 
691         if ( schedule == kmp_sch_auto ) {
692             // mapping and differentiation: in the __kmp_do_serial_initialize()
693             schedule = __kmp_auto;
694             #ifdef KMP_DEBUG
695             {
696                 const char * buff;
697                 // create format specifiers before the debug output
698                 buff = __kmp_str_format(
699                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
700                     traits_t< ST >::spec );
701                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
702                 __kmp_str_free( &buff );
703             }
704             #endif
705         }
706 
707         /* guided analytical not safe for too many threads */
708         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
709             schedule = kmp_sch_guided_iterative_chunked;
710             KMP_WARNING( DispatchManyThreads );
711         }
712         pr->u.p.parm1 = chunk;
713     }
714     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
715                 "unknown scheduling type" );
716 
717     pr->u.p.count = 0;
718 
719     if ( __kmp_env_consistency_check ) {
720         if ( st == 0 ) {
721             __kmp_error_construct(
722                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
724             );
725         }
726     }
727 
728     tc = ( ub - lb + st );
729     if ( st != 1 ) {
730         if ( st < 0 ) {
731             if ( lb < ub ) {
732                 tc = 0;            // zero-trip
733             } else {   // lb >= ub
734                 tc = (ST)tc / st;  // convert to signed division
735             }
736         } else {       // st > 0
737             if ( ub < lb ) {
738                 tc = 0;            // zero-trip
739             } else {   // lb >= ub
740                 tc /= st;
741             }
742         }
743     } else if ( ub < lb ) {        // st == 1
744         tc = 0;                    // zero-trip
745     }
746 
747     pr->u.p.lb = lb;
748     pr->u.p.ub = ub;
749     pr->u.p.st = st;
750     pr->u.p.tc = tc;
751 
752     #if KMP_OS_WINDOWS
753     pr->u.p.last_upper = ub + st;
754     #endif /* KMP_OS_WINDOWS */
755 
756     /* NOTE: only the active parallel region(s) has active ordered sections */
757 
758     if ( active ) {
759         if ( pr->ordered == 0 ) {
760             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
761             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
762         } else {
763             pr->ordered_bumped = 0;
764 
765             pr->u.p.ordered_lower = 1;
766             pr->u.p.ordered_upper = 0;
767 
768             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
769             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
770         }
771     }
772 
773     if ( __kmp_env_consistency_check ) {
774         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
775         if ( push_ws ) {
776             __kmp_push_workshare( gtid, ws, loc );
777             pr->pushed_ws = ws;
778         } else {
779             __kmp_check_workshare( gtid, ws, loc );
780             pr->pushed_ws = ct_none;
781         }
782     }
783 
784     switch ( schedule ) {
785     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
786     case kmp_sch_static_steal:
787         {
788             T nproc = team->t.t_nproc;
789             T ntc, init;
790 
791             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
792 
793             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
794             if ( nproc > 1 && ntc >= nproc ) {
795                 T id = __kmp_tid_from_gtid(gtid);
796                 T small_chunk, extras;
797 
798                 small_chunk = ntc / nproc;
799                 extras = ntc % nproc;
800 
801                 init = id * small_chunk + ( id < extras ? id : extras );
802                 pr->u.p.count = init;
803                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
804 
805                 pr->u.p.parm2 = lb;
806                 //pr->pfields.parm3 = 0; // it's not used in static_steal
807                 pr->u.p.parm4 = id;
808                 pr->u.p.st = st;
809                 break;
810             } else {
811                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
812                                gtid ) );
813                 schedule = kmp_sch_static_balanced;
814                 /* too few iterations: fall-through to kmp_sch_static_balanced */
815             } // if
816             /* FALL-THROUGH to static balanced */
817         } // case
818     #endif
819     case kmp_sch_static_balanced:
820         {
821             T nproc = team->t.t_nproc;
822             T init, limit;
823 
824             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
825                             gtid ) );
826 
827             if ( nproc > 1 ) {
828                 T id = __kmp_tid_from_gtid(gtid);
829 
830                 if ( tc < nproc ) {
831                     if ( id < tc ) {
832                         init = id;
833                         limit = id;
834                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
835                     } else {
836                         pr->u.p.count = 1;  /* means no more chunks to execute */
837                         pr->u.p.parm1 = FALSE;
838                         break;
839                     }
840                 } else {
841                     T small_chunk = tc / nproc;
842                     T extras = tc % nproc;
843                     init = id * small_chunk + (id < extras ? id : extras);
844                     limit = init + small_chunk - (id < extras ? 0 : 1);
845                     pr->u.p.parm1 = (id == nproc - 1);
846                 }
847             } else {
848                 if ( tc > 0 ) {
849                     init = 0;
850                     limit = tc - 1;
851                     pr->u.p.parm1 = TRUE;
852                 } else {
853                     // zero trip count
854                     pr->u.p.count = 1;  /* means no more chunks to execute */
855                     pr->u.p.parm1 = FALSE;
856                     break;
857                 }
858             }
859 #if USE_ITT_BUILD
860             // Calculate chunk for metadata report
861             if(  __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
862                 cur_chunk = limit - init + 1;
863             }
864 #endif
865             if ( st == 1 ) {
866                 pr->u.p.lb = lb + init;
867                 pr->u.p.ub = lb + limit;
868             } else {
869                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
870                 pr->u.p.lb = lb + init * st;
871                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
872                 if ( st > 0 ) {
873                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
874                 } else {
875                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
876                 }
877             }
878             if ( pr->ordered ) {
879                 pr->u.p.ordered_lower = init;
880                 pr->u.p.ordered_upper = limit;
881             }
882             break;
883         } // case
884     case kmp_sch_guided_iterative_chunked :
885         {
886             T nproc = team->t.t_nproc;
887             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
888 
889             if ( nproc > 1 ) {
890                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
891                     /* chunk size too large, switch to dynamic */
892                     schedule = kmp_sch_dynamic_chunked;
893                 } else {
894                     // when remaining iters become less than parm2 - switch to dynamic
895                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
896                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
897                 }
898             } else {
899                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
900                 schedule = kmp_sch_static_greedy;
901                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
902                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
903                 pr->u.p.parm1 = tc;
904             } // if
905         } // case
906         break;
907     case kmp_sch_guided_analytical_chunked:
908         {
909             T nproc = team->t.t_nproc;
910             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
911 
912             if ( nproc > 1 ) {
913                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
914                     /* chunk size too large, switch to dynamic */
915                     schedule = kmp_sch_dynamic_chunked;
916                 } else {
917                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
918                     DBL x;
919 
920                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
921                     /* Linux* OS already has 64-bit computation by default for
922 		       long double, and on Windows* OS on Intel(R) 64,
923 		       /Qlong_double doesn't work.  On Windows* OS
924 		       on IA-32 architecture, we need to set precision to
925 		       64-bit instead of the default 53-bit. Even though long
926 		       double doesn't work on Windows* OS on Intel(R) 64, the
927 		       resulting lack of precision is not expected to impact
928 		       the correctness of the algorithm, but this has not been
929 		       mathematically proven.
930                     */
931                     // save original FPCW and set precision to 64-bit, as
932                     // Windows* OS on IA-32 architecture defaults to 53-bit
933                     unsigned int oldFpcw = _control87(0,0);
934                     _control87(_PC_64,_MCW_PC); // 0,0x30000
935                     #endif
936                     /* value used for comparison in solver for cross-over point */
937                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
938 
939                     /* crossover point--chunk indexes equal to or greater than
940 		       this point switch to dynamic-style scheduling */
941                     UT   cross;
942 
943                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
944                     x = (long double)1.0 - (long double)0.5 / nproc;
945 
946                     #ifdef KMP_DEBUG
947                     { // test natural alignment
948                         struct _test_a {
949                             char a;
950                             union {
951                                 char b;
952                                 DBL  d;
953                             };
954                         } t;
955                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
956                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
957                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
958                     }
959                     #endif // KMP_DEBUG
960 
961                     /* save the term in thread private dispatch structure */
962                     *(DBL*)&pr->u.p.parm3 = x;
963 
964                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
965                     {
966                         UT          left, right, mid;
967                         long double p;
968 
969                         /* estimate initial upper and lower bound */
970 
971                         /* doesn't matter what value right is as long as it is positive, but
972                            it affects performance of the solver
973                         */
974                         right = 229;
975                         p = __kmp_pow< UT >(x,right);
976                         if ( p > target ) {
977                             do{
978                                 p *= p;
979                                 right <<= 1;
980                             } while(p>target && right < (1<<27));
981                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
982                         } else {
983                             left = 0;
984                         }
985 
986                         /* bisection root-finding method */
987                         while ( left + 1 < right ) {
988                             mid = (left + right) / 2;
989                             if ( __kmp_pow< UT >(x,mid) > target ) {
990                                 left = mid;
991                             } else {
992                                 right = mid;
993                             }
994                         } // while
995                         cross = right;
996                     }
997                     /* assert sanity of computed crossover point */
998                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
999 
1000                     /* save the crossover point in thread private dispatch structure */
1001                     pr->u.p.parm2 = cross;
1002 
1003                     // C75803
1004                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1005                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1006                     #else
1007                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1008                     #endif
1009                     /* dynamic-style scheduling offset */
1010                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1011                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1012                         // restore FPCW
1013                         _control87(oldFpcw,_MCW_PC);
1014                     #endif
1015                 } // if
1016             } else {
1017                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1018                                gtid ) );
1019                 schedule = kmp_sch_static_greedy;
1020                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1021                 pr->u.p.parm1 = tc;
1022             } // if
1023         } // case
1024         break;
1025     case kmp_sch_static_greedy:
1026         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1027             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1028                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1029                 tc;
1030         break;
1031     case kmp_sch_static_chunked :
1032     case kmp_sch_dynamic_chunked :
1033         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1034         break;
1035     case kmp_sch_trapezoidal :
1036         {
1037             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1038 
1039             T parm1, parm2, parm3, parm4;
1040             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1041 
1042             parm1 = chunk;
1043 
1044             /* F : size of the first cycle */
1045             parm2 = ( tc / (2 * team->t.t_nproc) );
1046 
1047             if ( parm2 < 1 ) {
1048                 parm2 = 1;
1049             }
1050 
1051             /* L : size of the last cycle.  Make sure the last cycle
1052              *     is not larger than the first cycle.
1053              */
1054             if ( parm1 < 1 ) {
1055                 parm1 = 1;
1056             } else if ( parm1 > parm2 ) {
1057                 parm1 = parm2;
1058             }
1059 
1060             /* N : number of cycles */
1061             parm3 = ( parm2 + parm1 );
1062             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1063 
1064             if ( parm3 < 2 ) {
1065                 parm3 = 2;
1066             }
1067 
1068             /* sigma : decreasing incr of the trapezoid */
1069             parm4 = ( parm3 - 1 );
1070             parm4 = ( parm2 - parm1 ) / parm4;
1071 
1072             // pointless check, because parm4 >= 0 always
1073             //if ( parm4 < 0 ) {
1074             //    parm4 = 0;
1075             //}
1076 
1077             pr->u.p.parm1 = parm1;
1078             pr->u.p.parm2 = parm2;
1079             pr->u.p.parm3 = parm3;
1080             pr->u.p.parm4 = parm4;
1081         } // case
1082         break;
1083 
1084     default:
1085         {
1086             __kmp_msg(
1087                 kmp_ms_fatal,                        // Severity
1088                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1089                 KMP_HNT( GetNewerLibrary ),          // Hint
1090                 __kmp_msg_null                       // Variadic argument list terminator
1091             );
1092         }
1093         break;
1094     } // switch
1095     pr->schedule = schedule;
1096     if ( active ) {
1097         /* The name of this buffer should be my_buffer_index when it's free to use it */
1098 
1099         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1100                         gtid, my_buffer_index, sh->buffer_index) );
1101         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1102                                         USE_ITT_BUILD_ARG( NULL )
1103                                         );
1104             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1105             // *always* 32-bit integers.
1106         KMP_MB();  /* is this necessary? */
1107         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1108                         gtid, my_buffer_index, sh->buffer_index) );
1109 
1110         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1111         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1112 #if USE_ITT_BUILD
1113         if ( pr->ordered ) {
1114             __kmp_itt_ordered_init( gtid );
1115         }; // if
1116 #endif /* USE_ITT_BUILD */
1117     }; // if
1118 
1119 #if USE_ITT_BUILD
1120     // Report loop metadata
1121     if( __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
1122         kmp_uint32 tid  = __kmp_tid_from_gtid( gtid );
1123         if (KMP_MASTER_TID(tid)) {
1124             kmp_uint64 schedtype = 0;
1125 
1126             switch ( schedule ) {
1127             case kmp_sch_static_chunked:
1128             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1129                 break;
1130             case kmp_sch_static_greedy:
1131                 cur_chunk = pr->u.p.parm1;
1132                 break;
1133             case kmp_sch_dynamic_chunked:
1134                 schedtype = 1;
1135                 break;
1136             case kmp_sch_guided_iterative_chunked:
1137             case kmp_sch_guided_analytical_chunked:
1138                 schedtype = 2;
1139                 break;
1140             default:
1141 //            Should we put this case under "static"?
1142 //            case kmp_sch_static_steal:
1143                 schedtype = 3;
1144                 break;
1145             }
1146             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1147         }
1148     }
1149 #endif /* USE_ITT_BUILD */
1150 
1151     #ifdef KMP_DEBUG
1152     {
1153         const char * buff;
1154         // create format specifiers before the debug output
1155         buff = __kmp_str_format(
1156             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1157             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1158             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1159             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1160             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1161             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1162             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1163         KD_TRACE(10, ( buff,
1164             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1165             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1166             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1167             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1168         __kmp_str_free( &buff );
1169     }
1170     #endif
1171     #if ( KMP_STATIC_STEAL_ENABLED )
1172     if ( ___kmp_size_type < 8 ) {
1173       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1174       // all the parm3 variables will contain the same value.
1175       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1176       // rather than program life-time increment.
1177       // So the dedicated variable is required. The 'static_steal_counter' is used.
1178       if( schedule == kmp_sch_static_steal ) {
1179         // Other threads will inspect this variable when searching for a victim.
1180         // This is a flag showing that other threads may steal from this thread since then.
1181         volatile T * p = &pr->u.p.static_steal_counter;
1182         *p = *p + 1;
1183       }
1184     }
1185     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1186 }
1187 
1188 /*
1189  * For ordered loops, either __kmp_dispatch_finish() should be called after
1190  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1191  * every chunk of iterations.  If the ordered section(s) were not executed
1192  * for this iteration (or every iteration in this chunk), we need to set the
1193  * ordered iteration counters so that the next thread can proceed.
1194  */
1195 template< typename UT >
1196 static void
1197 __kmp_dispatch_finish( int gtid, ident_t *loc )
1198 {
1199     typedef typename traits_t< UT >::signed_t ST;
1200     kmp_info_t *th = __kmp_threads[ gtid ];
1201 
1202     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1203     if ( ! th -> th.th_team -> t.t_serialized ) {
1204 
1205         dispatch_private_info_template< UT > * pr =
1206             reinterpret_cast< dispatch_private_info_template< UT >* >
1207             ( th->th.th_dispatch->th_dispatch_pr_current );
1208         dispatch_shared_info_template< UT > volatile * sh =
1209             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1210             ( th->th.th_dispatch->th_dispatch_sh_current );
1211         KMP_DEBUG_ASSERT( pr );
1212         KMP_DEBUG_ASSERT( sh );
1213         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1214                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1215 
1216         if ( pr->ordered_bumped ) {
1217             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1218                             gtid ) );
1219             pr->ordered_bumped = 0;
1220         } else {
1221             UT lower = pr->u.p.ordered_lower;
1222 
1223             #ifdef KMP_DEBUG
1224             {
1225                 const char * buff;
1226                 // create format specifiers before the debug output
1227                 buff = __kmp_str_format(
1228                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1229                     traits_t< UT >::spec, traits_t< UT >::spec );
1230                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1231                 __kmp_str_free( &buff );
1232             }
1233             #endif
1234 
1235             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1236                                    USE_ITT_BUILD_ARG(NULL)
1237                                    );
1238             KMP_MB();  /* is this necessary? */
1239             #ifdef KMP_DEBUG
1240             {
1241                 const char * buff;
1242                 // create format specifiers before the debug output
1243                 buff = __kmp_str_format(
1244                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1245                     traits_t< UT >::spec, traits_t< UT >::spec );
1246                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1247                 __kmp_str_free( &buff );
1248             }
1249             #endif
1250 
1251             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1252         } // if
1253     } // if
1254     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1255 }
1256 
1257 #ifdef KMP_GOMP_COMPAT
1258 
1259 template< typename UT >
1260 static void
1261 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1262 {
1263     typedef typename traits_t< UT >::signed_t ST;
1264     kmp_info_t *th = __kmp_threads[ gtid ];
1265 
1266     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1267     if ( ! th -> th.th_team -> t.t_serialized ) {
1268 //        int cid;
1269         dispatch_private_info_template< UT > * pr =
1270             reinterpret_cast< dispatch_private_info_template< UT >* >
1271             ( th->th.th_dispatch->th_dispatch_pr_current );
1272         dispatch_shared_info_template< UT > volatile * sh =
1273             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1274             ( th->th.th_dispatch->th_dispatch_sh_current );
1275         KMP_DEBUG_ASSERT( pr );
1276         KMP_DEBUG_ASSERT( sh );
1277         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1278                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1279 
1280 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1281             UT lower = pr->u.p.ordered_lower;
1282             UT upper = pr->u.p.ordered_upper;
1283             UT inc = upper - lower + 1;
1284 
1285             if ( pr->ordered_bumped == inc ) {
1286                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1287                   gtid ) );
1288                 pr->ordered_bumped = 0;
1289             } else {
1290                 inc -= pr->ordered_bumped;
1291 
1292                 #ifdef KMP_DEBUG
1293                 {
1294                     const char * buff;
1295                     // create format specifiers before the debug output
1296                     buff = __kmp_str_format(
1297                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1298                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1299                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1300                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1301                     __kmp_str_free( &buff );
1302                 }
1303                 #endif
1304 
1305                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1306                                        USE_ITT_BUILD_ARG(NULL)
1307                                        );
1308 
1309                 KMP_MB();  /* is this necessary? */
1310                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1311                   gtid ) );
1312                 pr->ordered_bumped = 0;
1313 //!!!!! TODO check if the inc should be unsigned, or signed???
1314                 #ifdef KMP_DEBUG
1315                 {
1316                     const char * buff;
1317                     // create format specifiers before the debug output
1318                     buff = __kmp_str_format(
1319                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1320                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1321                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1322                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1323                     __kmp_str_free( &buff );
1324                 }
1325                 #endif
1326 
1327                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1328             }
1329 //        }
1330     }
1331     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1332 }
1333 
1334 #endif /* KMP_GOMP_COMPAT */
1335 
1336 template< typename T >
1337 static int
1338 __kmp_dispatch_next(
1339     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1340 ) {
1341 
1342     typedef typename traits_t< T >::unsigned_t  UT;
1343     typedef typename traits_t< T >::signed_t    ST;
1344     typedef typename traits_t< T >::floating_t  DBL;
1345     static const int ___kmp_size_type = sizeof( UT );
1346 
1347     int                                   status;
1348     dispatch_private_info_template< T > * pr;
1349     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1350     kmp_team_t                          * team = th -> th.th_team;
1351 
1352     KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1353     #ifdef KMP_DEBUG
1354     {
1355         const char * buff;
1356         // create format specifiers before the debug output
1357         buff = __kmp_str_format(
1358             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1359             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1360         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1361         __kmp_str_free( &buff );
1362     }
1363     #endif
1364 
1365     if ( team -> t.t_serialized ) {
1366         /* NOTE: serialize this dispatch becase we are not at the active level */
1367         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1368             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1369         KMP_DEBUG_ASSERT( pr );
1370 
1371         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1372             *p_lb = 0;
1373             *p_ub = 0;
1374 //            if ( p_last != NULL )
1375 //                *p_last = 0;
1376             if ( p_st != NULL )
1377                 *p_st = 0;
1378             if ( __kmp_env_consistency_check ) {
1379                 if ( pr->pushed_ws != ct_none ) {
1380                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1381                 }
1382             }
1383         } else if ( pr->nomerge ) {
1384             kmp_int32 last;
1385             T         start;
1386             UT        limit, trip, init;
1387             ST        incr;
1388             T         chunk = pr->u.p.parm1;
1389 
1390             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1391 
1392             init = chunk * pr->u.p.count++;
1393             trip = pr->u.p.tc - 1;
1394 
1395             if ( (status = (init <= trip)) == 0 ) {
1396                 *p_lb = 0;
1397                 *p_ub = 0;
1398 //                if ( p_last != NULL )
1399 //                    *p_last = 0;
1400                 if ( p_st != NULL )
1401                     *p_st = 0;
1402                 if ( __kmp_env_consistency_check ) {
1403                     if ( pr->pushed_ws != ct_none ) {
1404                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1405                     }
1406                 }
1407             } else {
1408                 start = pr->u.p.lb;
1409                 limit = chunk + init - 1;
1410                 incr  = pr->u.p.st;
1411 
1412                 if ( (last = (limit >= trip)) != 0 ) {
1413                     limit = trip;
1414                     #if KMP_OS_WINDOWS
1415                     pr->u.p.last_upper = pr->u.p.ub;
1416                     #endif /* KMP_OS_WINDOWS */
1417                 }
1418                 if ( p_last != NULL )
1419                     *p_last = last;
1420                 if ( p_st != NULL )
1421                     *p_st = incr;
1422                 if ( incr == 1 ) {
1423                     *p_lb = start + init;
1424                     *p_ub = start + limit;
1425                 } else {
1426                     *p_lb = start + init * incr;
1427                     *p_ub = start + limit * incr;
1428                 }
1429 
1430                 if ( pr->ordered ) {
1431                     pr->u.p.ordered_lower = init;
1432                     pr->u.p.ordered_upper = limit;
1433                     #ifdef KMP_DEBUG
1434                     {
1435                         const char * buff;
1436                         // create format specifiers before the debug output
1437                         buff = __kmp_str_format(
1438                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1439                             traits_t< UT >::spec, traits_t< UT >::spec );
1440                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1441                         __kmp_str_free( &buff );
1442                     }
1443                     #endif
1444                 } // if
1445             } // if
1446         } else {
1447             pr->u.p.tc = 0;
1448             *p_lb = pr->u.p.lb;
1449             *p_ub = pr->u.p.ub;
1450             #if KMP_OS_WINDOWS
1451             pr->u.p.last_upper = *p_ub;
1452             #endif /* KMP_OS_WINDOWS */
1453             if ( p_last != NULL )
1454                 *p_last = TRUE;
1455             if ( p_st != NULL )
1456                 *p_st = pr->u.p.st;
1457         } // if
1458         #ifdef KMP_DEBUG
1459         {
1460             const char * buff;
1461             // create format specifiers before the debug output
1462             buff = __kmp_str_format(
1463                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1464                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1465                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1466             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1467             __kmp_str_free( &buff );
1468         }
1469         #endif
1470 #if INCLUDE_SSC_MARKS
1471         SSC_MARK_DISPATCH_NEXT();
1472 #endif
1473         return status;
1474     } else {
1475         kmp_int32 last = 0;
1476         dispatch_shared_info_template< UT > *sh;
1477         T         start;
1478         ST        incr;
1479         UT        limit, trip, init;
1480 
1481         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1482                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1483 
1484         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1485             ( th->th.th_dispatch->th_dispatch_pr_current );
1486         KMP_DEBUG_ASSERT( pr );
1487         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1488             ( th->th.th_dispatch->th_dispatch_sh_current );
1489         KMP_DEBUG_ASSERT( sh );
1490 
1491         if ( pr->u.p.tc == 0 ) {
1492             // zero trip count
1493             status = 0;
1494         } else {
1495             switch (pr->schedule) {
1496             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1497             case kmp_sch_static_steal:
1498                 {
1499                     T chunk = pr->u.p.parm1;
1500 
1501                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1502 
1503                     trip = pr->u.p.tc - 1;
1504 
1505                     if ( ___kmp_size_type > 4 ) {
1506                         // Other threads do not look into the data of this thread,
1507                         //  so it's not necessary to make volatile casting.
1508                         init   = ( pr->u.p.count )++;
1509                         status = ( init < (UT)pr->u.p.ub );
1510                     } else {
1511                         typedef union {
1512                             struct {
1513                                 UT count;
1514                                 T  ub;
1515                             } p;
1516                             kmp_int64 b;
1517                         } union_i4;
1518                         // All operations on 'count' or 'ub' must be combined atomically together.
1519                         // stealing implemented only for 4-byte indexes
1520                         {
1521                             union_i4 vold, vnew;
1522                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1523                             vnew = vold;
1524                             vnew.p.count++;
1525                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1526                                         ( volatile kmp_int64* )&pr->u.p.count,
1527                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1528                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1529                                 KMP_CPU_PAUSE();
1530                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1531                                 vnew = vold;
1532                                 vnew.p.count++;
1533                             }
1534                             vnew = vold;
1535                             init   = vnew.p.count;
1536                             status = ( init < (UT)vnew.p.ub ) ;
1537                         }
1538 
1539                         if( !status ) {
1540                             kmp_info_t   **other_threads = team->t.t_threads;
1541                             int          while_limit = 10;
1542                             int          while_index = 0;
1543 
1544                             // TODO: algorithm of searching for a victim
1545                             // should be cleaned up and measured
1546                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1547                                 union_i4  vold, vnew;
1548                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1549                                 T         victimIdx    = pr->u.p.parm4;
1550                                 T         oldVictimIdx = victimIdx;
1551                                 dispatch_private_info_template< T > * victim;
1552 
1553                                 do {
1554                                     if( !victimIdx ) {
1555                                         victimIdx = team->t.t_nproc - 1;
1556                                     } else {
1557                                         --victimIdx;
1558                                     }
1559                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1560                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1561                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1562                                 // TODO: think about a proper place of this test
1563                                 if ( ( !victim ) ||
1564                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1565                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1566                                     // TODO: delay would be nice
1567                                     continue;
1568                                     // the victim is not ready yet to participate in stealing
1569                                     // because the victim is still in kmp_init_dispatch
1570                                 }
1571                                 if ( oldVictimIdx == victimIdx ) {
1572                                     break;
1573                                 }
1574                                 pr->u.p.parm4 = victimIdx;
1575 
1576                                 while( 1 ) {
1577                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1578                                     vnew = vold;
1579 
1580                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1581                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1582                                         break;
1583                                     }
1584                                     vnew.p.ub -= (remaining >> 2);
1585                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1586                                     #pragma warning( push )
1587                                     // disable warning on pointless comparison of unsigned with 0
1588                                     #pragma warning( disable: 186 )
1589                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1590                                     #pragma warning( pop )
1591                                     // TODO: Should this be acquire or release?
1592                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1593                                             ( volatile kmp_int64 * )&victim->u.p.count,
1594                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1595                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1596                                         status = 1;
1597                                         while_index = 0;
1598                                         // now update own count and ub
1599                                         #if KMP_ARCH_X86
1600                                         // stealing executed on non-KMP_ARCH_X86 only
1601                                             // Atomic 64-bit write on ia32 is
1602                                             // unavailable, so we do this in steps.
1603                                             //     This code is not tested.
1604                                             init = vold.p.count;
1605                                             pr->u.p.ub = 0;
1606                                             pr->u.p.count = init + 1;
1607                                             pr->u.p.ub = vnew.p.count;
1608                                         #else
1609                                             init = vnew.p.ub;
1610                                             vold.p.count = init + 1;
1611                                             // TODO: is it safe and enough?
1612                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1613                                         #endif // KMP_ARCH_X86
1614                                         break;
1615                                     } // if
1616                                 KMP_CPU_PAUSE();
1617                                 } // while (1)
1618                             } // while
1619                         } // if
1620                     } // if
1621                     if ( !status ) {
1622                         *p_lb = 0;
1623                         *p_ub = 0;
1624                         if ( p_st != NULL ) *p_st = 0;
1625                     } else {
1626                         start = pr->u.p.parm2;
1627                         init *= chunk;
1628                         limit = chunk + init - 1;
1629                         incr  = pr->u.p.st;
1630 
1631                         KMP_DEBUG_ASSERT(init <= trip);
1632                         if ( (last = (limit >= trip)) != 0 )
1633                             limit = trip;
1634                         if ( p_st != NULL ) *p_st = incr;
1635 
1636                         if ( incr == 1 ) {
1637                             *p_lb = start + init;
1638                             *p_ub = start + limit;
1639                         } else {
1640                             *p_lb = start + init * incr;
1641                             *p_ub = start + limit * incr;
1642                         }
1643 
1644                         if ( pr->ordered ) {
1645                             pr->u.p.ordered_lower = init;
1646                             pr->u.p.ordered_upper = limit;
1647                             #ifdef KMP_DEBUG
1648                             {
1649                                 const char * buff;
1650                                 // create format specifiers before the debug output
1651                                 buff = __kmp_str_format(
1652                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1653                                     traits_t< UT >::spec, traits_t< UT >::spec );
1654                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1655                                 __kmp_str_free( &buff );
1656                             }
1657                             #endif
1658                         } // if
1659                     } // if
1660                     break;
1661                 } // case
1662             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1663             case kmp_sch_static_balanced:
1664                 {
1665                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1666                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1667                         pr->u.p.count = 1;
1668                         *p_lb = pr->u.p.lb;
1669                         *p_ub = pr->u.p.ub;
1670                         last = pr->u.p.parm1;
1671                         if ( p_st != NULL )
1672                             *p_st = pr->u.p.st;
1673                     } else {  /* no iterations to do */
1674                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1675                     }
1676                     if ( pr->ordered ) {
1677                         #ifdef KMP_DEBUG
1678                         {
1679                             const char * buff;
1680                             // create format specifiers before the debug output
1681                             buff = __kmp_str_format(
1682                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1683                                 traits_t< UT >::spec, traits_t< UT >::spec );
1684                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1685                             __kmp_str_free( &buff );
1686                         }
1687                         #endif
1688                     } // if
1689                 } // case
1690                 break;
1691             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1692             case kmp_sch_static_chunked:
1693                 {
1694                     T parm1;
1695 
1696                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1697                                    gtid ) );
1698                     parm1 = pr->u.p.parm1;
1699 
1700                     trip  = pr->u.p.tc - 1;
1701                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1702 
1703                     if ( (status = (init <= trip)) != 0 ) {
1704                         start = pr->u.p.lb;
1705                         incr  = pr->u.p.st;
1706                         limit = parm1 + init - 1;
1707 
1708                         if ( (last = (limit >= trip)) != 0 )
1709                             limit = trip;
1710 
1711                         if ( p_st != NULL ) *p_st = incr;
1712 
1713                         pr->u.p.count += team->t.t_nproc;
1714 
1715                         if ( incr == 1 ) {
1716                             *p_lb = start + init;
1717                             *p_ub = start + limit;
1718                         }
1719                         else {
1720                             *p_lb = start + init * incr;
1721                             *p_ub = start + limit * incr;
1722                         }
1723 
1724                         if ( pr->ordered ) {
1725                             pr->u.p.ordered_lower = init;
1726                             pr->u.p.ordered_upper = limit;
1727                             #ifdef KMP_DEBUG
1728                             {
1729                                 const char * buff;
1730                                 // create format specifiers before the debug output
1731                                 buff = __kmp_str_format(
1732                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1733                                     traits_t< UT >::spec, traits_t< UT >::spec );
1734                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1735                                 __kmp_str_free( &buff );
1736                             }
1737                             #endif
1738                         } // if
1739                     } // if
1740                 } // case
1741                 break;
1742 
1743             case kmp_sch_dynamic_chunked:
1744                 {
1745                     T chunk = pr->u.p.parm1;
1746 
1747                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1748                                    gtid ) );
1749 
1750                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1751                     trip = pr->u.p.tc - 1;
1752 
1753                     if ( (status = (init <= trip)) == 0 ) {
1754                         *p_lb = 0;
1755                         *p_ub = 0;
1756                         if ( p_st != NULL ) *p_st = 0;
1757                     } else {
1758                         start = pr->u.p.lb;
1759                         limit = chunk + init - 1;
1760                         incr  = pr->u.p.st;
1761 
1762                         if ( (last = (limit >= trip)) != 0 )
1763                             limit = trip;
1764 
1765                         if ( p_st != NULL ) *p_st = incr;
1766 
1767                         if ( incr == 1 ) {
1768                             *p_lb = start + init;
1769                             *p_ub = start + limit;
1770                         } else {
1771                             *p_lb = start + init * incr;
1772                             *p_ub = start + limit * incr;
1773                         }
1774 
1775                         if ( pr->ordered ) {
1776                             pr->u.p.ordered_lower = init;
1777                             pr->u.p.ordered_upper = limit;
1778                             #ifdef KMP_DEBUG
1779                             {
1780                                 const char * buff;
1781                                 // create format specifiers before the debug output
1782                                 buff = __kmp_str_format(
1783                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1784                                     traits_t< UT >::spec, traits_t< UT >::spec );
1785                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1786                                 __kmp_str_free( &buff );
1787                             }
1788                             #endif
1789                         } // if
1790                     } // if
1791                 } // case
1792                 break;
1793 
1794             case kmp_sch_guided_iterative_chunked:
1795                 {
1796                     T  chunkspec = pr->u.p.parm1;
1797                     KD_TRACE(100,
1798                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1799                     trip  = pr->u.p.tc;
1800                     // Start atomic part of calculations
1801                     while(1) {
1802                         ST  remaining;             // signed, because can be < 0
1803                         init = sh->u.s.iteration;  // shared value
1804                         remaining = trip - init;
1805                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1806                             // nothing to do, don't try atomic op
1807                             status = 0;
1808                             break;
1809                         }
1810                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1811                             // use dynamic-style shcedule
1812                             // atomically inrement iterations, get old value
1813                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1814                             remaining = trip - init;
1815                             if (remaining <= 0) {
1816                                 status = 0;    // all iterations got by other threads
1817                             } else {
1818                                 // got some iterations to work on
1819                                 status = 1;
1820                                 if ( (T)remaining > chunkspec ) {
1821                                     limit = init + chunkspec - 1;
1822                                 } else {
1823                                     last = 1;   // the last chunk
1824                                     limit = init + remaining - 1;
1825                                 } // if
1826                             } // if
1827                             break;
1828                         } // if
1829                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1830                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1831                             // CAS was successful, chunk obtained
1832                             status = 1;
1833                             --limit;
1834                             break;
1835                         } // if
1836                     } // while
1837                     if ( status != 0 ) {
1838                         start = pr->u.p.lb;
1839                         incr = pr->u.p.st;
1840                         if ( p_st != NULL )
1841                             *p_st = incr;
1842                         *p_lb = start + init * incr;
1843                         *p_ub = start + limit * incr;
1844                         if ( pr->ordered ) {
1845                             pr->u.p.ordered_lower = init;
1846                             pr->u.p.ordered_upper = limit;
1847                             #ifdef KMP_DEBUG
1848                             {
1849                                 const char * buff;
1850                                 // create format specifiers before the debug output
1851                                 buff = __kmp_str_format(
1852                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1853                                     traits_t< UT >::spec, traits_t< UT >::spec );
1854                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1855                                 __kmp_str_free( &buff );
1856                             }
1857                             #endif
1858                         } // if
1859                     } else {
1860                         *p_lb = 0;
1861                         *p_ub = 0;
1862                         if ( p_st != NULL )
1863                             *p_st = 0;
1864                     } // if
1865                 } // case
1866                 break;
1867 
1868             case kmp_sch_guided_analytical_chunked:
1869                 {
1870                     T   chunkspec = pr->u.p.parm1;
1871                     UT chunkIdx;
1872     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1873                     /* for storing original FPCW value for Windows* OS on
1874 		       IA-32 architecture 8-byte version */
1875                     unsigned int oldFpcw;
1876                     unsigned int fpcwSet = 0;
1877     #endif
1878                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1879                                    gtid ) );
1880 
1881                     trip  = pr->u.p.tc;
1882 
1883                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1884                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1885 
1886                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1887                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1888                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1889                             --trip;
1890                             /* use dynamic-style scheduling */
1891                             init = chunkIdx * chunkspec + pr->u.p.count;
1892                             /* need to verify init > 0 in case of overflow in the above calculation */
1893                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1894                                 limit = init + chunkspec -1;
1895 
1896                                 if ( (last = (limit >= trip)) != 0 )
1897                                     limit = trip;
1898                             }
1899                             break;
1900                         } else {
1901                             /* use exponential-style scheduling */
1902                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1903                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1904                              */
1905     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1906                             /* If we haven't already done so, save original
1907 			       FPCW and set precision to 64-bit, as Windows* OS
1908 			       on IA-32 architecture defaults to 53-bit */
1909                             if ( !fpcwSet ) {
1910                                 oldFpcw = _control87(0,0);
1911                                 _control87(_PC_64,_MCW_PC);
1912                                 fpcwSet = 0x30000;
1913                             }
1914     #endif
1915                             if ( chunkIdx ) {
1916                                 init = __kmp_dispatch_guided_remaining< T >(
1917                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1918                                 KMP_DEBUG_ASSERT(init);
1919                                 init = trip - init;
1920                             } else
1921                                 init = 0;
1922                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1923                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1924                             KMP_ASSERT(init <= limit);
1925                             if ( init < limit ) {
1926                                 KMP_DEBUG_ASSERT(limit <= trip);
1927                                 --limit;
1928                                 status = 1;
1929                                 break;
1930                             } // if
1931                         } // if
1932                     } // while (1)
1933     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1934                     /* restore FPCW if necessary
1935                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1936                     */
1937                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1938                         _control87(oldFpcw,_MCW_PC);
1939     #endif
1940                     if ( status != 0 ) {
1941                         start = pr->u.p.lb;
1942                         incr = pr->u.p.st;
1943                         if ( p_st != NULL )
1944                             *p_st = incr;
1945                         *p_lb = start + init * incr;
1946                         *p_ub = start + limit * incr;
1947                         if ( pr->ordered ) {
1948                             pr->u.p.ordered_lower = init;
1949                             pr->u.p.ordered_upper = limit;
1950                             #ifdef KMP_DEBUG
1951                             {
1952                                 const char * buff;
1953                                 // create format specifiers before the debug output
1954                                 buff = __kmp_str_format(
1955                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1956                                     traits_t< UT >::spec, traits_t< UT >::spec );
1957                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1958                                 __kmp_str_free( &buff );
1959                             }
1960                             #endif
1961                         }
1962                     } else {
1963                         *p_lb = 0;
1964                         *p_ub = 0;
1965                         if ( p_st != NULL )
1966                             *p_st = 0;
1967                     }
1968                 } // case
1969                 break;
1970 
1971             case kmp_sch_trapezoidal:
1972                 {
1973                     UT   index;
1974                     T    parm2 = pr->u.p.parm2;
1975                     T    parm3 = pr->u.p.parm3;
1976                     T    parm4 = pr->u.p.parm4;
1977                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1978                                    gtid ) );
1979 
1980                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1981 
1982                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1983                     trip = pr->u.p.tc - 1;
1984 
1985                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1986                         *p_lb = 0;
1987                         *p_ub = 0;
1988                         if ( p_st != NULL ) *p_st = 0;
1989                     } else {
1990                         start = pr->u.p.lb;
1991                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1992                         incr  = pr->u.p.st;
1993 
1994                         if ( (last = (limit >= trip)) != 0 )
1995                             limit = trip;
1996 
1997                         if ( p_st != NULL ) *p_st = incr;
1998 
1999                         if ( incr == 1 ) {
2000                             *p_lb = start + init;
2001                             *p_ub = start + limit;
2002                         } else {
2003                             *p_lb = start + init * incr;
2004                             *p_ub = start + limit * incr;
2005                         }
2006 
2007                         if ( pr->ordered ) {
2008                             pr->u.p.ordered_lower = init;
2009                             pr->u.p.ordered_upper = limit;
2010                             #ifdef KMP_DEBUG
2011                             {
2012                                 const char * buff;
2013                                 // create format specifiers before the debug output
2014                                 buff = __kmp_str_format(
2015                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2016                                     traits_t< UT >::spec, traits_t< UT >::spec );
2017                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2018                                 __kmp_str_free( &buff );
2019                             }
2020                             #endif
2021                         } // if
2022                     } // if
2023                 } // case
2024                 break;
2025             default:
2026                 {
2027                     status = 0; // to avoid complaints on uninitialized variable use
2028                     __kmp_msg(
2029                         kmp_ms_fatal,                        // Severity
2030                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2031                         KMP_HNT( GetNewerLibrary ),          // Hint
2032                         __kmp_msg_null                       // Variadic argument list terminator
2033                     );
2034                 }
2035                 break;
2036             } // switch
2037         } // if tc == 0;
2038 
2039         if ( status == 0 ) {
2040             UT   num_done;
2041 
2042             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2043             #ifdef KMP_DEBUG
2044             {
2045                 const char * buff;
2046                 // create format specifiers before the debug output
2047                 buff = __kmp_str_format(
2048                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2049                     traits_t< UT >::spec );
2050                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2051                 __kmp_str_free( &buff );
2052             }
2053             #endif
2054 
2055             if ( (ST)num_done == team->t.t_nproc-1 ) {
2056                 /* NOTE: release this buffer to be reused */
2057 
2058                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2059 
2060                 sh->u.s.num_done = 0;
2061                 sh->u.s.iteration = 0;
2062 
2063                 /* TODO replace with general release procedure? */
2064                 if ( pr->ordered ) {
2065                     sh->u.s.ordered_iteration = 0;
2066                 }
2067 
2068                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2069 
2070                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2071                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2072                                 gtid, sh->buffer_index) );
2073 
2074                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2075 
2076             } // if
2077             if ( __kmp_env_consistency_check ) {
2078                 if ( pr->pushed_ws != ct_none ) {
2079                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2080                 }
2081             }
2082 
2083             th -> th.th_dispatch -> th_deo_fcn = NULL;
2084             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2085             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2086             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2087         } // if (status == 0)
2088 #if KMP_OS_WINDOWS
2089         else if ( last ) {
2090             pr->u.p.last_upper = pr->u.p.ub;
2091         }
2092 #endif /* KMP_OS_WINDOWS */
2093         if ( p_last != NULL && status != 0 )
2094             *p_last = last;
2095     } // if
2096 
2097     #ifdef KMP_DEBUG
2098     {
2099         const char * buff;
2100         // create format specifiers before the debug output
2101         buff = __kmp_str_format(
2102             "__kmp_dispatch_next: T#%%d normal case: " \
2103             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2104             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2105         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2106         __kmp_str_free( &buff );
2107     }
2108     #endif
2109 #if INCLUDE_SSC_MARKS
2110     SSC_MARK_DISPATCH_NEXT();
2111 #endif
2112     return status;
2113 }
2114 
2115 template< typename T >
2116 static void
2117 __kmp_dist_get_bounds(
2118     ident_t                          *loc,
2119     kmp_int32                         gtid,
2120     kmp_int32                        *plastiter,
2121     T                                *plower,
2122     T                                *pupper,
2123     typename traits_t< T >::signed_t  incr
2124 ) {
2125     KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2126     typedef typename traits_t< T >::unsigned_t  UT;
2127     typedef typename traits_t< T >::signed_t    ST;
2128     register kmp_uint32  team_id;
2129     register kmp_uint32  nteams;
2130     register UT          trip_count;
2131     register kmp_team_t *team;
2132     kmp_info_t * th;
2133 
2134     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2135     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2136     #ifdef KMP_DEBUG
2137     {
2138         const char * buff;
2139         // create format specifiers before the debug output
2140         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2141             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2142             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2143             traits_t< T >::spec );
2144         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2145         __kmp_str_free( &buff );
2146     }
2147     #endif
2148 
2149     if( __kmp_env_consistency_check ) {
2150         if( incr == 0 ) {
2151             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2152         }
2153         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2154             // The loop is illegal.
2155             // Some zero-trip loops maintained by compiler, e.g.:
2156             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2157             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2158             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2159             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2160             // Compiler does not check the following illegal loops:
2161             //   for(i=0;i<10;i+=incr) // where incr<0
2162             //   for(i=10;i>0;i-=incr) // where incr<0
2163             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2164         }
2165     }
2166     th = __kmp_threads[gtid];
2167     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2168     team = th->th.th_team;
2169     #if OMP_40_ENABLED
2170     nteams = th->th.th_teams_size.nteams;
2171     #endif
2172     team_id = team->t.t_master_tid;
2173     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2174 
2175     // compute global trip count
2176     if( incr == 1 ) {
2177         trip_count = *pupper - *plower + 1;
2178     } else if(incr == -1) {
2179         trip_count = *plower - *pupper + 1;
2180     } else {
2181         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2182     }
2183     if( trip_count <= nteams ) {
2184         KMP_DEBUG_ASSERT(
2185             __kmp_static == kmp_sch_static_greedy || \
2186             __kmp_static == kmp_sch_static_balanced
2187         ); // Unknown static scheduling type.
2188         // only some teams get single iteration, others get nothing
2189         if( team_id < trip_count ) {
2190             *pupper = *plower = *plower + team_id * incr;
2191         } else {
2192             *plower = *pupper + incr; // zero-trip loop
2193         }
2194         if( plastiter != NULL )
2195             *plastiter = ( team_id == trip_count - 1 );
2196     } else {
2197         if( __kmp_static == kmp_sch_static_balanced ) {
2198             register UT chunk = trip_count / nteams;
2199             register UT extras = trip_count % nteams;
2200             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2201             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2202             if( plastiter != NULL )
2203                 *plastiter = ( team_id == nteams - 1 );
2204         } else {
2205             register T chunk_inc_count =
2206                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2207             register T upper = *pupper;
2208             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2209                 // Unknown static scheduling type.
2210             *plower += team_id * chunk_inc_count;
2211             *pupper = *plower + chunk_inc_count - incr;
2212             // Check/correct bounds if needed
2213             if( incr > 0 ) {
2214                 if( *pupper < *plower )
2215                     *pupper = i_maxmin< T >::mx;
2216                 if( plastiter != NULL )
2217                     *plastiter = *plower <= upper && *pupper > upper - incr;
2218                 if( *pupper > upper )
2219                     *pupper = upper; // tracker C73258
2220             } else {
2221                 if( *pupper > *plower )
2222                     *pupper = i_maxmin< T >::mn;
2223                 if( plastiter != NULL )
2224                     *plastiter = *plower >= upper && *pupper < upper - incr;
2225                 if( *pupper < upper )
2226                     *pupper = upper; // tracker C73258
2227             }
2228         }
2229     }
2230 }
2231 
2232 //-----------------------------------------------------------------------------------------
2233 // Dispatch routines
2234 //    Transfer call to template< type T >
2235 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2236 //                         T lb, T ub, ST st, ST chunk )
2237 extern "C" {
2238 
2239 /*!
2240 @ingroup WORK_SHARING
2241 @{
2242 @param loc Source location
2243 @param gtid Global thread id
2244 @param schedule Schedule type
2245 @param lb  Lower bound
2246 @param ub  Upper bound
2247 @param st  Step (or increment if you prefer)
2248 @param chunk The chunk size to block with
2249 
2250 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2251 These functions are all identical apart from the types of the arguments.
2252 */
2253 
2254 void
2255 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2256                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2257 {
2258     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2259     KMP_DEBUG_ASSERT( __kmp_init_serial );
2260     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2261 }
2262 /*!
2263 See @ref __kmpc_dispatch_init_4
2264 */
2265 void
2266 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2267                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2268 {
2269     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2270     KMP_DEBUG_ASSERT( __kmp_init_serial );
2271     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2272 }
2273 
2274 /*!
2275 See @ref __kmpc_dispatch_init_4
2276 */
2277 void
2278 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2279                         kmp_int64 lb, kmp_int64 ub,
2280                         kmp_int64 st, kmp_int64 chunk )
2281 {
2282     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2283     KMP_DEBUG_ASSERT( __kmp_init_serial );
2284     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2285 }
2286 
2287 /*!
2288 See @ref __kmpc_dispatch_init_4
2289 */
2290 void
2291 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2292                          kmp_uint64 lb, kmp_uint64 ub,
2293                          kmp_int64 st, kmp_int64 chunk )
2294 {
2295     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2296     KMP_DEBUG_ASSERT( __kmp_init_serial );
2297     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2298 }
2299 
2300 /*!
2301 See @ref __kmpc_dispatch_init_4
2302 
2303 Difference from __kmpc_dispatch_init set of functions is these functions
2304 are called for composite distribute parallel for construct. Thus before
2305 regular iterations dispatching we need to calc per-team iteration space.
2306 
2307 These functions are all identical apart from the types of the arguments.
2308 */
2309 void
2310 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2311     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2312 {
2313     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2314     KMP_DEBUG_ASSERT( __kmp_init_serial );
2315     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2316     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2317 }
2318 
2319 void
2320 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2321     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2322 {
2323     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2324     KMP_DEBUG_ASSERT( __kmp_init_serial );
2325     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2326     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2327 }
2328 
2329 void
2330 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2331     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2332 {
2333     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2334     KMP_DEBUG_ASSERT( __kmp_init_serial );
2335     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2336     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2337 }
2338 
2339 void
2340 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2341     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2342 {
2343     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2344     KMP_DEBUG_ASSERT( __kmp_init_serial );
2345     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2346     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2347 }
2348 
2349 /*!
2350 @param loc Source code location
2351 @param gtid Global thread id
2352 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2353 @param p_lb   Pointer to the lower bound for the next chunk of work
2354 @param p_ub   Pointer to the upper bound for the next chunk of work
2355 @param p_st   Pointer to the stride for the next chunk of work
2356 @return one if there is work to be done, zero otherwise
2357 
2358 Get the next dynamically allocated chunk of work for this thread.
2359 If there is no more work, then the lb,ub and stride need not be modified.
2360 */
2361 int
2362 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2363                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2364 {
2365     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2366 }
2367 
2368 /*!
2369 See @ref __kmpc_dispatch_next_4
2370 */
2371 int
2372 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2373                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2374 {
2375     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2376 }
2377 
2378 /*!
2379 See @ref __kmpc_dispatch_next_4
2380 */
2381 int
2382 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2383                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2384 {
2385     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2386 }
2387 
2388 /*!
2389 See @ref __kmpc_dispatch_next_4
2390 */
2391 int
2392 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2393                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2394 {
2395     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2396 }
2397 
2398 /*!
2399 @param loc Source code location
2400 @param gtid Global thread id
2401 
2402 Mark the end of a dynamic loop.
2403 */
2404 void
2405 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2406 {
2407     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2408 }
2409 
2410 /*!
2411 See @ref __kmpc_dispatch_fini_4
2412 */
2413 void
2414 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2415 {
2416     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2417 }
2418 
2419 /*!
2420 See @ref __kmpc_dispatch_fini_4
2421 */
2422 void
2423 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2424 {
2425     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2426 }
2427 
2428 /*!
2429 See @ref __kmpc_dispatch_fini_4
2430 */
2431 void
2432 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2433 {
2434     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2435 }
2436 /*! @} */
2437 
2438 //-----------------------------------------------------------------------------------------
2439 //Non-template routines from kmp_dispatch.c used in other sources
2440 
2441 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2442     return value == checker;
2443 }
2444 
2445 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2446     return value != checker;
2447 }
2448 
2449 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2450     return value < checker;
2451 }
2452 
2453 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2454     return value >= checker;
2455 }
2456 
2457 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2458     return value <= checker;
2459 }
2460 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2461     return value == checker;
2462 }
2463 
2464 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2465     return value != checker;
2466 }
2467 
2468 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2469     return value < checker;
2470 }
2471 
2472 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2473     return value >= checker;
2474 }
2475 
2476 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2477     return value <= checker;
2478 }
2479 
2480 kmp_uint32
2481 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2482                    kmp_uint32            checker,
2483                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2484                    , void        * obj    // Higher-level synchronization object, or NULL.
2485                    )
2486 {
2487     // note: we may not belong to a team at this point
2488     register volatile kmp_uint32         * spin          = spinner;
2489     register          kmp_uint32           check         = checker;
2490     register          kmp_uint32   spins;
2491     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2492     register          kmp_uint32           r;
2493 
2494     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2495     KMP_INIT_YIELD( spins );
2496     // main wait spin loop
2497     while(!f(r = TCR_4(*spin), check)) {
2498         KMP_FSYNC_SPIN_PREPARE( obj );
2499         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2500            It causes problems with infinite recursion because of exit lock */
2501         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2502             __kmp_abort_thread(); */
2503 
2504         /* if we have waited a bit, or are oversubscribed, yield */
2505         /* pause is in the following code */
2506         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2507         KMP_YIELD_SPIN( spins );
2508     }
2509     KMP_FSYNC_SPIN_ACQUIRED( obj );
2510     return r;
2511 }
2512 
2513 kmp_uint64
2514 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2515                     kmp_uint64            checker,
2516                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2517                     , void        * obj    // Higher-level synchronization object, or NULL.
2518                     )
2519 {
2520     // note: we may not belong to a team at this point
2521     register volatile kmp_uint64         * spin          = spinner;
2522     register          kmp_uint64           check         = checker;
2523     register          kmp_uint32   spins;
2524     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2525     register          kmp_uint64           r;
2526 
2527     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2528     KMP_INIT_YIELD( spins );
2529     // main wait spin loop
2530     while(!f(r = *spin, check))
2531     {
2532         KMP_FSYNC_SPIN_PREPARE( obj );
2533         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2534            It causes problems with infinite recursion because of exit lock */
2535         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2536             __kmp_abort_thread(); */
2537 
2538         // if we are oversubscribed,
2539         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2540         // pause is in the following code
2541         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2542         KMP_YIELD_SPIN( spins );
2543     }
2544     KMP_FSYNC_SPIN_ACQUIRED( obj );
2545     return r;
2546 }
2547 
2548 } // extern "C"
2549 
2550 #ifdef KMP_GOMP_COMPAT
2551 
2552 void
2553 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2554                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2555                            kmp_int32 chunk, int push_ws )
2556 {
2557     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2558                                       push_ws );
2559 }
2560 
2561 void
2562 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2563                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2564                             kmp_int32 chunk, int push_ws )
2565 {
2566     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2567                                        push_ws );
2568 }
2569 
2570 void
2571 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2572                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2573                            kmp_int64 chunk, int push_ws )
2574 {
2575     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2576                                       push_ws );
2577 }
2578 
2579 void
2580 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2581                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2582                             kmp_int64 chunk, int push_ws )
2583 {
2584     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2585                                        push_ws );
2586 }
2587 
2588 void
2589 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2590 {
2591     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2592 }
2593 
2594 void
2595 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2596 {
2597     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2598 }
2599 
2600 void
2601 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2602 {
2603     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2604 }
2605 
2606 void
2607 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2608 {
2609     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2610 }
2611 
2612 #endif /* KMP_GOMP_COMPAT */
2613 
2614 /* ------------------------------------------------------------------------ */
2615 /* ------------------------------------------------------------------------ */
2616 
2617