1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194     kmp_int32 r;
195     r = KMP_TEST_THEN_ADD32( p, d );
196     return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203     kmp_int64 r;
204     r = KMP_TEST_THEN_ADD64( p, d );
205     return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217     kmp_int32 r;
218     r = KMP_TEST_THEN_INC_ACQ32( p );
219     return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226     kmp_int64 r;
227     r = KMP_TEST_THEN_INC_ACQ64( p );
228     return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240     kmp_int32 r;
241     r = KMP_TEST_THEN_INC32( p );
242     return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249     kmp_int64 r;
250     r = KMP_TEST_THEN_INC64( p );
251     return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274     Spin wait loop that first does pause, then yield.
275     Waits until function returns non-zero when called with *spinner and check.
276     Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278     Arguments:
279         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280             locks consistently. For example, if lock is acquired immediately, its address is
281             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283             address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT  // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290                   UT            checker,
291                   kmp_uint32 (* pred)( UT, UT )
292                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
293                   )
294 {
295     // note: we may not belong to a team at this point
296     register volatile UT         * spin          = spinner;
297     register          UT           check         = checker;
298     register          kmp_uint32   spins;
299     register          kmp_uint32 (*f) ( UT, UT ) = pred;
300     register          UT           r;
301 
302     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303     KMP_INIT_YIELD( spins );
304     // main wait spin loop
305     while(!f(r = *spin, check))
306     {
307         KMP_FSYNC_SPIN_PREPARE( obj );
308         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309            It causes problems with infinite recursion because of exit lock */
310         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311             __kmp_abort_thread(); */
312 
313         // if we are oversubscribed,
314         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315         // pause is in the following code
316         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317         KMP_YIELD_SPIN( spins );
318     }
319     KMP_FSYNC_SPIN_ACQUIRED( obj );
320     return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325     return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330     return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335     return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340     return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345     return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355     kmp_info_t *th;
356 
357     KMP_DEBUG_ASSERT( gtid_ref );
358 
359     if ( __kmp_env_consistency_check ) {
360         th = __kmp_threads[*gtid_ref];
361         if ( th -> th.th_root -> r.r_active
362           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368         }
369     }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376     typedef typename traits_t< UT >::signed_t    ST;
377     dispatch_private_info_template< UT > * pr;
378 
379     int gtid = *gtid_ref;
380 //    int  cid = *cid_ref;
381     kmp_info_t *th = __kmp_threads[ gtid ];
382     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385     if ( __kmp_env_consistency_check ) {
386         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387             ( th -> th.th_dispatch -> th_dispatch_pr_current );
388         if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394         }
395     }
396 
397     if ( ! th -> th.th_team -> t.t_serialized ) {
398         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399             ( th -> th.th_dispatch -> th_dispatch_sh_current );
400         UT  lower;
401 
402         if ( ! __kmp_env_consistency_check ) {
403                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         }
406         lower = pr->u.p.ordered_lower;
407 
408         #if ! defined( KMP_GOMP_COMPAT )
409             if ( __kmp_env_consistency_check ) {
410                 if ( pr->ordered_bumped ) {
411                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412                     __kmp_error_construct2(
413                         kmp_i18n_msg_CnsMultipleNesting,
414                         ct_ordered_in_pdo, loc_ref,
415                         & p->stack_data[ p->w_top ]
416                     );
417                 }
418             }
419         #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421         KMP_MB();
422         #ifdef KMP_DEBUG
423         {
424             const char * buff;
425             // create format specifiers before the debug output
426             buff = __kmp_str_format(
427                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428                 traits_t< UT >::spec, traits_t< UT >::spec );
429             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430             __kmp_str_free( &buff );
431         }
432         #endif
433 
434         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435                                 USE_ITT_BUILD_ARG( NULL )
436                                 );
437         KMP_MB();  /* is this necessary? */
438         #ifdef KMP_DEBUG
439         {
440             const char * buff;
441             // create format specifiers before the debug output
442             buff = __kmp_str_format(
443                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444                 traits_t< UT >::spec, traits_t< UT >::spec );
445             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446             __kmp_str_free( &buff );
447         }
448         #endif
449     }
450     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456     kmp_info_t *th;
457 
458     if ( __kmp_env_consistency_check ) {
459         th = __kmp_threads[*gtid_ref];
460         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470     typedef typename traits_t< UT >::signed_t    ST;
471     dispatch_private_info_template< UT > * pr;
472 
473     int gtid = *gtid_ref;
474 //    int  cid = *cid_ref;
475     kmp_info_t *th = __kmp_threads[ gtid ];
476     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479     if ( __kmp_env_consistency_check ) {
480         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481             ( th -> th.th_dispatch -> th_dispatch_pr_current );
482         if ( pr -> pushed_ws != ct_none ) {
483             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484         }
485     }
486 
487     if ( ! th -> th.th_team -> t.t_serialized ) {
488         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489             ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491         if ( ! __kmp_env_consistency_check ) {
492             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494         }
495 
496         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497         #if ! defined( KMP_GOMP_COMPAT )
498             if ( __kmp_env_consistency_check ) {
499                 if ( pr->ordered_bumped != 0 ) {
500                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501                     /* How to test it? - OM */
502                     __kmp_error_construct2(
503                         kmp_i18n_msg_CnsMultipleNesting,
504                         ct_ordered_in_pdo, loc_ref,
505                         & p->stack_data[ p->w_top ]
506                     );
507                 }
508             }
509         #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511         KMP_MB();       /* Flush all pending memory write invalidates.  */
512 
513         pr->ordered_bumped += 1;
514 
515         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516                         gtid, pr->ordered_bumped ) );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519 
520         /* TODO use general release procedure? */
521         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524     }
525     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532     long double s=1.0L;
533 
534     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536     while(y) {
537         if ( y & 1 )
538             s *= x;
539         x *= x;
540         y >>= 1;
541     }
542     return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553     T                                  tc,
554     typename traits_t< T >::floating_t base,
555     typename traits_t< T >::unsigned_t idx
556 ) {
557     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558        least for ICL 8.1, long double arithmetic may not really have
559        long double precision, even with /Qlong_double.  Currently, we
560        workaround that in the caller code, by manipulating the FPCW for
561        Windows* OS on IA-32 architecture.  The lack of precision is not
562        expected to be a correctness issue, though.
563     */
564     typedef typename traits_t< T >::unsigned_t  UT;
565 
566     long double x = tc * __kmp_pow< UT >(base, idx);
567     UT r = (UT) x;
568     if ( x == r )
569         return r;
570     return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
575 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586     ident_t                        * loc,
587     int                              gtid,
588     enum sched_type                  schedule,
589     T                                lb,
590     T                                ub,
591     typename traits_t< T >::signed_t st,
592     typename traits_t< T >::signed_t chunk,
593     int                              push_ws
594 ) {
595     typedef typename traits_t< T >::unsigned_t  UT;
596     typedef typename traits_t< T >::signed_t    ST;
597     typedef typename traits_t< T >::floating_t  DBL;
598     static const int ___kmp_size_type = sizeof( UT );
599 
600     int                                            active;
601     T                                              tc;
602     kmp_info_t *                                   th;
603     kmp_team_t *                                   team;
604     kmp_uint32                                     my_buffer_index;
605     dispatch_private_info_template< T >          * pr;
606     dispatch_shared_info_template< UT > volatile * sh;
607 
608     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611     if ( ! TCR_4( __kmp_init_parallel ) )
612         __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615     SSC_MARK_DISPATCH_INIT();
616 #endif
617     #ifdef KMP_DEBUG
618     {
619         const char * buff;
620         // create format specifiers before the debug output
621         buff = __kmp_str_format(
622             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625         __kmp_str_free( &buff );
626     }
627     #endif
628     /* setup data */
629     th     = __kmp_threads[ gtid ];
630     team   = th -> th.th_team;
631     active = ! team -> t.t_serialized;
632     th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635     kmp_uint64 cur_chunk = chunk;
636 #endif
637     if ( ! active ) {
638         pr = reinterpret_cast< dispatch_private_info_template< T >* >
639             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
640     } else {
641         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
642                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
643 
644         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
645 
646         /* What happens when number of threads changes, need to resize buffer? */
647         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
648             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
649         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
650             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
651     }
652 
653     /* Pick up the nomerge/ordered bits from the scheduling type */
654     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
655         pr->nomerge = TRUE;
656         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
657     } else {
658         pr->nomerge = FALSE;
659     }
660     pr->type_size = ___kmp_size_type; // remember the size of variables
661     if ( kmp_ord_lower & schedule ) {
662         pr->ordered = TRUE;
663         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
664     } else {
665         pr->ordered = FALSE;
666     }
667     if ( schedule == kmp_sch_static ) {
668         schedule = __kmp_static;
669     } else {
670         if ( schedule == kmp_sch_runtime ) {
671             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
672             schedule = team -> t.t_sched.r_sched_type;
673             // Detail the schedule if needed (global controls are differentiated appropriately)
674             if ( schedule == kmp_sch_guided_chunked ) {
675                 schedule = __kmp_guided;
676             } else if ( schedule == kmp_sch_static ) {
677                 schedule = __kmp_static;
678             }
679             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
680             chunk = team -> t.t_sched.chunk;
681 
682             #ifdef KMP_DEBUG
683             {
684                 const char * buff;
685                 // create format specifiers before the debug output
686                 buff = __kmp_str_format(
687                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
688                     traits_t< ST >::spec );
689                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
690                 __kmp_str_free( &buff );
691             }
692             #endif
693         } else {
694             if ( schedule == kmp_sch_guided_chunked ) {
695                 schedule = __kmp_guided;
696             }
697             if ( chunk <= 0 ) {
698                 chunk = KMP_DEFAULT_CHUNK;
699             }
700         }
701 
702         if ( schedule == kmp_sch_auto ) {
703             // mapping and differentiation: in the __kmp_do_serial_initialize()
704             schedule = __kmp_auto;
705             #ifdef KMP_DEBUG
706             {
707                 const char * buff;
708                 // create format specifiers before the debug output
709                 buff = __kmp_str_format(
710                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
711                     traits_t< ST >::spec );
712                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
713                 __kmp_str_free( &buff );
714             }
715             #endif
716         }
717 
718         /* guided analytical not safe for too many threads */
719         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
720             schedule = kmp_sch_guided_iterative_chunked;
721             KMP_WARNING( DispatchManyThreads );
722         }
723         pr->u.p.parm1 = chunk;
724     }
725     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
726                 "unknown scheduling type" );
727 
728     pr->u.p.count = 0;
729 
730     if ( __kmp_env_consistency_check ) {
731         if ( st == 0 ) {
732             __kmp_error_construct(
733                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
734                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
735             );
736         }
737     }
738 
739     tc = ( ub - lb + st );
740     if ( st != 1 ) {
741         if ( st < 0 ) {
742             if ( lb < ub ) {
743                 tc = 0;            // zero-trip
744             } else {   // lb >= ub
745                 tc = (ST)tc / st;  // convert to signed division
746             }
747         } else {       // st > 0
748             if ( ub < lb ) {
749                 tc = 0;            // zero-trip
750             } else {   // lb >= ub
751                 tc /= st;
752             }
753         }
754     } else if ( ub < lb ) {        // st == 1
755         tc = 0;                    // zero-trip
756     }
757 
758     pr->u.p.lb = lb;
759     pr->u.p.ub = ub;
760     pr->u.p.st = st;
761     pr->u.p.tc = tc;
762 
763     #if KMP_OS_WINDOWS
764     pr->u.p.last_upper = ub + st;
765     #endif /* KMP_OS_WINDOWS */
766 
767     /* NOTE: only the active parallel region(s) has active ordered sections */
768 
769     if ( active ) {
770         if ( pr->ordered == 0 ) {
771             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
772             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
773         } else {
774             pr->ordered_bumped = 0;
775 
776             pr->u.p.ordered_lower = 1;
777             pr->u.p.ordered_upper = 0;
778 
779             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
780             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
781         }
782     }
783 
784     if ( __kmp_env_consistency_check ) {
785         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
786         if ( push_ws ) {
787             __kmp_push_workshare( gtid, ws, loc );
788             pr->pushed_ws = ws;
789         } else {
790             __kmp_check_workshare( gtid, ws, loc );
791             pr->pushed_ws = ct_none;
792         }
793     }
794 
795     switch ( schedule ) {
796     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
797     case kmp_sch_static_steal:
798         {
799             T nproc = team->t.t_nproc;
800             T ntc, init;
801 
802             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
803 
804             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
805             if ( nproc > 1 && ntc >= nproc ) {
806                 T id = __kmp_tid_from_gtid(gtid);
807                 T small_chunk, extras;
808 
809                 small_chunk = ntc / nproc;
810                 extras = ntc % nproc;
811 
812                 init = id * small_chunk + ( id < extras ? id : extras );
813                 pr->u.p.count = init;
814                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
815 
816                 pr->u.p.parm2 = lb;
817                 //pr->pfields.parm3 = 0; // it's not used in static_steal
818                 pr->u.p.parm4 = id;
819                 pr->u.p.st = st;
820                 break;
821             } else {
822                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
823                                gtid ) );
824                 schedule = kmp_sch_static_balanced;
825                 /* too few iterations: fall-through to kmp_sch_static_balanced */
826             } // if
827             /* FALL-THROUGH to static balanced */
828         } // case
829     #endif
830     case kmp_sch_static_balanced:
831         {
832             T nproc = team->t.t_nproc;
833             T init, limit;
834 
835             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
836                             gtid ) );
837 
838             if ( nproc > 1 ) {
839                 T id = __kmp_tid_from_gtid(gtid);
840 
841                 if ( tc < nproc ) {
842                     if ( id < tc ) {
843                         init = id;
844                         limit = id;
845                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
846                     } else {
847                         pr->u.p.count = 1;  /* means no more chunks to execute */
848                         pr->u.p.parm1 = FALSE;
849                         break;
850                     }
851                 } else {
852                     T small_chunk = tc / nproc;
853                     T extras = tc % nproc;
854                     init = id * small_chunk + (id < extras ? id : extras);
855                     limit = init + small_chunk - (id < extras ? 0 : 1);
856                     pr->u.p.parm1 = (id == nproc - 1);
857                 }
858             } else {
859                 if ( tc > 0 ) {
860                     init = 0;
861                     limit = tc - 1;
862                     pr->u.p.parm1 = TRUE;
863                 } else {
864                     // zero trip count
865                     pr->u.p.count = 1;  /* means no more chunks to execute */
866                     pr->u.p.parm1 = FALSE;
867                     break;
868                 }
869             }
870 #if USE_ITT_BUILD
871             // Calculate chunk for metadata report
872             if(  __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
873                 cur_chunk = limit - init + 1;
874             }
875 #endif
876             if ( st == 1 ) {
877                 pr->u.p.lb = lb + init;
878                 pr->u.p.ub = lb + limit;
879             } else {
880                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
881                 pr->u.p.lb = lb + init * st;
882                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
883                 if ( st > 0 ) {
884                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
885                 } else {
886                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
887                 }
888             }
889             if ( pr->ordered ) {
890                 pr->u.p.ordered_lower = init;
891                 pr->u.p.ordered_upper = limit;
892             }
893             break;
894         } // case
895     case kmp_sch_guided_iterative_chunked :
896         {
897             T nproc = team->t.t_nproc;
898             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
899 
900             if ( nproc > 1 ) {
901                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
902                     /* chunk size too large, switch to dynamic */
903                     schedule = kmp_sch_dynamic_chunked;
904                 } else {
905                     // when remaining iters become less than parm2 - switch to dynamic
906                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
907                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
908                 }
909             } else {
910                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
911                 schedule = kmp_sch_static_greedy;
912                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
913                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
914                 pr->u.p.parm1 = tc;
915             } // if
916         } // case
917         break;
918     case kmp_sch_guided_analytical_chunked:
919         {
920             T nproc = team->t.t_nproc;
921             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
922 
923             if ( nproc > 1 ) {
924                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
925                     /* chunk size too large, switch to dynamic */
926                     schedule = kmp_sch_dynamic_chunked;
927                 } else {
928                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
929                     DBL x;
930 
931                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
932                     /* Linux* OS already has 64-bit computation by default for
933 		       long double, and on Windows* OS on Intel(R) 64,
934 		       /Qlong_double doesn't work.  On Windows* OS
935 		       on IA-32 architecture, we need to set precision to
936 		       64-bit instead of the default 53-bit. Even though long
937 		       double doesn't work on Windows* OS on Intel(R) 64, the
938 		       resulting lack of precision is not expected to impact
939 		       the correctness of the algorithm, but this has not been
940 		       mathematically proven.
941                     */
942                     // save original FPCW and set precision to 64-bit, as
943                     // Windows* OS on IA-32 architecture defaults to 53-bit
944                     unsigned int oldFpcw = _control87(0,0);
945                     _control87(_PC_64,_MCW_PC); // 0,0x30000
946                     #endif
947                     /* value used for comparison in solver for cross-over point */
948                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
949 
950                     /* crossover point--chunk indexes equal to or greater than
951 		       this point switch to dynamic-style scheduling */
952                     UT   cross;
953 
954                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
955                     x = (long double)1.0 - (long double)0.5 / nproc;
956 
957                     #ifdef KMP_DEBUG
958                     { // test natural alignment
959                         struct _test_a {
960                             char a;
961                             union {
962                                 char b;
963                                 DBL  d;
964                             };
965                         } t;
966                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
967                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
968                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
969                     }
970                     #endif // KMP_DEBUG
971 
972                     /* save the term in thread private dispatch structure */
973                     *(DBL*)&pr->u.p.parm3 = x;
974 
975                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
976                     {
977                         UT          left, right, mid;
978                         long double p;
979 
980                         /* estimate initial upper and lower bound */
981 
982                         /* doesn't matter what value right is as long as it is positive, but
983                            it affects performance of the solver
984                         */
985                         right = 229;
986                         p = __kmp_pow< UT >(x,right);
987                         if ( p > target ) {
988                             do{
989                                 p *= p;
990                                 right <<= 1;
991                             } while(p>target && right < (1<<27));
992                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
993                         } else {
994                             left = 0;
995                         }
996 
997                         /* bisection root-finding method */
998                         while ( left + 1 < right ) {
999                             mid = (left + right) / 2;
1000                             if ( __kmp_pow< UT >(x,mid) > target ) {
1001                                 left = mid;
1002                             } else {
1003                                 right = mid;
1004                             }
1005                         } // while
1006                         cross = right;
1007                     }
1008                     /* assert sanity of computed crossover point */
1009                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1010 
1011                     /* save the crossover point in thread private dispatch structure */
1012                     pr->u.p.parm2 = cross;
1013 
1014                     // C75803
1015                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1016                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1017                     #else
1018                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1019                     #endif
1020                     /* dynamic-style scheduling offset */
1021                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1022                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1023                         // restore FPCW
1024                         _control87(oldFpcw,_MCW_PC);
1025                     #endif
1026                 } // if
1027             } else {
1028                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1029                                gtid ) );
1030                 schedule = kmp_sch_static_greedy;
1031                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1032                 pr->u.p.parm1 = tc;
1033             } // if
1034         } // case
1035         break;
1036     case kmp_sch_static_greedy:
1037         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1038             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1039                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1040                 tc;
1041         break;
1042     case kmp_sch_static_chunked :
1043     case kmp_sch_dynamic_chunked :
1044         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1045         break;
1046     case kmp_sch_trapezoidal :
1047         {
1048             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1049 
1050             T parm1, parm2, parm3, parm4;
1051             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1052 
1053             parm1 = chunk;
1054 
1055             /* F : size of the first cycle */
1056             parm2 = ( tc / (2 * team->t.t_nproc) );
1057 
1058             if ( parm2 < 1 ) {
1059                 parm2 = 1;
1060             }
1061 
1062             /* L : size of the last cycle.  Make sure the last cycle
1063              *     is not larger than the first cycle.
1064              */
1065             if ( parm1 < 1 ) {
1066                 parm1 = 1;
1067             } else if ( parm1 > parm2 ) {
1068                 parm1 = parm2;
1069             }
1070 
1071             /* N : number of cycles */
1072             parm3 = ( parm2 + parm1 );
1073             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1074 
1075             if ( parm3 < 2 ) {
1076                 parm3 = 2;
1077             }
1078 
1079             /* sigma : decreasing incr of the trapezoid */
1080             parm4 = ( parm3 - 1 );
1081             parm4 = ( parm2 - parm1 ) / parm4;
1082 
1083             // pointless check, because parm4 >= 0 always
1084             //if ( parm4 < 0 ) {
1085             //    parm4 = 0;
1086             //}
1087 
1088             pr->u.p.parm1 = parm1;
1089             pr->u.p.parm2 = parm2;
1090             pr->u.p.parm3 = parm3;
1091             pr->u.p.parm4 = parm4;
1092         } // case
1093         break;
1094 
1095     default:
1096         {
1097             __kmp_msg(
1098                 kmp_ms_fatal,                        // Severity
1099                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1100                 KMP_HNT( GetNewerLibrary ),          // Hint
1101                 __kmp_msg_null                       // Variadic argument list terminator
1102             );
1103         }
1104         break;
1105     } // switch
1106     pr->schedule = schedule;
1107     if ( active ) {
1108         /* The name of this buffer should be my_buffer_index when it's free to use it */
1109 
1110         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1111                         gtid, my_buffer_index, sh->buffer_index) );
1112         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1113                                         USE_ITT_BUILD_ARG( NULL )
1114                                         );
1115             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1116             // *always* 32-bit integers.
1117         KMP_MB();  /* is this necessary? */
1118         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1119                         gtid, my_buffer_index, sh->buffer_index) );
1120 
1121         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1122         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1123 #if USE_ITT_BUILD
1124         if ( pr->ordered ) {
1125             __kmp_itt_ordered_init( gtid );
1126         }; // if
1127 #endif /* USE_ITT_BUILD */
1128     }; // if
1129 
1130 #if USE_ITT_BUILD
1131     // Report loop metadata
1132     if( __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
1133         kmp_uint32 tid  = __kmp_tid_from_gtid( gtid );
1134         if (KMP_MASTER_TID(tid)) {
1135             kmp_uint64 schedtype = 0;
1136 
1137             switch ( schedule ) {
1138             case kmp_sch_static_chunked:
1139             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1140                 break;
1141             case kmp_sch_static_greedy:
1142                 cur_chunk = pr->u.p.parm1;
1143                 break;
1144             case kmp_sch_dynamic_chunked:
1145                 schedtype = 1;
1146                 break;
1147             case kmp_sch_guided_iterative_chunked:
1148             case kmp_sch_guided_analytical_chunked:
1149                 schedtype = 2;
1150                 break;
1151             default:
1152 //            Should we put this case under "static"?
1153 //            case kmp_sch_static_steal:
1154                 schedtype = 3;
1155                 break;
1156             }
1157             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1158         }
1159     }
1160 #endif /* USE_ITT_BUILD */
1161 
1162     #ifdef KMP_DEBUG
1163     {
1164         const char * buff;
1165         // create format specifiers before the debug output
1166         buff = __kmp_str_format(
1167             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1168             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1169             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1170             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1171             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1172             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1173             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1174         KD_TRACE(10, ( buff,
1175             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1176             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1177             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1178             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1179         __kmp_str_free( &buff );
1180     }
1181     #endif
1182     #if ( KMP_STATIC_STEAL_ENABLED )
1183     if ( ___kmp_size_type < 8 ) {
1184       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1185       // all the parm3 variables will contain the same value.
1186       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1187       // rather than program life-time increment.
1188       // So the dedicated variable is required. The 'static_steal_counter' is used.
1189       if( schedule == kmp_sch_static_steal ) {
1190         // Other threads will inspect this variable when searching for a victim.
1191         // This is a flag showing that other threads may steal from this thread since then.
1192         volatile T * p = &pr->u.p.static_steal_counter;
1193         *p = *p + 1;
1194       }
1195     }
1196     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1197 
1198 #if OMPT_SUPPORT && OMPT_TRACE
1199     if ((ompt_status == ompt_status_track_callback) &&
1200         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1201         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1202         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1203         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1204             team_info->parallel_id, task_info->task_id, team_info->microtask);
1205     }
1206 #endif
1207 }
1208 
1209 /*
1210  * For ordered loops, either __kmp_dispatch_finish() should be called after
1211  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1212  * every chunk of iterations.  If the ordered section(s) were not executed
1213  * for this iteration (or every iteration in this chunk), we need to set the
1214  * ordered iteration counters so that the next thread can proceed.
1215  */
1216 template< typename UT >
1217 static void
1218 __kmp_dispatch_finish( int gtid, ident_t *loc )
1219 {
1220     typedef typename traits_t< UT >::signed_t ST;
1221     kmp_info_t *th = __kmp_threads[ gtid ];
1222 
1223     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1224     if ( ! th -> th.th_team -> t.t_serialized ) {
1225 
1226         dispatch_private_info_template< UT > * pr =
1227             reinterpret_cast< dispatch_private_info_template< UT >* >
1228             ( th->th.th_dispatch->th_dispatch_pr_current );
1229         dispatch_shared_info_template< UT > volatile * sh =
1230             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1231             ( th->th.th_dispatch->th_dispatch_sh_current );
1232         KMP_DEBUG_ASSERT( pr );
1233         KMP_DEBUG_ASSERT( sh );
1234         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1235                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1236 
1237         if ( pr->ordered_bumped ) {
1238             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1239                             gtid ) );
1240             pr->ordered_bumped = 0;
1241         } else {
1242             UT lower = pr->u.p.ordered_lower;
1243 
1244             #ifdef KMP_DEBUG
1245             {
1246                 const char * buff;
1247                 // create format specifiers before the debug output
1248                 buff = __kmp_str_format(
1249                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1250                     traits_t< UT >::spec, traits_t< UT >::spec );
1251                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1252                 __kmp_str_free( &buff );
1253             }
1254             #endif
1255 
1256             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1257                                    USE_ITT_BUILD_ARG(NULL)
1258                                    );
1259             KMP_MB();  /* is this necessary? */
1260             #ifdef KMP_DEBUG
1261             {
1262                 const char * buff;
1263                 // create format specifiers before the debug output
1264                 buff = __kmp_str_format(
1265                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1266                     traits_t< UT >::spec, traits_t< UT >::spec );
1267                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1268                 __kmp_str_free( &buff );
1269             }
1270             #endif
1271 
1272             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1273         } // if
1274     } // if
1275     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1276 }
1277 
1278 #ifdef KMP_GOMP_COMPAT
1279 
1280 template< typename UT >
1281 static void
1282 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1283 {
1284     typedef typename traits_t< UT >::signed_t ST;
1285     kmp_info_t *th = __kmp_threads[ gtid ];
1286 
1287     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1288     if ( ! th -> th.th_team -> t.t_serialized ) {
1289 //        int cid;
1290         dispatch_private_info_template< UT > * pr =
1291             reinterpret_cast< dispatch_private_info_template< UT >* >
1292             ( th->th.th_dispatch->th_dispatch_pr_current );
1293         dispatch_shared_info_template< UT > volatile * sh =
1294             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1295             ( th->th.th_dispatch->th_dispatch_sh_current );
1296         KMP_DEBUG_ASSERT( pr );
1297         KMP_DEBUG_ASSERT( sh );
1298         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1299                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1300 
1301 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1302             UT lower = pr->u.p.ordered_lower;
1303             UT upper = pr->u.p.ordered_upper;
1304             UT inc = upper - lower + 1;
1305 
1306             if ( pr->ordered_bumped == inc ) {
1307                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1308                   gtid ) );
1309                 pr->ordered_bumped = 0;
1310             } else {
1311                 inc -= pr->ordered_bumped;
1312 
1313                 #ifdef KMP_DEBUG
1314                 {
1315                     const char * buff;
1316                     // create format specifiers before the debug output
1317                     buff = __kmp_str_format(
1318                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1319                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1320                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1321                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1322                     __kmp_str_free( &buff );
1323                 }
1324                 #endif
1325 
1326                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1327                                        USE_ITT_BUILD_ARG(NULL)
1328                                        );
1329 
1330                 KMP_MB();  /* is this necessary? */
1331                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1332                   gtid ) );
1333                 pr->ordered_bumped = 0;
1334 //!!!!! TODO check if the inc should be unsigned, or signed???
1335                 #ifdef KMP_DEBUG
1336                 {
1337                     const char * buff;
1338                     // create format specifiers before the debug output
1339                     buff = __kmp_str_format(
1340                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1341                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1342                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1343                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1344                     __kmp_str_free( &buff );
1345                 }
1346                 #endif
1347 
1348                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1349             }
1350 //        }
1351     }
1352     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1353 }
1354 
1355 #endif /* KMP_GOMP_COMPAT */
1356 
1357 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1358  * (no more work), then tell OMPT the loop is over. In some cases
1359  * kmp_dispatch_fini() is not called. */
1360 #if OMPT_SUPPORT && OMPT_TRACE
1361 #define OMPT_LOOP_END                                                          \
1362     if (status == 0) {                                                         \
1363         if ((ompt_status == ompt_status_track_callback) &&                     \
1364             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1365             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1366             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1367             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1368                 team_info->parallel_id, task_info->task_id);                   \
1369         }                                                                      \
1370     }
1371 #else
1372 #define OMPT_LOOP_END // no-op
1373 #endif
1374 
1375 template< typename T >
1376 static int
1377 __kmp_dispatch_next(
1378     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1379 ) {
1380 
1381     typedef typename traits_t< T >::unsigned_t  UT;
1382     typedef typename traits_t< T >::signed_t    ST;
1383     typedef typename traits_t< T >::floating_t  DBL;
1384     static const int ___kmp_size_type = sizeof( UT );
1385 
1386     int                                   status;
1387     dispatch_private_info_template< T > * pr;
1388     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1389     kmp_team_t                          * team = th -> th.th_team;
1390 
1391     KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1392     #ifdef KMP_DEBUG
1393     {
1394         const char * buff;
1395         // create format specifiers before the debug output
1396         buff = __kmp_str_format(
1397             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1398             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1399         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1400         __kmp_str_free( &buff );
1401     }
1402     #endif
1403 
1404     if ( team -> t.t_serialized ) {
1405         /* NOTE: serialize this dispatch becase we are not at the active level */
1406         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1407             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1408         KMP_DEBUG_ASSERT( pr );
1409 
1410         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1411             *p_lb = 0;
1412             *p_ub = 0;
1413 //            if ( p_last != NULL )
1414 //                *p_last = 0;
1415             if ( p_st != NULL )
1416                 *p_st = 0;
1417             if ( __kmp_env_consistency_check ) {
1418                 if ( pr->pushed_ws != ct_none ) {
1419                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1420                 }
1421             }
1422         } else if ( pr->nomerge ) {
1423             kmp_int32 last;
1424             T         start;
1425             UT        limit, trip, init;
1426             ST        incr;
1427             T         chunk = pr->u.p.parm1;
1428 
1429             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1430 
1431             init = chunk * pr->u.p.count++;
1432             trip = pr->u.p.tc - 1;
1433 
1434             if ( (status = (init <= trip)) == 0 ) {
1435                 *p_lb = 0;
1436                 *p_ub = 0;
1437 //                if ( p_last != NULL )
1438 //                    *p_last = 0;
1439                 if ( p_st != NULL )
1440                     *p_st = 0;
1441                 if ( __kmp_env_consistency_check ) {
1442                     if ( pr->pushed_ws != ct_none ) {
1443                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1444                     }
1445                 }
1446             } else {
1447                 start = pr->u.p.lb;
1448                 limit = chunk + init - 1;
1449                 incr  = pr->u.p.st;
1450 
1451                 if ( (last = (limit >= trip)) != 0 ) {
1452                     limit = trip;
1453                     #if KMP_OS_WINDOWS
1454                     pr->u.p.last_upper = pr->u.p.ub;
1455                     #endif /* KMP_OS_WINDOWS */
1456                 }
1457                 if ( p_last != NULL )
1458                     *p_last = last;
1459                 if ( p_st != NULL )
1460                     *p_st = incr;
1461                 if ( incr == 1 ) {
1462                     *p_lb = start + init;
1463                     *p_ub = start + limit;
1464                 } else {
1465                     *p_lb = start + init * incr;
1466                     *p_ub = start + limit * incr;
1467                 }
1468 
1469                 if ( pr->ordered ) {
1470                     pr->u.p.ordered_lower = init;
1471                     pr->u.p.ordered_upper = limit;
1472                     #ifdef KMP_DEBUG
1473                     {
1474                         const char * buff;
1475                         // create format specifiers before the debug output
1476                         buff = __kmp_str_format(
1477                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1478                             traits_t< UT >::spec, traits_t< UT >::spec );
1479                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1480                         __kmp_str_free( &buff );
1481                     }
1482                     #endif
1483                 } // if
1484             } // if
1485         } else {
1486             pr->u.p.tc = 0;
1487             *p_lb = pr->u.p.lb;
1488             *p_ub = pr->u.p.ub;
1489             #if KMP_OS_WINDOWS
1490             pr->u.p.last_upper = *p_ub;
1491             #endif /* KMP_OS_WINDOWS */
1492             if ( p_last != NULL )
1493                 *p_last = TRUE;
1494             if ( p_st != NULL )
1495                 *p_st = pr->u.p.st;
1496         } // if
1497         #ifdef KMP_DEBUG
1498         {
1499             const char * buff;
1500             // create format specifiers before the debug output
1501             buff = __kmp_str_format(
1502                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1503                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1504                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1505             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1506             __kmp_str_free( &buff );
1507         }
1508         #endif
1509 #if INCLUDE_SSC_MARKS
1510         SSC_MARK_DISPATCH_NEXT();
1511 #endif
1512         OMPT_LOOP_END;
1513         return status;
1514     } else {
1515         kmp_int32 last = 0;
1516         dispatch_shared_info_template< UT > *sh;
1517         T         start;
1518         ST        incr;
1519         UT        limit, trip, init;
1520 
1521         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1522                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1523 
1524         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1525             ( th->th.th_dispatch->th_dispatch_pr_current );
1526         KMP_DEBUG_ASSERT( pr );
1527         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1528             ( th->th.th_dispatch->th_dispatch_sh_current );
1529         KMP_DEBUG_ASSERT( sh );
1530 
1531         if ( pr->u.p.tc == 0 ) {
1532             // zero trip count
1533             status = 0;
1534         } else {
1535             switch (pr->schedule) {
1536             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1537             case kmp_sch_static_steal:
1538                 {
1539                     T chunk = pr->u.p.parm1;
1540 
1541                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1542 
1543                     trip = pr->u.p.tc - 1;
1544 
1545                     if ( ___kmp_size_type > 4 ) {
1546                         // Other threads do not look into the data of this thread,
1547                         //  so it's not necessary to make volatile casting.
1548                         init   = ( pr->u.p.count )++;
1549                         status = ( init < (UT)pr->u.p.ub );
1550                     } else {
1551                         typedef union {
1552                             struct {
1553                                 UT count;
1554                                 T  ub;
1555                             } p;
1556                             kmp_int64 b;
1557                         } union_i4;
1558                         // All operations on 'count' or 'ub' must be combined atomically together.
1559                         // stealing implemented only for 4-byte indexes
1560                         {
1561                             union_i4 vold, vnew;
1562                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1563                             vnew = vold;
1564                             vnew.p.count++;
1565                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1566                                         ( volatile kmp_int64* )&pr->u.p.count,
1567                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1568                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1569                                 KMP_CPU_PAUSE();
1570                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1571                                 vnew = vold;
1572                                 vnew.p.count++;
1573                             }
1574                             vnew = vold;
1575                             init   = vnew.p.count;
1576                             status = ( init < (UT)vnew.p.ub ) ;
1577                         }
1578 
1579                         if( !status ) {
1580                             kmp_info_t   **other_threads = team->t.t_threads;
1581                             int          while_limit = 10;
1582                             int          while_index = 0;
1583 
1584                             // TODO: algorithm of searching for a victim
1585                             // should be cleaned up and measured
1586                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1587                                 union_i4  vold, vnew;
1588                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1589                                 T         victimIdx    = pr->u.p.parm4;
1590                                 T         oldVictimIdx = victimIdx;
1591                                 dispatch_private_info_template< T > * victim;
1592 
1593                                 do {
1594                                     if( !victimIdx ) {
1595                                         victimIdx = team->t.t_nproc - 1;
1596                                     } else {
1597                                         --victimIdx;
1598                                     }
1599                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1600                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1601                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1602                                 // TODO: think about a proper place of this test
1603                                 if ( ( !victim ) ||
1604                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1605                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1606                                     // TODO: delay would be nice
1607                                     continue;
1608                                     // the victim is not ready yet to participate in stealing
1609                                     // because the victim is still in kmp_init_dispatch
1610                                 }
1611                                 if ( oldVictimIdx == victimIdx ) {
1612                                     break;
1613                                 }
1614                                 pr->u.p.parm4 = victimIdx;
1615 
1616                                 while( 1 ) {
1617                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1618                                     vnew = vold;
1619 
1620                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1621                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1622                                         break;
1623                                     }
1624                                     vnew.p.ub -= (remaining >> 2);
1625                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1626                                     #pragma warning( push )
1627                                     // disable warning on pointless comparison of unsigned with 0
1628                                     #pragma warning( disable: 186 )
1629                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1630                                     #pragma warning( pop )
1631                                     // TODO: Should this be acquire or release?
1632                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1633                                             ( volatile kmp_int64 * )&victim->u.p.count,
1634                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1635                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1636                                         status = 1;
1637                                         while_index = 0;
1638                                         // now update own count and ub
1639                                         #if KMP_ARCH_X86
1640                                         // stealing executed on non-KMP_ARCH_X86 only
1641                                             // Atomic 64-bit write on ia32 is
1642                                             // unavailable, so we do this in steps.
1643                                             //     This code is not tested.
1644                                             init = vold.p.count;
1645                                             pr->u.p.ub = 0;
1646                                             pr->u.p.count = init + 1;
1647                                             pr->u.p.ub = vnew.p.count;
1648                                         #else
1649                                             init = vnew.p.ub;
1650                                             vold.p.count = init + 1;
1651                                             // TODO: is it safe and enough?
1652                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1653                                         #endif // KMP_ARCH_X86
1654                                         break;
1655                                     } // if
1656                                 KMP_CPU_PAUSE();
1657                                 } // while (1)
1658                             } // while
1659                         } // if
1660                     } // if
1661                     if ( !status ) {
1662                         *p_lb = 0;
1663                         *p_ub = 0;
1664                         if ( p_st != NULL ) *p_st = 0;
1665                     } else {
1666                         start = pr->u.p.parm2;
1667                         init *= chunk;
1668                         limit = chunk + init - 1;
1669                         incr  = pr->u.p.st;
1670 
1671                         KMP_DEBUG_ASSERT(init <= trip);
1672                         if ( (last = (limit >= trip)) != 0 )
1673                             limit = trip;
1674                         if ( p_st != NULL ) *p_st = incr;
1675 
1676                         if ( incr == 1 ) {
1677                             *p_lb = start + init;
1678                             *p_ub = start + limit;
1679                         } else {
1680                             *p_lb = start + init * incr;
1681                             *p_ub = start + limit * incr;
1682                         }
1683 
1684                         if ( pr->ordered ) {
1685                             pr->u.p.ordered_lower = init;
1686                             pr->u.p.ordered_upper = limit;
1687                             #ifdef KMP_DEBUG
1688                             {
1689                                 const char * buff;
1690                                 // create format specifiers before the debug output
1691                                 buff = __kmp_str_format(
1692                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1693                                     traits_t< UT >::spec, traits_t< UT >::spec );
1694                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1695                                 __kmp_str_free( &buff );
1696                             }
1697                             #endif
1698                         } // if
1699                     } // if
1700                     break;
1701                 } // case
1702             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1703             case kmp_sch_static_balanced:
1704                 {
1705                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1706                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1707                         pr->u.p.count = 1;
1708                         *p_lb = pr->u.p.lb;
1709                         *p_ub = pr->u.p.ub;
1710                         last = pr->u.p.parm1;
1711                         if ( p_st != NULL )
1712                             *p_st = pr->u.p.st;
1713                     } else {  /* no iterations to do */
1714                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1715                     }
1716                     if ( pr->ordered ) {
1717                         #ifdef KMP_DEBUG
1718                         {
1719                             const char * buff;
1720                             // create format specifiers before the debug output
1721                             buff = __kmp_str_format(
1722                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1723                                 traits_t< UT >::spec, traits_t< UT >::spec );
1724                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1725                             __kmp_str_free( &buff );
1726                         }
1727                         #endif
1728                     } // if
1729                 } // case
1730                 break;
1731             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1732             case kmp_sch_static_chunked:
1733                 {
1734                     T parm1;
1735 
1736                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1737                                    gtid ) );
1738                     parm1 = pr->u.p.parm1;
1739 
1740                     trip  = pr->u.p.tc - 1;
1741                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1742 
1743                     if ( (status = (init <= trip)) != 0 ) {
1744                         start = pr->u.p.lb;
1745                         incr  = pr->u.p.st;
1746                         limit = parm1 + init - 1;
1747 
1748                         if ( (last = (limit >= trip)) != 0 )
1749                             limit = trip;
1750 
1751                         if ( p_st != NULL ) *p_st = incr;
1752 
1753                         pr->u.p.count += team->t.t_nproc;
1754 
1755                         if ( incr == 1 ) {
1756                             *p_lb = start + init;
1757                             *p_ub = start + limit;
1758                         }
1759                         else {
1760                             *p_lb = start + init * incr;
1761                             *p_ub = start + limit * incr;
1762                         }
1763 
1764                         if ( pr->ordered ) {
1765                             pr->u.p.ordered_lower = init;
1766                             pr->u.p.ordered_upper = limit;
1767                             #ifdef KMP_DEBUG
1768                             {
1769                                 const char * buff;
1770                                 // create format specifiers before the debug output
1771                                 buff = __kmp_str_format(
1772                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1773                                     traits_t< UT >::spec, traits_t< UT >::spec );
1774                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1775                                 __kmp_str_free( &buff );
1776                             }
1777                             #endif
1778                         } // if
1779                     } // if
1780                 } // case
1781                 break;
1782 
1783             case kmp_sch_dynamic_chunked:
1784                 {
1785                     T chunk = pr->u.p.parm1;
1786 
1787                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1788                                    gtid ) );
1789 
1790                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1791                     trip = pr->u.p.tc - 1;
1792 
1793                     if ( (status = (init <= trip)) == 0 ) {
1794                         *p_lb = 0;
1795                         *p_ub = 0;
1796                         if ( p_st != NULL ) *p_st = 0;
1797                     } else {
1798                         start = pr->u.p.lb;
1799                         limit = chunk + init - 1;
1800                         incr  = pr->u.p.st;
1801 
1802                         if ( (last = (limit >= trip)) != 0 )
1803                             limit = trip;
1804 
1805                         if ( p_st != NULL ) *p_st = incr;
1806 
1807                         if ( incr == 1 ) {
1808                             *p_lb = start + init;
1809                             *p_ub = start + limit;
1810                         } else {
1811                             *p_lb = start + init * incr;
1812                             *p_ub = start + limit * incr;
1813                         }
1814 
1815                         if ( pr->ordered ) {
1816                             pr->u.p.ordered_lower = init;
1817                             pr->u.p.ordered_upper = limit;
1818                             #ifdef KMP_DEBUG
1819                             {
1820                                 const char * buff;
1821                                 // create format specifiers before the debug output
1822                                 buff = __kmp_str_format(
1823                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1824                                     traits_t< UT >::spec, traits_t< UT >::spec );
1825                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1826                                 __kmp_str_free( &buff );
1827                             }
1828                             #endif
1829                         } // if
1830                     } // if
1831                 } // case
1832                 break;
1833 
1834             case kmp_sch_guided_iterative_chunked:
1835                 {
1836                     T  chunkspec = pr->u.p.parm1;
1837                     KD_TRACE(100,
1838                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1839                     trip  = pr->u.p.tc;
1840                     // Start atomic part of calculations
1841                     while(1) {
1842                         ST  remaining;             // signed, because can be < 0
1843                         init = sh->u.s.iteration;  // shared value
1844                         remaining = trip - init;
1845                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1846                             // nothing to do, don't try atomic op
1847                             status = 0;
1848                             break;
1849                         }
1850                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1851                             // use dynamic-style shcedule
1852                             // atomically inrement iterations, get old value
1853                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1854                             remaining = trip - init;
1855                             if (remaining <= 0) {
1856                                 status = 0;    // all iterations got by other threads
1857                             } else {
1858                                 // got some iterations to work on
1859                                 status = 1;
1860                                 if ( (T)remaining > chunkspec ) {
1861                                     limit = init + chunkspec - 1;
1862                                 } else {
1863                                     last = 1;   // the last chunk
1864                                     limit = init + remaining - 1;
1865                                 } // if
1866                             } // if
1867                             break;
1868                         } // if
1869                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1870                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1871                             // CAS was successful, chunk obtained
1872                             status = 1;
1873                             --limit;
1874                             break;
1875                         } // if
1876                     } // while
1877                     if ( status != 0 ) {
1878                         start = pr->u.p.lb;
1879                         incr = pr->u.p.st;
1880                         if ( p_st != NULL )
1881                             *p_st = incr;
1882                         *p_lb = start + init * incr;
1883                         *p_ub = start + limit * incr;
1884                         if ( pr->ordered ) {
1885                             pr->u.p.ordered_lower = init;
1886                             pr->u.p.ordered_upper = limit;
1887                             #ifdef KMP_DEBUG
1888                             {
1889                                 const char * buff;
1890                                 // create format specifiers before the debug output
1891                                 buff = __kmp_str_format(
1892                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1893                                     traits_t< UT >::spec, traits_t< UT >::spec );
1894                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1895                                 __kmp_str_free( &buff );
1896                             }
1897                             #endif
1898                         } // if
1899                     } else {
1900                         *p_lb = 0;
1901                         *p_ub = 0;
1902                         if ( p_st != NULL )
1903                             *p_st = 0;
1904                     } // if
1905                 } // case
1906                 break;
1907 
1908             case kmp_sch_guided_analytical_chunked:
1909                 {
1910                     T   chunkspec = pr->u.p.parm1;
1911                     UT chunkIdx;
1912     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1913                     /* for storing original FPCW value for Windows* OS on
1914 		       IA-32 architecture 8-byte version */
1915                     unsigned int oldFpcw;
1916                     unsigned int fpcwSet = 0;
1917     #endif
1918                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1919                                    gtid ) );
1920 
1921                     trip  = pr->u.p.tc;
1922 
1923                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1924                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1925 
1926                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1927                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1928                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1929                             --trip;
1930                             /* use dynamic-style scheduling */
1931                             init = chunkIdx * chunkspec + pr->u.p.count;
1932                             /* need to verify init > 0 in case of overflow in the above calculation */
1933                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1934                                 limit = init + chunkspec -1;
1935 
1936                                 if ( (last = (limit >= trip)) != 0 )
1937                                     limit = trip;
1938                             }
1939                             break;
1940                         } else {
1941                             /* use exponential-style scheduling */
1942                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1943                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1944                              */
1945     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1946                             /* If we haven't already done so, save original
1947 			       FPCW and set precision to 64-bit, as Windows* OS
1948 			       on IA-32 architecture defaults to 53-bit */
1949                             if ( !fpcwSet ) {
1950                                 oldFpcw = _control87(0,0);
1951                                 _control87(_PC_64,_MCW_PC);
1952                                 fpcwSet = 0x30000;
1953                             }
1954     #endif
1955                             if ( chunkIdx ) {
1956                                 init = __kmp_dispatch_guided_remaining< T >(
1957                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1958                                 KMP_DEBUG_ASSERT(init);
1959                                 init = trip - init;
1960                             } else
1961                                 init = 0;
1962                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1963                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1964                             KMP_ASSERT(init <= limit);
1965                             if ( init < limit ) {
1966                                 KMP_DEBUG_ASSERT(limit <= trip);
1967                                 --limit;
1968                                 status = 1;
1969                                 break;
1970                             } // if
1971                         } // if
1972                     } // while (1)
1973     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1974                     /* restore FPCW if necessary
1975                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1976                     */
1977                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1978                         _control87(oldFpcw,_MCW_PC);
1979     #endif
1980                     if ( status != 0 ) {
1981                         start = pr->u.p.lb;
1982                         incr = pr->u.p.st;
1983                         if ( p_st != NULL )
1984                             *p_st = incr;
1985                         *p_lb = start + init * incr;
1986                         *p_ub = start + limit * incr;
1987                         if ( pr->ordered ) {
1988                             pr->u.p.ordered_lower = init;
1989                             pr->u.p.ordered_upper = limit;
1990                             #ifdef KMP_DEBUG
1991                             {
1992                                 const char * buff;
1993                                 // create format specifiers before the debug output
1994                                 buff = __kmp_str_format(
1995                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1996                                     traits_t< UT >::spec, traits_t< UT >::spec );
1997                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1998                                 __kmp_str_free( &buff );
1999                             }
2000                             #endif
2001                         }
2002                     } else {
2003                         *p_lb = 0;
2004                         *p_ub = 0;
2005                         if ( p_st != NULL )
2006                             *p_st = 0;
2007                     }
2008                 } // case
2009                 break;
2010 
2011             case kmp_sch_trapezoidal:
2012                 {
2013                     UT   index;
2014                     T    parm2 = pr->u.p.parm2;
2015                     T    parm3 = pr->u.p.parm3;
2016                     T    parm4 = pr->u.p.parm4;
2017                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2018                                    gtid ) );
2019 
2020                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2021 
2022                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2023                     trip = pr->u.p.tc - 1;
2024 
2025                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2026                         *p_lb = 0;
2027                         *p_ub = 0;
2028                         if ( p_st != NULL ) *p_st = 0;
2029                     } else {
2030                         start = pr->u.p.lb;
2031                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2032                         incr  = pr->u.p.st;
2033 
2034                         if ( (last = (limit >= trip)) != 0 )
2035                             limit = trip;
2036 
2037                         if ( p_st != NULL ) *p_st = incr;
2038 
2039                         if ( incr == 1 ) {
2040                             *p_lb = start + init;
2041                             *p_ub = start + limit;
2042                         } else {
2043                             *p_lb = start + init * incr;
2044                             *p_ub = start + limit * incr;
2045                         }
2046 
2047                         if ( pr->ordered ) {
2048                             pr->u.p.ordered_lower = init;
2049                             pr->u.p.ordered_upper = limit;
2050                             #ifdef KMP_DEBUG
2051                             {
2052                                 const char * buff;
2053                                 // create format specifiers before the debug output
2054                                 buff = __kmp_str_format(
2055                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2056                                     traits_t< UT >::spec, traits_t< UT >::spec );
2057                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2058                                 __kmp_str_free( &buff );
2059                             }
2060                             #endif
2061                         } // if
2062                     } // if
2063                 } // case
2064                 break;
2065             default:
2066                 {
2067                     status = 0; // to avoid complaints on uninitialized variable use
2068                     __kmp_msg(
2069                         kmp_ms_fatal,                        // Severity
2070                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2071                         KMP_HNT( GetNewerLibrary ),          // Hint
2072                         __kmp_msg_null                       // Variadic argument list terminator
2073                     );
2074                 }
2075                 break;
2076             } // switch
2077         } // if tc == 0;
2078 
2079         if ( status == 0 ) {
2080             UT   num_done;
2081 
2082             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2083             #ifdef KMP_DEBUG
2084             {
2085                 const char * buff;
2086                 // create format specifiers before the debug output
2087                 buff = __kmp_str_format(
2088                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2089                     traits_t< UT >::spec );
2090                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2091                 __kmp_str_free( &buff );
2092             }
2093             #endif
2094 
2095             if ( (ST)num_done == team->t.t_nproc-1 ) {
2096                 /* NOTE: release this buffer to be reused */
2097 
2098                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2099 
2100                 sh->u.s.num_done = 0;
2101                 sh->u.s.iteration = 0;
2102 
2103                 /* TODO replace with general release procedure? */
2104                 if ( pr->ordered ) {
2105                     sh->u.s.ordered_iteration = 0;
2106                 }
2107 
2108                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2109 
2110                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2111                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2112                                 gtid, sh->buffer_index) );
2113 
2114                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2115 
2116             } // if
2117             if ( __kmp_env_consistency_check ) {
2118                 if ( pr->pushed_ws != ct_none ) {
2119                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2120                 }
2121             }
2122 
2123             th -> th.th_dispatch -> th_deo_fcn = NULL;
2124             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2125             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2126             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2127         } // if (status == 0)
2128 #if KMP_OS_WINDOWS
2129         else if ( last ) {
2130             pr->u.p.last_upper = pr->u.p.ub;
2131         }
2132 #endif /* KMP_OS_WINDOWS */
2133         if ( p_last != NULL && status != 0 )
2134             *p_last = last;
2135     } // if
2136 
2137     #ifdef KMP_DEBUG
2138     {
2139         const char * buff;
2140         // create format specifiers before the debug output
2141         buff = __kmp_str_format(
2142             "__kmp_dispatch_next: T#%%d normal case: " \
2143             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2144             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2145         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2146         __kmp_str_free( &buff );
2147     }
2148     #endif
2149 #if INCLUDE_SSC_MARKS
2150     SSC_MARK_DISPATCH_NEXT();
2151 #endif
2152     OMPT_LOOP_END;
2153     return status;
2154 }
2155 
2156 template< typename T >
2157 static void
2158 __kmp_dist_get_bounds(
2159     ident_t                          *loc,
2160     kmp_int32                         gtid,
2161     kmp_int32                        *plastiter,
2162     T                                *plower,
2163     T                                *pupper,
2164     typename traits_t< T >::signed_t  incr
2165 ) {
2166     KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2167     typedef typename traits_t< T >::unsigned_t  UT;
2168     typedef typename traits_t< T >::signed_t    ST;
2169     register kmp_uint32  team_id;
2170     register kmp_uint32  nteams;
2171     register UT          trip_count;
2172     register kmp_team_t *team;
2173     kmp_info_t * th;
2174 
2175     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2176     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2177     #ifdef KMP_DEBUG
2178     {
2179         const char * buff;
2180         // create format specifiers before the debug output
2181         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2182             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2183             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2184             traits_t< T >::spec );
2185         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2186         __kmp_str_free( &buff );
2187     }
2188     #endif
2189 
2190     if( __kmp_env_consistency_check ) {
2191         if( incr == 0 ) {
2192             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2193         }
2194         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2195             // The loop is illegal.
2196             // Some zero-trip loops maintained by compiler, e.g.:
2197             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2198             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2199             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2200             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2201             // Compiler does not check the following illegal loops:
2202             //   for(i=0;i<10;i+=incr) // where incr<0
2203             //   for(i=10;i>0;i-=incr) // where incr<0
2204             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2205         }
2206     }
2207     th = __kmp_threads[gtid];
2208     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2209     team = th->th.th_team;
2210     #if OMP_40_ENABLED
2211     nteams = th->th.th_teams_size.nteams;
2212     #endif
2213     team_id = team->t.t_master_tid;
2214     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2215 
2216     // compute global trip count
2217     if( incr == 1 ) {
2218         trip_count = *pupper - *plower + 1;
2219     } else if(incr == -1) {
2220         trip_count = *plower - *pupper + 1;
2221     } else {
2222         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2223     }
2224     if( trip_count <= nteams ) {
2225         KMP_DEBUG_ASSERT(
2226             __kmp_static == kmp_sch_static_greedy || \
2227             __kmp_static == kmp_sch_static_balanced
2228         ); // Unknown static scheduling type.
2229         // only some teams get single iteration, others get nothing
2230         if( team_id < trip_count ) {
2231             *pupper = *plower = *plower + team_id * incr;
2232         } else {
2233             *plower = *pupper + incr; // zero-trip loop
2234         }
2235         if( plastiter != NULL )
2236             *plastiter = ( team_id == trip_count - 1 );
2237     } else {
2238         if( __kmp_static == kmp_sch_static_balanced ) {
2239             register UT chunk = trip_count / nteams;
2240             register UT extras = trip_count % nteams;
2241             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2242             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2243             if( plastiter != NULL )
2244                 *plastiter = ( team_id == nteams - 1 );
2245         } else {
2246             register T chunk_inc_count =
2247                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2248             register T upper = *pupper;
2249             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2250                 // Unknown static scheduling type.
2251             *plower += team_id * chunk_inc_count;
2252             *pupper = *plower + chunk_inc_count - incr;
2253             // Check/correct bounds if needed
2254             if( incr > 0 ) {
2255                 if( *pupper < *plower )
2256                     *pupper = i_maxmin< T >::mx;
2257                 if( plastiter != NULL )
2258                     *plastiter = *plower <= upper && *pupper > upper - incr;
2259                 if( *pupper > upper )
2260                     *pupper = upper; // tracker C73258
2261             } else {
2262                 if( *pupper > *plower )
2263                     *pupper = i_maxmin< T >::mn;
2264                 if( plastiter != NULL )
2265                     *plastiter = *plower >= upper && *pupper < upper - incr;
2266                 if( *pupper < upper )
2267                     *pupper = upper; // tracker C73258
2268             }
2269         }
2270     }
2271 }
2272 
2273 //-----------------------------------------------------------------------------------------
2274 // Dispatch routines
2275 //    Transfer call to template< type T >
2276 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2277 //                         T lb, T ub, ST st, ST chunk )
2278 extern "C" {
2279 
2280 /*!
2281 @ingroup WORK_SHARING
2282 @{
2283 @param loc Source location
2284 @param gtid Global thread id
2285 @param schedule Schedule type
2286 @param lb  Lower bound
2287 @param ub  Upper bound
2288 @param st  Step (or increment if you prefer)
2289 @param chunk The chunk size to block with
2290 
2291 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2292 These functions are all identical apart from the types of the arguments.
2293 */
2294 
2295 void
2296 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2297                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2298 {
2299     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2300     KMP_DEBUG_ASSERT( __kmp_init_serial );
2301     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2302 }
2303 /*!
2304 See @ref __kmpc_dispatch_init_4
2305 */
2306 void
2307 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2308                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2309 {
2310     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2311     KMP_DEBUG_ASSERT( __kmp_init_serial );
2312     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2313 }
2314 
2315 /*!
2316 See @ref __kmpc_dispatch_init_4
2317 */
2318 void
2319 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2320                         kmp_int64 lb, kmp_int64 ub,
2321                         kmp_int64 st, kmp_int64 chunk )
2322 {
2323     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2324     KMP_DEBUG_ASSERT( __kmp_init_serial );
2325     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2326 }
2327 
2328 /*!
2329 See @ref __kmpc_dispatch_init_4
2330 */
2331 void
2332 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2333                          kmp_uint64 lb, kmp_uint64 ub,
2334                          kmp_int64 st, kmp_int64 chunk )
2335 {
2336     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2337     KMP_DEBUG_ASSERT( __kmp_init_serial );
2338     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2339 }
2340 
2341 /*!
2342 See @ref __kmpc_dispatch_init_4
2343 
2344 Difference from __kmpc_dispatch_init set of functions is these functions
2345 are called for composite distribute parallel for construct. Thus before
2346 regular iterations dispatching we need to calc per-team iteration space.
2347 
2348 These functions are all identical apart from the types of the arguments.
2349 */
2350 void
2351 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2352     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2353 {
2354     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2355     KMP_DEBUG_ASSERT( __kmp_init_serial );
2356     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2357     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2358 }
2359 
2360 void
2361 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2362     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2363 {
2364     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2365     KMP_DEBUG_ASSERT( __kmp_init_serial );
2366     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2367     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2368 }
2369 
2370 void
2371 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2372     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2373 {
2374     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2375     KMP_DEBUG_ASSERT( __kmp_init_serial );
2376     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2377     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2378 }
2379 
2380 void
2381 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2382     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2383 {
2384     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2385     KMP_DEBUG_ASSERT( __kmp_init_serial );
2386     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2387     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2388 }
2389 
2390 /*!
2391 @param loc Source code location
2392 @param gtid Global thread id
2393 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2394 @param p_lb   Pointer to the lower bound for the next chunk of work
2395 @param p_ub   Pointer to the upper bound for the next chunk of work
2396 @param p_st   Pointer to the stride for the next chunk of work
2397 @return one if there is work to be done, zero otherwise
2398 
2399 Get the next dynamically allocated chunk of work for this thread.
2400 If there is no more work, then the lb,ub and stride need not be modified.
2401 */
2402 int
2403 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2404                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2405 {
2406     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2407 }
2408 
2409 /*!
2410 See @ref __kmpc_dispatch_next_4
2411 */
2412 int
2413 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2414                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2415 {
2416     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2417 }
2418 
2419 /*!
2420 See @ref __kmpc_dispatch_next_4
2421 */
2422 int
2423 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2424                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2425 {
2426     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2427 }
2428 
2429 /*!
2430 See @ref __kmpc_dispatch_next_4
2431 */
2432 int
2433 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2434                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2435 {
2436     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2437 }
2438 
2439 /*!
2440 @param loc Source code location
2441 @param gtid Global thread id
2442 
2443 Mark the end of a dynamic loop.
2444 */
2445 void
2446 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2447 {
2448     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2449 }
2450 
2451 /*!
2452 See @ref __kmpc_dispatch_fini_4
2453 */
2454 void
2455 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2456 {
2457     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2458 }
2459 
2460 /*!
2461 See @ref __kmpc_dispatch_fini_4
2462 */
2463 void
2464 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2465 {
2466     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2467 }
2468 
2469 /*!
2470 See @ref __kmpc_dispatch_fini_4
2471 */
2472 void
2473 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2474 {
2475     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2476 }
2477 /*! @} */
2478 
2479 //-----------------------------------------------------------------------------------------
2480 //Non-template routines from kmp_dispatch.c used in other sources
2481 
2482 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2483     return value == checker;
2484 }
2485 
2486 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2487     return value != checker;
2488 }
2489 
2490 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2491     return value < checker;
2492 }
2493 
2494 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2495     return value >= checker;
2496 }
2497 
2498 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2499     return value <= checker;
2500 }
2501 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2502     return value == checker;
2503 }
2504 
2505 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2506     return value != checker;
2507 }
2508 
2509 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2510     return value < checker;
2511 }
2512 
2513 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2514     return value >= checker;
2515 }
2516 
2517 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2518     return value <= checker;
2519 }
2520 
2521 kmp_uint32
2522 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2523                    kmp_uint32            checker,
2524                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2525                    , void        * obj    // Higher-level synchronization object, or NULL.
2526                    )
2527 {
2528     // note: we may not belong to a team at this point
2529     register volatile kmp_uint32         * spin          = spinner;
2530     register          kmp_uint32           check         = checker;
2531     register          kmp_uint32   spins;
2532     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2533     register          kmp_uint32           r;
2534 
2535     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2536     KMP_INIT_YIELD( spins );
2537     // main wait spin loop
2538     while(!f(r = TCR_4(*spin), check)) {
2539         KMP_FSYNC_SPIN_PREPARE( obj );
2540         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2541            It causes problems with infinite recursion because of exit lock */
2542         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2543             __kmp_abort_thread(); */
2544 
2545         /* if we have waited a bit, or are oversubscribed, yield */
2546         /* pause is in the following code */
2547         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2548         KMP_YIELD_SPIN( spins );
2549     }
2550     KMP_FSYNC_SPIN_ACQUIRED( obj );
2551     return r;
2552 }
2553 
2554 kmp_uint64
2555 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2556                     kmp_uint64            checker,
2557                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2558                     , void        * obj    // Higher-level synchronization object, or NULL.
2559                     )
2560 {
2561     // note: we may not belong to a team at this point
2562     register volatile kmp_uint64         * spin          = spinner;
2563     register          kmp_uint64           check         = checker;
2564     register          kmp_uint32   spins;
2565     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2566     register          kmp_uint64           r;
2567 
2568     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2569     KMP_INIT_YIELD( spins );
2570     // main wait spin loop
2571     while(!f(r = *spin, check))
2572     {
2573         KMP_FSYNC_SPIN_PREPARE( obj );
2574         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2575            It causes problems with infinite recursion because of exit lock */
2576         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2577             __kmp_abort_thread(); */
2578 
2579         // if we are oversubscribed,
2580         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2581         // pause is in the following code
2582         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2583         KMP_YIELD_SPIN( spins );
2584     }
2585     KMP_FSYNC_SPIN_ACQUIRED( obj );
2586     return r;
2587 }
2588 
2589 } // extern "C"
2590 
2591 #ifdef KMP_GOMP_COMPAT
2592 
2593 void
2594 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2595                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2596                            kmp_int32 chunk, int push_ws )
2597 {
2598     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2599                                       push_ws );
2600 }
2601 
2602 void
2603 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2604                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2605                             kmp_int32 chunk, int push_ws )
2606 {
2607     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2608                                        push_ws );
2609 }
2610 
2611 void
2612 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2613                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2614                            kmp_int64 chunk, int push_ws )
2615 {
2616     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2617                                       push_ws );
2618 }
2619 
2620 void
2621 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2622                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2623                             kmp_int64 chunk, int push_ws )
2624 {
2625     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2626                                        push_ws );
2627 }
2628 
2629 void
2630 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2631 {
2632     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2633 }
2634 
2635 void
2636 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2637 {
2638     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2639 }
2640 
2641 void
2642 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2643 {
2644     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2645 }
2646 
2647 void
2648 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2649 {
2650     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2651 }
2652 
2653 #endif /* KMP_GOMP_COMPAT */
2654 
2655 /* ------------------------------------------------------------------------ */
2656 /* ------------------------------------------------------------------------ */
2657 
2658