1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194     kmp_int32 r;
195     r = KMP_TEST_THEN_ADD32( p, d );
196     return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203     kmp_int64 r;
204     r = KMP_TEST_THEN_ADD64( p, d );
205     return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217     kmp_int32 r;
218     r = KMP_TEST_THEN_INC_ACQ32( p );
219     return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226     kmp_int64 r;
227     r = KMP_TEST_THEN_INC_ACQ64( p );
228     return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240     kmp_int32 r;
241     r = KMP_TEST_THEN_INC32( p );
242     return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249     kmp_int64 r;
250     r = KMP_TEST_THEN_INC64( p );
251     return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274     Spin wait loop that first does pause, then yield.
275     Waits until function returns non-zero when called with *spinner and check.
276     Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278     Arguments:
279         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280             locks consistently. For example, if lock is acquired immediately, its address is
281             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283             address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT  // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290                   UT            checker,
291                   kmp_uint32 (* pred)( UT, UT )
292                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
293                   )
294 {
295     // note: we may not belong to a team at this point
296     register volatile UT         * spin          = spinner;
297     register          UT           check         = checker;
298     register          kmp_uint32   spins;
299     register          kmp_uint32 (*f) ( UT, UT ) = pred;
300     register          UT           r;
301 
302     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303     KMP_INIT_YIELD( spins );
304     // main wait spin loop
305     while(!f(r = *spin, check))
306     {
307         KMP_FSYNC_SPIN_PREPARE( obj );
308         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309            It causes problems with infinite recursion because of exit lock */
310         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311             __kmp_abort_thread(); */
312 
313         // if we are oversubscribed,
314         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315         // pause is in the following code
316         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317         KMP_YIELD_SPIN( spins );
318     }
319     KMP_FSYNC_SPIN_ACQUIRED( obj );
320     return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325     return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330     return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335     return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340     return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345     return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355     kmp_info_t *th;
356 
357     KMP_DEBUG_ASSERT( gtid_ref );
358 
359     if ( __kmp_env_consistency_check ) {
360         th = __kmp_threads[*gtid_ref];
361         if ( th -> th.th_root -> r.r_active
362           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368         }
369     }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376     typedef typename traits_t< UT >::signed_t    ST;
377     dispatch_private_info_template< UT > * pr;
378 
379     int gtid = *gtid_ref;
380 //    int  cid = *cid_ref;
381     kmp_info_t *th = __kmp_threads[ gtid ];
382     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385     if ( __kmp_env_consistency_check ) {
386         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387             ( th -> th.th_dispatch -> th_dispatch_pr_current );
388         if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394         }
395     }
396 
397     if ( ! th -> th.th_team -> t.t_serialized ) {
398         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399             ( th -> th.th_dispatch -> th_dispatch_sh_current );
400         UT  lower;
401 
402         if ( ! __kmp_env_consistency_check ) {
403                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         }
406         lower = pr->u.p.ordered_lower;
407 
408         #if ! defined( KMP_GOMP_COMPAT )
409             if ( __kmp_env_consistency_check ) {
410                 if ( pr->ordered_bumped ) {
411                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412                     __kmp_error_construct2(
413                         kmp_i18n_msg_CnsMultipleNesting,
414                         ct_ordered_in_pdo, loc_ref,
415                         & p->stack_data[ p->w_top ]
416                     );
417                 }
418             }
419         #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421         KMP_MB();
422         #ifdef KMP_DEBUG
423         {
424             const char * buff;
425             // create format specifiers before the debug output
426             buff = __kmp_str_format(
427                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428                 traits_t< UT >::spec, traits_t< UT >::spec );
429             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430             __kmp_str_free( &buff );
431         }
432         #endif
433 
434         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435                                 USE_ITT_BUILD_ARG( NULL )
436                                 );
437         KMP_MB();  /* is this necessary? */
438         #ifdef KMP_DEBUG
439         {
440             const char * buff;
441             // create format specifiers before the debug output
442             buff = __kmp_str_format(
443                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444                 traits_t< UT >::spec, traits_t< UT >::spec );
445             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446             __kmp_str_free( &buff );
447         }
448         #endif
449     }
450     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456     kmp_info_t *th;
457 
458     if ( __kmp_env_consistency_check ) {
459         th = __kmp_threads[*gtid_ref];
460         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470     typedef typename traits_t< UT >::signed_t    ST;
471     dispatch_private_info_template< UT > * pr;
472 
473     int gtid = *gtid_ref;
474 //    int  cid = *cid_ref;
475     kmp_info_t *th = __kmp_threads[ gtid ];
476     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479     if ( __kmp_env_consistency_check ) {
480         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481             ( th -> th.th_dispatch -> th_dispatch_pr_current );
482         if ( pr -> pushed_ws != ct_none ) {
483             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484         }
485     }
486 
487     if ( ! th -> th.th_team -> t.t_serialized ) {
488         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489             ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491         if ( ! __kmp_env_consistency_check ) {
492             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494         }
495 
496         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497         #if ! defined( KMP_GOMP_COMPAT )
498             if ( __kmp_env_consistency_check ) {
499                 if ( pr->ordered_bumped != 0 ) {
500                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501                     /* How to test it? - OM */
502                     __kmp_error_construct2(
503                         kmp_i18n_msg_CnsMultipleNesting,
504                         ct_ordered_in_pdo, loc_ref,
505                         & p->stack_data[ p->w_top ]
506                     );
507                 }
508             }
509         #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511         KMP_MB();       /* Flush all pending memory write invalidates.  */
512 
513         pr->ordered_bumped += 1;
514 
515         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516                         gtid, pr->ordered_bumped ) );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519 
520         /* TODO use general release procedure? */
521         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524     }
525     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532     long double s=1.0L;
533 
534     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536     while(y) {
537         if ( y & 1 )
538             s *= x;
539         x *= x;
540         y >>= 1;
541     }
542     return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553     T                                  tc,
554     typename traits_t< T >::floating_t base,
555     typename traits_t< T >::unsigned_t idx
556 ) {
557     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558        least for ICL 8.1, long double arithmetic may not really have
559        long double precision, even with /Qlong_double.  Currently, we
560        workaround that in the caller code, by manipulating the FPCW for
561        Windows* OS on IA-32 architecture.  The lack of precision is not
562        expected to be a correctness issue, though.
563     */
564     typedef typename traits_t< T >::unsigned_t  UT;
565 
566     long double x = tc * __kmp_pow< UT >(base, idx);
567     UT r = (UT) x;
568     if ( x == r )
569         return r;
570     return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
575 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586     ident_t                        * loc,
587     int                              gtid,
588     enum sched_type                  schedule,
589     T                                lb,
590     T                                ub,
591     typename traits_t< T >::signed_t st,
592     typename traits_t< T >::signed_t chunk,
593     int                              push_ws
594 ) {
595     typedef typename traits_t< T >::unsigned_t  UT;
596     typedef typename traits_t< T >::signed_t    ST;
597     typedef typename traits_t< T >::floating_t  DBL;
598     static const int ___kmp_size_type = sizeof( UT );
599 
600     int                                            active;
601     T                                              tc;
602     kmp_info_t *                                   th;
603     kmp_team_t *                                   team;
604     kmp_uint32                                     my_buffer_index;
605     dispatch_private_info_template< T >          * pr;
606     dispatch_shared_info_template< UT > volatile * sh;
607 
608     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611     if ( ! TCR_4( __kmp_init_parallel ) )
612         __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615     SSC_MARK_DISPATCH_INIT();
616 #endif
617     #ifdef KMP_DEBUG
618     {
619         const char * buff;
620         // create format specifiers before the debug output
621         buff = __kmp_str_format(
622             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625         __kmp_str_free( &buff );
626     }
627     #endif
628     /* setup data */
629     th     = __kmp_threads[ gtid ];
630     team   = th -> th.th_team;
631     active = ! team -> t.t_serialized;
632     th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635     kmp_uint64 cur_chunk = chunk;
636     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637         KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639         th->th.th_teams_microtask == NULL &&
640 #endif
641         team->t.t_active_level == 1;
642 #endif
643     if ( ! active ) {
644         pr = reinterpret_cast< dispatch_private_info_template< T >* >
645             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646     } else {
647         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652         /* What happens when number of threads changes, need to resize buffer? */
653         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
654             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657     }
658 
659     /* Pick up the nomerge/ordered bits from the scheduling type */
660     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661         pr->nomerge = TRUE;
662         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663     } else {
664         pr->nomerge = FALSE;
665     }
666     pr->type_size = ___kmp_size_type; // remember the size of variables
667     if ( kmp_ord_lower & schedule ) {
668         pr->ordered = TRUE;
669         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670     } else {
671         pr->ordered = FALSE;
672     }
673     if ( schedule == kmp_sch_static ) {
674         schedule = __kmp_static;
675     } else {
676         if ( schedule == kmp_sch_runtime ) {
677             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
678             schedule = team -> t.t_sched.r_sched_type;
679             // Detail the schedule if needed (global controls are differentiated appropriately)
680             if ( schedule == kmp_sch_guided_chunked ) {
681                 schedule = __kmp_guided;
682             } else if ( schedule == kmp_sch_static ) {
683                 schedule = __kmp_static;
684             }
685             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
686             chunk = team -> t.t_sched.chunk;
687 
688             #ifdef KMP_DEBUG
689             {
690                 const char * buff;
691                 // create format specifiers before the debug output
692                 buff = __kmp_str_format(
693                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
694                     traits_t< ST >::spec );
695                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
696                 __kmp_str_free( &buff );
697             }
698             #endif
699         } else {
700             if ( schedule == kmp_sch_guided_chunked ) {
701                 schedule = __kmp_guided;
702             }
703             if ( chunk <= 0 ) {
704                 chunk = KMP_DEFAULT_CHUNK;
705             }
706         }
707 
708         if ( schedule == kmp_sch_auto ) {
709             // mapping and differentiation: in the __kmp_do_serial_initialize()
710             schedule = __kmp_auto;
711             #ifdef KMP_DEBUG
712             {
713                 const char * buff;
714                 // create format specifiers before the debug output
715                 buff = __kmp_str_format(
716                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
717                     traits_t< ST >::spec );
718                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
719                 __kmp_str_free( &buff );
720             }
721             #endif
722         }
723 
724         /* guided analytical not safe for too many threads */
725         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
726             schedule = kmp_sch_guided_iterative_chunked;
727             KMP_WARNING( DispatchManyThreads );
728         }
729         pr->u.p.parm1 = chunk;
730     }
731     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
732                 "unknown scheduling type" );
733 
734     pr->u.p.count = 0;
735 
736     if ( __kmp_env_consistency_check ) {
737         if ( st == 0 ) {
738             __kmp_error_construct(
739                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
740                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
741             );
742         }
743     }
744 
745     tc = ( ub - lb + st );
746     if ( st != 1 ) {
747         if ( st < 0 ) {
748             if ( lb < ub ) {
749                 tc = 0;            // zero-trip
750             } else {   // lb >= ub
751                 tc = (ST)tc / st;  // convert to signed division
752             }
753         } else {       // st > 0
754             if ( ub < lb ) {
755                 tc = 0;            // zero-trip
756             } else {   // lb >= ub
757                 tc /= st;
758             }
759         }
760     } else if ( ub < lb ) {        // st == 1
761         tc = 0;                    // zero-trip
762     }
763 
764     pr->u.p.lb = lb;
765     pr->u.p.ub = ub;
766     pr->u.p.st = st;
767     pr->u.p.tc = tc;
768 
769     #if KMP_OS_WINDOWS
770     pr->u.p.last_upper = ub + st;
771     #endif /* KMP_OS_WINDOWS */
772 
773     /* NOTE: only the active parallel region(s) has active ordered sections */
774 
775     if ( active ) {
776         if ( pr->ordered == 0 ) {
777             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
778             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
779         } else {
780             pr->ordered_bumped = 0;
781 
782             pr->u.p.ordered_lower = 1;
783             pr->u.p.ordered_upper = 0;
784 
785             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
786             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
787         }
788     }
789 
790     if ( __kmp_env_consistency_check ) {
791         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
792         if ( push_ws ) {
793             __kmp_push_workshare( gtid, ws, loc );
794             pr->pushed_ws = ws;
795         } else {
796             __kmp_check_workshare( gtid, ws, loc );
797             pr->pushed_ws = ct_none;
798         }
799     }
800 
801     switch ( schedule ) {
802     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
803     case kmp_sch_static_steal:
804         {
805             T nproc = team->t.t_nproc;
806             T ntc, init;
807 
808             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
809 
810             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
811             if ( nproc > 1 && ntc >= nproc ) {
812                 T id = __kmp_tid_from_gtid(gtid);
813                 T small_chunk, extras;
814 
815                 small_chunk = ntc / nproc;
816                 extras = ntc % nproc;
817 
818                 init = id * small_chunk + ( id < extras ? id : extras );
819                 pr->u.p.count = init;
820                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
821 
822                 pr->u.p.parm2 = lb;
823                 //pr->pfields.parm3 = 0; // it's not used in static_steal
824                 pr->u.p.parm4 = id;
825                 pr->u.p.st = st;
826                 break;
827             } else {
828                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
829                                gtid ) );
830                 schedule = kmp_sch_static_balanced;
831                 /* too few iterations: fall-through to kmp_sch_static_balanced */
832             } // if
833             /* FALL-THROUGH to static balanced */
834         } // case
835     #endif
836     case kmp_sch_static_balanced:
837         {
838             T nproc = team->t.t_nproc;
839             T init, limit;
840 
841             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
842                             gtid ) );
843 
844             if ( nproc > 1 ) {
845                 T id = __kmp_tid_from_gtid(gtid);
846 
847                 if ( tc < nproc ) {
848                     if ( id < tc ) {
849                         init = id;
850                         limit = id;
851                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
852                     } else {
853                         pr->u.p.count = 1;  /* means no more chunks to execute */
854                         pr->u.p.parm1 = FALSE;
855                         break;
856                     }
857                 } else {
858                     T small_chunk = tc / nproc;
859                     T extras = tc % nproc;
860                     init = id * small_chunk + (id < extras ? id : extras);
861                     limit = init + small_chunk - (id < extras ? 0 : 1);
862                     pr->u.p.parm1 = (id == nproc - 1);
863                 }
864             } else {
865                 if ( tc > 0 ) {
866                     init = 0;
867                     limit = tc - 1;
868                     pr->u.p.parm1 = TRUE;
869                 } else {
870                     // zero trip count
871                     pr->u.p.count = 1;  /* means no more chunks to execute */
872                     pr->u.p.parm1 = FALSE;
873                     break;
874                 }
875             }
876 #if USE_ITT_BUILD
877             // Calculate chunk for metadata report
878             if ( itt_need_metadata_reporting )
879                 cur_chunk = limit - init + 1;
880 #endif
881             if ( st == 1 ) {
882                 pr->u.p.lb = lb + init;
883                 pr->u.p.ub = lb + limit;
884             } else {
885                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
886                 pr->u.p.lb = lb + init * st;
887                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
888                 if ( st > 0 ) {
889                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
890                 } else {
891                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
892                 }
893             }
894             if ( pr->ordered ) {
895                 pr->u.p.ordered_lower = init;
896                 pr->u.p.ordered_upper = limit;
897             }
898             break;
899         } // case
900     case kmp_sch_guided_iterative_chunked :
901         {
902             T nproc = team->t.t_nproc;
903             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
904 
905             if ( nproc > 1 ) {
906                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
907                     /* chunk size too large, switch to dynamic */
908                     schedule = kmp_sch_dynamic_chunked;
909                 } else {
910                     // when remaining iters become less than parm2 - switch to dynamic
911                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
912                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
913                 }
914             } else {
915                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
916                 schedule = kmp_sch_static_greedy;
917                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
918                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
919                 pr->u.p.parm1 = tc;
920             } // if
921         } // case
922         break;
923     case kmp_sch_guided_analytical_chunked:
924         {
925             T nproc = team->t.t_nproc;
926             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
927 
928             if ( nproc > 1 ) {
929                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
930                     /* chunk size too large, switch to dynamic */
931                     schedule = kmp_sch_dynamic_chunked;
932                 } else {
933                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
934                     DBL x;
935 
936                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
937                     /* Linux* OS already has 64-bit computation by default for
938 		       long double, and on Windows* OS on Intel(R) 64,
939 		       /Qlong_double doesn't work.  On Windows* OS
940 		       on IA-32 architecture, we need to set precision to
941 		       64-bit instead of the default 53-bit. Even though long
942 		       double doesn't work on Windows* OS on Intel(R) 64, the
943 		       resulting lack of precision is not expected to impact
944 		       the correctness of the algorithm, but this has not been
945 		       mathematically proven.
946                     */
947                     // save original FPCW and set precision to 64-bit, as
948                     // Windows* OS on IA-32 architecture defaults to 53-bit
949                     unsigned int oldFpcw = _control87(0,0);
950                     _control87(_PC_64,_MCW_PC); // 0,0x30000
951                     #endif
952                     /* value used for comparison in solver for cross-over point */
953                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
954 
955                     /* crossover point--chunk indexes equal to or greater than
956 		       this point switch to dynamic-style scheduling */
957                     UT   cross;
958 
959                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
960                     x = (long double)1.0 - (long double)0.5 / nproc;
961 
962                     #ifdef KMP_DEBUG
963                     { // test natural alignment
964                         struct _test_a {
965                             char a;
966                             union {
967                                 char b;
968                                 DBL  d;
969                             };
970                         } t;
971                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
972                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
973                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
974                     }
975                     #endif // KMP_DEBUG
976 
977                     /* save the term in thread private dispatch structure */
978                     *(DBL*)&pr->u.p.parm3 = x;
979 
980                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
981                     {
982                         UT          left, right, mid;
983                         long double p;
984 
985                         /* estimate initial upper and lower bound */
986 
987                         /* doesn't matter what value right is as long as it is positive, but
988                            it affects performance of the solver
989                         */
990                         right = 229;
991                         p = __kmp_pow< UT >(x,right);
992                         if ( p > target ) {
993                             do{
994                                 p *= p;
995                                 right <<= 1;
996                             } while(p>target && right < (1<<27));
997                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
998                         } else {
999                             left = 0;
1000                         }
1001 
1002                         /* bisection root-finding method */
1003                         while ( left + 1 < right ) {
1004                             mid = (left + right) / 2;
1005                             if ( __kmp_pow< UT >(x,mid) > target ) {
1006                                 left = mid;
1007                             } else {
1008                                 right = mid;
1009                             }
1010                         } // while
1011                         cross = right;
1012                     }
1013                     /* assert sanity of computed crossover point */
1014                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1015 
1016                     /* save the crossover point in thread private dispatch structure */
1017                     pr->u.p.parm2 = cross;
1018 
1019                     // C75803
1020                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1021                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1022                     #else
1023                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1024                     #endif
1025                     /* dynamic-style scheduling offset */
1026                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1027                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1028                         // restore FPCW
1029                         _control87(oldFpcw,_MCW_PC);
1030                     #endif
1031                 } // if
1032             } else {
1033                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1034                                gtid ) );
1035                 schedule = kmp_sch_static_greedy;
1036                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1037                 pr->u.p.parm1 = tc;
1038             } // if
1039         } // case
1040         break;
1041     case kmp_sch_static_greedy:
1042         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1043             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1044                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1045                 tc;
1046         break;
1047     case kmp_sch_static_chunked :
1048     case kmp_sch_dynamic_chunked :
1049         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1050         break;
1051     case kmp_sch_trapezoidal :
1052         {
1053             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1054 
1055             T parm1, parm2, parm3, parm4;
1056             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1057 
1058             parm1 = chunk;
1059 
1060             /* F : size of the first cycle */
1061             parm2 = ( tc / (2 * team->t.t_nproc) );
1062 
1063             if ( parm2 < 1 ) {
1064                 parm2 = 1;
1065             }
1066 
1067             /* L : size of the last cycle.  Make sure the last cycle
1068              *     is not larger than the first cycle.
1069              */
1070             if ( parm1 < 1 ) {
1071                 parm1 = 1;
1072             } else if ( parm1 > parm2 ) {
1073                 parm1 = parm2;
1074             }
1075 
1076             /* N : number of cycles */
1077             parm3 = ( parm2 + parm1 );
1078             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1079 
1080             if ( parm3 < 2 ) {
1081                 parm3 = 2;
1082             }
1083 
1084             /* sigma : decreasing incr of the trapezoid */
1085             parm4 = ( parm3 - 1 );
1086             parm4 = ( parm2 - parm1 ) / parm4;
1087 
1088             // pointless check, because parm4 >= 0 always
1089             //if ( parm4 < 0 ) {
1090             //    parm4 = 0;
1091             //}
1092 
1093             pr->u.p.parm1 = parm1;
1094             pr->u.p.parm2 = parm2;
1095             pr->u.p.parm3 = parm3;
1096             pr->u.p.parm4 = parm4;
1097         } // case
1098         break;
1099 
1100     default:
1101         {
1102             __kmp_msg(
1103                 kmp_ms_fatal,                        // Severity
1104                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1105                 KMP_HNT( GetNewerLibrary ),          // Hint
1106                 __kmp_msg_null                       // Variadic argument list terminator
1107             );
1108         }
1109         break;
1110     } // switch
1111     pr->schedule = schedule;
1112     if ( active ) {
1113         /* The name of this buffer should be my_buffer_index when it's free to use it */
1114 
1115         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1116                         gtid, my_buffer_index, sh->buffer_index) );
1117         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1118                                         USE_ITT_BUILD_ARG( NULL )
1119                                         );
1120             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1121             // *always* 32-bit integers.
1122         KMP_MB();  /* is this necessary? */
1123         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1124                         gtid, my_buffer_index, sh->buffer_index) );
1125 
1126         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1127         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1128 #if USE_ITT_BUILD
1129         if ( pr->ordered ) {
1130             __kmp_itt_ordered_init( gtid );
1131         }; // if
1132         // Report loop metadata
1133         if ( itt_need_metadata_reporting ) {
1134             // Only report metadata by master of active team at level 1
1135             kmp_uint64 schedtype = 0;
1136             switch ( schedule ) {
1137             case kmp_sch_static_chunked:
1138             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1139                 break;
1140             case kmp_sch_static_greedy:
1141                 cur_chunk = pr->u.p.parm1;
1142                 break;
1143             case kmp_sch_dynamic_chunked:
1144                 schedtype = 1;
1145                 break;
1146             case kmp_sch_guided_iterative_chunked:
1147             case kmp_sch_guided_analytical_chunked:
1148                 schedtype = 2;
1149                 break;
1150             default:
1151 //            Should we put this case under "static"?
1152 //            case kmp_sch_static_steal:
1153                 schedtype = 3;
1154                 break;
1155             }
1156             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1157         }
1158 #endif /* USE_ITT_BUILD */
1159     }; // if
1160 
1161     #ifdef KMP_DEBUG
1162     {
1163         const char * buff;
1164         // create format specifiers before the debug output
1165         buff = __kmp_str_format(
1166             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1167             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1168             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1169             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1170             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1171             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1172             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1173         KD_TRACE(10, ( buff,
1174             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1175             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1176             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1177             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1178         __kmp_str_free( &buff );
1179     }
1180     #endif
1181     #if ( KMP_STATIC_STEAL_ENABLED )
1182     if ( ___kmp_size_type < 8 ) {
1183       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1184       // all the parm3 variables will contain the same value.
1185       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1186       // rather than program life-time increment.
1187       // So the dedicated variable is required. The 'static_steal_counter' is used.
1188       if( schedule == kmp_sch_static_steal ) {
1189         // Other threads will inspect this variable when searching for a victim.
1190         // This is a flag showing that other threads may steal from this thread since then.
1191         volatile T * p = &pr->u.p.static_steal_counter;
1192         *p = *p + 1;
1193       }
1194     }
1195     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1196 
1197 #if OMPT_SUPPORT && OMPT_TRACE
1198     if ((ompt_status == ompt_status_track_callback) &&
1199         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1200         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1201         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1202         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1203             team_info->parallel_id, task_info->task_id, team_info->microtask);
1204     }
1205 #endif
1206 }
1207 
1208 /*
1209  * For ordered loops, either __kmp_dispatch_finish() should be called after
1210  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1211  * every chunk of iterations.  If the ordered section(s) were not executed
1212  * for this iteration (or every iteration in this chunk), we need to set the
1213  * ordered iteration counters so that the next thread can proceed.
1214  */
1215 template< typename UT >
1216 static void
1217 __kmp_dispatch_finish( int gtid, ident_t *loc )
1218 {
1219     typedef typename traits_t< UT >::signed_t ST;
1220     kmp_info_t *th = __kmp_threads[ gtid ];
1221 
1222     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1223     if ( ! th -> th.th_team -> t.t_serialized ) {
1224 
1225         dispatch_private_info_template< UT > * pr =
1226             reinterpret_cast< dispatch_private_info_template< UT >* >
1227             ( th->th.th_dispatch->th_dispatch_pr_current );
1228         dispatch_shared_info_template< UT > volatile * sh =
1229             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1230             ( th->th.th_dispatch->th_dispatch_sh_current );
1231         KMP_DEBUG_ASSERT( pr );
1232         KMP_DEBUG_ASSERT( sh );
1233         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1234                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1235 
1236         if ( pr->ordered_bumped ) {
1237             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1238                             gtid ) );
1239             pr->ordered_bumped = 0;
1240         } else {
1241             UT lower = pr->u.p.ordered_lower;
1242 
1243             #ifdef KMP_DEBUG
1244             {
1245                 const char * buff;
1246                 // create format specifiers before the debug output
1247                 buff = __kmp_str_format(
1248                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1249                     traits_t< UT >::spec, traits_t< UT >::spec );
1250                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1251                 __kmp_str_free( &buff );
1252             }
1253             #endif
1254 
1255             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1256                                    USE_ITT_BUILD_ARG(NULL)
1257                                    );
1258             KMP_MB();  /* is this necessary? */
1259             #ifdef KMP_DEBUG
1260             {
1261                 const char * buff;
1262                 // create format specifiers before the debug output
1263                 buff = __kmp_str_format(
1264                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1265                     traits_t< UT >::spec, traits_t< UT >::spec );
1266                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1267                 __kmp_str_free( &buff );
1268             }
1269             #endif
1270 
1271             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1272         } // if
1273     } // if
1274     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1275 }
1276 
1277 #ifdef KMP_GOMP_COMPAT
1278 
1279 template< typename UT >
1280 static void
1281 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1282 {
1283     typedef typename traits_t< UT >::signed_t ST;
1284     kmp_info_t *th = __kmp_threads[ gtid ];
1285 
1286     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1287     if ( ! th -> th.th_team -> t.t_serialized ) {
1288 //        int cid;
1289         dispatch_private_info_template< UT > * pr =
1290             reinterpret_cast< dispatch_private_info_template< UT >* >
1291             ( th->th.th_dispatch->th_dispatch_pr_current );
1292         dispatch_shared_info_template< UT > volatile * sh =
1293             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1294             ( th->th.th_dispatch->th_dispatch_sh_current );
1295         KMP_DEBUG_ASSERT( pr );
1296         KMP_DEBUG_ASSERT( sh );
1297         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1298                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1299 
1300 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1301             UT lower = pr->u.p.ordered_lower;
1302             UT upper = pr->u.p.ordered_upper;
1303             UT inc = upper - lower + 1;
1304 
1305             if ( pr->ordered_bumped == inc ) {
1306                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1307                   gtid ) );
1308                 pr->ordered_bumped = 0;
1309             } else {
1310                 inc -= pr->ordered_bumped;
1311 
1312                 #ifdef KMP_DEBUG
1313                 {
1314                     const char * buff;
1315                     // create format specifiers before the debug output
1316                     buff = __kmp_str_format(
1317                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1318                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1319                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1320                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1321                     __kmp_str_free( &buff );
1322                 }
1323                 #endif
1324 
1325                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1326                                        USE_ITT_BUILD_ARG(NULL)
1327                                        );
1328 
1329                 KMP_MB();  /* is this necessary? */
1330                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1331                   gtid ) );
1332                 pr->ordered_bumped = 0;
1333 //!!!!! TODO check if the inc should be unsigned, or signed???
1334                 #ifdef KMP_DEBUG
1335                 {
1336                     const char * buff;
1337                     // create format specifiers before the debug output
1338                     buff = __kmp_str_format(
1339                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1340                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1341                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1342                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1343                     __kmp_str_free( &buff );
1344                 }
1345                 #endif
1346 
1347                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1348             }
1349 //        }
1350     }
1351     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1352 }
1353 
1354 #endif /* KMP_GOMP_COMPAT */
1355 
1356 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1357  * (no more work), then tell OMPT the loop is over. In some cases
1358  * kmp_dispatch_fini() is not called. */
1359 #if OMPT_SUPPORT && OMPT_TRACE
1360 #define OMPT_LOOP_END                                                          \
1361     if (status == 0) {                                                         \
1362         if ((ompt_status == ompt_status_track_callback) &&                     \
1363             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1364             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1365             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1366             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1367                 team_info->parallel_id, task_info->task_id);                   \
1368         }                                                                      \
1369     }
1370 #else
1371 #define OMPT_LOOP_END // no-op
1372 #endif
1373 
1374 template< typename T >
1375 static int
1376 __kmp_dispatch_next(
1377     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1378 ) {
1379 
1380     typedef typename traits_t< T >::unsigned_t  UT;
1381     typedef typename traits_t< T >::signed_t    ST;
1382     typedef typename traits_t< T >::floating_t  DBL;
1383     static const int ___kmp_size_type = sizeof( UT );
1384 
1385     int                                   status;
1386     dispatch_private_info_template< T > * pr;
1387     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1388     kmp_team_t                          * team = th -> th.th_team;
1389 
1390     KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
1391     #ifdef KMP_DEBUG
1392     {
1393         const char * buff;
1394         // create format specifiers before the debug output
1395         buff = __kmp_str_format(
1396             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1397             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1398         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1399         __kmp_str_free( &buff );
1400     }
1401     #endif
1402 
1403     if ( team -> t.t_serialized ) {
1404         /* NOTE: serialize this dispatch becase we are not at the active level */
1405         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1406             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1407         KMP_DEBUG_ASSERT( pr );
1408 
1409         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1410             *p_lb = 0;
1411             *p_ub = 0;
1412 //            if ( p_last != NULL )
1413 //                *p_last = 0;
1414             if ( p_st != NULL )
1415                 *p_st = 0;
1416             if ( __kmp_env_consistency_check ) {
1417                 if ( pr->pushed_ws != ct_none ) {
1418                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1419                 }
1420             }
1421         } else if ( pr->nomerge ) {
1422             kmp_int32 last;
1423             T         start;
1424             UT        limit, trip, init;
1425             ST        incr;
1426             T         chunk = pr->u.p.parm1;
1427 
1428             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1429 
1430             init = chunk * pr->u.p.count++;
1431             trip = pr->u.p.tc - 1;
1432 
1433             if ( (status = (init <= trip)) == 0 ) {
1434                 *p_lb = 0;
1435                 *p_ub = 0;
1436 //                if ( p_last != NULL )
1437 //                    *p_last = 0;
1438                 if ( p_st != NULL )
1439                     *p_st = 0;
1440                 if ( __kmp_env_consistency_check ) {
1441                     if ( pr->pushed_ws != ct_none ) {
1442                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1443                     }
1444                 }
1445             } else {
1446                 start = pr->u.p.lb;
1447                 limit = chunk + init - 1;
1448                 incr  = pr->u.p.st;
1449 
1450                 if ( (last = (limit >= trip)) != 0 ) {
1451                     limit = trip;
1452                     #if KMP_OS_WINDOWS
1453                     pr->u.p.last_upper = pr->u.p.ub;
1454                     #endif /* KMP_OS_WINDOWS */
1455                 }
1456                 if ( p_last != NULL )
1457                     *p_last = last;
1458                 if ( p_st != NULL )
1459                     *p_st = incr;
1460                 if ( incr == 1 ) {
1461                     *p_lb = start + init;
1462                     *p_ub = start + limit;
1463                 } else {
1464                     *p_lb = start + init * incr;
1465                     *p_ub = start + limit * incr;
1466                 }
1467 
1468                 if ( pr->ordered ) {
1469                     pr->u.p.ordered_lower = init;
1470                     pr->u.p.ordered_upper = limit;
1471                     #ifdef KMP_DEBUG
1472                     {
1473                         const char * buff;
1474                         // create format specifiers before the debug output
1475                         buff = __kmp_str_format(
1476                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1477                             traits_t< UT >::spec, traits_t< UT >::spec );
1478                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1479                         __kmp_str_free( &buff );
1480                     }
1481                     #endif
1482                 } // if
1483             } // if
1484         } else {
1485             pr->u.p.tc = 0;
1486             *p_lb = pr->u.p.lb;
1487             *p_ub = pr->u.p.ub;
1488             #if KMP_OS_WINDOWS
1489             pr->u.p.last_upper = *p_ub;
1490             #endif /* KMP_OS_WINDOWS */
1491             if ( p_last != NULL )
1492                 *p_last = TRUE;
1493             if ( p_st != NULL )
1494                 *p_st = pr->u.p.st;
1495         } // if
1496         #ifdef KMP_DEBUG
1497         {
1498             const char * buff;
1499             // create format specifiers before the debug output
1500             buff = __kmp_str_format(
1501                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1502                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1503                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1504             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1505             __kmp_str_free( &buff );
1506         }
1507         #endif
1508 #if INCLUDE_SSC_MARKS
1509         SSC_MARK_DISPATCH_NEXT();
1510 #endif
1511         OMPT_LOOP_END;
1512         return status;
1513     } else {
1514         kmp_int32 last = 0;
1515         dispatch_shared_info_template< UT > *sh;
1516         T         start;
1517         ST        incr;
1518         UT        limit, trip, init;
1519 
1520         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1521                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1522 
1523         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1524             ( th->th.th_dispatch->th_dispatch_pr_current );
1525         KMP_DEBUG_ASSERT( pr );
1526         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1527             ( th->th.th_dispatch->th_dispatch_sh_current );
1528         KMP_DEBUG_ASSERT( sh );
1529 
1530         if ( pr->u.p.tc == 0 ) {
1531             // zero trip count
1532             status = 0;
1533         } else {
1534             switch (pr->schedule) {
1535             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1536             case kmp_sch_static_steal:
1537                 {
1538                     T chunk = pr->u.p.parm1;
1539 
1540                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1541 
1542                     trip = pr->u.p.tc - 1;
1543 
1544                     if ( ___kmp_size_type > 4 ) {
1545                         // Other threads do not look into the data of this thread,
1546                         //  so it's not necessary to make volatile casting.
1547                         init   = ( pr->u.p.count )++;
1548                         status = ( init < (UT)pr->u.p.ub );
1549                     } else {
1550                         typedef union {
1551                             struct {
1552                                 UT count;
1553                                 T  ub;
1554                             } p;
1555                             kmp_int64 b;
1556                         } union_i4;
1557                         // All operations on 'count' or 'ub' must be combined atomically together.
1558                         // stealing implemented only for 4-byte indexes
1559                         {
1560                             union_i4 vold, vnew;
1561                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1562                             vnew = vold;
1563                             vnew.p.count++;
1564                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1565                                         ( volatile kmp_int64* )&pr->u.p.count,
1566                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1567                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1568                                 KMP_CPU_PAUSE();
1569                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1570                                 vnew = vold;
1571                                 vnew.p.count++;
1572                             }
1573                             vnew = vold;
1574                             init   = vnew.p.count;
1575                             status = ( init < (UT)vnew.p.ub ) ;
1576                         }
1577 
1578                         if( !status ) {
1579                             kmp_info_t   **other_threads = team->t.t_threads;
1580                             int          while_limit = 10;
1581                             int          while_index = 0;
1582 
1583                             // TODO: algorithm of searching for a victim
1584                             // should be cleaned up and measured
1585                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1586                                 union_i4  vold, vnew;
1587                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1588                                 T         victimIdx    = pr->u.p.parm4;
1589                                 T         oldVictimIdx = victimIdx;
1590                                 dispatch_private_info_template< T > * victim;
1591 
1592                                 do {
1593                                     if( !victimIdx ) {
1594                                         victimIdx = team->t.t_nproc - 1;
1595                                     } else {
1596                                         --victimIdx;
1597                                     }
1598                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1599                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1600                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1601                                 // TODO: think about a proper place of this test
1602                                 if ( ( !victim ) ||
1603                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1604                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1605                                     // TODO: delay would be nice
1606                                     continue;
1607                                     // the victim is not ready yet to participate in stealing
1608                                     // because the victim is still in kmp_init_dispatch
1609                                 }
1610                                 if ( oldVictimIdx == victimIdx ) {
1611                                     break;
1612                                 }
1613                                 pr->u.p.parm4 = victimIdx;
1614 
1615                                 while( 1 ) {
1616                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1617                                     vnew = vold;
1618 
1619                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1620                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1621                                         break;
1622                                     }
1623                                     vnew.p.ub -= (remaining >> 2);
1624                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1625                                     #pragma warning( push )
1626                                     // disable warning on pointless comparison of unsigned with 0
1627                                     #pragma warning( disable: 186 )
1628                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1629                                     #pragma warning( pop )
1630                                     // TODO: Should this be acquire or release?
1631                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1632                                             ( volatile kmp_int64 * )&victim->u.p.count,
1633                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1634                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1635                                         status = 1;
1636                                         while_index = 0;
1637                                         // now update own count and ub
1638                                         #if KMP_ARCH_X86
1639                                         // stealing executed on non-KMP_ARCH_X86 only
1640                                             // Atomic 64-bit write on ia32 is
1641                                             // unavailable, so we do this in steps.
1642                                             //     This code is not tested.
1643                                             init = vold.p.count;
1644                                             pr->u.p.ub = 0;
1645                                             pr->u.p.count = init + 1;
1646                                             pr->u.p.ub = vnew.p.count;
1647                                         #else
1648                                             init = vnew.p.ub;
1649                                             vold.p.count = init + 1;
1650                                             // TODO: is it safe and enough?
1651                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1652                                         #endif // KMP_ARCH_X86
1653                                         break;
1654                                     } // if
1655                                 KMP_CPU_PAUSE();
1656                                 } // while (1)
1657                             } // while
1658                         } // if
1659                     } // if
1660                     if ( !status ) {
1661                         *p_lb = 0;
1662                         *p_ub = 0;
1663                         if ( p_st != NULL ) *p_st = 0;
1664                     } else {
1665                         start = pr->u.p.parm2;
1666                         init *= chunk;
1667                         limit = chunk + init - 1;
1668                         incr  = pr->u.p.st;
1669 
1670                         KMP_DEBUG_ASSERT(init <= trip);
1671                         if ( (last = (limit >= trip)) != 0 )
1672                             limit = trip;
1673                         if ( p_st != NULL ) *p_st = incr;
1674 
1675                         if ( incr == 1 ) {
1676                             *p_lb = start + init;
1677                             *p_ub = start + limit;
1678                         } else {
1679                             *p_lb = start + init * incr;
1680                             *p_ub = start + limit * incr;
1681                         }
1682 
1683                         if ( pr->ordered ) {
1684                             pr->u.p.ordered_lower = init;
1685                             pr->u.p.ordered_upper = limit;
1686                             #ifdef KMP_DEBUG
1687                             {
1688                                 const char * buff;
1689                                 // create format specifiers before the debug output
1690                                 buff = __kmp_str_format(
1691                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1692                                     traits_t< UT >::spec, traits_t< UT >::spec );
1693                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1694                                 __kmp_str_free( &buff );
1695                             }
1696                             #endif
1697                         } // if
1698                     } // if
1699                     break;
1700                 } // case
1701             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1702             case kmp_sch_static_balanced:
1703                 {
1704                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1705                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1706                         pr->u.p.count = 1;
1707                         *p_lb = pr->u.p.lb;
1708                         *p_ub = pr->u.p.ub;
1709                         last = pr->u.p.parm1;
1710                         if ( p_st != NULL )
1711                             *p_st = pr->u.p.st;
1712                     } else {  /* no iterations to do */
1713                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1714                     }
1715                     if ( pr->ordered ) {
1716                         #ifdef KMP_DEBUG
1717                         {
1718                             const char * buff;
1719                             // create format specifiers before the debug output
1720                             buff = __kmp_str_format(
1721                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1722                                 traits_t< UT >::spec, traits_t< UT >::spec );
1723                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1724                             __kmp_str_free( &buff );
1725                         }
1726                         #endif
1727                     } // if
1728                 } // case
1729                 break;
1730             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1731             case kmp_sch_static_chunked:
1732                 {
1733                     T parm1;
1734 
1735                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1736                                    gtid ) );
1737                     parm1 = pr->u.p.parm1;
1738 
1739                     trip  = pr->u.p.tc - 1;
1740                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1741 
1742                     if ( (status = (init <= trip)) != 0 ) {
1743                         start = pr->u.p.lb;
1744                         incr  = pr->u.p.st;
1745                         limit = parm1 + init - 1;
1746 
1747                         if ( (last = (limit >= trip)) != 0 )
1748                             limit = trip;
1749 
1750                         if ( p_st != NULL ) *p_st = incr;
1751 
1752                         pr->u.p.count += team->t.t_nproc;
1753 
1754                         if ( incr == 1 ) {
1755                             *p_lb = start + init;
1756                             *p_ub = start + limit;
1757                         }
1758                         else {
1759                             *p_lb = start + init * incr;
1760                             *p_ub = start + limit * incr;
1761                         }
1762 
1763                         if ( pr->ordered ) {
1764                             pr->u.p.ordered_lower = init;
1765                             pr->u.p.ordered_upper = limit;
1766                             #ifdef KMP_DEBUG
1767                             {
1768                                 const char * buff;
1769                                 // create format specifiers before the debug output
1770                                 buff = __kmp_str_format(
1771                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1772                                     traits_t< UT >::spec, traits_t< UT >::spec );
1773                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1774                                 __kmp_str_free( &buff );
1775                             }
1776                             #endif
1777                         } // if
1778                     } // if
1779                 } // case
1780                 break;
1781 
1782             case kmp_sch_dynamic_chunked:
1783                 {
1784                     T chunk = pr->u.p.parm1;
1785 
1786                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1787                                    gtid ) );
1788 
1789                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1790                     trip = pr->u.p.tc - 1;
1791 
1792                     if ( (status = (init <= trip)) == 0 ) {
1793                         *p_lb = 0;
1794                         *p_ub = 0;
1795                         if ( p_st != NULL ) *p_st = 0;
1796                     } else {
1797                         start = pr->u.p.lb;
1798                         limit = chunk + init - 1;
1799                         incr  = pr->u.p.st;
1800 
1801                         if ( (last = (limit >= trip)) != 0 )
1802                             limit = trip;
1803 
1804                         if ( p_st != NULL ) *p_st = incr;
1805 
1806                         if ( incr == 1 ) {
1807                             *p_lb = start + init;
1808                             *p_ub = start + limit;
1809                         } else {
1810                             *p_lb = start + init * incr;
1811                             *p_ub = start + limit * incr;
1812                         }
1813 
1814                         if ( pr->ordered ) {
1815                             pr->u.p.ordered_lower = init;
1816                             pr->u.p.ordered_upper = limit;
1817                             #ifdef KMP_DEBUG
1818                             {
1819                                 const char * buff;
1820                                 // create format specifiers before the debug output
1821                                 buff = __kmp_str_format(
1822                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1823                                     traits_t< UT >::spec, traits_t< UT >::spec );
1824                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1825                                 __kmp_str_free( &buff );
1826                             }
1827                             #endif
1828                         } // if
1829                     } // if
1830                 } // case
1831                 break;
1832 
1833             case kmp_sch_guided_iterative_chunked:
1834                 {
1835                     T  chunkspec = pr->u.p.parm1;
1836                     KD_TRACE(100,
1837                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1838                     trip  = pr->u.p.tc;
1839                     // Start atomic part of calculations
1840                     while(1) {
1841                         ST  remaining;             // signed, because can be < 0
1842                         init = sh->u.s.iteration;  // shared value
1843                         remaining = trip - init;
1844                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1845                             // nothing to do, don't try atomic op
1846                             status = 0;
1847                             break;
1848                         }
1849                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1850                             // use dynamic-style shcedule
1851                             // atomically inrement iterations, get old value
1852                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1853                             remaining = trip - init;
1854                             if (remaining <= 0) {
1855                                 status = 0;    // all iterations got by other threads
1856                             } else {
1857                                 // got some iterations to work on
1858                                 status = 1;
1859                                 if ( (T)remaining > chunkspec ) {
1860                                     limit = init + chunkspec - 1;
1861                                 } else {
1862                                     last = 1;   // the last chunk
1863                                     limit = init + remaining - 1;
1864                                 } // if
1865                             } // if
1866                             break;
1867                         } // if
1868                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1869                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1870                             // CAS was successful, chunk obtained
1871                             status = 1;
1872                             --limit;
1873                             break;
1874                         } // if
1875                     } // while
1876                     if ( status != 0 ) {
1877                         start = pr->u.p.lb;
1878                         incr = pr->u.p.st;
1879                         if ( p_st != NULL )
1880                             *p_st = incr;
1881                         *p_lb = start + init * incr;
1882                         *p_ub = start + limit * incr;
1883                         if ( pr->ordered ) {
1884                             pr->u.p.ordered_lower = init;
1885                             pr->u.p.ordered_upper = limit;
1886                             #ifdef KMP_DEBUG
1887                             {
1888                                 const char * buff;
1889                                 // create format specifiers before the debug output
1890                                 buff = __kmp_str_format(
1891                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1892                                     traits_t< UT >::spec, traits_t< UT >::spec );
1893                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1894                                 __kmp_str_free( &buff );
1895                             }
1896                             #endif
1897                         } // if
1898                     } else {
1899                         *p_lb = 0;
1900                         *p_ub = 0;
1901                         if ( p_st != NULL )
1902                             *p_st = 0;
1903                     } // if
1904                 } // case
1905                 break;
1906 
1907             case kmp_sch_guided_analytical_chunked:
1908                 {
1909                     T   chunkspec = pr->u.p.parm1;
1910                     UT chunkIdx;
1911     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1912                     /* for storing original FPCW value for Windows* OS on
1913 		       IA-32 architecture 8-byte version */
1914                     unsigned int oldFpcw;
1915                     unsigned int fpcwSet = 0;
1916     #endif
1917                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1918                                    gtid ) );
1919 
1920                     trip  = pr->u.p.tc;
1921 
1922                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1923                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1924 
1925                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1926                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1927                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1928                             --trip;
1929                             /* use dynamic-style scheduling */
1930                             init = chunkIdx * chunkspec + pr->u.p.count;
1931                             /* need to verify init > 0 in case of overflow in the above calculation */
1932                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1933                                 limit = init + chunkspec -1;
1934 
1935                                 if ( (last = (limit >= trip)) != 0 )
1936                                     limit = trip;
1937                             }
1938                             break;
1939                         } else {
1940                             /* use exponential-style scheduling */
1941                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1942                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1943                              */
1944     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1945                             /* If we haven't already done so, save original
1946 			       FPCW and set precision to 64-bit, as Windows* OS
1947 			       on IA-32 architecture defaults to 53-bit */
1948                             if ( !fpcwSet ) {
1949                                 oldFpcw = _control87(0,0);
1950                                 _control87(_PC_64,_MCW_PC);
1951                                 fpcwSet = 0x30000;
1952                             }
1953     #endif
1954                             if ( chunkIdx ) {
1955                                 init = __kmp_dispatch_guided_remaining< T >(
1956                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1957                                 KMP_DEBUG_ASSERT(init);
1958                                 init = trip - init;
1959                             } else
1960                                 init = 0;
1961                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1962                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1963                             KMP_ASSERT(init <= limit);
1964                             if ( init < limit ) {
1965                                 KMP_DEBUG_ASSERT(limit <= trip);
1966                                 --limit;
1967                                 status = 1;
1968                                 break;
1969                             } // if
1970                         } // if
1971                     } // while (1)
1972     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1973                     /* restore FPCW if necessary
1974                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1975                     */
1976                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1977                         _control87(oldFpcw,_MCW_PC);
1978     #endif
1979                     if ( status != 0 ) {
1980                         start = pr->u.p.lb;
1981                         incr = pr->u.p.st;
1982                         if ( p_st != NULL )
1983                             *p_st = incr;
1984                         *p_lb = start + init * incr;
1985                         *p_ub = start + limit * incr;
1986                         if ( pr->ordered ) {
1987                             pr->u.p.ordered_lower = init;
1988                             pr->u.p.ordered_upper = limit;
1989                             #ifdef KMP_DEBUG
1990                             {
1991                                 const char * buff;
1992                                 // create format specifiers before the debug output
1993                                 buff = __kmp_str_format(
1994                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1995                                     traits_t< UT >::spec, traits_t< UT >::spec );
1996                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1997                                 __kmp_str_free( &buff );
1998                             }
1999                             #endif
2000                         }
2001                     } else {
2002                         *p_lb = 0;
2003                         *p_ub = 0;
2004                         if ( p_st != NULL )
2005                             *p_st = 0;
2006                     }
2007                 } // case
2008                 break;
2009 
2010             case kmp_sch_trapezoidal:
2011                 {
2012                     UT   index;
2013                     T    parm2 = pr->u.p.parm2;
2014                     T    parm3 = pr->u.p.parm3;
2015                     T    parm4 = pr->u.p.parm4;
2016                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2017                                    gtid ) );
2018 
2019                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2020 
2021                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2022                     trip = pr->u.p.tc - 1;
2023 
2024                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2025                         *p_lb = 0;
2026                         *p_ub = 0;
2027                         if ( p_st != NULL ) *p_st = 0;
2028                     } else {
2029                         start = pr->u.p.lb;
2030                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2031                         incr  = pr->u.p.st;
2032 
2033                         if ( (last = (limit >= trip)) != 0 )
2034                             limit = trip;
2035 
2036                         if ( p_st != NULL ) *p_st = incr;
2037 
2038                         if ( incr == 1 ) {
2039                             *p_lb = start + init;
2040                             *p_ub = start + limit;
2041                         } else {
2042                             *p_lb = start + init * incr;
2043                             *p_ub = start + limit * incr;
2044                         }
2045 
2046                         if ( pr->ordered ) {
2047                             pr->u.p.ordered_lower = init;
2048                             pr->u.p.ordered_upper = limit;
2049                             #ifdef KMP_DEBUG
2050                             {
2051                                 const char * buff;
2052                                 // create format specifiers before the debug output
2053                                 buff = __kmp_str_format(
2054                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2055                                     traits_t< UT >::spec, traits_t< UT >::spec );
2056                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2057                                 __kmp_str_free( &buff );
2058                             }
2059                             #endif
2060                         } // if
2061                     } // if
2062                 } // case
2063                 break;
2064             default:
2065                 {
2066                     status = 0; // to avoid complaints on uninitialized variable use
2067                     __kmp_msg(
2068                         kmp_ms_fatal,                        // Severity
2069                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2070                         KMP_HNT( GetNewerLibrary ),          // Hint
2071                         __kmp_msg_null                       // Variadic argument list terminator
2072                     );
2073                 }
2074                 break;
2075             } // switch
2076         } // if tc == 0;
2077 
2078         if ( status == 0 ) {
2079             UT   num_done;
2080 
2081             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2082             #ifdef KMP_DEBUG
2083             {
2084                 const char * buff;
2085                 // create format specifiers before the debug output
2086                 buff = __kmp_str_format(
2087                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2088                     traits_t< UT >::spec );
2089                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2090                 __kmp_str_free( &buff );
2091             }
2092             #endif
2093 
2094             if ( (ST)num_done == team->t.t_nproc-1 ) {
2095                 /* NOTE: release this buffer to be reused */
2096 
2097                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2098 
2099                 sh->u.s.num_done = 0;
2100                 sh->u.s.iteration = 0;
2101 
2102                 /* TODO replace with general release procedure? */
2103                 if ( pr->ordered ) {
2104                     sh->u.s.ordered_iteration = 0;
2105                 }
2106 
2107                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2108 
2109                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2110                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2111                                 gtid, sh->buffer_index) );
2112 
2113                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2114 
2115             } // if
2116             if ( __kmp_env_consistency_check ) {
2117                 if ( pr->pushed_ws != ct_none ) {
2118                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2119                 }
2120             }
2121 
2122             th -> th.th_dispatch -> th_deo_fcn = NULL;
2123             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2124             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2125             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2126         } // if (status == 0)
2127 #if KMP_OS_WINDOWS
2128         else if ( last ) {
2129             pr->u.p.last_upper = pr->u.p.ub;
2130         }
2131 #endif /* KMP_OS_WINDOWS */
2132         if ( p_last != NULL && status != 0 )
2133             *p_last = last;
2134     } // if
2135 
2136     #ifdef KMP_DEBUG
2137     {
2138         const char * buff;
2139         // create format specifiers before the debug output
2140         buff = __kmp_str_format(
2141             "__kmp_dispatch_next: T#%%d normal case: " \
2142             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2143             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2144         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2145         __kmp_str_free( &buff );
2146     }
2147     #endif
2148 #if INCLUDE_SSC_MARKS
2149     SSC_MARK_DISPATCH_NEXT();
2150 #endif
2151     OMPT_LOOP_END;
2152     return status;
2153 }
2154 
2155 template< typename T >
2156 static void
2157 __kmp_dist_get_bounds(
2158     ident_t                          *loc,
2159     kmp_int32                         gtid,
2160     kmp_int32                        *plastiter,
2161     T                                *plower,
2162     T                                *pupper,
2163     typename traits_t< T >::signed_t  incr
2164 ) {
2165     KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2166     typedef typename traits_t< T >::unsigned_t  UT;
2167     typedef typename traits_t< T >::signed_t    ST;
2168     register kmp_uint32  team_id;
2169     register kmp_uint32  nteams;
2170     register UT          trip_count;
2171     register kmp_team_t *team;
2172     kmp_info_t * th;
2173 
2174     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2175     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2176     #ifdef KMP_DEBUG
2177     {
2178         const char * buff;
2179         // create format specifiers before the debug output
2180         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2181             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2182             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2183             traits_t< T >::spec );
2184         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2185         __kmp_str_free( &buff );
2186     }
2187     #endif
2188 
2189     if( __kmp_env_consistency_check ) {
2190         if( incr == 0 ) {
2191             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2192         }
2193         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2194             // The loop is illegal.
2195             // Some zero-trip loops maintained by compiler, e.g.:
2196             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2197             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2198             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2199             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2200             // Compiler does not check the following illegal loops:
2201             //   for(i=0;i<10;i+=incr) // where incr<0
2202             //   for(i=10;i>0;i-=incr) // where incr<0
2203             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2204         }
2205     }
2206     th = __kmp_threads[gtid];
2207     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2208     team = th->th.th_team;
2209     #if OMP_40_ENABLED
2210     nteams = th->th.th_teams_size.nteams;
2211     #endif
2212     team_id = team->t.t_master_tid;
2213     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2214 
2215     // compute global trip count
2216     if( incr == 1 ) {
2217         trip_count = *pupper - *plower + 1;
2218     } else if(incr == -1) {
2219         trip_count = *plower - *pupper + 1;
2220     } else {
2221         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2222     }
2223     if( trip_count <= nteams ) {
2224         KMP_DEBUG_ASSERT(
2225             __kmp_static == kmp_sch_static_greedy || \
2226             __kmp_static == kmp_sch_static_balanced
2227         ); // Unknown static scheduling type.
2228         // only some teams get single iteration, others get nothing
2229         if( team_id < trip_count ) {
2230             *pupper = *plower = *plower + team_id * incr;
2231         } else {
2232             *plower = *pupper + incr; // zero-trip loop
2233         }
2234         if( plastiter != NULL )
2235             *plastiter = ( team_id == trip_count - 1 );
2236     } else {
2237         if( __kmp_static == kmp_sch_static_balanced ) {
2238             register UT chunk = trip_count / nteams;
2239             register UT extras = trip_count % nteams;
2240             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2241             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2242             if( plastiter != NULL )
2243                 *plastiter = ( team_id == nteams - 1 );
2244         } else {
2245             register T chunk_inc_count =
2246                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2247             register T upper = *pupper;
2248             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2249                 // Unknown static scheduling type.
2250             *plower += team_id * chunk_inc_count;
2251             *pupper = *plower + chunk_inc_count - incr;
2252             // Check/correct bounds if needed
2253             if( incr > 0 ) {
2254                 if( *pupper < *plower )
2255                     *pupper = i_maxmin< T >::mx;
2256                 if( plastiter != NULL )
2257                     *plastiter = *plower <= upper && *pupper > upper - incr;
2258                 if( *pupper > upper )
2259                     *pupper = upper; // tracker C73258
2260             } else {
2261                 if( *pupper > *plower )
2262                     *pupper = i_maxmin< T >::mn;
2263                 if( plastiter != NULL )
2264                     *plastiter = *plower >= upper && *pupper < upper - incr;
2265                 if( *pupper < upper )
2266                     *pupper = upper; // tracker C73258
2267             }
2268         }
2269     }
2270 }
2271 
2272 //-----------------------------------------------------------------------------------------
2273 // Dispatch routines
2274 //    Transfer call to template< type T >
2275 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2276 //                         T lb, T ub, ST st, ST chunk )
2277 extern "C" {
2278 
2279 /*!
2280 @ingroup WORK_SHARING
2281 @{
2282 @param loc Source location
2283 @param gtid Global thread id
2284 @param schedule Schedule type
2285 @param lb  Lower bound
2286 @param ub  Upper bound
2287 @param st  Step (or increment if you prefer)
2288 @param chunk The chunk size to block with
2289 
2290 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2291 These functions are all identical apart from the types of the arguments.
2292 */
2293 
2294 void
2295 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2296                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2297 {
2298     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2299     KMP_DEBUG_ASSERT( __kmp_init_serial );
2300     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2301 }
2302 /*!
2303 See @ref __kmpc_dispatch_init_4
2304 */
2305 void
2306 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2307                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2308 {
2309     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2310     KMP_DEBUG_ASSERT( __kmp_init_serial );
2311     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2312 }
2313 
2314 /*!
2315 See @ref __kmpc_dispatch_init_4
2316 */
2317 void
2318 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2319                         kmp_int64 lb, kmp_int64 ub,
2320                         kmp_int64 st, kmp_int64 chunk )
2321 {
2322     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2323     KMP_DEBUG_ASSERT( __kmp_init_serial );
2324     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2325 }
2326 
2327 /*!
2328 See @ref __kmpc_dispatch_init_4
2329 */
2330 void
2331 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2332                          kmp_uint64 lb, kmp_uint64 ub,
2333                          kmp_int64 st, kmp_int64 chunk )
2334 {
2335     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2336     KMP_DEBUG_ASSERT( __kmp_init_serial );
2337     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2338 }
2339 
2340 /*!
2341 See @ref __kmpc_dispatch_init_4
2342 
2343 Difference from __kmpc_dispatch_init set of functions is these functions
2344 are called for composite distribute parallel for construct. Thus before
2345 regular iterations dispatching we need to calc per-team iteration space.
2346 
2347 These functions are all identical apart from the types of the arguments.
2348 */
2349 void
2350 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2351     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2352 {
2353     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2354     KMP_DEBUG_ASSERT( __kmp_init_serial );
2355     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2356     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2357 }
2358 
2359 void
2360 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2361     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2362 {
2363     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2364     KMP_DEBUG_ASSERT( __kmp_init_serial );
2365     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2366     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2367 }
2368 
2369 void
2370 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2371     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2372 {
2373     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2374     KMP_DEBUG_ASSERT( __kmp_init_serial );
2375     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2376     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2377 }
2378 
2379 void
2380 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2381     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2382 {
2383     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2384     KMP_DEBUG_ASSERT( __kmp_init_serial );
2385     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2386     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2387 }
2388 
2389 /*!
2390 @param loc Source code location
2391 @param gtid Global thread id
2392 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2393 @param p_lb   Pointer to the lower bound for the next chunk of work
2394 @param p_ub   Pointer to the upper bound for the next chunk of work
2395 @param p_st   Pointer to the stride for the next chunk of work
2396 @return one if there is work to be done, zero otherwise
2397 
2398 Get the next dynamically allocated chunk of work for this thread.
2399 If there is no more work, then the lb,ub and stride need not be modified.
2400 */
2401 int
2402 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2403                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2404 {
2405     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2406 }
2407 
2408 /*!
2409 See @ref __kmpc_dispatch_next_4
2410 */
2411 int
2412 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2413                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2414 {
2415     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2416 }
2417 
2418 /*!
2419 See @ref __kmpc_dispatch_next_4
2420 */
2421 int
2422 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2423                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2424 {
2425     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2426 }
2427 
2428 /*!
2429 See @ref __kmpc_dispatch_next_4
2430 */
2431 int
2432 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2434 {
2435     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2436 }
2437 
2438 /*!
2439 @param loc Source code location
2440 @param gtid Global thread id
2441 
2442 Mark the end of a dynamic loop.
2443 */
2444 void
2445 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2446 {
2447     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2448 }
2449 
2450 /*!
2451 See @ref __kmpc_dispatch_fini_4
2452 */
2453 void
2454 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2455 {
2456     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2457 }
2458 
2459 /*!
2460 See @ref __kmpc_dispatch_fini_4
2461 */
2462 void
2463 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2464 {
2465     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2466 }
2467 
2468 /*!
2469 See @ref __kmpc_dispatch_fini_4
2470 */
2471 void
2472 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2473 {
2474     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2475 }
2476 /*! @} */
2477 
2478 //-----------------------------------------------------------------------------------------
2479 //Non-template routines from kmp_dispatch.c used in other sources
2480 
2481 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2482     return value == checker;
2483 }
2484 
2485 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2486     return value != checker;
2487 }
2488 
2489 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2490     return value < checker;
2491 }
2492 
2493 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2494     return value >= checker;
2495 }
2496 
2497 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2498     return value <= checker;
2499 }
2500 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2501     return value == checker;
2502 }
2503 
2504 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2505     return value != checker;
2506 }
2507 
2508 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2509     return value < checker;
2510 }
2511 
2512 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2513     return value >= checker;
2514 }
2515 
2516 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2517     return value <= checker;
2518 }
2519 
2520 kmp_uint32
2521 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2522                    kmp_uint32            checker,
2523                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2524                    , void        * obj    // Higher-level synchronization object, or NULL.
2525                    )
2526 {
2527     // note: we may not belong to a team at this point
2528     register volatile kmp_uint32         * spin          = spinner;
2529     register          kmp_uint32           check         = checker;
2530     register          kmp_uint32   spins;
2531     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2532     register          kmp_uint32           r;
2533 
2534     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2535     KMP_INIT_YIELD( spins );
2536     // main wait spin loop
2537     while(!f(r = TCR_4(*spin), check)) {
2538         KMP_FSYNC_SPIN_PREPARE( obj );
2539         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2540            It causes problems with infinite recursion because of exit lock */
2541         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2542             __kmp_abort_thread(); */
2543 
2544         /* if we have waited a bit, or are oversubscribed, yield */
2545         /* pause is in the following code */
2546         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2547         KMP_YIELD_SPIN( spins );
2548     }
2549     KMP_FSYNC_SPIN_ACQUIRED( obj );
2550     return r;
2551 }
2552 
2553 kmp_uint64
2554 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2555                     kmp_uint64            checker,
2556                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2557                     , void        * obj    // Higher-level synchronization object, or NULL.
2558                     )
2559 {
2560     // note: we may not belong to a team at this point
2561     register volatile kmp_uint64         * spin          = spinner;
2562     register          kmp_uint64           check         = checker;
2563     register          kmp_uint32   spins;
2564     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2565     register          kmp_uint64           r;
2566 
2567     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2568     KMP_INIT_YIELD( spins );
2569     // main wait spin loop
2570     while(!f(r = *spin, check))
2571     {
2572         KMP_FSYNC_SPIN_PREPARE( obj );
2573         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2574            It causes problems with infinite recursion because of exit lock */
2575         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2576             __kmp_abort_thread(); */
2577 
2578         // if we are oversubscribed,
2579         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2580         // pause is in the following code
2581         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2582         KMP_YIELD_SPIN( spins );
2583     }
2584     KMP_FSYNC_SPIN_ACQUIRED( obj );
2585     return r;
2586 }
2587 
2588 } // extern "C"
2589 
2590 #ifdef KMP_GOMP_COMPAT
2591 
2592 void
2593 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2594                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2595                            kmp_int32 chunk, int push_ws )
2596 {
2597     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2598                                       push_ws );
2599 }
2600 
2601 void
2602 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2603                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2604                             kmp_int32 chunk, int push_ws )
2605 {
2606     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2607                                        push_ws );
2608 }
2609 
2610 void
2611 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2612                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2613                            kmp_int64 chunk, int push_ws )
2614 {
2615     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2616                                       push_ws );
2617 }
2618 
2619 void
2620 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2621                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2622                             kmp_int64 chunk, int push_ws )
2623 {
2624     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2625                                        push_ws );
2626 }
2627 
2628 void
2629 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2630 {
2631     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2632 }
2633 
2634 void
2635 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2636 {
2637     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2638 }
2639 
2640 void
2641 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2642 {
2643     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2644 }
2645 
2646 void
2647 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2648 {
2649     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2650 }
2651 
2652 #endif /* KMP_GOMP_COMPAT */
2653 
2654 /* ------------------------------------------------------------------------ */
2655 /* ------------------------------------------------------------------------ */
2656 
2657