1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64
29 #if defined(_WIN32_WINNT) && defined(_M_IX86)
30 #undef _WIN32_WINNT
31 #define _WIN32_WINNT 0x0502
32 #endif
33 
34 #include "kmp.h"
35 #include "kmp_i18n.h"
36 #include "kmp_itt.h"
37 #include "kmp_str.h"
38 #include "kmp_error.h"
39 #include "kmp_stats.h"
40 #if KMP_OS_WINDOWS && KMP_ARCH_X86
41     #include <float.h>
42 #endif
43 
44 #if OMPT_SUPPORT
45 #include "ompt-internal.h"
46 #include "ompt-specific.h"
47 #endif
48 
49 /* ------------------------------------------------------------------------ */
50 /* ------------------------------------------------------------------------ */
51 
52 // template for type limits
53 template< typename T >
54 struct i_maxmin {
55     static const T mx;
56     static const T mn;
57 };
58 template<>
59 struct i_maxmin< int > {
60     static const int mx = 0x7fffffff;
61     static const int mn = 0x80000000;
62 };
63 template<>
64 struct i_maxmin< unsigned int > {
65     static const unsigned int mx = 0xffffffff;
66     static const unsigned int mn = 0x00000000;
67 };
68 template<>
69 struct i_maxmin< long long > {
70     static const long long mx = 0x7fffffffffffffffLL;
71     static const long long mn = 0x8000000000000000LL;
72 };
73 template<>
74 struct i_maxmin< unsigned long long > {
75     static const unsigned long long mx = 0xffffffffffffffffLL;
76     static const unsigned long long mn = 0x0000000000000000LL;
77 };
78 //-------------------------------------------------------------------------
79 
80 #if KMP_STATIC_STEAL_ENABLED
81 
82     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
83     template< typename T >
84     struct dispatch_private_infoXX_template {
85         typedef typename traits_t< T >::unsigned_t  UT;
86         typedef typename traits_t< T >::signed_t    ST;
87         UT count;                // unsigned
88         T  ub;
89         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
90         T  lb;
91         ST st;                   // signed
92         UT tc;                   // unsigned
93         T  static_steal_counter; // for static_steal only; maybe better to put after ub
94 
95         /* parm[1-4] are used in different ways by different scheduling algorithms */
96 
97         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
98         //    a) parm3 is properly aligned and
99         //    b) all parm1-4 are in the same cache line.
100         // Because of parm1-4 are used together, performance seems to be better
101         // if they are in the same line (not measured though).
102 
103         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
104             T  parm1;
105             T  parm2;
106             T  parm3;
107             T  parm4;
108         };
109 
110         UT ordered_lower; // unsigned
111         UT ordered_upper; // unsigned
112         #if KMP_OS_WINDOWS
113         T  last_upper;
114         #endif /* KMP_OS_WINDOWS */
115     };
116 
117 #else /* KMP_STATIC_STEAL_ENABLED */
118 
119     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
120     template< typename T >
121     struct dispatch_private_infoXX_template {
122         typedef typename traits_t< T >::unsigned_t  UT;
123         typedef typename traits_t< T >::signed_t    ST;
124         T  lb;
125         T  ub;
126         ST st;            // signed
127         UT tc;            // unsigned
128 
129         T  parm1;
130         T  parm2;
131         T  parm3;
132         T  parm4;
133 
134         UT count;         // unsigned
135 
136         UT ordered_lower; // unsigned
137         UT ordered_upper; // unsigned
138         #if KMP_OS_WINDOWS
139 	T  last_upper;
140         #endif /* KMP_OS_WINDOWS */
141     };
142 
143 #endif /* KMP_STATIC_STEAL_ENABLED */
144 
145 // replaces dispatch_private_info structure and dispatch_private_info_t type
146 template< typename T >
147 struct KMP_ALIGN_CACHE dispatch_private_info_template {
148     // duplicate alignment here, otherwise size of structure is not correct in our compiler
149     union KMP_ALIGN_CACHE private_info_tmpl {
150         dispatch_private_infoXX_template< T > p;
151         dispatch_private_info64_t             p64;
152     } u;
153     enum sched_type schedule;  /* scheduling algorithm */
154     kmp_uint32      ordered;   /* ordered clause specified */
155     kmp_uint32      ordered_bumped;
156     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
157     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
158     kmp_uint32      nomerge;   /* don't merge iters if serialized */
159     kmp_uint32      type_size;
160     enum cons_type  pushed_ws;
161 };
162 
163 
164 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
165 template< typename UT >
166 struct dispatch_shared_infoXX_template {
167     /* chunk index under dynamic, number of idle threads under static-steal;
168        iteration index otherwise */
169     volatile UT     iteration;
170     volatile UT     num_done;
171     volatile UT     ordered_iteration;
172     UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
173 };
174 
175 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
176 template< typename UT >
177 struct dispatch_shared_info_template {
178     // we need union here to keep the structure size
179     union shared_info_tmpl {
180         dispatch_shared_infoXX_template< UT >  s;
181         dispatch_shared_info64_t               s64;
182     } u;
183     volatile kmp_uint32     buffer_index;
184 #if OMP_45_ENABLED
185     volatile kmp_int32      doacross_buf_idx;  // teamwise index
186     kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
187     kmp_int32               doacross_num_done; // count finished threads
188 #endif
189 #if KMP_USE_HWLOC
190     // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
191     // machines (> 48 cores). Performance analysis showed that a cache thrash
192     // was occurring and this padding helps alleviate the problem.
193     char padding[64];
194 #endif
195 };
196 
197 /* ------------------------------------------------------------------------ */
198 /* ------------------------------------------------------------------------ */
199 
200 #undef USE_TEST_LOCKS
201 
202 // test_then_add template (general template should NOT be used)
203 template< typename T >
204 static __forceinline T
205 test_then_add( volatile T *p, T d );
206 
207 template<>
208 __forceinline kmp_int32
209 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
210 {
211     kmp_int32 r;
212     r = KMP_TEST_THEN_ADD32( p, d );
213     return r;
214 }
215 
216 template<>
217 __forceinline kmp_int64
218 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
219 {
220     kmp_int64 r;
221     r = KMP_TEST_THEN_ADD64( p, d );
222     return r;
223 }
224 
225 // test_then_inc_acq template (general template should NOT be used)
226 template< typename T >
227 static __forceinline T
228 test_then_inc_acq( volatile T *p );
229 
230 template<>
231 __forceinline kmp_int32
232 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
233 {
234     kmp_int32 r;
235     r = KMP_TEST_THEN_INC_ACQ32( p );
236     return r;
237 }
238 
239 template<>
240 __forceinline kmp_int64
241 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
242 {
243     kmp_int64 r;
244     r = KMP_TEST_THEN_INC_ACQ64( p );
245     return r;
246 }
247 
248 // test_then_inc template (general template should NOT be used)
249 template< typename T >
250 static __forceinline T
251 test_then_inc( volatile T *p );
252 
253 template<>
254 __forceinline kmp_int32
255 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
256 {
257     kmp_int32 r;
258     r = KMP_TEST_THEN_INC32( p );
259     return r;
260 }
261 
262 template<>
263 __forceinline kmp_int64
264 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
265 {
266     kmp_int64 r;
267     r = KMP_TEST_THEN_INC64( p );
268     return r;
269 }
270 
271 // compare_and_swap template (general template should NOT be used)
272 template< typename T >
273 static __forceinline kmp_int32
274 compare_and_swap( volatile T *p, T c, T s );
275 
276 template<>
277 __forceinline kmp_int32
278 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
279 {
280     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
281 }
282 
283 template<>
284 __forceinline kmp_int32
285 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
286 {
287     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
288 }
289 
290 /*
291     Spin wait loop that first does pause, then yield.
292     Waits until function returns non-zero when called with *spinner and check.
293     Does NOT put threads to sleep.
294 #if USE_ITT_BUILD
295     Arguments:
296         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
297             locks consistently. For example, if lock is acquired immediately, its address is
298             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
299             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
300             address, not an address of low-level spinner.
301 #endif // USE_ITT_BUILD
302 */
303 template< typename UT >
304 // ToDo: make inline function (move to header file for icl)
305 static UT  // unsigned 4- or 8-byte type
306 __kmp_wait_yield( volatile UT * spinner,
307                   UT            checker,
308                   kmp_uint32 (* pred)( UT, UT )
309                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
310                   )
311 {
312     // note: we may not belong to a team at this point
313     register volatile UT         * spin          = spinner;
314     register          UT           check         = checker;
315     register          kmp_uint32   spins;
316     register          kmp_uint32 (*f) ( UT, UT ) = pred;
317     register          UT           r;
318 
319     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
320     KMP_INIT_YIELD( spins );
321     // main wait spin loop
322     while(!f(r = *spin, check))
323     {
324         KMP_FSYNC_SPIN_PREPARE( obj );
325         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
326            It causes problems with infinite recursion because of exit lock */
327         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
328             __kmp_abort_thread(); */
329 
330         // if we are oversubscribed,
331         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
332         // pause is in the following code
333         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
334         KMP_YIELD_SPIN( spins );
335     }
336     KMP_FSYNC_SPIN_ACQUIRED( obj );
337     return r;
338 }
339 
340 template< typename UT >
341 static kmp_uint32 __kmp_eq( UT value, UT checker) {
342     return value == checker;
343 }
344 
345 template< typename UT >
346 static kmp_uint32 __kmp_neq( UT value, UT checker) {
347     return value != checker;
348 }
349 
350 template< typename UT >
351 static kmp_uint32 __kmp_lt( UT value, UT checker) {
352     return value < checker;
353 }
354 
355 template< typename UT >
356 static kmp_uint32 __kmp_ge( UT value, UT checker) {
357     return value >= checker;
358 }
359 
360 template< typename UT >
361 static kmp_uint32 __kmp_le( UT value, UT checker) {
362     return value <= checker;
363 }
364 
365 
366 /* ------------------------------------------------------------------------ */
367 /* ------------------------------------------------------------------------ */
368 
369 static void
370 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
371 {
372     kmp_info_t *th;
373 
374     KMP_DEBUG_ASSERT( gtid_ref );
375 
376     if ( __kmp_env_consistency_check ) {
377         th = __kmp_threads[*gtid_ref];
378         if ( th -> th.th_root -> r.r_active
379           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
380 #if KMP_USE_DYNAMIC_LOCK
381             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
382 #else
383             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
384 #endif
385         }
386     }
387 }
388 
389 template< typename UT >
390 static void
391 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
392 {
393     typedef typename traits_t< UT >::signed_t    ST;
394     dispatch_private_info_template< UT > * pr;
395 
396     int gtid = *gtid_ref;
397 //    int  cid = *cid_ref;
398     kmp_info_t *th = __kmp_threads[ gtid ];
399     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
400 
401     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
402     if ( __kmp_env_consistency_check ) {
403         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404             ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         if ( pr -> pushed_ws != ct_none ) {
406 #if KMP_USE_DYNAMIC_LOCK
407             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
408 #else
409             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
410 #endif
411         }
412     }
413 
414     if ( ! th -> th.th_team -> t.t_serialized ) {
415         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
416             ( th -> th.th_dispatch -> th_dispatch_sh_current );
417         UT  lower;
418 
419         if ( ! __kmp_env_consistency_check ) {
420                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
421                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
422         }
423         lower = pr->u.p.ordered_lower;
424 
425         #if ! defined( KMP_GOMP_COMPAT )
426             if ( __kmp_env_consistency_check ) {
427                 if ( pr->ordered_bumped ) {
428                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
429                     __kmp_error_construct2(
430                         kmp_i18n_msg_CnsMultipleNesting,
431                         ct_ordered_in_pdo, loc_ref,
432                         & p->stack_data[ p->w_top ]
433                     );
434                 }
435             }
436         #endif /* !defined(KMP_GOMP_COMPAT) */
437 
438         KMP_MB();
439         #ifdef KMP_DEBUG
440         {
441             const char * buff;
442             // create format specifiers before the debug output
443             buff = __kmp_str_format(
444                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
445                 traits_t< UT >::spec, traits_t< UT >::spec );
446             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
447             __kmp_str_free( &buff );
448         }
449         #endif
450 
451         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
452                                 USE_ITT_BUILD_ARG( NULL )
453                                 );
454         KMP_MB();  /* is this necessary? */
455         #ifdef KMP_DEBUG
456         {
457             const char * buff;
458             // create format specifiers before the debug output
459             buff = __kmp_str_format(
460                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
461                 traits_t< UT >::spec, traits_t< UT >::spec );
462             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
463             __kmp_str_free( &buff );
464         }
465         #endif
466     }
467     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
468 }
469 
470 static void
471 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
472 {
473     kmp_info_t *th;
474 
475     if ( __kmp_env_consistency_check ) {
476         th = __kmp_threads[*gtid_ref];
477         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
478             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
479         }
480     }
481 }
482 
483 template< typename UT >
484 static void
485 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
486 {
487     typedef typename traits_t< UT >::signed_t    ST;
488     dispatch_private_info_template< UT > * pr;
489 
490     int gtid = *gtid_ref;
491 //    int  cid = *cid_ref;
492     kmp_info_t *th = __kmp_threads[ gtid ];
493     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
494 
495     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
496     if ( __kmp_env_consistency_check ) {
497         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
498             ( th -> th.th_dispatch -> th_dispatch_pr_current );
499         if ( pr -> pushed_ws != ct_none ) {
500             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
501         }
502     }
503 
504     if ( ! th -> th.th_team -> t.t_serialized ) {
505         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
506             ( th -> th.th_dispatch -> th_dispatch_sh_current );
507 
508         if ( ! __kmp_env_consistency_check ) {
509             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
510                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
511         }
512 
513         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
514         #if ! defined( KMP_GOMP_COMPAT )
515             if ( __kmp_env_consistency_check ) {
516                 if ( pr->ordered_bumped != 0 ) {
517                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
518                     /* How to test it? - OM */
519                     __kmp_error_construct2(
520                         kmp_i18n_msg_CnsMultipleNesting,
521                         ct_ordered_in_pdo, loc_ref,
522                         & p->stack_data[ p->w_top ]
523                     );
524                 }
525             }
526         #endif /* !defined(KMP_GOMP_COMPAT) */
527 
528         KMP_MB();       /* Flush all pending memory write invalidates.  */
529 
530         pr->ordered_bumped += 1;
531 
532         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
533                         gtid, pr->ordered_bumped ) );
534 
535         KMP_MB();       /* Flush all pending memory write invalidates.  */
536 
537         /* TODO use general release procedure? */
538         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
539 
540         KMP_MB();       /* Flush all pending memory write invalidates.  */
541     }
542     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
543 }
544 
545 /* Computes and returns x to the power of y, where y must a non-negative integer */
546 template< typename UT >
547 static __forceinline long double
548 __kmp_pow(long double x, UT y) {
549     long double s=1.0L;
550 
551     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
552     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
553     while(y) {
554         if ( y & 1 )
555             s *= x;
556         x *= x;
557         y >>= 1;
558     }
559     return s;
560 }
561 
562 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
563    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
564    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
565    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
566 */
567 template< typename T >
568 static __inline typename traits_t< T >::unsigned_t
569 __kmp_dispatch_guided_remaining(
570     T                                  tc,
571     typename traits_t< T >::floating_t base,
572     typename traits_t< T >::unsigned_t idx
573 ) {
574     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
575        least for ICL 8.1, long double arithmetic may not really have
576        long double precision, even with /Qlong_double.  Currently, we
577        workaround that in the caller code, by manipulating the FPCW for
578        Windows* OS on IA-32 architecture.  The lack of precision is not
579        expected to be a correctness issue, though.
580     */
581     typedef typename traits_t< T >::unsigned_t  UT;
582 
583     long double x = tc * __kmp_pow< UT >(base, idx);
584     UT r = (UT) x;
585     if ( x == r )
586         return r;
587     return r + 1;
588 }
589 
590 // Parameters of the guided-iterative algorithm:
591 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
592 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
593 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
594 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
595 static int guided_int_param = 2;
596 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
597 
598 // UT - unsigned flavor of T, ST - signed flavor of T,
599 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
600 template< typename T >
601 static void
602 __kmp_dispatch_init(
603     ident_t                        * loc,
604     int                              gtid,
605     enum sched_type                  schedule,
606     T                                lb,
607     T                                ub,
608     typename traits_t< T >::signed_t st,
609     typename traits_t< T >::signed_t chunk,
610     int                              push_ws
611 ) {
612     typedef typename traits_t< T >::unsigned_t  UT;
613     typedef typename traits_t< T >::signed_t    ST;
614     typedef typename traits_t< T >::floating_t  DBL;
615     static const int ___kmp_size_type = sizeof( UT );
616 
617     int                                            active;
618     T                                              tc;
619     kmp_info_t *                                   th;
620     kmp_team_t *                                   team;
621     kmp_uint32                                     my_buffer_index;
622     dispatch_private_info_template< T >          * pr;
623     dispatch_shared_info_template< UT > volatile * sh;
624 
625     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
626     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
627 
628     if ( ! TCR_4( __kmp_init_parallel ) )
629         __kmp_parallel_initialize();
630 
631 #if INCLUDE_SSC_MARKS
632     SSC_MARK_DISPATCH_INIT();
633 #endif
634     #ifdef KMP_DEBUG
635     {
636         const char * buff;
637         // create format specifiers before the debug output
638         buff = __kmp_str_format(
639             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
640             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
641         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
642         __kmp_str_free( &buff );
643     }
644     #endif
645     /* setup data */
646     th     = __kmp_threads[ gtid ];
647     team   = th -> th.th_team;
648     active = ! team -> t.t_serialized;
649     th->th.th_ident = loc;
650 
651 #if USE_ITT_BUILD
652     kmp_uint64 cur_chunk = chunk;
653     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
654         KMP_MASTER_GTID(gtid) &&
655 #if OMP_40_ENABLED
656         th->th.th_teams_microtask == NULL &&
657 #endif
658         team->t.t_active_level == 1;
659 #endif
660     if ( ! active ) {
661         pr = reinterpret_cast< dispatch_private_info_template< T >* >
662             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
663     } else {
664         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
665                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
666 
667         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
668 
669         /* What happens when number of threads changes, need to resize buffer? */
670         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
671             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
672         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
673             ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
674     }
675 
676     #if  ( KMP_STATIC_STEAL_ENABLED )
677     if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
678         // AC: we now have only one implementation of stealing, so use it
679         schedule = kmp_sch_static_steal;
680     else
681     #endif
682         schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
683 
684     /* Pick up the nomerge/ordered bits from the scheduling type */
685     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
686         pr->nomerge = TRUE;
687         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
688     } else {
689         pr->nomerge = FALSE;
690     }
691     pr->type_size = ___kmp_size_type; // remember the size of variables
692     if ( kmp_ord_lower & schedule ) {
693         pr->ordered = TRUE;
694         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
695     } else {
696         pr->ordered = FALSE;
697     }
698 
699     if ( schedule == kmp_sch_static ) {
700         schedule = __kmp_static;
701     } else {
702         if ( schedule == kmp_sch_runtime ) {
703             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
704             schedule = team -> t.t_sched.r_sched_type;
705             // Detail the schedule if needed (global controls are differentiated appropriately)
706             if ( schedule == kmp_sch_guided_chunked ) {
707                 schedule = __kmp_guided;
708             } else if ( schedule == kmp_sch_static ) {
709                 schedule = __kmp_static;
710             }
711             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
712             chunk = team -> t.t_sched.chunk;
713 #if USE_ITT_BUILD
714             cur_chunk = chunk;
715 #endif
716             #ifdef KMP_DEBUG
717             {
718                 const char * buff;
719                 // create format specifiers before the debug output
720                 buff = __kmp_str_format(
721                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
722                     traits_t< ST >::spec );
723                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
724                 __kmp_str_free( &buff );
725             }
726             #endif
727         } else {
728             if ( schedule == kmp_sch_guided_chunked ) {
729                 schedule = __kmp_guided;
730             }
731             if ( chunk <= 0 ) {
732                 chunk = KMP_DEFAULT_CHUNK;
733             }
734         }
735 
736         if ( schedule == kmp_sch_auto ) {
737             // mapping and differentiation: in the __kmp_do_serial_initialize()
738             schedule = __kmp_auto;
739             #ifdef KMP_DEBUG
740             {
741                 const char * buff;
742                 // create format specifiers before the debug output
743                 buff = __kmp_str_format(
744                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
745                     traits_t< ST >::spec );
746                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
747                 __kmp_str_free( &buff );
748             }
749             #endif
750         }
751 
752         /* guided analytical not safe for too many threads */
753         if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) {
754             schedule = kmp_sch_guided_iterative_chunked;
755             KMP_WARNING( DispatchManyThreads );
756         }
757         pr->u.p.parm1 = chunk;
758     }
759     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
760                 "unknown scheduling type" );
761 
762     pr->u.p.count = 0;
763 
764     if ( __kmp_env_consistency_check ) {
765         if ( st == 0 ) {
766             __kmp_error_construct(
767                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
768                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
769             );
770         }
771     }
772     // compute trip count
773     if ( st == 1 ) {   // most common case
774         if ( ub >= lb ) {
775             tc = ub - lb + 1;
776         } else {   // ub < lb
777             tc = 0;            // zero-trip
778         }
779     } else if ( st < 0 ) {
780         if ( lb >= ub ) {
781             // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
782             //     where the division needs to be unsigned regardless of the result type
783             tc = (UT)(lb - ub) / (-st) + 1;
784         } else {   // lb < ub
785             tc = 0;            // zero-trip
786         }
787     } else {       // st > 0
788         if ( ub >= lb ) {
789             // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
790             //     where the division needs to be unsigned regardless of the result type
791             tc = (UT)(ub - lb) / st + 1;
792         } else {   // ub < lb
793             tc = 0;            // zero-trip
794         }
795     }
796 
797     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
798     // when statistics are disabled.
799     if (schedule == __kmp_static)
800     {
801         KMP_COUNT_BLOCK(OMP_FOR_static);
802         KMP_COUNT_VALUE(FOR_static_iterations, tc);
803     }
804     else
805     {
806         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
807         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
808     }
809 
810     pr->u.p.lb = lb;
811     pr->u.p.ub = ub;
812     pr->u.p.st = st;
813     pr->u.p.tc = tc;
814 
815     #if KMP_OS_WINDOWS
816     pr->u.p.last_upper = ub + st;
817     #endif /* KMP_OS_WINDOWS */
818 
819     /* NOTE: only the active parallel region(s) has active ordered sections */
820 
821     if ( active ) {
822         if ( pr->ordered == 0 ) {
823             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
824             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
825         } else {
826             pr->ordered_bumped = 0;
827 
828             pr->u.p.ordered_lower = 1;
829             pr->u.p.ordered_upper = 0;
830 
831             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
832             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
833         }
834     }
835 
836     if ( __kmp_env_consistency_check ) {
837         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
838         if ( push_ws ) {
839             __kmp_push_workshare( gtid, ws, loc );
840             pr->pushed_ws = ws;
841         } else {
842             __kmp_check_workshare( gtid, ws, loc );
843             pr->pushed_ws = ct_none;
844         }
845     }
846 
847     switch ( schedule ) {
848     #if  ( KMP_STATIC_STEAL_ENABLED )
849     case kmp_sch_static_steal:
850         {
851             T nproc = th->th.th_team_nproc;
852             T ntc, init;
853 
854             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
855 
856             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
857             if ( nproc > 1 && ntc >= nproc ) {
858                 T id = __kmp_tid_from_gtid(gtid);
859                 T small_chunk, extras;
860 
861                 small_chunk = ntc / nproc;
862                 extras = ntc % nproc;
863 
864                 init = id * small_chunk + ( id < extras ? id : extras );
865                 pr->u.p.count = init;
866                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
867 
868                 pr->u.p.parm2 = lb;
869                 //pr->pfields.parm3 = 0; // it's not used in static_steal
870                 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
871                 pr->u.p.st = st;
872                 if ( ___kmp_size_type > 4 ) {
873                     // AC: TODO: check if 16-byte CAS available and use it to
874                     // improve performance (probably wait for explicit request
875                     // before spending time on this).
876                     // For now use dynamically allocated per-thread lock,
877                     // free memory in __kmp_dispatch_next when status==0.
878                     KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
879                     th->th.th_dispatch->th_steal_lock =
880                         (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
881                     __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
882                 }
883                 break;
884             } else {
885                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
886                                gtid ) );
887                 schedule = kmp_sch_static_balanced;
888                 /* too few iterations: fall-through to kmp_sch_static_balanced */
889             } // if
890             /* FALL-THROUGH to static balanced */
891         } // case
892     #endif
893     case kmp_sch_static_balanced:
894         {
895             T nproc = th->th.th_team_nproc;
896             T init, limit;
897 
898             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
899                             gtid ) );
900 
901             if ( nproc > 1 ) {
902                 T id = __kmp_tid_from_gtid(gtid);
903 
904                 if ( tc < nproc ) {
905                     if ( id < tc ) {
906                         init = id;
907                         limit = id;
908                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
909                     } else {
910                         pr->u.p.count = 1;  /* means no more chunks to execute */
911                         pr->u.p.parm1 = FALSE;
912                         break;
913                     }
914                 } else {
915                     T small_chunk = tc / nproc;
916                     T extras = tc % nproc;
917                     init = id * small_chunk + (id < extras ? id : extras);
918                     limit = init + small_chunk - (id < extras ? 0 : 1);
919                     pr->u.p.parm1 = (id == nproc - 1);
920                 }
921             } else {
922                 if ( tc > 0 ) {
923                     init = 0;
924                     limit = tc - 1;
925                     pr->u.p.parm1 = TRUE;
926                 } else {
927                     // zero trip count
928                     pr->u.p.count = 1;  /* means no more chunks to execute */
929                     pr->u.p.parm1 = FALSE;
930                     break;
931                 }
932             }
933 #if USE_ITT_BUILD
934             // Calculate chunk for metadata report
935             if ( itt_need_metadata_reporting )
936                 cur_chunk = limit - init + 1;
937 #endif
938             if ( st == 1 ) {
939                 pr->u.p.lb = lb + init;
940                 pr->u.p.ub = lb + limit;
941             } else {
942                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
943                 pr->u.p.lb = lb + init * st;
944                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
945                 if ( st > 0 ) {
946                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
947                 } else {
948                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
949                 }
950             }
951             if ( pr->ordered ) {
952                 pr->u.p.ordered_lower = init;
953                 pr->u.p.ordered_upper = limit;
954             }
955             break;
956         } // case
957     case kmp_sch_guided_iterative_chunked :
958         {
959             T nproc = th->th.th_team_nproc;
960             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
961 
962             if ( nproc > 1 ) {
963                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
964                     /* chunk size too large, switch to dynamic */
965                     schedule = kmp_sch_dynamic_chunked;
966                 } else {
967                     // when remaining iters become less than parm2 - switch to dynamic
968                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
969                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
970                 }
971             } else {
972                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
973                 schedule = kmp_sch_static_greedy;
974                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
975                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
976                 pr->u.p.parm1 = tc;
977             } // if
978         } // case
979         break;
980     case kmp_sch_guided_analytical_chunked:
981         {
982             T nproc = th->th.th_team_nproc;
983             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
984 
985             if ( nproc > 1 ) {
986                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
987                     /* chunk size too large, switch to dynamic */
988                     schedule = kmp_sch_dynamic_chunked;
989                 } else {
990                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
991                     DBL x;
992 
993                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
994                     /* Linux* OS already has 64-bit computation by default for
995 		       long double, and on Windows* OS on Intel(R) 64,
996 		       /Qlong_double doesn't work.  On Windows* OS
997 		       on IA-32 architecture, we need to set precision to
998 		       64-bit instead of the default 53-bit. Even though long
999 		       double doesn't work on Windows* OS on Intel(R) 64, the
1000 		       resulting lack of precision is not expected to impact
1001 		       the correctness of the algorithm, but this has not been
1002 		       mathematically proven.
1003                     */
1004                     // save original FPCW and set precision to 64-bit, as
1005                     // Windows* OS on IA-32 architecture defaults to 53-bit
1006                     unsigned int oldFpcw = _control87(0,0);
1007                     _control87(_PC_64,_MCW_PC); // 0,0x30000
1008                     #endif
1009                     /* value used for comparison in solver for cross-over point */
1010                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
1011 
1012                     /* crossover point--chunk indexes equal to or greater than
1013 		       this point switch to dynamic-style scheduling */
1014                     UT   cross;
1015 
1016                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
1017                     x = (long double)1.0 - (long double)0.5 / nproc;
1018 
1019                     #ifdef KMP_DEBUG
1020                     { // test natural alignment
1021                         struct _test_a {
1022                             char a;
1023                             union {
1024                                 char b;
1025                                 DBL  d;
1026                             };
1027                         } t;
1028                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1029                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1030                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1031                     }
1032                     #endif // KMP_DEBUG
1033 
1034                     /* save the term in thread private dispatch structure */
1035                     *(DBL*)&pr->u.p.parm3 = x;
1036 
1037                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1038                     {
1039                         UT          left, right, mid;
1040                         long double p;
1041 
1042                         /* estimate initial upper and lower bound */
1043 
1044                         /* doesn't matter what value right is as long as it is positive, but
1045                            it affects performance of the solver
1046                         */
1047                         right = 229;
1048                         p = __kmp_pow< UT >(x,right);
1049                         if ( p > target ) {
1050                             do{
1051                                 p *= p;
1052                                 right <<= 1;
1053                             } while(p>target && right < (1<<27));
1054                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1055                         } else {
1056                             left = 0;
1057                         }
1058 
1059                         /* bisection root-finding method */
1060                         while ( left + 1 < right ) {
1061                             mid = (left + right) / 2;
1062                             if ( __kmp_pow< UT >(x,mid) > target ) {
1063                                 left = mid;
1064                             } else {
1065                                 right = mid;
1066                             }
1067                         } // while
1068                         cross = right;
1069                     }
1070                     /* assert sanity of computed crossover point */
1071                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1072 
1073                     /* save the crossover point in thread private dispatch structure */
1074                     pr->u.p.parm2 = cross;
1075 
1076                     // C75803
1077                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1078                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1079                     #else
1080                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1081                     #endif
1082                     /* dynamic-style scheduling offset */
1083                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1084                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1085                         // restore FPCW
1086                         _control87(oldFpcw,_MCW_PC);
1087                     #endif
1088                 } // if
1089             } else {
1090                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1091                                gtid ) );
1092                 schedule = kmp_sch_static_greedy;
1093                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1094                 pr->u.p.parm1 = tc;
1095             } // if
1096         } // case
1097         break;
1098     case kmp_sch_static_greedy:
1099         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1100             pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ?
1101                 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc :
1102                 tc;
1103         break;
1104     case kmp_sch_static_chunked :
1105     case kmp_sch_dynamic_chunked :
1106         if ( pr->u.p.parm1 <= 0 ) {
1107             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1108         }
1109         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1110         break;
1111     case kmp_sch_trapezoidal :
1112         {
1113             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1114 
1115             T parm1, parm2, parm3, parm4;
1116             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1117 
1118             parm1 = chunk;
1119 
1120             /* F : size of the first cycle */
1121             parm2 = ( tc / (2 * th->th.th_team_nproc) );
1122 
1123             if ( parm2 < 1 ) {
1124                 parm2 = 1;
1125             }
1126 
1127             /* L : size of the last cycle.  Make sure the last cycle
1128              *     is not larger than the first cycle.
1129              */
1130             if ( parm1 < 1 ) {
1131                 parm1 = 1;
1132             } else if ( parm1 > parm2 ) {
1133                 parm1 = parm2;
1134             }
1135 
1136             /* N : number of cycles */
1137             parm3 = ( parm2 + parm1 );
1138             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1139 
1140             if ( parm3 < 2 ) {
1141                 parm3 = 2;
1142             }
1143 
1144             /* sigma : decreasing incr of the trapezoid */
1145             parm4 = ( parm3 - 1 );
1146             parm4 = ( parm2 - parm1 ) / parm4;
1147 
1148             // pointless check, because parm4 >= 0 always
1149             //if ( parm4 < 0 ) {
1150             //    parm4 = 0;
1151             //}
1152 
1153             pr->u.p.parm1 = parm1;
1154             pr->u.p.parm2 = parm2;
1155             pr->u.p.parm3 = parm3;
1156             pr->u.p.parm4 = parm4;
1157         } // case
1158         break;
1159 
1160     default:
1161         {
1162             __kmp_msg(
1163                 kmp_ms_fatal,                        // Severity
1164                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1165                 KMP_HNT( GetNewerLibrary ),          // Hint
1166                 __kmp_msg_null                       // Variadic argument list terminator
1167             );
1168         }
1169         break;
1170     } // switch
1171     pr->schedule = schedule;
1172     if ( active ) {
1173         /* The name of this buffer should be my_buffer_index when it's free to use it */
1174 
1175         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1176                         gtid, my_buffer_index, sh->buffer_index) );
1177         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1178                                         USE_ITT_BUILD_ARG( NULL )
1179                                         );
1180             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1181             // *always* 32-bit integers.
1182         KMP_MB();  /* is this necessary? */
1183         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1184                         gtid, my_buffer_index, sh->buffer_index) );
1185 
1186         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1187         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1188 #if USE_ITT_BUILD
1189         if ( pr->ordered ) {
1190             __kmp_itt_ordered_init( gtid );
1191         }; // if
1192         // Report loop metadata
1193         if ( itt_need_metadata_reporting ) {
1194             // Only report metadata by master of active team at level 1
1195             kmp_uint64 schedtype = 0;
1196             switch ( schedule ) {
1197             case kmp_sch_static_chunked:
1198             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1199                 break;
1200             case kmp_sch_static_greedy:
1201                 cur_chunk = pr->u.p.parm1;
1202                 break;
1203             case kmp_sch_dynamic_chunked:
1204                 schedtype = 1;
1205                 break;
1206             case kmp_sch_guided_iterative_chunked:
1207             case kmp_sch_guided_analytical_chunked:
1208                 schedtype = 2;
1209                 break;
1210             default:
1211 //            Should we put this case under "static"?
1212 //            case kmp_sch_static_steal:
1213                 schedtype = 3;
1214                 break;
1215             }
1216             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1217         }
1218 #endif /* USE_ITT_BUILD */
1219     }; // if
1220 
1221     #ifdef KMP_DEBUG
1222     {
1223         const char * buff;
1224         // create format specifiers before the debug output
1225         buff = __kmp_str_format(
1226             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1227             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1228             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1229             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1230             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1231             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1232             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1233         KD_TRACE(10, ( buff,
1234             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1235             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1236             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1237             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1238         __kmp_str_free( &buff );
1239     }
1240     #endif
1241     #if ( KMP_STATIC_STEAL_ENABLED )
1242       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1243       // all the parm3 variables will contain the same value.
1244       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1245       // rather than program life-time increment.
1246       // So the dedicated variable is required. The 'static_steal_counter' is used.
1247       if( schedule == kmp_sch_static_steal ) {
1248         // Other threads will inspect this variable when searching for a victim.
1249         // This is a flag showing that other threads may steal from this thread since then.
1250         volatile T * p = &pr->u.p.static_steal_counter;
1251         *p = *p + 1;
1252       }
1253     #endif // ( KMP_STATIC_STEAL_ENABLED )
1254 
1255 #if OMPT_SUPPORT && OMPT_TRACE
1256     if (ompt_enabled &&
1257         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1258         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1259         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1260         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1261             team_info->parallel_id, task_info->task_id, team_info->microtask);
1262     }
1263 #endif
1264 }
1265 
1266 /*
1267  * For ordered loops, either __kmp_dispatch_finish() should be called after
1268  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1269  * every chunk of iterations.  If the ordered section(s) were not executed
1270  * for this iteration (or every iteration in this chunk), we need to set the
1271  * ordered iteration counters so that the next thread can proceed.
1272  */
1273 template< typename UT >
1274 static void
1275 __kmp_dispatch_finish( int gtid, ident_t *loc )
1276 {
1277     typedef typename traits_t< UT >::signed_t ST;
1278     kmp_info_t *th = __kmp_threads[ gtid ];
1279 
1280     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1281     if ( ! th -> th.th_team -> t.t_serialized ) {
1282 
1283         dispatch_private_info_template< UT > * pr =
1284             reinterpret_cast< dispatch_private_info_template< UT >* >
1285             ( th->th.th_dispatch->th_dispatch_pr_current );
1286         dispatch_shared_info_template< UT > volatile * sh =
1287             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1288             ( th->th.th_dispatch->th_dispatch_sh_current );
1289         KMP_DEBUG_ASSERT( pr );
1290         KMP_DEBUG_ASSERT( sh );
1291         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1292                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1293 
1294         if ( pr->ordered_bumped ) {
1295             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1296                             gtid ) );
1297             pr->ordered_bumped = 0;
1298         } else {
1299             UT lower = pr->u.p.ordered_lower;
1300 
1301             #ifdef KMP_DEBUG
1302             {
1303                 const char * buff;
1304                 // create format specifiers before the debug output
1305                 buff = __kmp_str_format(
1306                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1307                     traits_t< UT >::spec, traits_t< UT >::spec );
1308                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1309                 __kmp_str_free( &buff );
1310             }
1311             #endif
1312 
1313             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1314                                    USE_ITT_BUILD_ARG(NULL)
1315                                    );
1316             KMP_MB();  /* is this necessary? */
1317             #ifdef KMP_DEBUG
1318             {
1319                 const char * buff;
1320                 // create format specifiers before the debug output
1321                 buff = __kmp_str_format(
1322                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1323                     traits_t< UT >::spec, traits_t< UT >::spec );
1324                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1325                 __kmp_str_free( &buff );
1326             }
1327             #endif
1328 
1329             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1330         } // if
1331     } // if
1332     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1333 }
1334 
1335 #ifdef KMP_GOMP_COMPAT
1336 
1337 template< typename UT >
1338 static void
1339 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1340 {
1341     typedef typename traits_t< UT >::signed_t ST;
1342     kmp_info_t *th = __kmp_threads[ gtid ];
1343 
1344     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1345     if ( ! th -> th.th_team -> t.t_serialized ) {
1346 //        int cid;
1347         dispatch_private_info_template< UT > * pr =
1348             reinterpret_cast< dispatch_private_info_template< UT >* >
1349             ( th->th.th_dispatch->th_dispatch_pr_current );
1350         dispatch_shared_info_template< UT > volatile * sh =
1351             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1352             ( th->th.th_dispatch->th_dispatch_sh_current );
1353         KMP_DEBUG_ASSERT( pr );
1354         KMP_DEBUG_ASSERT( sh );
1355         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1356                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1357 
1358 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1359             UT lower = pr->u.p.ordered_lower;
1360             UT upper = pr->u.p.ordered_upper;
1361             UT inc = upper - lower + 1;
1362 
1363             if ( pr->ordered_bumped == inc ) {
1364                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1365                   gtid ) );
1366                 pr->ordered_bumped = 0;
1367             } else {
1368                 inc -= pr->ordered_bumped;
1369 
1370                 #ifdef KMP_DEBUG
1371                 {
1372                     const char * buff;
1373                     // create format specifiers before the debug output
1374                     buff = __kmp_str_format(
1375                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1376                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1377                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1378                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1379                     __kmp_str_free( &buff );
1380                 }
1381                 #endif
1382 
1383                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1384                                        USE_ITT_BUILD_ARG(NULL)
1385                                        );
1386 
1387                 KMP_MB();  /* is this necessary? */
1388                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1389                   gtid ) );
1390                 pr->ordered_bumped = 0;
1391 //!!!!! TODO check if the inc should be unsigned, or signed???
1392                 #ifdef KMP_DEBUG
1393                 {
1394                     const char * buff;
1395                     // create format specifiers before the debug output
1396                     buff = __kmp_str_format(
1397                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1398                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1399                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1400                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1401                     __kmp_str_free( &buff );
1402                 }
1403                 #endif
1404 
1405                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1406             }
1407 //        }
1408     }
1409     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1410 }
1411 
1412 #endif /* KMP_GOMP_COMPAT */
1413 
1414 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1415  * (no more work), then tell OMPT the loop is over. In some cases
1416  * kmp_dispatch_fini() is not called. */
1417 #if OMPT_SUPPORT && OMPT_TRACE
1418 #define OMPT_LOOP_END                                                          \
1419     if (status == 0) {                                                         \
1420         if (ompt_enabled &&                     \
1421             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1422             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1423             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1424             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1425                 team_info->parallel_id, task_info->task_id);                   \
1426         }                                                                      \
1427     }
1428 #else
1429 #define OMPT_LOOP_END // no-op
1430 #endif
1431 
1432 template< typename T >
1433 static int
1434 __kmp_dispatch_next(
1435     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1436 ) {
1437 
1438     typedef typename traits_t< T >::unsigned_t  UT;
1439     typedef typename traits_t< T >::signed_t    ST;
1440     typedef typename traits_t< T >::floating_t  DBL;
1441 #if ( KMP_STATIC_STEAL_ENABLED )
1442     static const int ___kmp_size_type = sizeof( UT );
1443 #endif
1444 
1445     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1446     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1447     // more than a compile time choice to use static scheduling would.)
1448     KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1449 
1450     int                                   status;
1451     dispatch_private_info_template< T > * pr;
1452     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1453     kmp_team_t                          * team = th -> th.th_team;
1454 
1455     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1456     #ifdef KMP_DEBUG
1457     {
1458         const char * buff;
1459         // create format specifiers before the debug output
1460         buff = __kmp_str_format(
1461             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1462             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1463         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1464         __kmp_str_free( &buff );
1465     }
1466     #endif
1467 
1468     if ( team -> t.t_serialized ) {
1469         /* NOTE: serialize this dispatch becase we are not at the active level */
1470         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1471             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1472         KMP_DEBUG_ASSERT( pr );
1473 
1474         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1475             *p_lb = 0;
1476             *p_ub = 0;
1477 //            if ( p_last != NULL )
1478 //                *p_last = 0;
1479             if ( p_st != NULL )
1480                 *p_st = 0;
1481             if ( __kmp_env_consistency_check ) {
1482                 if ( pr->pushed_ws != ct_none ) {
1483                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1484                 }
1485             }
1486         } else if ( pr->nomerge ) {
1487             kmp_int32 last;
1488             T         start;
1489             UT        limit, trip, init;
1490             ST        incr;
1491             T         chunk = pr->u.p.parm1;
1492 
1493             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1494 
1495             init = chunk * pr->u.p.count++;
1496             trip = pr->u.p.tc - 1;
1497 
1498             if ( (status = (init <= trip)) == 0 ) {
1499                 *p_lb = 0;
1500                 *p_ub = 0;
1501 //                if ( p_last != NULL )
1502 //                    *p_last = 0;
1503                 if ( p_st != NULL )
1504                     *p_st = 0;
1505                 if ( __kmp_env_consistency_check ) {
1506                     if ( pr->pushed_ws != ct_none ) {
1507                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1508                     }
1509                 }
1510             } else {
1511                 start = pr->u.p.lb;
1512                 limit = chunk + init - 1;
1513                 incr  = pr->u.p.st;
1514 
1515                 if ( (last = (limit >= trip)) != 0 ) {
1516                     limit = trip;
1517                     #if KMP_OS_WINDOWS
1518                     pr->u.p.last_upper = pr->u.p.ub;
1519                     #endif /* KMP_OS_WINDOWS */
1520                 }
1521                 if ( p_last != NULL )
1522                     *p_last = last;
1523                 if ( p_st != NULL )
1524                     *p_st = incr;
1525                 if ( incr == 1 ) {
1526                     *p_lb = start + init;
1527                     *p_ub = start + limit;
1528                 } else {
1529                     *p_lb = start + init * incr;
1530                     *p_ub = start + limit * incr;
1531                 }
1532 
1533                 if ( pr->ordered ) {
1534                     pr->u.p.ordered_lower = init;
1535                     pr->u.p.ordered_upper = limit;
1536                     #ifdef KMP_DEBUG
1537                     {
1538                         const char * buff;
1539                         // create format specifiers before the debug output
1540                         buff = __kmp_str_format(
1541                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1542                             traits_t< UT >::spec, traits_t< UT >::spec );
1543                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1544                         __kmp_str_free( &buff );
1545                     }
1546                     #endif
1547                 } // if
1548             } // if
1549         } else {
1550             pr->u.p.tc = 0;
1551             *p_lb = pr->u.p.lb;
1552             *p_ub = pr->u.p.ub;
1553             #if KMP_OS_WINDOWS
1554             pr->u.p.last_upper = *p_ub;
1555             #endif /* KMP_OS_WINDOWS */
1556             if ( p_last != NULL )
1557                 *p_last = TRUE;
1558             if ( p_st != NULL )
1559                 *p_st = pr->u.p.st;
1560         } // if
1561         #ifdef KMP_DEBUG
1562         {
1563             const char * buff;
1564             // create format specifiers before the debug output
1565             buff = __kmp_str_format(
1566                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1567                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1568                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1569             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1570             __kmp_str_free( &buff );
1571         }
1572         #endif
1573 #if INCLUDE_SSC_MARKS
1574         SSC_MARK_DISPATCH_NEXT();
1575 #endif
1576         OMPT_LOOP_END;
1577         return status;
1578     } else {
1579         kmp_int32 last = 0;
1580         dispatch_shared_info_template< UT > *sh;
1581         T         start;
1582         ST        incr;
1583         UT        limit, trip, init;
1584 
1585         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1586                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1587 
1588         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1589             ( th->th.th_dispatch->th_dispatch_pr_current );
1590         KMP_DEBUG_ASSERT( pr );
1591         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1592             ( th->th.th_dispatch->th_dispatch_sh_current );
1593         KMP_DEBUG_ASSERT( sh );
1594 
1595         if ( pr->u.p.tc == 0 ) {
1596             // zero trip count
1597             status = 0;
1598         } else {
1599             switch (pr->schedule) {
1600             #if ( KMP_STATIC_STEAL_ENABLED )
1601             case kmp_sch_static_steal:
1602                 {
1603                     T chunk = pr->u.p.parm1;
1604                     int nproc = th->th.th_team_nproc;
1605 
1606                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1607 
1608                     trip = pr->u.p.tc - 1;
1609 
1610                     if ( ___kmp_size_type > 4 ) {
1611                         // use lock for 8-byte and CAS for 4-byte induction
1612                         // variable. TODO (optional): check and use 16-byte CAS
1613                         kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
1614                         KMP_DEBUG_ASSERT(lck != NULL);
1615                         if( pr->u.p.count < (UT)pr->u.p.ub ) {
1616                             __kmp_acquire_lock(lck, gtid);
1617                             // try to get own chunk of iterations
1618                             init   = ( pr->u.p.count )++;
1619                             status = ( init < (UT)pr->u.p.ub );
1620                             __kmp_release_lock(lck, gtid);
1621                         } else {
1622                             status = 0; // no own chunks
1623                         }
1624                         if( !status ) { // try to steal
1625                             kmp_info_t   **other_threads = team->t.t_threads;
1626                             int          while_limit = nproc; // nproc attempts to find a victim
1627                             int          while_index = 0;
1628                             // TODO: algorithm of searching for a victim
1629                             // should be cleaned up and measured
1630                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1631                                 T remaining;
1632                                 T victimIdx    = pr->u.p.parm4;
1633                                 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1634                                 dispatch_private_info_template< T > * victim =
1635                                     reinterpret_cast< dispatch_private_info_template< T >* >
1636                                     (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1637                                 while( ( victim == NULL || victim == pr ||
1638                                     ( *(volatile T*)&victim->u.p.static_steal_counter !=
1639                                     *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
1640                                     oldVictimIdx != victimIdx )
1641                                 {
1642                                     victimIdx = (victimIdx + 1) % nproc;
1643                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1644                                         (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1645                                 };
1646                                 if( !victim ||
1647                                     ( *(volatile T *)&victim->u.p.static_steal_counter !=
1648                                     *(volatile T *)&pr->u.p.static_steal_counter ) )
1649                                 {
1650                                     continue; // try once more (nproc attempts in total)
1651                                     // no victim is ready yet to participate in stealing
1652                                     // because all victims are still in kmp_init_dispatch
1653                                 }
1654                                 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
1655                                     pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1656                                     continue; // not enough chunks to steal, goto next victim
1657                                 }
1658 
1659                                 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1660                                 KMP_ASSERT(lck != NULL);
1661                                 __kmp_acquire_lock(lck, gtid);
1662                                 limit = victim->u.p.ub; // keep initial ub
1663                                 if( victim->u.p.count >= limit ||
1664                                     (remaining = limit - victim->u.p.count) < 2 )
1665                                 {
1666                                     __kmp_release_lock(lck, gtid);
1667                                     pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1668                                     continue; // not enough chunks to steal
1669                                 }
1670                                 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
1671                                 if( remaining > 3 ) {
1672                                     init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
1673                                 } else {
1674                                     init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
1675                                 }
1676                                 __kmp_release_lock(lck, gtid);
1677 
1678                                 KMP_DEBUG_ASSERT(init + 1 <= limit);
1679                                 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1680                                 status = 1;
1681                                 while_index = 0;
1682                                 // now update own count and ub with stolen range but init chunk
1683                                 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1684                                 pr->u.p.count = init + 1;
1685                                 pr->u.p.ub = limit;
1686                                 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1687                             } // while (search for victim)
1688                         } // if (try to find victim and steal)
1689                     } else {
1690                         // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1691                         typedef union {
1692                             struct {
1693                                 UT count;
1694                                 T  ub;
1695                             } p;
1696                             kmp_int64 b;
1697                         } union_i4;
1698                         // All operations on 'count' or 'ub' must be combined atomically together.
1699                         {
1700                             union_i4 vold, vnew;
1701                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1702                             vnew = vold;
1703                             vnew.p.count++;
1704                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1705                                         ( volatile kmp_int64* )&pr->u.p.count,
1706                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1707                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1708                                 KMP_CPU_PAUSE();
1709                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1710                                 vnew = vold;
1711                                 vnew.p.count++;
1712                             }
1713                             vnew = vold;
1714                             init   = vnew.p.count;
1715                             status = ( init < (UT)vnew.p.ub ) ;
1716                         }
1717 
1718                         if( !status ) {
1719                             kmp_info_t   **other_threads = team->t.t_threads;
1720                             int          while_limit = nproc; // nproc attempts to find a victim
1721                             int          while_index = 0;
1722 
1723                             // TODO: algorithm of searching for a victim
1724                             // should be cleaned up and measured
1725                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1726                                 union_i4  vold, vnew;
1727                                 kmp_int32 remaining;
1728                                 T         victimIdx    = pr->u.p.parm4;
1729                                 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1730                                 dispatch_private_info_template< T > * victim =
1731                                     reinterpret_cast< dispatch_private_info_template< T >* >
1732                                     (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1733                                 while( (victim == NULL || victim == pr ||
1734                                     (*(volatile T*)&victim->u.p.static_steal_counter !=
1735                                     *(volatile T*)&pr->u.p.static_steal_counter)) &&
1736                                     oldVictimIdx != victimIdx )
1737                                 {
1738                                     victimIdx = (victimIdx + 1) % nproc;
1739                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1740                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1741                                 };
1742                                 if( !victim ||
1743                                     ( *(volatile T *)&victim->u.p.static_steal_counter !=
1744                                     *(volatile T *)&pr->u.p.static_steal_counter ) )
1745                                 {
1746                                     continue; // try once more (nproc attempts in total)
1747                                     // no victim is ready yet to participate in stealing
1748                                     // because all victims are still in kmp_init_dispatch
1749                                 }
1750                                 pr->u.p.parm4 = victimIdx; // new victim found
1751                                 while( 1 ) { // CAS loop if victim has enough chunks to steal
1752                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1753                                     vnew = vold;
1754 
1755                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1756                                     if ( vnew.p.count >= (UT)vnew.p.ub ||
1757                                         (remaining = vnew.p.ub - vnew.p.count) < 2 )
1758                                     {
1759                                         pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1760                                         break; // not enough chunks to steal, goto next victim
1761                                     }
1762                                     if( remaining > 3 ) {
1763                                         vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
1764                                     } else {
1765                                         vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1766                                     }
1767                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1768                                     // TODO: Should this be acquire or release?
1769                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1770                                             ( volatile kmp_int64 * )&victim->u.p.count,
1771                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1772                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1773                                         // stealing succedded
1774                                         status = 1;
1775                                         while_index = 0;
1776                                         // now update own count and ub
1777                                         init = vnew.p.ub;
1778                                         vold.p.count = init + 1;
1779                                         #if KMP_ARCH_X86
1780                                         KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
1781                                         #else
1782                                         *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1783                                         #endif
1784                                         break;
1785                                     } // if (check CAS result)
1786                                     KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1787                                 } // while (try to steal from particular victim)
1788                             } // while (search for victim)
1789                         } // if (try to find victim and steal)
1790                     } // if (4-byte induction variable)
1791                     if ( !status ) {
1792                         *p_lb = 0;
1793                         *p_ub = 0;
1794                         if ( p_st != NULL ) *p_st = 0;
1795                     } else {
1796                         start = pr->u.p.parm2;
1797                         init *= chunk;
1798                         limit = chunk + init - 1;
1799                         incr  = pr->u.p.st;
1800 
1801                         KMP_DEBUG_ASSERT(init <= trip);
1802                         if ( (last = (limit >= trip)) != 0 )
1803                             limit = trip;
1804                         if ( p_st != NULL ) *p_st = incr;
1805 
1806                         if ( incr == 1 ) {
1807                             *p_lb = start + init;
1808                             *p_ub = start + limit;
1809                         } else {
1810                             *p_lb = start + init * incr;
1811                             *p_ub = start + limit * incr;
1812                         }
1813 
1814                         if ( pr->ordered ) {
1815                             pr->u.p.ordered_lower = init;
1816                             pr->u.p.ordered_upper = limit;
1817                             #ifdef KMP_DEBUG
1818                             {
1819                                 const char * buff;
1820                                 // create format specifiers before the debug output
1821                                 buff = __kmp_str_format(
1822                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1823                                     traits_t< UT >::spec, traits_t< UT >::spec );
1824                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1825                                 __kmp_str_free( &buff );
1826                             }
1827                             #endif
1828                         } // if
1829                     } // if
1830                     break;
1831                 } // case
1832             #endif // ( KMP_STATIC_STEAL_ENABLED )
1833             case kmp_sch_static_balanced:
1834                 {
1835                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1836                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1837                         pr->u.p.count = 1;
1838                         *p_lb = pr->u.p.lb;
1839                         *p_ub = pr->u.p.ub;
1840                         last = pr->u.p.parm1;
1841                         if ( p_st != NULL )
1842                             *p_st = pr->u.p.st;
1843                     } else {  /* no iterations to do */
1844                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1845                     }
1846                     if ( pr->ordered ) {
1847                         #ifdef KMP_DEBUG
1848                         {
1849                             const char * buff;
1850                             // create format specifiers before the debug output
1851                             buff = __kmp_str_format(
1852                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1853                                 traits_t< UT >::spec, traits_t< UT >::spec );
1854                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1855                             __kmp_str_free( &buff );
1856                         }
1857                         #endif
1858                     } // if
1859                 } // case
1860                 break;
1861             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1862             case kmp_sch_static_chunked:
1863                 {
1864                     T parm1;
1865 
1866                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1867                                    gtid ) );
1868                     parm1 = pr->u.p.parm1;
1869 
1870                     trip  = pr->u.p.tc - 1;
1871                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1872 
1873                     if ( (status = (init <= trip)) != 0 ) {
1874                         start = pr->u.p.lb;
1875                         incr  = pr->u.p.st;
1876                         limit = parm1 + init - 1;
1877 
1878                         if ( (last = (limit >= trip)) != 0 )
1879                             limit = trip;
1880 
1881                         if ( p_st != NULL ) *p_st = incr;
1882 
1883                         pr->u.p.count += th->th.th_team_nproc;
1884 
1885                         if ( incr == 1 ) {
1886                             *p_lb = start + init;
1887                             *p_ub = start + limit;
1888                         }
1889                         else {
1890                             *p_lb = start + init * incr;
1891                             *p_ub = start + limit * incr;
1892                         }
1893 
1894                         if ( pr->ordered ) {
1895                             pr->u.p.ordered_lower = init;
1896                             pr->u.p.ordered_upper = limit;
1897                             #ifdef KMP_DEBUG
1898                             {
1899                                 const char * buff;
1900                                 // create format specifiers before the debug output
1901                                 buff = __kmp_str_format(
1902                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1903                                     traits_t< UT >::spec, traits_t< UT >::spec );
1904                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1905                                 __kmp_str_free( &buff );
1906                             }
1907                             #endif
1908                         } // if
1909                     } // if
1910                 } // case
1911                 break;
1912 
1913             case kmp_sch_dynamic_chunked:
1914                 {
1915                     T chunk = pr->u.p.parm1;
1916 
1917                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1918                                    gtid ) );
1919 
1920                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1921                     trip = pr->u.p.tc - 1;
1922 
1923                     if ( (status = (init <= trip)) == 0 ) {
1924                         *p_lb = 0;
1925                         *p_ub = 0;
1926                         if ( p_st != NULL ) *p_st = 0;
1927                     } else {
1928                         start = pr->u.p.lb;
1929                         limit = chunk + init - 1;
1930                         incr  = pr->u.p.st;
1931 
1932                         if ( (last = (limit >= trip)) != 0 )
1933                             limit = trip;
1934 
1935                         if ( p_st != NULL ) *p_st = incr;
1936 
1937                         if ( incr == 1 ) {
1938                             *p_lb = start + init;
1939                             *p_ub = start + limit;
1940                         } else {
1941                             *p_lb = start + init * incr;
1942                             *p_ub = start + limit * incr;
1943                         }
1944 
1945                         if ( pr->ordered ) {
1946                             pr->u.p.ordered_lower = init;
1947                             pr->u.p.ordered_upper = limit;
1948                             #ifdef KMP_DEBUG
1949                             {
1950                                 const char * buff;
1951                                 // create format specifiers before the debug output
1952                                 buff = __kmp_str_format(
1953                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1954                                     traits_t< UT >::spec, traits_t< UT >::spec );
1955                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1956                                 __kmp_str_free( &buff );
1957                             }
1958                             #endif
1959                         } // if
1960                     } // if
1961                 } // case
1962                 break;
1963 
1964             case kmp_sch_guided_iterative_chunked:
1965                 {
1966                     T  chunkspec = pr->u.p.parm1;
1967                     KD_TRACE(100,
1968                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1969                     trip  = pr->u.p.tc;
1970                     // Start atomic part of calculations
1971                     while(1) {
1972                         ST  remaining;             // signed, because can be < 0
1973                         init = sh->u.s.iteration;  // shared value
1974                         remaining = trip - init;
1975                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1976                             // nothing to do, don't try atomic op
1977                             status = 0;
1978                             break;
1979                         }
1980                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1981                             // use dynamic-style shcedule
1982                             // atomically inrement iterations, get old value
1983                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1984                             remaining = trip - init;
1985                             if (remaining <= 0) {
1986                                 status = 0;    // all iterations got by other threads
1987                             } else {
1988                                 // got some iterations to work on
1989                                 status = 1;
1990                                 if ( (T)remaining > chunkspec ) {
1991                                     limit = init + chunkspec - 1;
1992                                 } else {
1993                                     last = 1;   // the last chunk
1994                                     limit = init + remaining - 1;
1995                                 } // if
1996                             } // if
1997                             break;
1998                         } // if
1999                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
2000                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
2001                             // CAS was successful, chunk obtained
2002                             status = 1;
2003                             --limit;
2004                             break;
2005                         } // if
2006                     } // while
2007                     if ( status != 0 ) {
2008                         start = pr->u.p.lb;
2009                         incr = pr->u.p.st;
2010                         if ( p_st != NULL )
2011                             *p_st = incr;
2012                         *p_lb = start + init * incr;
2013                         *p_ub = start + limit * incr;
2014                         if ( pr->ordered ) {
2015                             pr->u.p.ordered_lower = init;
2016                             pr->u.p.ordered_upper = limit;
2017                             #ifdef KMP_DEBUG
2018                             {
2019                                 const char * buff;
2020                                 // create format specifiers before the debug output
2021                                 buff = __kmp_str_format(
2022                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2023                                     traits_t< UT >::spec, traits_t< UT >::spec );
2024                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2025                                 __kmp_str_free( &buff );
2026                             }
2027                             #endif
2028                         } // if
2029                     } else {
2030                         *p_lb = 0;
2031                         *p_ub = 0;
2032                         if ( p_st != NULL )
2033                             *p_st = 0;
2034                     } // if
2035                 } // case
2036                 break;
2037 
2038             case kmp_sch_guided_analytical_chunked:
2039                 {
2040                     T   chunkspec = pr->u.p.parm1;
2041                     UT chunkIdx;
2042     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2043                     /* for storing original FPCW value for Windows* OS on
2044 		       IA-32 architecture 8-byte version */
2045                     unsigned int oldFpcw;
2046                     unsigned int fpcwSet = 0;
2047     #endif
2048                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
2049                                    gtid ) );
2050 
2051                     trip  = pr->u.p.tc;
2052 
2053                     KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2054                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip);
2055 
2056                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
2057                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
2058                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
2059                             --trip;
2060                             /* use dynamic-style scheduling */
2061                             init = chunkIdx * chunkspec + pr->u.p.count;
2062                             /* need to verify init > 0 in case of overflow in the above calculation */
2063                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
2064                                 limit = init + chunkspec -1;
2065 
2066                                 if ( (last = (limit >= trip)) != 0 )
2067                                     limit = trip;
2068                             }
2069                             break;
2070                         } else {
2071                             /* use exponential-style scheduling */
2072                             /* The following check is to workaround the lack of long double precision on Windows* OS.
2073                                This check works around the possible effect that init != 0 for chunkIdx == 0.
2074                              */
2075     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2076                             /* If we haven't already done so, save original
2077 			       FPCW and set precision to 64-bit, as Windows* OS
2078 			       on IA-32 architecture defaults to 53-bit */
2079                             if ( !fpcwSet ) {
2080                                 oldFpcw = _control87(0,0);
2081                                 _control87(_PC_64,_MCW_PC);
2082                                 fpcwSet = 0x30000;
2083                             }
2084     #endif
2085                             if ( chunkIdx ) {
2086                                 init = __kmp_dispatch_guided_remaining< T >(
2087                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2088                                 KMP_DEBUG_ASSERT(init);
2089                                 init = trip - init;
2090                             } else
2091                                 init = 0;
2092                             limit = trip - __kmp_dispatch_guided_remaining< T >(
2093                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2094                             KMP_ASSERT(init <= limit);
2095                             if ( init < limit ) {
2096                                 KMP_DEBUG_ASSERT(limit <= trip);
2097                                 --limit;
2098                                 status = 1;
2099                                 break;
2100                             } // if
2101                         } // if
2102                     } // while (1)
2103     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2104                     /* restore FPCW if necessary
2105                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2106                     */
2107                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2108                         _control87(oldFpcw,_MCW_PC);
2109     #endif
2110                     if ( status != 0 ) {
2111                         start = pr->u.p.lb;
2112                         incr = pr->u.p.st;
2113                         if ( p_st != NULL )
2114                             *p_st = incr;
2115                         *p_lb = start + init * incr;
2116                         *p_ub = start + limit * incr;
2117                         if ( pr->ordered ) {
2118                             pr->u.p.ordered_lower = init;
2119                             pr->u.p.ordered_upper = limit;
2120                             #ifdef KMP_DEBUG
2121                             {
2122                                 const char * buff;
2123                                 // create format specifiers before the debug output
2124                                 buff = __kmp_str_format(
2125                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2126                                     traits_t< UT >::spec, traits_t< UT >::spec );
2127                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2128                                 __kmp_str_free( &buff );
2129                             }
2130                             #endif
2131                         }
2132                     } else {
2133                         *p_lb = 0;
2134                         *p_ub = 0;
2135                         if ( p_st != NULL )
2136                             *p_st = 0;
2137                     }
2138                 } // case
2139                 break;
2140 
2141             case kmp_sch_trapezoidal:
2142                 {
2143                     UT   index;
2144                     T    parm2 = pr->u.p.parm2;
2145                     T    parm3 = pr->u.p.parm3;
2146                     T    parm4 = pr->u.p.parm4;
2147                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2148                                    gtid ) );
2149 
2150                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2151 
2152                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2153                     trip = pr->u.p.tc - 1;
2154 
2155                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2156                         *p_lb = 0;
2157                         *p_ub = 0;
2158                         if ( p_st != NULL ) *p_st = 0;
2159                     } else {
2160                         start = pr->u.p.lb;
2161                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2162                         incr  = pr->u.p.st;
2163 
2164                         if ( (last = (limit >= trip)) != 0 )
2165                             limit = trip;
2166 
2167                         if ( p_st != NULL ) *p_st = incr;
2168 
2169                         if ( incr == 1 ) {
2170                             *p_lb = start + init;
2171                             *p_ub = start + limit;
2172                         } else {
2173                             *p_lb = start + init * incr;
2174                             *p_ub = start + limit * incr;
2175                         }
2176 
2177                         if ( pr->ordered ) {
2178                             pr->u.p.ordered_lower = init;
2179                             pr->u.p.ordered_upper = limit;
2180                             #ifdef KMP_DEBUG
2181                             {
2182                                 const char * buff;
2183                                 // create format specifiers before the debug output
2184                                 buff = __kmp_str_format(
2185                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2186                                     traits_t< UT >::spec, traits_t< UT >::spec );
2187                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2188                                 __kmp_str_free( &buff );
2189                             }
2190                             #endif
2191                         } // if
2192                     } // if
2193                 } // case
2194                 break;
2195             default:
2196                 {
2197                     status = 0; // to avoid complaints on uninitialized variable use
2198                     __kmp_msg(
2199                         kmp_ms_fatal,                        // Severity
2200                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2201                         KMP_HNT( GetNewerLibrary ),          // Hint
2202                         __kmp_msg_null                       // Variadic argument list terminator
2203                     );
2204                 }
2205                 break;
2206             } // switch
2207         } // if tc == 0;
2208 
2209         if ( status == 0 ) {
2210             UT   num_done;
2211 
2212             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2213             #ifdef KMP_DEBUG
2214             {
2215                 const char * buff;
2216                 // create format specifiers before the debug output
2217                 buff = __kmp_str_format(
2218                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2219                     traits_t< UT >::spec );
2220                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2221                 __kmp_str_free( &buff );
2222             }
2223             #endif
2224 
2225             if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
2226                 #if ( KMP_STATIC_STEAL_ENABLED )
2227                 if( pr->schedule == kmp_sch_static_steal && ___kmp_size_type > 4 ) {
2228                     int i;
2229                     kmp_info_t **other_threads = team->t.t_threads;
2230                     // loop complete, safe to destroy locks used for stealing
2231                     for( i = 0; i < th->th.th_team_nproc; ++i ) {
2232                         kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2233                         KMP_ASSERT(lck != NULL);
2234                         __kmp_destroy_lock( lck );
2235                         __kmp_free( lck );
2236                         other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2237                     }
2238                 }
2239                 #endif
2240                 /* NOTE: release this buffer to be reused */
2241 
2242                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2243 
2244                 sh->u.s.num_done = 0;
2245                 sh->u.s.iteration = 0;
2246 
2247                 /* TODO replace with general release procedure? */
2248                 if ( pr->ordered ) {
2249                     sh->u.s.ordered_iteration = 0;
2250                 }
2251 
2252                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2253 
2254                 sh -> buffer_index += __kmp_dispatch_num_buffers;
2255                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2256                                 gtid, sh->buffer_index) );
2257 
2258                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2259 
2260             } // if
2261             if ( __kmp_env_consistency_check ) {
2262                 if ( pr->pushed_ws != ct_none ) {
2263                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2264                 }
2265             }
2266 
2267             th -> th.th_dispatch -> th_deo_fcn = NULL;
2268             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2269             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2270             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2271         } // if (status == 0)
2272 #if KMP_OS_WINDOWS
2273         else if ( last ) {
2274             pr->u.p.last_upper = pr->u.p.ub;
2275         }
2276 #endif /* KMP_OS_WINDOWS */
2277         if ( p_last != NULL && status != 0 )
2278             *p_last = last;
2279     } // if
2280 
2281     #ifdef KMP_DEBUG
2282     {
2283         const char * buff;
2284         // create format specifiers before the debug output
2285         buff = __kmp_str_format(
2286             "__kmp_dispatch_next: T#%%d normal case: " \
2287             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2288             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2289         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2290         __kmp_str_free( &buff );
2291     }
2292     #endif
2293 #if INCLUDE_SSC_MARKS
2294     SSC_MARK_DISPATCH_NEXT();
2295 #endif
2296     OMPT_LOOP_END;
2297     return status;
2298 }
2299 
2300 template< typename T >
2301 static void
2302 __kmp_dist_get_bounds(
2303     ident_t                          *loc,
2304     kmp_int32                         gtid,
2305     kmp_int32                        *plastiter,
2306     T                                *plower,
2307     T                                *pupper,
2308     typename traits_t< T >::signed_t  incr
2309 ) {
2310     typedef typename traits_t< T >::unsigned_t  UT;
2311     typedef typename traits_t< T >::signed_t    ST;
2312     register kmp_uint32  team_id;
2313     register kmp_uint32  nteams;
2314     register UT          trip_count;
2315     register kmp_team_t *team;
2316     kmp_info_t * th;
2317 
2318     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2319     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2320     #ifdef KMP_DEBUG
2321     {
2322         const char * buff;
2323         // create format specifiers before the debug output
2324         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2325             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2326             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2327             traits_t< T >::spec );
2328         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2329         __kmp_str_free( &buff );
2330     }
2331     #endif
2332 
2333     if( __kmp_env_consistency_check ) {
2334         if( incr == 0 ) {
2335             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2336         }
2337         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2338             // The loop is illegal.
2339             // Some zero-trip loops maintained by compiler, e.g.:
2340             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2341             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2342             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2343             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2344             // Compiler does not check the following illegal loops:
2345             //   for(i=0;i<10;i+=incr) // where incr<0
2346             //   for(i=10;i>0;i-=incr) // where incr<0
2347             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2348         }
2349     }
2350     th = __kmp_threads[gtid];
2351     team = th->th.th_team;
2352     #if OMP_40_ENABLED
2353     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2354     nteams = th->th.th_teams_size.nteams;
2355     #endif
2356     team_id = team->t.t_master_tid;
2357     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2358 
2359     // compute global trip count
2360     if( incr == 1 ) {
2361         trip_count = *pupper - *plower + 1;
2362     } else if(incr == -1) {
2363         trip_count = *plower - *pupper + 1;
2364     } else if ( incr > 0 ) {
2365         // upper-lower can exceed the limit of signed type
2366         trip_count = (UT)(*pupper - *plower) / incr + 1;
2367     } else {
2368         trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
2369     }
2370 
2371     if( trip_count <= nteams ) {
2372         KMP_DEBUG_ASSERT(
2373             __kmp_static == kmp_sch_static_greedy || \
2374             __kmp_static == kmp_sch_static_balanced
2375         ); // Unknown static scheduling type.
2376         // only some teams get single iteration, others get nothing
2377         if( team_id < trip_count ) {
2378             *pupper = *plower = *plower + team_id * incr;
2379         } else {
2380             *plower = *pupper + incr; // zero-trip loop
2381         }
2382         if( plastiter != NULL )
2383             *plastiter = ( team_id == trip_count - 1 );
2384     } else {
2385         if( __kmp_static == kmp_sch_static_balanced ) {
2386             register UT chunk = trip_count / nteams;
2387             register UT extras = trip_count % nteams;
2388             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2389             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2390             if( plastiter != NULL )
2391                 *plastiter = ( team_id == nteams - 1 );
2392         } else {
2393             register T chunk_inc_count =
2394                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2395             register T upper = *pupper;
2396             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2397                 // Unknown static scheduling type.
2398             *plower += team_id * chunk_inc_count;
2399             *pupper = *plower + chunk_inc_count - incr;
2400             // Check/correct bounds if needed
2401             if( incr > 0 ) {
2402                 if( *pupper < *plower )
2403                     *pupper = i_maxmin< T >::mx;
2404                 if( plastiter != NULL )
2405                     *plastiter = *plower <= upper && *pupper > upper - incr;
2406                 if( *pupper > upper )
2407                     *pupper = upper; // tracker C73258
2408             } else {
2409                 if( *pupper > *plower )
2410                     *pupper = i_maxmin< T >::mn;
2411                 if( plastiter != NULL )
2412                     *plastiter = *plower >= upper && *pupper < upper - incr;
2413                 if( *pupper < upper )
2414                     *pupper = upper; // tracker C73258
2415             }
2416         }
2417     }
2418 }
2419 
2420 //-----------------------------------------------------------------------------------------
2421 // Dispatch routines
2422 //    Transfer call to template< type T >
2423 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2424 //                         T lb, T ub, ST st, ST chunk )
2425 extern "C" {
2426 
2427 /*!
2428 @ingroup WORK_SHARING
2429 @{
2430 @param loc Source location
2431 @param gtid Global thread id
2432 @param schedule Schedule type
2433 @param lb  Lower bound
2434 @param ub  Upper bound
2435 @param st  Step (or increment if you prefer)
2436 @param chunk The chunk size to block with
2437 
2438 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2439 These functions are all identical apart from the types of the arguments.
2440 */
2441 
2442 void
2443 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2444                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2445 {
2446     KMP_DEBUG_ASSERT( __kmp_init_serial );
2447     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2448 }
2449 /*!
2450 See @ref __kmpc_dispatch_init_4
2451 */
2452 void
2453 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2454                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2455 {
2456     KMP_DEBUG_ASSERT( __kmp_init_serial );
2457     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2458 }
2459 
2460 /*!
2461 See @ref __kmpc_dispatch_init_4
2462 */
2463 void
2464 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2465                         kmp_int64 lb, kmp_int64 ub,
2466                         kmp_int64 st, kmp_int64 chunk )
2467 {
2468     KMP_DEBUG_ASSERT( __kmp_init_serial );
2469     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2470 }
2471 
2472 /*!
2473 See @ref __kmpc_dispatch_init_4
2474 */
2475 void
2476 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2477                          kmp_uint64 lb, kmp_uint64 ub,
2478                          kmp_int64 st, kmp_int64 chunk )
2479 {
2480     KMP_DEBUG_ASSERT( __kmp_init_serial );
2481     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2482 }
2483 
2484 /*!
2485 See @ref __kmpc_dispatch_init_4
2486 
2487 Difference from __kmpc_dispatch_init set of functions is these functions
2488 are called for composite distribute parallel for construct. Thus before
2489 regular iterations dispatching we need to calc per-team iteration space.
2490 
2491 These functions are all identical apart from the types of the arguments.
2492 */
2493 void
2494 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2495     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2496 {
2497     KMP_DEBUG_ASSERT( __kmp_init_serial );
2498     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2499     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2500 }
2501 
2502 void
2503 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2504     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2505 {
2506     KMP_DEBUG_ASSERT( __kmp_init_serial );
2507     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2508     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2509 }
2510 
2511 void
2512 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2513     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2514 {
2515     KMP_DEBUG_ASSERT( __kmp_init_serial );
2516     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2517     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2518 }
2519 
2520 void
2521 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2522     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2523 {
2524     KMP_DEBUG_ASSERT( __kmp_init_serial );
2525     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2526     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2527 }
2528 
2529 /*!
2530 @param loc Source code location
2531 @param gtid Global thread id
2532 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2533 @param p_lb   Pointer to the lower bound for the next chunk of work
2534 @param p_ub   Pointer to the upper bound for the next chunk of work
2535 @param p_st   Pointer to the stride for the next chunk of work
2536 @return one if there is work to be done, zero otherwise
2537 
2538 Get the next dynamically allocated chunk of work for this thread.
2539 If there is no more work, then the lb,ub and stride need not be modified.
2540 */
2541 int
2542 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2544 {
2545     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2546 }
2547 
2548 /*!
2549 See @ref __kmpc_dispatch_next_4
2550 */
2551 int
2552 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2553                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2554 {
2555     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2556 }
2557 
2558 /*!
2559 See @ref __kmpc_dispatch_next_4
2560 */
2561 int
2562 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2563                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2564 {
2565     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2566 }
2567 
2568 /*!
2569 See @ref __kmpc_dispatch_next_4
2570 */
2571 int
2572 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2573                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2574 {
2575     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2576 }
2577 
2578 /*!
2579 @param loc Source code location
2580 @param gtid Global thread id
2581 
2582 Mark the end of a dynamic loop.
2583 */
2584 void
2585 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2586 {
2587     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2588 }
2589 
2590 /*!
2591 See @ref __kmpc_dispatch_fini_4
2592 */
2593 void
2594 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2595 {
2596     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2597 }
2598 
2599 /*!
2600 See @ref __kmpc_dispatch_fini_4
2601 */
2602 void
2603 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2604 {
2605     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2606 }
2607 
2608 /*!
2609 See @ref __kmpc_dispatch_fini_4
2610 */
2611 void
2612 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2613 {
2614     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2615 }
2616 /*! @} */
2617 
2618 //-----------------------------------------------------------------------------------------
2619 //Non-template routines from kmp_dispatch.c used in other sources
2620 
2621 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2622     return value == checker;
2623 }
2624 
2625 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2626     return value != checker;
2627 }
2628 
2629 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2630     return value < checker;
2631 }
2632 
2633 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2634     return value >= checker;
2635 }
2636 
2637 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2638     return value <= checker;
2639 }
2640 
2641 kmp_uint32
2642 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2643                    kmp_uint32            checker,
2644                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2645                    , void        * obj    // Higher-level synchronization object, or NULL.
2646                    )
2647 {
2648     // note: we may not belong to a team at this point
2649     register volatile kmp_uint32         * spin          = spinner;
2650     register          kmp_uint32           check         = checker;
2651     register          kmp_uint32   spins;
2652     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2653     register          kmp_uint32           r;
2654 
2655     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2656     KMP_INIT_YIELD( spins );
2657     // main wait spin loop
2658     while(!f(r = TCR_4(*spin), check)) {
2659         KMP_FSYNC_SPIN_PREPARE( obj );
2660         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2661            It causes problems with infinite recursion because of exit lock */
2662         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2663             __kmp_abort_thread(); */
2664 
2665         /* if we have waited a bit, or are oversubscribed, yield */
2666         /* pause is in the following code */
2667         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2668         KMP_YIELD_SPIN( spins );
2669     }
2670     KMP_FSYNC_SPIN_ACQUIRED( obj );
2671     return r;
2672 }
2673 
2674 void
2675 __kmp_wait_yield_4_ptr(void *spinner,
2676                    kmp_uint32 checker,
2677                    kmp_uint32 (*pred)( void *, kmp_uint32 ),
2678                    void        *obj    // Higher-level synchronization object, or NULL.
2679                    )
2680 {
2681     // note: we may not belong to a team at this point
2682     register void                *spin          = spinner;
2683     register kmp_uint32           check         = checker;
2684     register kmp_uint32           spins;
2685     register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2686 
2687     KMP_FSYNC_SPIN_INIT( obj, spin );
2688     KMP_INIT_YIELD( spins );
2689     // main wait spin loop
2690     while ( !f( spin, check ) ) {
2691         KMP_FSYNC_SPIN_PREPARE( obj );
2692         /* if we have waited a bit, or are oversubscribed, yield */
2693         /* pause is in the following code */
2694         KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2695         KMP_YIELD_SPIN( spins );
2696     }
2697     KMP_FSYNC_SPIN_ACQUIRED( obj );
2698 }
2699 
2700 } // extern "C"
2701 
2702 #ifdef KMP_GOMP_COMPAT
2703 
2704 void
2705 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2706                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2707                            kmp_int32 chunk, int push_ws )
2708 {
2709     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2710                                       push_ws );
2711 }
2712 
2713 void
2714 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2715                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2716                             kmp_int32 chunk, int push_ws )
2717 {
2718     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2719                                        push_ws );
2720 }
2721 
2722 void
2723 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2724                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2725                            kmp_int64 chunk, int push_ws )
2726 {
2727     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2728                                       push_ws );
2729 }
2730 
2731 void
2732 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2733                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2734                             kmp_int64 chunk, int push_ws )
2735 {
2736     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2737                                        push_ws );
2738 }
2739 
2740 void
2741 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2742 {
2743     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2744 }
2745 
2746 void
2747 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2748 {
2749     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2750 }
2751 
2752 void
2753 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2754 {
2755     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2756 }
2757 
2758 void
2759 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2760 {
2761     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2762 }
2763 
2764 #endif /* KMP_GOMP_COMPAT */
2765 
2766 /* ------------------------------------------------------------------------ */
2767 /* ------------------------------------------------------------------------ */
2768 
2769