1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64
29 #if defined(_WIN32_WINNT) && defined(_M_IX86)
30 #undef _WIN32_WINNT
31 #define _WIN32_WINNT 0x0502
32 #endif
33 
34 #include "kmp.h"
35 #include "kmp_i18n.h"
36 #include "kmp_itt.h"
37 #include "kmp_str.h"
38 #include "kmp_error.h"
39 #include "kmp_stats.h"
40 #if KMP_OS_WINDOWS && KMP_ARCH_X86
41     #include <float.h>
42 #endif
43 
44 #if OMPT_SUPPORT
45 #include "ompt-internal.h"
46 #include "ompt-specific.h"
47 #endif
48 
49 /* ------------------------------------------------------------------------ */
50 /* ------------------------------------------------------------------------ */
51 
52 #if KMP_STATIC_STEAL_ENABLED
53 
54     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
55     template< typename T >
56     struct dispatch_private_infoXX_template {
57         typedef typename traits_t< T >::unsigned_t  UT;
58         typedef typename traits_t< T >::signed_t    ST;
59         UT count;                // unsigned
60         T  ub;
61         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
62         T  lb;
63         ST st;                   // signed
64         UT tc;                   // unsigned
65         T  static_steal_counter; // for static_steal only; maybe better to put after ub
66 
67         /* parm[1-4] are used in different ways by different scheduling algorithms */
68 
69         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
70         //    a) parm3 is properly aligned and
71         //    b) all parm1-4 are in the same cache line.
72         // Because of parm1-4 are used together, performance seems to be better
73         // if they are in the same line (not measured though).
74 
75         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
76             T  parm1;
77             T  parm2;
78             T  parm3;
79             T  parm4;
80         };
81 
82         UT ordered_lower; // unsigned
83         UT ordered_upper; // unsigned
84         #if KMP_OS_WINDOWS
85         T  last_upper;
86         #endif /* KMP_OS_WINDOWS */
87     };
88 
89 #else /* KMP_STATIC_STEAL_ENABLED */
90 
91     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
92     template< typename T >
93     struct dispatch_private_infoXX_template {
94         typedef typename traits_t< T >::unsigned_t  UT;
95         typedef typename traits_t< T >::signed_t    ST;
96         T  lb;
97         T  ub;
98         ST st;            // signed
99         UT tc;            // unsigned
100 
101         T  parm1;
102         T  parm2;
103         T  parm3;
104         T  parm4;
105 
106         UT count;         // unsigned
107 
108         UT ordered_lower; // unsigned
109         UT ordered_upper; // unsigned
110         #if KMP_OS_WINDOWS
111 	T  last_upper;
112         #endif /* KMP_OS_WINDOWS */
113     };
114 
115 #endif /* KMP_STATIC_STEAL_ENABLED */
116 
117 // replaces dispatch_private_info structure and dispatch_private_info_t type
118 template< typename T >
119 struct KMP_ALIGN_CACHE dispatch_private_info_template {
120     // duplicate alignment here, otherwise size of structure is not correct in our compiler
121     union KMP_ALIGN_CACHE private_info_tmpl {
122         dispatch_private_infoXX_template< T > p;
123         dispatch_private_info64_t             p64;
124     } u;
125     enum sched_type schedule;  /* scheduling algorithm */
126     kmp_uint32      ordered;   /* ordered clause specified */
127     kmp_uint32      ordered_bumped;
128     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
129     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
130     kmp_uint32      nomerge;   /* don't merge iters if serialized */
131     kmp_uint32      type_size;
132     enum cons_type  pushed_ws;
133 };
134 
135 
136 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
137 template< typename UT >
138 struct dispatch_shared_infoXX_template {
139     /* chunk index under dynamic, number of idle threads under static-steal;
140        iteration index otherwise */
141     volatile UT     iteration;
142     volatile UT     num_done;
143     volatile UT     ordered_iteration;
144     UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
145 };
146 
147 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
148 template< typename UT >
149 struct dispatch_shared_info_template {
150     // we need union here to keep the structure size
151     union shared_info_tmpl {
152         dispatch_shared_infoXX_template< UT >  s;
153         dispatch_shared_info64_t               s64;
154     } u;
155     volatile kmp_uint32     buffer_index;
156 #if OMP_45_ENABLED
157     volatile kmp_int32      doacross_buf_idx;  // teamwise index
158     kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
159     kmp_int32               doacross_num_done; // count finished threads
160 #endif
161 #if KMP_USE_HWLOC
162     // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
163     // machines (> 48 cores). Performance analysis showed that a cache thrash
164     // was occurring and this padding helps alleviate the problem.
165     char padding[64];
166 #endif
167 };
168 
169 /* ------------------------------------------------------------------------ */
170 /* ------------------------------------------------------------------------ */
171 
172 #undef USE_TEST_LOCKS
173 
174 // test_then_add template (general template should NOT be used)
175 template< typename T >
176 static __forceinline T
177 test_then_add( volatile T *p, T d );
178 
179 template<>
180 __forceinline kmp_int32
181 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
182 {
183     kmp_int32 r;
184     r = KMP_TEST_THEN_ADD32( p, d );
185     return r;
186 }
187 
188 template<>
189 __forceinline kmp_int64
190 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
191 {
192     kmp_int64 r;
193     r = KMP_TEST_THEN_ADD64( p, d );
194     return r;
195 }
196 
197 // test_then_inc_acq template (general template should NOT be used)
198 template< typename T >
199 static __forceinline T
200 test_then_inc_acq( volatile T *p );
201 
202 template<>
203 __forceinline kmp_int32
204 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
205 {
206     kmp_int32 r;
207     r = KMP_TEST_THEN_INC_ACQ32( p );
208     return r;
209 }
210 
211 template<>
212 __forceinline kmp_int64
213 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
214 {
215     kmp_int64 r;
216     r = KMP_TEST_THEN_INC_ACQ64( p );
217     return r;
218 }
219 
220 // test_then_inc template (general template should NOT be used)
221 template< typename T >
222 static __forceinline T
223 test_then_inc( volatile T *p );
224 
225 template<>
226 __forceinline kmp_int32
227 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
228 {
229     kmp_int32 r;
230     r = KMP_TEST_THEN_INC32( p );
231     return r;
232 }
233 
234 template<>
235 __forceinline kmp_int64
236 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
237 {
238     kmp_int64 r;
239     r = KMP_TEST_THEN_INC64( p );
240     return r;
241 }
242 
243 // compare_and_swap template (general template should NOT be used)
244 template< typename T >
245 static __forceinline kmp_int32
246 compare_and_swap( volatile T *p, T c, T s );
247 
248 template<>
249 __forceinline kmp_int32
250 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
251 {
252     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
253 }
254 
255 template<>
256 __forceinline kmp_int32
257 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
258 {
259     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
260 }
261 
262 /*
263     Spin wait loop that first does pause, then yield.
264     Waits until function returns non-zero when called with *spinner and check.
265     Does NOT put threads to sleep.
266 #if USE_ITT_BUILD
267     Arguments:
268         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
269             locks consistently. For example, if lock is acquired immediately, its address is
270             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
271             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
272             address, not an address of low-level spinner.
273 #endif // USE_ITT_BUILD
274 */
275 template< typename UT >
276 // ToDo: make inline function (move to header file for icl)
277 static UT  // unsigned 4- or 8-byte type
278 __kmp_wait_yield( volatile UT * spinner,
279                   UT            checker,
280                   kmp_uint32 (* pred)( UT, UT )
281                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
282                   )
283 {
284     // note: we may not belong to a team at this point
285     register volatile UT         * spin          = spinner;
286     register          UT           check         = checker;
287     register          kmp_uint32   spins;
288     register          kmp_uint32 (*f) ( UT, UT ) = pred;
289     register          UT           r;
290 
291     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
292     KMP_INIT_YIELD( spins );
293     // main wait spin loop
294     while(!f(r = *spin, check))
295     {
296         KMP_FSYNC_SPIN_PREPARE( obj );
297         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
298            It causes problems with infinite recursion because of exit lock */
299         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
300             __kmp_abort_thread(); */
301 
302         // if we are oversubscribed,
303         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
304         // pause is in the following code
305         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
306         KMP_YIELD_SPIN( spins );
307     }
308     KMP_FSYNC_SPIN_ACQUIRED( obj );
309     return r;
310 }
311 
312 template< typename UT >
313 static kmp_uint32 __kmp_eq( UT value, UT checker) {
314     return value == checker;
315 }
316 
317 template< typename UT >
318 static kmp_uint32 __kmp_neq( UT value, UT checker) {
319     return value != checker;
320 }
321 
322 template< typename UT >
323 static kmp_uint32 __kmp_lt( UT value, UT checker) {
324     return value < checker;
325 }
326 
327 template< typename UT >
328 static kmp_uint32 __kmp_ge( UT value, UT checker) {
329     return value >= checker;
330 }
331 
332 template< typename UT >
333 static kmp_uint32 __kmp_le( UT value, UT checker) {
334     return value <= checker;
335 }
336 
337 
338 /* ------------------------------------------------------------------------ */
339 /* ------------------------------------------------------------------------ */
340 
341 static void
342 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
343 {
344     kmp_info_t *th;
345 
346     KMP_DEBUG_ASSERT( gtid_ref );
347 
348     if ( __kmp_env_consistency_check ) {
349         th = __kmp_threads[*gtid_ref];
350         if ( th -> th.th_root -> r.r_active
351           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
352 #if KMP_USE_DYNAMIC_LOCK
353             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
354 #else
355             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
356 #endif
357         }
358     }
359 }
360 
361 template< typename UT >
362 static void
363 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
364 {
365     typedef typename traits_t< UT >::signed_t    ST;
366     dispatch_private_info_template< UT > * pr;
367 
368     int gtid = *gtid_ref;
369 //    int  cid = *cid_ref;
370     kmp_info_t *th = __kmp_threads[ gtid ];
371     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
372 
373     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
374     if ( __kmp_env_consistency_check ) {
375         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
376             ( th -> th.th_dispatch -> th_dispatch_pr_current );
377         if ( pr -> pushed_ws != ct_none ) {
378 #if KMP_USE_DYNAMIC_LOCK
379             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
380 #else
381             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
382 #endif
383         }
384     }
385 
386     if ( ! th -> th.th_team -> t.t_serialized ) {
387         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
388             ( th -> th.th_dispatch -> th_dispatch_sh_current );
389         UT  lower;
390 
391         if ( ! __kmp_env_consistency_check ) {
392                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
393                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
394         }
395         lower = pr->u.p.ordered_lower;
396 
397         #if ! defined( KMP_GOMP_COMPAT )
398             if ( __kmp_env_consistency_check ) {
399                 if ( pr->ordered_bumped ) {
400                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
401                     __kmp_error_construct2(
402                         kmp_i18n_msg_CnsMultipleNesting,
403                         ct_ordered_in_pdo, loc_ref,
404                         & p->stack_data[ p->w_top ]
405                     );
406                 }
407             }
408         #endif /* !defined(KMP_GOMP_COMPAT) */
409 
410         KMP_MB();
411         #ifdef KMP_DEBUG
412         {
413             const char * buff;
414             // create format specifiers before the debug output
415             buff = __kmp_str_format(
416                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
417                 traits_t< UT >::spec, traits_t< UT >::spec );
418             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
419             __kmp_str_free( &buff );
420         }
421         #endif
422 
423         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
424                                 USE_ITT_BUILD_ARG( NULL )
425                                 );
426         KMP_MB();  /* is this necessary? */
427         #ifdef KMP_DEBUG
428         {
429             const char * buff;
430             // create format specifiers before the debug output
431             buff = __kmp_str_format(
432                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
433                 traits_t< UT >::spec, traits_t< UT >::spec );
434             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435             __kmp_str_free( &buff );
436         }
437         #endif
438     }
439     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
440 }
441 
442 static void
443 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
444 {
445     kmp_info_t *th;
446 
447     if ( __kmp_env_consistency_check ) {
448         th = __kmp_threads[*gtid_ref];
449         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
450             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
451         }
452     }
453 }
454 
455 template< typename UT >
456 static void
457 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
458 {
459     typedef typename traits_t< UT >::signed_t    ST;
460     dispatch_private_info_template< UT > * pr;
461 
462     int gtid = *gtid_ref;
463 //    int  cid = *cid_ref;
464     kmp_info_t *th = __kmp_threads[ gtid ];
465     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
466 
467     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
468     if ( __kmp_env_consistency_check ) {
469         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
470             ( th -> th.th_dispatch -> th_dispatch_pr_current );
471         if ( pr -> pushed_ws != ct_none ) {
472             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
473         }
474     }
475 
476     if ( ! th -> th.th_team -> t.t_serialized ) {
477         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
478             ( th -> th.th_dispatch -> th_dispatch_sh_current );
479 
480         if ( ! __kmp_env_consistency_check ) {
481             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
482                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
483         }
484 
485         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
486         #if ! defined( KMP_GOMP_COMPAT )
487             if ( __kmp_env_consistency_check ) {
488                 if ( pr->ordered_bumped != 0 ) {
489                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
490                     /* How to test it? - OM */
491                     __kmp_error_construct2(
492                         kmp_i18n_msg_CnsMultipleNesting,
493                         ct_ordered_in_pdo, loc_ref,
494                         & p->stack_data[ p->w_top ]
495                     );
496                 }
497             }
498         #endif /* !defined(KMP_GOMP_COMPAT) */
499 
500         KMP_MB();       /* Flush all pending memory write invalidates.  */
501 
502         pr->ordered_bumped += 1;
503 
504         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
505                         gtid, pr->ordered_bumped ) );
506 
507         KMP_MB();       /* Flush all pending memory write invalidates.  */
508 
509         /* TODO use general release procedure? */
510         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
511 
512         KMP_MB();       /* Flush all pending memory write invalidates.  */
513     }
514     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
515 }
516 
517 /* Computes and returns x to the power of y, where y must a non-negative integer */
518 template< typename UT >
519 static __forceinline long double
520 __kmp_pow(long double x, UT y) {
521     long double s=1.0L;
522 
523     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
524     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
525     while(y) {
526         if ( y & 1 )
527             s *= x;
528         x *= x;
529         y >>= 1;
530     }
531     return s;
532 }
533 
534 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
535    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
536    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
537    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
538 */
539 template< typename T >
540 static __inline typename traits_t< T >::unsigned_t
541 __kmp_dispatch_guided_remaining(
542     T                                  tc,
543     typename traits_t< T >::floating_t base,
544     typename traits_t< T >::unsigned_t idx
545 ) {
546     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
547        least for ICL 8.1, long double arithmetic may not really have
548        long double precision, even with /Qlong_double.  Currently, we
549        workaround that in the caller code, by manipulating the FPCW for
550        Windows* OS on IA-32 architecture.  The lack of precision is not
551        expected to be a correctness issue, though.
552     */
553     typedef typename traits_t< T >::unsigned_t  UT;
554 
555     long double x = tc * __kmp_pow< UT >(base, idx);
556     UT r = (UT) x;
557     if ( x == r )
558         return r;
559     return r + 1;
560 }
561 
562 // Parameters of the guided-iterative algorithm:
563 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
564 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
565 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
566 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
567 static int guided_int_param = 2;
568 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
569 
570 // UT - unsigned flavor of T, ST - signed flavor of T,
571 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
572 template< typename T >
573 static void
574 __kmp_dispatch_init(
575     ident_t                        * loc,
576     int                              gtid,
577     enum sched_type                  schedule,
578     T                                lb,
579     T                                ub,
580     typename traits_t< T >::signed_t st,
581     typename traits_t< T >::signed_t chunk,
582     int                              push_ws
583 ) {
584     typedef typename traits_t< T >::unsigned_t  UT;
585     typedef typename traits_t< T >::signed_t    ST;
586     typedef typename traits_t< T >::floating_t  DBL;
587 
588     int                                            active;
589     T                                              tc;
590     kmp_info_t *                                   th;
591     kmp_team_t *                                   team;
592     kmp_uint32                                     my_buffer_index;
593     dispatch_private_info_template< T >          * pr;
594     dispatch_shared_info_template< UT > volatile * sh;
595 
596     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
597     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
598 
599     if ( ! TCR_4( __kmp_init_parallel ) )
600         __kmp_parallel_initialize();
601 
602 #if INCLUDE_SSC_MARKS
603     SSC_MARK_DISPATCH_INIT();
604 #endif
605     #ifdef KMP_DEBUG
606     {
607         const char * buff;
608         // create format specifiers before the debug output
609         buff = __kmp_str_format(
610             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
611             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
612         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
613         __kmp_str_free( &buff );
614     }
615     #endif
616     /* setup data */
617     th     = __kmp_threads[ gtid ];
618     team   = th -> th.th_team;
619     active = ! team -> t.t_serialized;
620     th->th.th_ident = loc;
621 
622 #if USE_ITT_BUILD
623     kmp_uint64 cur_chunk = chunk;
624     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
625         KMP_MASTER_GTID(gtid) &&
626 #if OMP_40_ENABLED
627         th->th.th_teams_microtask == NULL &&
628 #endif
629         team->t.t_active_level == 1;
630 #endif
631     if ( ! active ) {
632         pr = reinterpret_cast< dispatch_private_info_template< T >* >
633             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
634     } else {
635         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
636                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
637 
638         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
639 
640         /* What happens when number of threads changes, need to resize buffer? */
641         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
642             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
643         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
644             ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
645     }
646 
647     #if  ( KMP_STATIC_STEAL_ENABLED )
648     if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
649         // AC: we now have only one implementation of stealing, so use it
650         schedule = kmp_sch_static_steal;
651     else
652     #endif
653         schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
654 
655     /* Pick up the nomerge/ordered bits from the scheduling type */
656     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
657         pr->nomerge = TRUE;
658         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
659     } else {
660         pr->nomerge = FALSE;
661     }
662     pr->type_size = traits_t<T>::type_size; // remember the size of variables
663     if ( kmp_ord_lower & schedule ) {
664         pr->ordered = TRUE;
665         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
666     } else {
667         pr->ordered = FALSE;
668     }
669 
670     if ( schedule == kmp_sch_static ) {
671         schedule = __kmp_static;
672     } else {
673         if ( schedule == kmp_sch_runtime ) {
674             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
675             schedule = team -> t.t_sched.r_sched_type;
676             // Detail the schedule if needed (global controls are differentiated appropriately)
677             if ( schedule == kmp_sch_guided_chunked ) {
678                 schedule = __kmp_guided;
679             } else if ( schedule == kmp_sch_static ) {
680                 schedule = __kmp_static;
681             }
682             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
683             chunk = team -> t.t_sched.chunk;
684 #if USE_ITT_BUILD
685             cur_chunk = chunk;
686 #endif
687             #ifdef KMP_DEBUG
688             {
689                 const char * buff;
690                 // create format specifiers before the debug output
691                 buff = __kmp_str_format(
692                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
693                     traits_t< ST >::spec );
694                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
695                 __kmp_str_free( &buff );
696             }
697             #endif
698         } else {
699             if ( schedule == kmp_sch_guided_chunked ) {
700                 schedule = __kmp_guided;
701             }
702             if ( chunk <= 0 ) {
703                 chunk = KMP_DEFAULT_CHUNK;
704             }
705         }
706 
707         if ( schedule == kmp_sch_auto ) {
708             // mapping and differentiation: in the __kmp_do_serial_initialize()
709             schedule = __kmp_auto;
710             #ifdef KMP_DEBUG
711             {
712                 const char * buff;
713                 // create format specifiers before the debug output
714                 buff = __kmp_str_format(
715                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
716                     traits_t< ST >::spec );
717                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
718                 __kmp_str_free( &buff );
719             }
720             #endif
721         }
722 
723         /* guided analytical not safe for too many threads */
724         if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) {
725             schedule = kmp_sch_guided_iterative_chunked;
726             KMP_WARNING( DispatchManyThreads );
727         }
728         pr->u.p.parm1 = chunk;
729     }
730     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
731                 "unknown scheduling type" );
732 
733     pr->u.p.count = 0;
734 
735     if ( __kmp_env_consistency_check ) {
736         if ( st == 0 ) {
737             __kmp_error_construct(
738                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
739                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
740             );
741         }
742     }
743     // compute trip count
744     if ( st == 1 ) {   // most common case
745         if ( ub >= lb ) {
746             tc = ub - lb + 1;
747         } else {   // ub < lb
748             tc = 0;            // zero-trip
749         }
750     } else if ( st < 0 ) {
751         if ( lb >= ub ) {
752             // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
753             //     where the division needs to be unsigned regardless of the result type
754             tc = (UT)(lb - ub) / (-st) + 1;
755         } else {   // lb < ub
756             tc = 0;            // zero-trip
757         }
758     } else {       // st > 0
759         if ( ub >= lb ) {
760             // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
761             //     where the division needs to be unsigned regardless of the result type
762             tc = (UT)(ub - lb) / st + 1;
763         } else {   // ub < lb
764             tc = 0;            // zero-trip
765         }
766     }
767 
768     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
769     // when statistics are disabled.
770     if (schedule == __kmp_static)
771     {
772         KMP_COUNT_BLOCK(OMP_FOR_static);
773         KMP_COUNT_VALUE(FOR_static_iterations, tc);
774     }
775     else
776     {
777         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
778         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
779     }
780 
781     pr->u.p.lb = lb;
782     pr->u.p.ub = ub;
783     pr->u.p.st = st;
784     pr->u.p.tc = tc;
785 
786     #if KMP_OS_WINDOWS
787     pr->u.p.last_upper = ub + st;
788     #endif /* KMP_OS_WINDOWS */
789 
790     /* NOTE: only the active parallel region(s) has active ordered sections */
791 
792     if ( active ) {
793         if ( pr->ordered == 0 ) {
794             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
795             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
796         } else {
797             pr->ordered_bumped = 0;
798 
799             pr->u.p.ordered_lower = 1;
800             pr->u.p.ordered_upper = 0;
801 
802             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
803             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
804         }
805     }
806 
807     if ( __kmp_env_consistency_check ) {
808         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
809         if ( push_ws ) {
810             __kmp_push_workshare( gtid, ws, loc );
811             pr->pushed_ws = ws;
812         } else {
813             __kmp_check_workshare( gtid, ws, loc );
814             pr->pushed_ws = ct_none;
815         }
816     }
817 
818     switch ( schedule ) {
819     #if  ( KMP_STATIC_STEAL_ENABLED )
820     case kmp_sch_static_steal:
821         {
822             T nproc = th->th.th_team_nproc;
823             T ntc, init;
824 
825             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
826 
827             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
828             if ( nproc > 1 && ntc >= nproc ) {
829                 KMP_COUNT_BLOCK(OMP_FOR_static_steal);
830                 T id = __kmp_tid_from_gtid(gtid);
831                 T small_chunk, extras;
832 
833                 small_chunk = ntc / nproc;
834                 extras = ntc % nproc;
835 
836                 init = id * small_chunk + ( id < extras ? id : extras );
837                 pr->u.p.count = init;
838                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
839 
840                 pr->u.p.parm2 = lb;
841                 //pr->pfields.parm3 = 0; // it's not used in static_steal
842                 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
843                 pr->u.p.st = st;
844                 if ( traits_t<T>::type_size > 4 ) {
845                     // AC: TODO: check if 16-byte CAS available and use it to
846                     // improve performance (probably wait for explicit request
847                     // before spending time on this).
848                     // For now use dynamically allocated per-thread lock,
849                     // free memory in __kmp_dispatch_next when status==0.
850                     KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
851                     th->th.th_dispatch->th_steal_lock =
852                         (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
853                     __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
854                 }
855                 break;
856             } else {
857                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
858                                gtid ) );
859                 schedule = kmp_sch_static_balanced;
860                 /* too few iterations: fall-through to kmp_sch_static_balanced */
861             } // if
862             /* FALL-THROUGH to static balanced */
863         } // case
864     #endif
865     case kmp_sch_static_balanced:
866         {
867             T nproc = th->th.th_team_nproc;
868             T init, limit;
869 
870             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
871                             gtid ) );
872 
873             if ( nproc > 1 ) {
874                 T id = __kmp_tid_from_gtid(gtid);
875 
876                 if ( tc < nproc ) {
877                     if ( id < tc ) {
878                         init = id;
879                         limit = id;
880                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
881                     } else {
882                         pr->u.p.count = 1;  /* means no more chunks to execute */
883                         pr->u.p.parm1 = FALSE;
884                         break;
885                     }
886                 } else {
887                     T small_chunk = tc / nproc;
888                     T extras = tc % nproc;
889                     init = id * small_chunk + (id < extras ? id : extras);
890                     limit = init + small_chunk - (id < extras ? 0 : 1);
891                     pr->u.p.parm1 = (id == nproc - 1);
892                 }
893             } else {
894                 if ( tc > 0 ) {
895                     init = 0;
896                     limit = tc - 1;
897                     pr->u.p.parm1 = TRUE;
898                 } else {
899                     // zero trip count
900                     pr->u.p.count = 1;  /* means no more chunks to execute */
901                     pr->u.p.parm1 = FALSE;
902                     break;
903                 }
904             }
905 #if USE_ITT_BUILD
906             // Calculate chunk for metadata report
907             if ( itt_need_metadata_reporting )
908                 cur_chunk = limit - init + 1;
909 #endif
910             if ( st == 1 ) {
911                 pr->u.p.lb = lb + init;
912                 pr->u.p.ub = lb + limit;
913             } else {
914                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
915                 pr->u.p.lb = lb + init * st;
916                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
917                 if ( st > 0 ) {
918                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
919                 } else {
920                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
921                 }
922             }
923             if ( pr->ordered ) {
924                 pr->u.p.ordered_lower = init;
925                 pr->u.p.ordered_upper = limit;
926             }
927             break;
928         } // case
929     case kmp_sch_guided_iterative_chunked :
930         {
931             T nproc = th->th.th_team_nproc;
932             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
933 
934             if ( nproc > 1 ) {
935                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
936                     /* chunk size too large, switch to dynamic */
937                     schedule = kmp_sch_dynamic_chunked;
938                 } else {
939                     // when remaining iters become less than parm2 - switch to dynamic
940                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
941                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
942                 }
943             } else {
944                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
945                 schedule = kmp_sch_static_greedy;
946                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
947                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
948                 pr->u.p.parm1 = tc;
949             } // if
950         } // case
951         break;
952     case kmp_sch_guided_analytical_chunked:
953         {
954             T nproc = th->th.th_team_nproc;
955             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
956 
957             if ( nproc > 1 ) {
958                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
959                     /* chunk size too large, switch to dynamic */
960                     schedule = kmp_sch_dynamic_chunked;
961                 } else {
962                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
963                     DBL x;
964 
965                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
966                     /* Linux* OS already has 64-bit computation by default for
967 		       long double, and on Windows* OS on Intel(R) 64,
968 		       /Qlong_double doesn't work.  On Windows* OS
969 		       on IA-32 architecture, we need to set precision to
970 		       64-bit instead of the default 53-bit. Even though long
971 		       double doesn't work on Windows* OS on Intel(R) 64, the
972 		       resulting lack of precision is not expected to impact
973 		       the correctness of the algorithm, but this has not been
974 		       mathematically proven.
975                     */
976                     // save original FPCW and set precision to 64-bit, as
977                     // Windows* OS on IA-32 architecture defaults to 53-bit
978                     unsigned int oldFpcw = _control87(0,0);
979                     _control87(_PC_64,_MCW_PC); // 0,0x30000
980                     #endif
981                     /* value used for comparison in solver for cross-over point */
982                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
983 
984                     /* crossover point--chunk indexes equal to or greater than
985 		       this point switch to dynamic-style scheduling */
986                     UT   cross;
987 
988                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
989                     x = (long double)1.0 - (long double)0.5 / nproc;
990 
991                     #ifdef KMP_DEBUG
992                     { // test natural alignment
993                         struct _test_a {
994                             char a;
995                             union {
996                                 char b;
997                                 DBL  d;
998                             };
999                         } t;
1000                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1001                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1002                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1003                     }
1004                     #endif // KMP_DEBUG
1005 
1006                     /* save the term in thread private dispatch structure */
1007                     *(DBL*)&pr->u.p.parm3 = x;
1008 
1009                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1010                     {
1011                         UT          left, right, mid;
1012                         long double p;
1013 
1014                         /* estimate initial upper and lower bound */
1015 
1016                         /* doesn't matter what value right is as long as it is positive, but
1017                            it affects performance of the solver
1018                         */
1019                         right = 229;
1020                         p = __kmp_pow< UT >(x,right);
1021                         if ( p > target ) {
1022                             do{
1023                                 p *= p;
1024                                 right <<= 1;
1025                             } while(p>target && right < (1<<27));
1026                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1027                         } else {
1028                             left = 0;
1029                         }
1030 
1031                         /* bisection root-finding method */
1032                         while ( left + 1 < right ) {
1033                             mid = (left + right) / 2;
1034                             if ( __kmp_pow< UT >(x,mid) > target ) {
1035                                 left = mid;
1036                             } else {
1037                                 right = mid;
1038                             }
1039                         } // while
1040                         cross = right;
1041                     }
1042                     /* assert sanity of computed crossover point */
1043                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1044 
1045                     /* save the crossover point in thread private dispatch structure */
1046                     pr->u.p.parm2 = cross;
1047 
1048                     // C75803
1049                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1050                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1051                     #else
1052                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1053                     #endif
1054                     /* dynamic-style scheduling offset */
1055                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1056                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1057                         // restore FPCW
1058                         _control87(oldFpcw,_MCW_PC);
1059                     #endif
1060                 } // if
1061             } else {
1062                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1063                                gtid ) );
1064                 schedule = kmp_sch_static_greedy;
1065                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1066                 pr->u.p.parm1 = tc;
1067             } // if
1068         } // case
1069         break;
1070     case kmp_sch_static_greedy:
1071         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1072             pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ?
1073                 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc :
1074                 tc;
1075         break;
1076     case kmp_sch_static_chunked :
1077     case kmp_sch_dynamic_chunked :
1078         if ( pr->u.p.parm1 <= 0 ) {
1079             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1080         }
1081         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1082         break;
1083     case kmp_sch_trapezoidal :
1084         {
1085             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1086 
1087             T parm1, parm2, parm3, parm4;
1088             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1089 
1090             parm1 = chunk;
1091 
1092             /* F : size of the first cycle */
1093             parm2 = ( tc / (2 * th->th.th_team_nproc) );
1094 
1095             if ( parm2 < 1 ) {
1096                 parm2 = 1;
1097             }
1098 
1099             /* L : size of the last cycle.  Make sure the last cycle
1100              *     is not larger than the first cycle.
1101              */
1102             if ( parm1 < 1 ) {
1103                 parm1 = 1;
1104             } else if ( parm1 > parm2 ) {
1105                 parm1 = parm2;
1106             }
1107 
1108             /* N : number of cycles */
1109             parm3 = ( parm2 + parm1 );
1110             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1111 
1112             if ( parm3 < 2 ) {
1113                 parm3 = 2;
1114             }
1115 
1116             /* sigma : decreasing incr of the trapezoid */
1117             parm4 = ( parm3 - 1 );
1118             parm4 = ( parm2 - parm1 ) / parm4;
1119 
1120             // pointless check, because parm4 >= 0 always
1121             //if ( parm4 < 0 ) {
1122             //    parm4 = 0;
1123             //}
1124 
1125             pr->u.p.parm1 = parm1;
1126             pr->u.p.parm2 = parm2;
1127             pr->u.p.parm3 = parm3;
1128             pr->u.p.parm4 = parm4;
1129         } // case
1130         break;
1131 
1132     default:
1133         {
1134             __kmp_msg(
1135                 kmp_ms_fatal,                        // Severity
1136                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1137                 KMP_HNT( GetNewerLibrary ),          // Hint
1138                 __kmp_msg_null                       // Variadic argument list terminator
1139             );
1140         }
1141         break;
1142     } // switch
1143     pr->schedule = schedule;
1144     if ( active ) {
1145         /* The name of this buffer should be my_buffer_index when it's free to use it */
1146 
1147         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1148                         gtid, my_buffer_index, sh->buffer_index) );
1149         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1150                                         USE_ITT_BUILD_ARG( NULL )
1151                                         );
1152             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1153             // *always* 32-bit integers.
1154         KMP_MB();  /* is this necessary? */
1155         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1156                         gtid, my_buffer_index, sh->buffer_index) );
1157 
1158         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1159         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1160 #if USE_ITT_BUILD
1161         if ( pr->ordered ) {
1162             __kmp_itt_ordered_init( gtid );
1163         }; // if
1164         // Report loop metadata
1165         if ( itt_need_metadata_reporting ) {
1166             // Only report metadata by master of active team at level 1
1167             kmp_uint64 schedtype = 0;
1168             switch ( schedule ) {
1169             case kmp_sch_static_chunked:
1170             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1171                 break;
1172             case kmp_sch_static_greedy:
1173                 cur_chunk = pr->u.p.parm1;
1174                 break;
1175             case kmp_sch_dynamic_chunked:
1176                 schedtype = 1;
1177                 break;
1178             case kmp_sch_guided_iterative_chunked:
1179             case kmp_sch_guided_analytical_chunked:
1180                 schedtype = 2;
1181                 break;
1182             default:
1183 //            Should we put this case under "static"?
1184 //            case kmp_sch_static_steal:
1185                 schedtype = 3;
1186                 break;
1187             }
1188             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1189         }
1190 #endif /* USE_ITT_BUILD */
1191     }; // if
1192 
1193     #ifdef KMP_DEBUG
1194     {
1195         const char * buff;
1196         // create format specifiers before the debug output
1197         buff = __kmp_str_format(
1198             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1199             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1200             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1201             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1202             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1203             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1204             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1205         KD_TRACE(10, ( buff,
1206             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1207             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1208             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1209             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1210         __kmp_str_free( &buff );
1211     }
1212     #endif
1213     #if ( KMP_STATIC_STEAL_ENABLED )
1214       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1215       // all the parm3 variables will contain the same value.
1216       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1217       // rather than program life-time increment.
1218       // So the dedicated variable is required. The 'static_steal_counter' is used.
1219       if( schedule == kmp_sch_static_steal ) {
1220         // Other threads will inspect this variable when searching for a victim.
1221         // This is a flag showing that other threads may steal from this thread since then.
1222         volatile T * p = &pr->u.p.static_steal_counter;
1223         *p = *p + 1;
1224       }
1225     #endif // ( KMP_STATIC_STEAL_ENABLED )
1226 
1227 #if OMPT_SUPPORT && OMPT_TRACE
1228     if (ompt_enabled &&
1229         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1230         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1231         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1232         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1233             team_info->parallel_id, task_info->task_id, team_info->microtask);
1234     }
1235 #endif
1236 }
1237 
1238 /*
1239  * For ordered loops, either __kmp_dispatch_finish() should be called after
1240  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1241  * every chunk of iterations.  If the ordered section(s) were not executed
1242  * for this iteration (or every iteration in this chunk), we need to set the
1243  * ordered iteration counters so that the next thread can proceed.
1244  */
1245 template< typename UT >
1246 static void
1247 __kmp_dispatch_finish( int gtid, ident_t *loc )
1248 {
1249     typedef typename traits_t< UT >::signed_t ST;
1250     kmp_info_t *th = __kmp_threads[ gtid ];
1251 
1252     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1253     if ( ! th -> th.th_team -> t.t_serialized ) {
1254 
1255         dispatch_private_info_template< UT > * pr =
1256             reinterpret_cast< dispatch_private_info_template< UT >* >
1257             ( th->th.th_dispatch->th_dispatch_pr_current );
1258         dispatch_shared_info_template< UT > volatile * sh =
1259             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1260             ( th->th.th_dispatch->th_dispatch_sh_current );
1261         KMP_DEBUG_ASSERT( pr );
1262         KMP_DEBUG_ASSERT( sh );
1263         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1264                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1265 
1266         if ( pr->ordered_bumped ) {
1267             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1268                             gtid ) );
1269             pr->ordered_bumped = 0;
1270         } else {
1271             UT lower = pr->u.p.ordered_lower;
1272 
1273             #ifdef KMP_DEBUG
1274             {
1275                 const char * buff;
1276                 // create format specifiers before the debug output
1277                 buff = __kmp_str_format(
1278                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1279                     traits_t< UT >::spec, traits_t< UT >::spec );
1280                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1281                 __kmp_str_free( &buff );
1282             }
1283             #endif
1284 
1285             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1286                                    USE_ITT_BUILD_ARG(NULL)
1287                                    );
1288             KMP_MB();  /* is this necessary? */
1289             #ifdef KMP_DEBUG
1290             {
1291                 const char * buff;
1292                 // create format specifiers before the debug output
1293                 buff = __kmp_str_format(
1294                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1295                     traits_t< UT >::spec, traits_t< UT >::spec );
1296                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1297                 __kmp_str_free( &buff );
1298             }
1299             #endif
1300 
1301             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1302         } // if
1303     } // if
1304     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1305 }
1306 
1307 #ifdef KMP_GOMP_COMPAT
1308 
1309 template< typename UT >
1310 static void
1311 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1312 {
1313     typedef typename traits_t< UT >::signed_t ST;
1314     kmp_info_t *th = __kmp_threads[ gtid ];
1315 
1316     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1317     if ( ! th -> th.th_team -> t.t_serialized ) {
1318 //        int cid;
1319         dispatch_private_info_template< UT > * pr =
1320             reinterpret_cast< dispatch_private_info_template< UT >* >
1321             ( th->th.th_dispatch->th_dispatch_pr_current );
1322         dispatch_shared_info_template< UT > volatile * sh =
1323             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1324             ( th->th.th_dispatch->th_dispatch_sh_current );
1325         KMP_DEBUG_ASSERT( pr );
1326         KMP_DEBUG_ASSERT( sh );
1327         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1328                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1329 
1330 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1331             UT lower = pr->u.p.ordered_lower;
1332             UT upper = pr->u.p.ordered_upper;
1333             UT inc = upper - lower + 1;
1334 
1335             if ( pr->ordered_bumped == inc ) {
1336                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1337                   gtid ) );
1338                 pr->ordered_bumped = 0;
1339             } else {
1340                 inc -= pr->ordered_bumped;
1341 
1342                 #ifdef KMP_DEBUG
1343                 {
1344                     const char * buff;
1345                     // create format specifiers before the debug output
1346                     buff = __kmp_str_format(
1347                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1348                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1349                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1350                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1351                     __kmp_str_free( &buff );
1352                 }
1353                 #endif
1354 
1355                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1356                                        USE_ITT_BUILD_ARG(NULL)
1357                                        );
1358 
1359                 KMP_MB();  /* is this necessary? */
1360                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1361                   gtid ) );
1362                 pr->ordered_bumped = 0;
1363 //!!!!! TODO check if the inc should be unsigned, or signed???
1364                 #ifdef KMP_DEBUG
1365                 {
1366                     const char * buff;
1367                     // create format specifiers before the debug output
1368                     buff = __kmp_str_format(
1369                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1370                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1371                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1372                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1373                     __kmp_str_free( &buff );
1374                 }
1375                 #endif
1376 
1377                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1378             }
1379 //        }
1380     }
1381     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1382 }
1383 
1384 #endif /* KMP_GOMP_COMPAT */
1385 
1386 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1387  * (no more work), then tell OMPT the loop is over. In some cases
1388  * kmp_dispatch_fini() is not called. */
1389 #if OMPT_SUPPORT && OMPT_TRACE
1390 #define OMPT_LOOP_END                                                          \
1391     if (status == 0) {                                                         \
1392         if (ompt_enabled &&                     \
1393             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1394             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1395             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1396             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1397                 team_info->parallel_id, task_info->task_id);                   \
1398         }                                                                      \
1399     }
1400 #else
1401 #define OMPT_LOOP_END // no-op
1402 #endif
1403 
1404 template< typename T >
1405 static int
1406 __kmp_dispatch_next(
1407     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1408 ) {
1409 
1410     typedef typename traits_t< T >::unsigned_t  UT;
1411     typedef typename traits_t< T >::signed_t    ST;
1412     typedef typename traits_t< T >::floating_t  DBL;
1413 
1414     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1415     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1416     // more than a compile time choice to use static scheduling would.)
1417     KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1418 
1419     int                                   status;
1420     dispatch_private_info_template< T > * pr;
1421     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1422     kmp_team_t                          * team = th -> th.th_team;
1423 
1424     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1425     #ifdef KMP_DEBUG
1426     {
1427         const char * buff;
1428         // create format specifiers before the debug output
1429         buff = __kmp_str_format(
1430             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1431             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1432         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1433         __kmp_str_free( &buff );
1434     }
1435     #endif
1436 
1437     if ( team -> t.t_serialized ) {
1438         /* NOTE: serialize this dispatch becase we are not at the active level */
1439         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1440             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1441         KMP_DEBUG_ASSERT( pr );
1442 
1443         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1444             *p_lb = 0;
1445             *p_ub = 0;
1446 //            if ( p_last != NULL )
1447 //                *p_last = 0;
1448             if ( p_st != NULL )
1449                 *p_st = 0;
1450             if ( __kmp_env_consistency_check ) {
1451                 if ( pr->pushed_ws != ct_none ) {
1452                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1453                 }
1454             }
1455         } else if ( pr->nomerge ) {
1456             kmp_int32 last;
1457             T         start;
1458             UT        limit, trip, init;
1459             ST        incr;
1460             T         chunk = pr->u.p.parm1;
1461 
1462             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1463 
1464             init = chunk * pr->u.p.count++;
1465             trip = pr->u.p.tc - 1;
1466 
1467             if ( (status = (init <= trip)) == 0 ) {
1468                 *p_lb = 0;
1469                 *p_ub = 0;
1470 //                if ( p_last != NULL )
1471 //                    *p_last = 0;
1472                 if ( p_st != NULL )
1473                     *p_st = 0;
1474                 if ( __kmp_env_consistency_check ) {
1475                     if ( pr->pushed_ws != ct_none ) {
1476                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1477                     }
1478                 }
1479             } else {
1480                 start = pr->u.p.lb;
1481                 limit = chunk + init - 1;
1482                 incr  = pr->u.p.st;
1483 
1484                 if ( (last = (limit >= trip)) != 0 ) {
1485                     limit = trip;
1486                     #if KMP_OS_WINDOWS
1487                     pr->u.p.last_upper = pr->u.p.ub;
1488                     #endif /* KMP_OS_WINDOWS */
1489                 }
1490                 if ( p_last != NULL )
1491                     *p_last = last;
1492                 if ( p_st != NULL )
1493                     *p_st = incr;
1494                 if ( incr == 1 ) {
1495                     *p_lb = start + init;
1496                     *p_ub = start + limit;
1497                 } else {
1498                     *p_lb = start + init * incr;
1499                     *p_ub = start + limit * incr;
1500                 }
1501 
1502                 if ( pr->ordered ) {
1503                     pr->u.p.ordered_lower = init;
1504                     pr->u.p.ordered_upper = limit;
1505                     #ifdef KMP_DEBUG
1506                     {
1507                         const char * buff;
1508                         // create format specifiers before the debug output
1509                         buff = __kmp_str_format(
1510                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1511                             traits_t< UT >::spec, traits_t< UT >::spec );
1512                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1513                         __kmp_str_free( &buff );
1514                     }
1515                     #endif
1516                 } // if
1517             } // if
1518         } else {
1519             pr->u.p.tc = 0;
1520             *p_lb = pr->u.p.lb;
1521             *p_ub = pr->u.p.ub;
1522             #if KMP_OS_WINDOWS
1523             pr->u.p.last_upper = *p_ub;
1524             #endif /* KMP_OS_WINDOWS */
1525             if ( p_last != NULL )
1526                 *p_last = TRUE;
1527             if ( p_st != NULL )
1528                 *p_st = pr->u.p.st;
1529         } // if
1530         #ifdef KMP_DEBUG
1531         {
1532             const char * buff;
1533             // create format specifiers before the debug output
1534             buff = __kmp_str_format(
1535                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1536                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1537                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1538             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1539             __kmp_str_free( &buff );
1540         }
1541         #endif
1542 #if INCLUDE_SSC_MARKS
1543         SSC_MARK_DISPATCH_NEXT();
1544 #endif
1545         OMPT_LOOP_END;
1546         return status;
1547     } else {
1548         kmp_int32 last = 0;
1549         dispatch_shared_info_template< UT > *sh;
1550         T         start;
1551         ST        incr;
1552         UT        limit, trip, init;
1553 
1554         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1555                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1556 
1557         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1558             ( th->th.th_dispatch->th_dispatch_pr_current );
1559         KMP_DEBUG_ASSERT( pr );
1560         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1561             ( th->th.th_dispatch->th_dispatch_sh_current );
1562         KMP_DEBUG_ASSERT( sh );
1563 
1564         if ( pr->u.p.tc == 0 ) {
1565             // zero trip count
1566             status = 0;
1567         } else {
1568             switch (pr->schedule) {
1569             #if ( KMP_STATIC_STEAL_ENABLED )
1570             case kmp_sch_static_steal:
1571                 {
1572                     T chunk = pr->u.p.parm1;
1573                     int nproc = th->th.th_team_nproc;
1574 
1575                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1576 
1577                     trip = pr->u.p.tc - 1;
1578 
1579                     if ( traits_t<T>::type_size > 4 ) {
1580                         // use lock for 8-byte and CAS for 4-byte induction
1581                         // variable. TODO (optional): check and use 16-byte CAS
1582                         kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
1583                         KMP_DEBUG_ASSERT(lck != NULL);
1584                         if( pr->u.p.count < (UT)pr->u.p.ub ) {
1585                             __kmp_acquire_lock(lck, gtid);
1586                             // try to get own chunk of iterations
1587                             init   = ( pr->u.p.count )++;
1588                             status = ( init < (UT)pr->u.p.ub );
1589                             __kmp_release_lock(lck, gtid);
1590                         } else {
1591                             status = 0; // no own chunks
1592                         }
1593                         if( !status ) { // try to steal
1594                             kmp_info_t   **other_threads = team->t.t_threads;
1595                             int          while_limit = nproc; // nproc attempts to find a victim
1596                             int          while_index = 0;
1597                             // TODO: algorithm of searching for a victim
1598                             // should be cleaned up and measured
1599                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1600                                 T remaining;
1601                                 T victimIdx    = pr->u.p.parm4;
1602                                 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1603                                 dispatch_private_info_template< T > * victim =
1604                                     reinterpret_cast< dispatch_private_info_template< T >* >
1605                                     (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1606                                 while( ( victim == NULL || victim == pr ||
1607                                     ( *(volatile T*)&victim->u.p.static_steal_counter !=
1608                                     *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
1609                                     oldVictimIdx != victimIdx )
1610                                 {
1611                                     victimIdx = (victimIdx + 1) % nproc;
1612                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1613                                         (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1614                                 };
1615                                 if( !victim ||
1616                                     ( *(volatile T *)&victim->u.p.static_steal_counter !=
1617                                     *(volatile T *)&pr->u.p.static_steal_counter ) )
1618                                 {
1619                                     continue; // try once more (nproc attempts in total)
1620                                     // no victim is ready yet to participate in stealing
1621                                     // because all victims are still in kmp_init_dispatch
1622                                 }
1623                                 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
1624                                     pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1625                                     continue; // not enough chunks to steal, goto next victim
1626                                 }
1627 
1628                                 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1629                                 KMP_ASSERT(lck != NULL);
1630                                 __kmp_acquire_lock(lck, gtid);
1631                                 limit = victim->u.p.ub; // keep initial ub
1632                                 if( victim->u.p.count >= limit ||
1633                                     (remaining = limit - victim->u.p.count) < 2 )
1634                                 {
1635                                     __kmp_release_lock(lck, gtid);
1636                                     pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1637                                     continue; // not enough chunks to steal
1638                                 }
1639                                 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
1640                                 if( remaining > 3 ) {
1641                                     KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining>>2);
1642                                     init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
1643                                 } else {
1644                                     KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1645                                     init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
1646                                 }
1647                                 __kmp_release_lock(lck, gtid);
1648 
1649                                 KMP_DEBUG_ASSERT(init + 1 <= limit);
1650                                 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1651                                 status = 1;
1652                                 while_index = 0;
1653                                 // now update own count and ub with stolen range but init chunk
1654                                 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1655                                 pr->u.p.count = init + 1;
1656                                 pr->u.p.ub = limit;
1657                                 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1658                             } // while (search for victim)
1659                         } // if (try to find victim and steal)
1660                     } else {
1661                         // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1662                         typedef union {
1663                             struct {
1664                                 UT count;
1665                                 T  ub;
1666                             } p;
1667                             kmp_int64 b;
1668                         } union_i4;
1669                         // All operations on 'count' or 'ub' must be combined atomically together.
1670                         {
1671                             union_i4 vold, vnew;
1672                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1673                             vnew = vold;
1674                             vnew.p.count++;
1675                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1676                                         ( volatile kmp_int64* )&pr->u.p.count,
1677                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1678                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1679                                 KMP_CPU_PAUSE();
1680                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1681                                 vnew = vold;
1682                                 vnew.p.count++;
1683                             }
1684                             vnew = vold;
1685                             init   = vnew.p.count;
1686                             status = ( init < (UT)vnew.p.ub ) ;
1687                         }
1688 
1689                         if( !status ) {
1690                             kmp_info_t   **other_threads = team->t.t_threads;
1691                             int          while_limit = nproc; // nproc attempts to find a victim
1692                             int          while_index = 0;
1693 
1694                             // TODO: algorithm of searching for a victim
1695                             // should be cleaned up and measured
1696                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1697                                 union_i4  vold, vnew;
1698                                 kmp_int32 remaining;
1699                                 T         victimIdx    = pr->u.p.parm4;
1700                                 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1701                                 dispatch_private_info_template< T > * victim =
1702                                     reinterpret_cast< dispatch_private_info_template< T >* >
1703                                     (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1704                                 while( (victim == NULL || victim == pr ||
1705                                     (*(volatile T*)&victim->u.p.static_steal_counter !=
1706                                     *(volatile T*)&pr->u.p.static_steal_counter)) &&
1707                                     oldVictimIdx != victimIdx )
1708                                 {
1709                                     victimIdx = (victimIdx + 1) % nproc;
1710                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1711                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1712                                 };
1713                                 if( !victim ||
1714                                     ( *(volatile T *)&victim->u.p.static_steal_counter !=
1715                                     *(volatile T *)&pr->u.p.static_steal_counter ) )
1716                                 {
1717                                     continue; // try once more (nproc attempts in total)
1718                                     // no victim is ready yet to participate in stealing
1719                                     // because all victims are still in kmp_init_dispatch
1720                                 }
1721                                 pr->u.p.parm4 = victimIdx; // new victim found
1722                                 while( 1 ) { // CAS loop if victim has enough chunks to steal
1723                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1724                                     vnew = vold;
1725 
1726                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1727                                     if ( vnew.p.count >= (UT)vnew.p.ub ||
1728                                         (remaining = vnew.p.ub - vnew.p.count) < 2 )
1729                                     {
1730                                         pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1731                                         break; // not enough chunks to steal, goto next victim
1732                                     }
1733                                     if( remaining > 3 ) {
1734                                         vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
1735                                     } else {
1736                                         vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1737                                     }
1738                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1739                                     // TODO: Should this be acquire or release?
1740                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1741                                             ( volatile kmp_int64 * )&victim->u.p.count,
1742                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1743                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1744                                         // stealing succedded
1745                                         KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub-vnew.p.ub);
1746                                         status = 1;
1747                                         while_index = 0;
1748                                         // now update own count and ub
1749                                         init = vnew.p.ub;
1750                                         vold.p.count = init + 1;
1751                                         #if KMP_ARCH_X86
1752                                         KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
1753                                         #else
1754                                         *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1755                                         #endif
1756                                         break;
1757                                     } // if (check CAS result)
1758                                     KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1759                                 } // while (try to steal from particular victim)
1760                             } // while (search for victim)
1761                         } // if (try to find victim and steal)
1762                     } // if (4-byte induction variable)
1763                     if ( !status ) {
1764                         *p_lb = 0;
1765                         *p_ub = 0;
1766                         if ( p_st != NULL ) *p_st = 0;
1767                     } else {
1768                         start = pr->u.p.parm2;
1769                         init *= chunk;
1770                         limit = chunk + init - 1;
1771                         incr  = pr->u.p.st;
1772                         KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1773 
1774                         KMP_DEBUG_ASSERT(init <= trip);
1775                         if ( (last = (limit >= trip)) != 0 )
1776                             limit = trip;
1777                         if ( p_st != NULL ) *p_st = incr;
1778 
1779                         if ( incr == 1 ) {
1780                             *p_lb = start + init;
1781                             *p_ub = start + limit;
1782                         } else {
1783                             *p_lb = start + init * incr;
1784                             *p_ub = start + limit * incr;
1785                         }
1786 
1787                         if ( pr->ordered ) {
1788                             pr->u.p.ordered_lower = init;
1789                             pr->u.p.ordered_upper = limit;
1790                             #ifdef KMP_DEBUG
1791                             {
1792                                 const char * buff;
1793                                 // create format specifiers before the debug output
1794                                 buff = __kmp_str_format(
1795                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1796                                     traits_t< UT >::spec, traits_t< UT >::spec );
1797                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1798                                 __kmp_str_free( &buff );
1799                             }
1800                             #endif
1801                         } // if
1802                     } // if
1803                     break;
1804                 } // case
1805             #endif // ( KMP_STATIC_STEAL_ENABLED )
1806             case kmp_sch_static_balanced:
1807                 {
1808                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1809                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1810                         pr->u.p.count = 1;
1811                         *p_lb = pr->u.p.lb;
1812                         *p_ub = pr->u.p.ub;
1813                         last = pr->u.p.parm1;
1814                         if ( p_st != NULL )
1815                             *p_st = pr->u.p.st;
1816                     } else {  /* no iterations to do */
1817                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1818                     }
1819                     if ( pr->ordered ) {
1820                         #ifdef KMP_DEBUG
1821                         {
1822                             const char * buff;
1823                             // create format specifiers before the debug output
1824                             buff = __kmp_str_format(
1825                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1826                                 traits_t< UT >::spec, traits_t< UT >::spec );
1827                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1828                             __kmp_str_free( &buff );
1829                         }
1830                         #endif
1831                     } // if
1832                 } // case
1833                 break;
1834             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1835             case kmp_sch_static_chunked:
1836                 {
1837                     T parm1;
1838 
1839                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1840                                    gtid ) );
1841                     parm1 = pr->u.p.parm1;
1842 
1843                     trip  = pr->u.p.tc - 1;
1844                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1845 
1846                     if ( (status = (init <= trip)) != 0 ) {
1847                         start = pr->u.p.lb;
1848                         incr  = pr->u.p.st;
1849                         limit = parm1 + init - 1;
1850 
1851                         if ( (last = (limit >= trip)) != 0 )
1852                             limit = trip;
1853 
1854                         if ( p_st != NULL ) *p_st = incr;
1855 
1856                         pr->u.p.count += th->th.th_team_nproc;
1857 
1858                         if ( incr == 1 ) {
1859                             *p_lb = start + init;
1860                             *p_ub = start + limit;
1861                         }
1862                         else {
1863                             *p_lb = start + init * incr;
1864                             *p_ub = start + limit * incr;
1865                         }
1866 
1867                         if ( pr->ordered ) {
1868                             pr->u.p.ordered_lower = init;
1869                             pr->u.p.ordered_upper = limit;
1870                             #ifdef KMP_DEBUG
1871                             {
1872                                 const char * buff;
1873                                 // create format specifiers before the debug output
1874                                 buff = __kmp_str_format(
1875                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1876                                     traits_t< UT >::spec, traits_t< UT >::spec );
1877                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1878                                 __kmp_str_free( &buff );
1879                             }
1880                             #endif
1881                         } // if
1882                     } // if
1883                 } // case
1884                 break;
1885 
1886             case kmp_sch_dynamic_chunked:
1887                 {
1888                     T chunk = pr->u.p.parm1;
1889 
1890                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1891                                    gtid ) );
1892 
1893                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1894                     trip = pr->u.p.tc - 1;
1895 
1896                     if ( (status = (init <= trip)) == 0 ) {
1897                         *p_lb = 0;
1898                         *p_ub = 0;
1899                         if ( p_st != NULL ) *p_st = 0;
1900                     } else {
1901                         start = pr->u.p.lb;
1902                         limit = chunk + init - 1;
1903                         incr  = pr->u.p.st;
1904 
1905                         if ( (last = (limit >= trip)) != 0 )
1906                             limit = trip;
1907 
1908                         if ( p_st != NULL ) *p_st = incr;
1909 
1910                         if ( incr == 1 ) {
1911                             *p_lb = start + init;
1912                             *p_ub = start + limit;
1913                         } else {
1914                             *p_lb = start + init * incr;
1915                             *p_ub = start + limit * incr;
1916                         }
1917 
1918                         if ( pr->ordered ) {
1919                             pr->u.p.ordered_lower = init;
1920                             pr->u.p.ordered_upper = limit;
1921                             #ifdef KMP_DEBUG
1922                             {
1923                                 const char * buff;
1924                                 // create format specifiers before the debug output
1925                                 buff = __kmp_str_format(
1926                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1927                                     traits_t< UT >::spec, traits_t< UT >::spec );
1928                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1929                                 __kmp_str_free( &buff );
1930                             }
1931                             #endif
1932                         } // if
1933                     } // if
1934                 } // case
1935                 break;
1936 
1937             case kmp_sch_guided_iterative_chunked:
1938                 {
1939                     T  chunkspec = pr->u.p.parm1;
1940                     KD_TRACE(100,
1941                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1942                     trip  = pr->u.p.tc;
1943                     // Start atomic part of calculations
1944                     while(1) {
1945                         ST  remaining;             // signed, because can be < 0
1946                         init = sh->u.s.iteration;  // shared value
1947                         remaining = trip - init;
1948                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1949                             // nothing to do, don't try atomic op
1950                             status = 0;
1951                             break;
1952                         }
1953                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1954                             // use dynamic-style shcedule
1955                             // atomically inrement iterations, get old value
1956                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1957                             remaining = trip - init;
1958                             if (remaining <= 0) {
1959                                 status = 0;    // all iterations got by other threads
1960                             } else {
1961                                 // got some iterations to work on
1962                                 status = 1;
1963                                 if ( (T)remaining > chunkspec ) {
1964                                     limit = init + chunkspec - 1;
1965                                 } else {
1966                                     last = 1;   // the last chunk
1967                                     limit = init + remaining - 1;
1968                                 } // if
1969                             } // if
1970                             break;
1971                         } // if
1972                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1973                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1974                             // CAS was successful, chunk obtained
1975                             status = 1;
1976                             --limit;
1977                             break;
1978                         } // if
1979                     } // while
1980                     if ( status != 0 ) {
1981                         start = pr->u.p.lb;
1982                         incr = pr->u.p.st;
1983                         if ( p_st != NULL )
1984                             *p_st = incr;
1985                         *p_lb = start + init * incr;
1986                         *p_ub = start + limit * incr;
1987                         if ( pr->ordered ) {
1988                             pr->u.p.ordered_lower = init;
1989                             pr->u.p.ordered_upper = limit;
1990                             #ifdef KMP_DEBUG
1991                             {
1992                                 const char * buff;
1993                                 // create format specifiers before the debug output
1994                                 buff = __kmp_str_format(
1995                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1996                                     traits_t< UT >::spec, traits_t< UT >::spec );
1997                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1998                                 __kmp_str_free( &buff );
1999                             }
2000                             #endif
2001                         } // if
2002                     } else {
2003                         *p_lb = 0;
2004                         *p_ub = 0;
2005                         if ( p_st != NULL )
2006                             *p_st = 0;
2007                     } // if
2008                 } // case
2009                 break;
2010 
2011             case kmp_sch_guided_analytical_chunked:
2012                 {
2013                     T   chunkspec = pr->u.p.parm1;
2014                     UT chunkIdx;
2015     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2016                     /* for storing original FPCW value for Windows* OS on
2017 		       IA-32 architecture 8-byte version */
2018                     unsigned int oldFpcw;
2019                     unsigned int fpcwSet = 0;
2020     #endif
2021                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
2022                                    gtid ) );
2023 
2024                     trip  = pr->u.p.tc;
2025 
2026                     KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2027                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip);
2028 
2029                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
2030                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
2031                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
2032                             --trip;
2033                             /* use dynamic-style scheduling */
2034                             init = chunkIdx * chunkspec + pr->u.p.count;
2035                             /* need to verify init > 0 in case of overflow in the above calculation */
2036                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
2037                                 limit = init + chunkspec -1;
2038 
2039                                 if ( (last = (limit >= trip)) != 0 )
2040                                     limit = trip;
2041                             }
2042                             break;
2043                         } else {
2044                             /* use exponential-style scheduling */
2045                             /* The following check is to workaround the lack of long double precision on Windows* OS.
2046                                This check works around the possible effect that init != 0 for chunkIdx == 0.
2047                              */
2048     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2049                             /* If we haven't already done so, save original
2050 			       FPCW and set precision to 64-bit, as Windows* OS
2051 			       on IA-32 architecture defaults to 53-bit */
2052                             if ( !fpcwSet ) {
2053                                 oldFpcw = _control87(0,0);
2054                                 _control87(_PC_64,_MCW_PC);
2055                                 fpcwSet = 0x30000;
2056                             }
2057     #endif
2058                             if ( chunkIdx ) {
2059                                 init = __kmp_dispatch_guided_remaining< T >(
2060                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2061                                 KMP_DEBUG_ASSERT(init);
2062                                 init = trip - init;
2063                             } else
2064                                 init = 0;
2065                             limit = trip - __kmp_dispatch_guided_remaining< T >(
2066                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2067                             KMP_ASSERT(init <= limit);
2068                             if ( init < limit ) {
2069                                 KMP_DEBUG_ASSERT(limit <= trip);
2070                                 --limit;
2071                                 status = 1;
2072                                 break;
2073                             } // if
2074                         } // if
2075                     } // while (1)
2076     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2077                     /* restore FPCW if necessary
2078                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2079                     */
2080                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2081                         _control87(oldFpcw,_MCW_PC);
2082     #endif
2083                     if ( status != 0 ) {
2084                         start = pr->u.p.lb;
2085                         incr = pr->u.p.st;
2086                         if ( p_st != NULL )
2087                             *p_st = incr;
2088                         *p_lb = start + init * incr;
2089                         *p_ub = start + limit * incr;
2090                         if ( pr->ordered ) {
2091                             pr->u.p.ordered_lower = init;
2092                             pr->u.p.ordered_upper = limit;
2093                             #ifdef KMP_DEBUG
2094                             {
2095                                 const char * buff;
2096                                 // create format specifiers before the debug output
2097                                 buff = __kmp_str_format(
2098                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2099                                     traits_t< UT >::spec, traits_t< UT >::spec );
2100                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2101                                 __kmp_str_free( &buff );
2102                             }
2103                             #endif
2104                         }
2105                     } else {
2106                         *p_lb = 0;
2107                         *p_ub = 0;
2108                         if ( p_st != NULL )
2109                             *p_st = 0;
2110                     }
2111                 } // case
2112                 break;
2113 
2114             case kmp_sch_trapezoidal:
2115                 {
2116                     UT   index;
2117                     T    parm2 = pr->u.p.parm2;
2118                     T    parm3 = pr->u.p.parm3;
2119                     T    parm4 = pr->u.p.parm4;
2120                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2121                                    gtid ) );
2122 
2123                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2124 
2125                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2126                     trip = pr->u.p.tc - 1;
2127 
2128                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2129                         *p_lb = 0;
2130                         *p_ub = 0;
2131                         if ( p_st != NULL ) *p_st = 0;
2132                     } else {
2133                         start = pr->u.p.lb;
2134                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2135                         incr  = pr->u.p.st;
2136 
2137                         if ( (last = (limit >= trip)) != 0 )
2138                             limit = trip;
2139 
2140                         if ( p_st != NULL ) *p_st = incr;
2141 
2142                         if ( incr == 1 ) {
2143                             *p_lb = start + init;
2144                             *p_ub = start + limit;
2145                         } else {
2146                             *p_lb = start + init * incr;
2147                             *p_ub = start + limit * incr;
2148                         }
2149 
2150                         if ( pr->ordered ) {
2151                             pr->u.p.ordered_lower = init;
2152                             pr->u.p.ordered_upper = limit;
2153                             #ifdef KMP_DEBUG
2154                             {
2155                                 const char * buff;
2156                                 // create format specifiers before the debug output
2157                                 buff = __kmp_str_format(
2158                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2159                                     traits_t< UT >::spec, traits_t< UT >::spec );
2160                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2161                                 __kmp_str_free( &buff );
2162                             }
2163                             #endif
2164                         } // if
2165                     } // if
2166                 } // case
2167                 break;
2168             default:
2169                 {
2170                     status = 0; // to avoid complaints on uninitialized variable use
2171                     __kmp_msg(
2172                         kmp_ms_fatal,                        // Severity
2173                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2174                         KMP_HNT( GetNewerLibrary ),          // Hint
2175                         __kmp_msg_null                       // Variadic argument list terminator
2176                     );
2177                 }
2178                 break;
2179             } // switch
2180         } // if tc == 0;
2181 
2182         if ( status == 0 ) {
2183             UT   num_done;
2184 
2185             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2186             #ifdef KMP_DEBUG
2187             {
2188                 const char * buff;
2189                 // create format specifiers before the debug output
2190                 buff = __kmp_str_format(
2191                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2192                     traits_t< UT >::spec );
2193                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2194                 __kmp_str_free( &buff );
2195             }
2196             #endif
2197 
2198             if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
2199                 #if ( KMP_STATIC_STEAL_ENABLED )
2200                 if( pr->schedule == kmp_sch_static_steal && traits_t<T>::type_size > 4 ) {
2201                     int i;
2202                     kmp_info_t **other_threads = team->t.t_threads;
2203                     // loop complete, safe to destroy locks used for stealing
2204                     for( i = 0; i < th->th.th_team_nproc; ++i ) {
2205                         kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2206                         KMP_ASSERT(lck != NULL);
2207                         __kmp_destroy_lock( lck );
2208                         __kmp_free( lck );
2209                         other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2210                     }
2211                 }
2212                 #endif
2213                 /* NOTE: release this buffer to be reused */
2214 
2215                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2216 
2217                 sh->u.s.num_done = 0;
2218                 sh->u.s.iteration = 0;
2219 
2220                 /* TODO replace with general release procedure? */
2221                 if ( pr->ordered ) {
2222                     sh->u.s.ordered_iteration = 0;
2223                 }
2224 
2225                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2226 
2227                 sh -> buffer_index += __kmp_dispatch_num_buffers;
2228                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2229                                 gtid, sh->buffer_index) );
2230 
2231                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2232 
2233             } // if
2234             if ( __kmp_env_consistency_check ) {
2235                 if ( pr->pushed_ws != ct_none ) {
2236                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2237                 }
2238             }
2239 
2240             th -> th.th_dispatch -> th_deo_fcn = NULL;
2241             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2242             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2243             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2244         } // if (status == 0)
2245 #if KMP_OS_WINDOWS
2246         else if ( last ) {
2247             pr->u.p.last_upper = pr->u.p.ub;
2248         }
2249 #endif /* KMP_OS_WINDOWS */
2250         if ( p_last != NULL && status != 0 )
2251             *p_last = last;
2252     } // if
2253 
2254     #ifdef KMP_DEBUG
2255     {
2256         const char * buff;
2257         // create format specifiers before the debug output
2258         buff = __kmp_str_format(
2259             "__kmp_dispatch_next: T#%%d normal case: " \
2260             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2261             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2262         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2263         __kmp_str_free( &buff );
2264     }
2265     #endif
2266 #if INCLUDE_SSC_MARKS
2267     SSC_MARK_DISPATCH_NEXT();
2268 #endif
2269     OMPT_LOOP_END;
2270     return status;
2271 }
2272 
2273 template< typename T >
2274 static void
2275 __kmp_dist_get_bounds(
2276     ident_t                          *loc,
2277     kmp_int32                         gtid,
2278     kmp_int32                        *plastiter,
2279     T                                *plower,
2280     T                                *pupper,
2281     typename traits_t< T >::signed_t  incr
2282 ) {
2283     typedef typename traits_t< T >::unsigned_t  UT;
2284     typedef typename traits_t< T >::signed_t    ST;
2285     register kmp_uint32  team_id;
2286     register kmp_uint32  nteams;
2287     register UT          trip_count;
2288     register kmp_team_t *team;
2289     kmp_info_t * th;
2290 
2291     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2292     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2293     #ifdef KMP_DEBUG
2294     {
2295         const char * buff;
2296         // create format specifiers before the debug output
2297         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2298             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2299             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2300             traits_t< T >::spec );
2301         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2302         __kmp_str_free( &buff );
2303     }
2304     #endif
2305 
2306     if( __kmp_env_consistency_check ) {
2307         if( incr == 0 ) {
2308             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2309         }
2310         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2311             // The loop is illegal.
2312             // Some zero-trip loops maintained by compiler, e.g.:
2313             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2314             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2315             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2316             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2317             // Compiler does not check the following illegal loops:
2318             //   for(i=0;i<10;i+=incr) // where incr<0
2319             //   for(i=10;i>0;i-=incr) // where incr<0
2320             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2321         }
2322     }
2323     th = __kmp_threads[gtid];
2324     team = th->th.th_team;
2325     #if OMP_40_ENABLED
2326     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2327     nteams = th->th.th_teams_size.nteams;
2328     #endif
2329     team_id = team->t.t_master_tid;
2330     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2331 
2332     // compute global trip count
2333     if( incr == 1 ) {
2334         trip_count = *pupper - *plower + 1;
2335     } else if(incr == -1) {
2336         trip_count = *plower - *pupper + 1;
2337     } else if ( incr > 0 ) {
2338         // upper-lower can exceed the limit of signed type
2339         trip_count = (UT)(*pupper - *plower) / incr + 1;
2340     } else {
2341         trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
2342     }
2343 
2344     if( trip_count <= nteams ) {
2345         KMP_DEBUG_ASSERT(
2346             __kmp_static == kmp_sch_static_greedy || \
2347             __kmp_static == kmp_sch_static_balanced
2348         ); // Unknown static scheduling type.
2349         // only some teams get single iteration, others get nothing
2350         if( team_id < trip_count ) {
2351             *pupper = *plower = *plower + team_id * incr;
2352         } else {
2353             *plower = *pupper + incr; // zero-trip loop
2354         }
2355         if( plastiter != NULL )
2356             *plastiter = ( team_id == trip_count - 1 );
2357     } else {
2358         if( __kmp_static == kmp_sch_static_balanced ) {
2359             register UT chunk = trip_count / nteams;
2360             register UT extras = trip_count % nteams;
2361             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2362             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2363             if( plastiter != NULL )
2364                 *plastiter = ( team_id == nteams - 1 );
2365         } else {
2366             register T chunk_inc_count =
2367                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2368             register T upper = *pupper;
2369             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2370                 // Unknown static scheduling type.
2371             *plower += team_id * chunk_inc_count;
2372             *pupper = *plower + chunk_inc_count - incr;
2373             // Check/correct bounds if needed
2374             if( incr > 0 ) {
2375                 if( *pupper < *plower )
2376                     *pupper = traits_t<T>::max_value;
2377                 if( plastiter != NULL )
2378                     *plastiter = *plower <= upper && *pupper > upper - incr;
2379                 if( *pupper > upper )
2380                     *pupper = upper; // tracker C73258
2381             } else {
2382                 if( *pupper > *plower )
2383                     *pupper = traits_t<T>::min_value;
2384                 if( plastiter != NULL )
2385                     *plastiter = *plower >= upper && *pupper < upper - incr;
2386                 if( *pupper < upper )
2387                     *pupper = upper; // tracker C73258
2388             }
2389         }
2390     }
2391 }
2392 
2393 //-----------------------------------------------------------------------------------------
2394 // Dispatch routines
2395 //    Transfer call to template< type T >
2396 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2397 //                         T lb, T ub, ST st, ST chunk )
2398 extern "C" {
2399 
2400 /*!
2401 @ingroup WORK_SHARING
2402 @{
2403 @param loc Source location
2404 @param gtid Global thread id
2405 @param schedule Schedule type
2406 @param lb  Lower bound
2407 @param ub  Upper bound
2408 @param st  Step (or increment if you prefer)
2409 @param chunk The chunk size to block with
2410 
2411 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2412 These functions are all identical apart from the types of the arguments.
2413 */
2414 
2415 void
2416 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2417                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2418 {
2419     KMP_DEBUG_ASSERT( __kmp_init_serial );
2420     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2421 }
2422 /*!
2423 See @ref __kmpc_dispatch_init_4
2424 */
2425 void
2426 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2427                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2428 {
2429     KMP_DEBUG_ASSERT( __kmp_init_serial );
2430     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2431 }
2432 
2433 /*!
2434 See @ref __kmpc_dispatch_init_4
2435 */
2436 void
2437 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2438                         kmp_int64 lb, kmp_int64 ub,
2439                         kmp_int64 st, kmp_int64 chunk )
2440 {
2441     KMP_DEBUG_ASSERT( __kmp_init_serial );
2442     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2443 }
2444 
2445 /*!
2446 See @ref __kmpc_dispatch_init_4
2447 */
2448 void
2449 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2450                          kmp_uint64 lb, kmp_uint64 ub,
2451                          kmp_int64 st, kmp_int64 chunk )
2452 {
2453     KMP_DEBUG_ASSERT( __kmp_init_serial );
2454     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2455 }
2456 
2457 /*!
2458 See @ref __kmpc_dispatch_init_4
2459 
2460 Difference from __kmpc_dispatch_init set of functions is these functions
2461 are called for composite distribute parallel for construct. Thus before
2462 regular iterations dispatching we need to calc per-team iteration space.
2463 
2464 These functions are all identical apart from the types of the arguments.
2465 */
2466 void
2467 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2468     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2469 {
2470     KMP_DEBUG_ASSERT( __kmp_init_serial );
2471     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2472     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2473 }
2474 
2475 void
2476 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2477     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2478 {
2479     KMP_DEBUG_ASSERT( __kmp_init_serial );
2480     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2481     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2482 }
2483 
2484 void
2485 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2486     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2487 {
2488     KMP_DEBUG_ASSERT( __kmp_init_serial );
2489     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2490     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2491 }
2492 
2493 void
2494 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2495     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2496 {
2497     KMP_DEBUG_ASSERT( __kmp_init_serial );
2498     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2499     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2500 }
2501 
2502 /*!
2503 @param loc Source code location
2504 @param gtid Global thread id
2505 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2506 @param p_lb   Pointer to the lower bound for the next chunk of work
2507 @param p_ub   Pointer to the upper bound for the next chunk of work
2508 @param p_st   Pointer to the stride for the next chunk of work
2509 @return one if there is work to be done, zero otherwise
2510 
2511 Get the next dynamically allocated chunk of work for this thread.
2512 If there is no more work, then the lb,ub and stride need not be modified.
2513 */
2514 int
2515 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2516                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2517 {
2518     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2519 }
2520 
2521 /*!
2522 See @ref __kmpc_dispatch_next_4
2523 */
2524 int
2525 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2526                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2527 {
2528     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2529 }
2530 
2531 /*!
2532 See @ref __kmpc_dispatch_next_4
2533 */
2534 int
2535 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2536                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2537 {
2538     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2539 }
2540 
2541 /*!
2542 See @ref __kmpc_dispatch_next_4
2543 */
2544 int
2545 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2546                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2547 {
2548     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2549 }
2550 
2551 /*!
2552 @param loc Source code location
2553 @param gtid Global thread id
2554 
2555 Mark the end of a dynamic loop.
2556 */
2557 void
2558 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2559 {
2560     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2561 }
2562 
2563 /*!
2564 See @ref __kmpc_dispatch_fini_4
2565 */
2566 void
2567 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2568 {
2569     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2570 }
2571 
2572 /*!
2573 See @ref __kmpc_dispatch_fini_4
2574 */
2575 void
2576 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2577 {
2578     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2579 }
2580 
2581 /*!
2582 See @ref __kmpc_dispatch_fini_4
2583 */
2584 void
2585 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2586 {
2587     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2588 }
2589 /*! @} */
2590 
2591 //-----------------------------------------------------------------------------------------
2592 //Non-template routines from kmp_dispatch.cpp used in other sources
2593 
2594 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2595     return value == checker;
2596 }
2597 
2598 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2599     return value != checker;
2600 }
2601 
2602 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2603     return value < checker;
2604 }
2605 
2606 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2607     return value >= checker;
2608 }
2609 
2610 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2611     return value <= checker;
2612 }
2613 
2614 kmp_uint32
2615 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2616                    kmp_uint32            checker,
2617                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2618                    , void        * obj    // Higher-level synchronization object, or NULL.
2619                    )
2620 {
2621     // note: we may not belong to a team at this point
2622     register volatile kmp_uint32         * spin          = spinner;
2623     register          kmp_uint32           check         = checker;
2624     register          kmp_uint32   spins;
2625     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2626     register          kmp_uint32           r;
2627 
2628     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2629     KMP_INIT_YIELD( spins );
2630     // main wait spin loop
2631     while(!f(r = TCR_4(*spin), check)) {
2632         KMP_FSYNC_SPIN_PREPARE( obj );
2633         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2634            It causes problems with infinite recursion because of exit lock */
2635         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2636             __kmp_abort_thread(); */
2637 
2638         /* if we have waited a bit, or are oversubscribed, yield */
2639         /* pause is in the following code */
2640         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2641         KMP_YIELD_SPIN( spins );
2642     }
2643     KMP_FSYNC_SPIN_ACQUIRED( obj );
2644     return r;
2645 }
2646 
2647 void
2648 __kmp_wait_yield_4_ptr(void *spinner,
2649                    kmp_uint32 checker,
2650                    kmp_uint32 (*pred)( void *, kmp_uint32 ),
2651                    void        *obj    // Higher-level synchronization object, or NULL.
2652                    )
2653 {
2654     // note: we may not belong to a team at this point
2655     register void                *spin          = spinner;
2656     register kmp_uint32           check         = checker;
2657     register kmp_uint32           spins;
2658     register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2659 
2660     KMP_FSYNC_SPIN_INIT( obj, spin );
2661     KMP_INIT_YIELD( spins );
2662     // main wait spin loop
2663     while ( !f( spin, check ) ) {
2664         KMP_FSYNC_SPIN_PREPARE( obj );
2665         /* if we have waited a bit, or are oversubscribed, yield */
2666         /* pause is in the following code */
2667         KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2668         KMP_YIELD_SPIN( spins );
2669     }
2670     KMP_FSYNC_SPIN_ACQUIRED( obj );
2671 }
2672 
2673 } // extern "C"
2674 
2675 #ifdef KMP_GOMP_COMPAT
2676 
2677 void
2678 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2679                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2680                            kmp_int32 chunk, int push_ws )
2681 {
2682     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2683                                       push_ws );
2684 }
2685 
2686 void
2687 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2688                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2689                             kmp_int32 chunk, int push_ws )
2690 {
2691     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2692                                        push_ws );
2693 }
2694 
2695 void
2696 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2697                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2698                            kmp_int64 chunk, int push_ws )
2699 {
2700     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2701                                       push_ws );
2702 }
2703 
2704 void
2705 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2706                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2707                             kmp_int64 chunk, int push_ws )
2708 {
2709     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2710                                        push_ws );
2711 }
2712 
2713 void
2714 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2715 {
2716     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2717 }
2718 
2719 void
2720 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2721 {
2722     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2723 }
2724 
2725 void
2726 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2727 {
2728     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2729 }
2730 
2731 void
2732 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2733 {
2734     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2735 }
2736 
2737 #endif /* KMP_GOMP_COMPAT */
2738 
2739 /* ------------------------------------------------------------------------ */
2740 /* ------------------------------------------------------------------------ */
2741 
2742