1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 // Need to raise Win version from XP to Vista here for support of InterlockedExchange64
29 #if defined(_WIN32_WINNT) && defined(_M_IX86)
30 #undef _WIN32_WINNT
31 #define _WIN32_WINNT 0x0502
32 #endif
33 
34 #include "kmp.h"
35 #include "kmp_i18n.h"
36 #include "kmp_itt.h"
37 #include "kmp_str.h"
38 #include "kmp_error.h"
39 #include "kmp_stats.h"
40 #if KMP_OS_WINDOWS && KMP_ARCH_X86
41     #include <float.h>
42 #endif
43 
44 #if OMPT_SUPPORT
45 #include "ompt-internal.h"
46 #include "ompt-specific.h"
47 #endif
48 
49 /* ------------------------------------------------------------------------ */
50 /* ------------------------------------------------------------------------ */
51 
52 #if KMP_STATIC_STEAL_ENABLED
53 
54     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
55     template< typename T >
56     struct dispatch_private_infoXX_template {
57         typedef typename traits_t< T >::unsigned_t  UT;
58         typedef typename traits_t< T >::signed_t    ST;
59         UT count;                // unsigned
60         T  ub;
61         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
62         T  lb;
63         ST st;                   // signed
64         UT tc;                   // unsigned
65         T  static_steal_counter; // for static_steal only; maybe better to put after ub
66 
67         /* parm[1-4] are used in different ways by different scheduling algorithms */
68 
69         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
70         //    a) parm3 is properly aligned and
71         //    b) all parm1-4 are in the same cache line.
72         // Because of parm1-4 are used together, performance seems to be better
73         // if they are in the same line (not measured though).
74 
75         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
76             T  parm1;
77             T  parm2;
78             T  parm3;
79             T  parm4;
80         };
81 
82         UT ordered_lower; // unsigned
83         UT ordered_upper; // unsigned
84         #if KMP_OS_WINDOWS
85         T  last_upper;
86         #endif /* KMP_OS_WINDOWS */
87     };
88 
89 #else /* KMP_STATIC_STEAL_ENABLED */
90 
91     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
92     template< typename T >
93     struct dispatch_private_infoXX_template {
94         typedef typename traits_t< T >::unsigned_t  UT;
95         typedef typename traits_t< T >::signed_t    ST;
96         T  lb;
97         T  ub;
98         ST st;            // signed
99         UT tc;            // unsigned
100 
101         T  parm1;
102         T  parm2;
103         T  parm3;
104         T  parm4;
105 
106         UT count;         // unsigned
107 
108         UT ordered_lower; // unsigned
109         UT ordered_upper; // unsigned
110         #if KMP_OS_WINDOWS
111 	T  last_upper;
112         #endif /* KMP_OS_WINDOWS */
113     };
114 
115 #endif /* KMP_STATIC_STEAL_ENABLED */
116 
117 // replaces dispatch_private_info structure and dispatch_private_info_t type
118 template< typename T >
119 struct KMP_ALIGN_CACHE dispatch_private_info_template {
120     // duplicate alignment here, otherwise size of structure is not correct in our compiler
121     union KMP_ALIGN_CACHE private_info_tmpl {
122         dispatch_private_infoXX_template< T > p;
123         dispatch_private_info64_t             p64;
124     } u;
125     enum sched_type schedule;  /* scheduling algorithm */
126     kmp_uint32      ordered;   /* ordered clause specified */
127     kmp_uint32      ordered_bumped;
128     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
129     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
130     kmp_uint32      nomerge;   /* don't merge iters if serialized */
131     kmp_uint32      type_size;
132     enum cons_type  pushed_ws;
133 };
134 
135 
136 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
137 template< typename UT >
138 struct dispatch_shared_infoXX_template {
139     /* chunk index under dynamic, number of idle threads under static-steal;
140        iteration index otherwise */
141     volatile UT     iteration;
142     volatile UT     num_done;
143     volatile UT     ordered_iteration;
144     UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
145 };
146 
147 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
148 template< typename UT >
149 struct dispatch_shared_info_template {
150     // we need union here to keep the structure size
151     union shared_info_tmpl {
152         dispatch_shared_infoXX_template< UT >  s;
153         dispatch_shared_info64_t               s64;
154     } u;
155     volatile kmp_uint32     buffer_index;
156 #if OMP_45_ENABLED
157     volatile kmp_int32      doacross_buf_idx;  // teamwise index
158     kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
159     kmp_int32               doacross_num_done; // count finished threads
160 #endif
161 #if KMP_USE_HWLOC
162     // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
163     // machines (> 48 cores). Performance analysis showed that a cache thrash
164     // was occurring and this padding helps alleviate the problem.
165     char padding[64];
166 #endif
167 };
168 
169 /* ------------------------------------------------------------------------ */
170 /* ------------------------------------------------------------------------ */
171 
172 #undef USE_TEST_LOCKS
173 
174 // test_then_add template (general template should NOT be used)
175 template< typename T >
176 static __forceinline T
177 test_then_add( volatile T *p, T d );
178 
179 template<>
180 __forceinline kmp_int32
181 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
182 {
183     kmp_int32 r;
184     r = KMP_TEST_THEN_ADD32( p, d );
185     return r;
186 }
187 
188 template<>
189 __forceinline kmp_int64
190 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
191 {
192     kmp_int64 r;
193     r = KMP_TEST_THEN_ADD64( p, d );
194     return r;
195 }
196 
197 // test_then_inc_acq template (general template should NOT be used)
198 template< typename T >
199 static __forceinline T
200 test_then_inc_acq( volatile T *p );
201 
202 template<>
203 __forceinline kmp_int32
204 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
205 {
206     kmp_int32 r;
207     r = KMP_TEST_THEN_INC_ACQ32( p );
208     return r;
209 }
210 
211 template<>
212 __forceinline kmp_int64
213 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
214 {
215     kmp_int64 r;
216     r = KMP_TEST_THEN_INC_ACQ64( p );
217     return r;
218 }
219 
220 // test_then_inc template (general template should NOT be used)
221 template< typename T >
222 static __forceinline T
223 test_then_inc( volatile T *p );
224 
225 template<>
226 __forceinline kmp_int32
227 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
228 {
229     kmp_int32 r;
230     r = KMP_TEST_THEN_INC32( p );
231     return r;
232 }
233 
234 template<>
235 __forceinline kmp_int64
236 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
237 {
238     kmp_int64 r;
239     r = KMP_TEST_THEN_INC64( p );
240     return r;
241 }
242 
243 // compare_and_swap template (general template should NOT be used)
244 template< typename T >
245 static __forceinline kmp_int32
246 compare_and_swap( volatile T *p, T c, T s );
247 
248 template<>
249 __forceinline kmp_int32
250 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
251 {
252     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
253 }
254 
255 template<>
256 __forceinline kmp_int32
257 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
258 {
259     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
260 }
261 
262 /*
263     Spin wait loop that first does pause, then yield.
264     Waits until function returns non-zero when called with *spinner and check.
265     Does NOT put threads to sleep.
266 #if USE_ITT_BUILD
267     Arguments:
268         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
269             locks consistently. For example, if lock is acquired immediately, its address is
270             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
271             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
272             address, not an address of low-level spinner.
273 #endif // USE_ITT_BUILD
274 */
275 template< typename UT >
276 // ToDo: make inline function (move to header file for icl)
277 static UT  // unsigned 4- or 8-byte type
278 __kmp_wait_yield( volatile UT * spinner,
279                   UT            checker,
280                   kmp_uint32 (* pred)( UT, UT )
281                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
282                   )
283 {
284     // note: we may not belong to a team at this point
285     register volatile UT         * spin          = spinner;
286     register          UT           check         = checker;
287     register          kmp_uint32   spins;
288     register          kmp_uint32 (*f) ( UT, UT ) = pred;
289     register          UT           r;
290 
291     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
292     KMP_INIT_YIELD( spins );
293     // main wait spin loop
294     while(!f(r = *spin, check))
295     {
296         KMP_FSYNC_SPIN_PREPARE( obj );
297         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
298            It causes problems with infinite recursion because of exit lock */
299         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
300             __kmp_abort_thread(); */
301 
302         // if we are oversubscribed,
303         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
304         // pause is in the following code
305         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
306         KMP_YIELD_SPIN( spins );
307     }
308     KMP_FSYNC_SPIN_ACQUIRED( obj );
309     return r;
310 }
311 
312 template< typename UT >
313 static kmp_uint32 __kmp_eq( UT value, UT checker) {
314     return value == checker;
315 }
316 
317 template< typename UT >
318 static kmp_uint32 __kmp_neq( UT value, UT checker) {
319     return value != checker;
320 }
321 
322 template< typename UT >
323 static kmp_uint32 __kmp_lt( UT value, UT checker) {
324     return value < checker;
325 }
326 
327 template< typename UT >
328 static kmp_uint32 __kmp_ge( UT value, UT checker) {
329     return value >= checker;
330 }
331 
332 template< typename UT >
333 static kmp_uint32 __kmp_le( UT value, UT checker) {
334     return value <= checker;
335 }
336 
337 
338 /* ------------------------------------------------------------------------ */
339 /* ------------------------------------------------------------------------ */
340 
341 static void
342 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
343 {
344     kmp_info_t *th;
345 
346     KMP_DEBUG_ASSERT( gtid_ref );
347 
348     if ( __kmp_env_consistency_check ) {
349         th = __kmp_threads[*gtid_ref];
350         if ( th -> th.th_root -> r.r_active
351           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
352 #if KMP_USE_DYNAMIC_LOCK
353             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
354 #else
355             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
356 #endif
357         }
358     }
359 }
360 
361 template< typename UT >
362 static void
363 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
364 {
365     typedef typename traits_t< UT >::signed_t    ST;
366     dispatch_private_info_template< UT > * pr;
367 
368     int gtid = *gtid_ref;
369 //    int  cid = *cid_ref;
370     kmp_info_t *th = __kmp_threads[ gtid ];
371     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
372 
373     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
374     if ( __kmp_env_consistency_check ) {
375         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
376             ( th -> th.th_dispatch -> th_dispatch_pr_current );
377         if ( pr -> pushed_ws != ct_none ) {
378 #if KMP_USE_DYNAMIC_LOCK
379             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
380 #else
381             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
382 #endif
383         }
384     }
385 
386     if ( ! th -> th.th_team -> t.t_serialized ) {
387         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
388             ( th -> th.th_dispatch -> th_dispatch_sh_current );
389         UT  lower;
390 
391         if ( ! __kmp_env_consistency_check ) {
392                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
393                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
394         }
395         lower = pr->u.p.ordered_lower;
396 
397         #if ! defined( KMP_GOMP_COMPAT )
398             if ( __kmp_env_consistency_check ) {
399                 if ( pr->ordered_bumped ) {
400                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
401                     __kmp_error_construct2(
402                         kmp_i18n_msg_CnsMultipleNesting,
403                         ct_ordered_in_pdo, loc_ref,
404                         & p->stack_data[ p->w_top ]
405                     );
406                 }
407             }
408         #endif /* !defined(KMP_GOMP_COMPAT) */
409 
410         KMP_MB();
411         #ifdef KMP_DEBUG
412         {
413             const char * buff;
414             // create format specifiers before the debug output
415             buff = __kmp_str_format(
416                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
417                 traits_t< UT >::spec, traits_t< UT >::spec );
418             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
419             __kmp_str_free( &buff );
420         }
421         #endif
422 
423         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
424                                 USE_ITT_BUILD_ARG( NULL )
425                                 );
426         KMP_MB();  /* is this necessary? */
427         #ifdef KMP_DEBUG
428         {
429             const char * buff;
430             // create format specifiers before the debug output
431             buff = __kmp_str_format(
432                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
433                 traits_t< UT >::spec, traits_t< UT >::spec );
434             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435             __kmp_str_free( &buff );
436         }
437         #endif
438     }
439     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
440 }
441 
442 static void
443 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
444 {
445     kmp_info_t *th;
446 
447     if ( __kmp_env_consistency_check ) {
448         th = __kmp_threads[*gtid_ref];
449         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
450             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
451         }
452     }
453 }
454 
455 template< typename UT >
456 static void
457 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
458 {
459     typedef typename traits_t< UT >::signed_t    ST;
460     dispatch_private_info_template< UT > * pr;
461 
462     int gtid = *gtid_ref;
463 //    int  cid = *cid_ref;
464     kmp_info_t *th = __kmp_threads[ gtid ];
465     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
466 
467     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
468     if ( __kmp_env_consistency_check ) {
469         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
470             ( th -> th.th_dispatch -> th_dispatch_pr_current );
471         if ( pr -> pushed_ws != ct_none ) {
472             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
473         }
474     }
475 
476     if ( ! th -> th.th_team -> t.t_serialized ) {
477         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
478             ( th -> th.th_dispatch -> th_dispatch_sh_current );
479 
480         if ( ! __kmp_env_consistency_check ) {
481             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
482                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
483         }
484 
485         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
486         #if ! defined( KMP_GOMP_COMPAT )
487             if ( __kmp_env_consistency_check ) {
488                 if ( pr->ordered_bumped != 0 ) {
489                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
490                     /* How to test it? - OM */
491                     __kmp_error_construct2(
492                         kmp_i18n_msg_CnsMultipleNesting,
493                         ct_ordered_in_pdo, loc_ref,
494                         & p->stack_data[ p->w_top ]
495                     );
496                 }
497             }
498         #endif /* !defined(KMP_GOMP_COMPAT) */
499 
500         KMP_MB();       /* Flush all pending memory write invalidates.  */
501 
502         pr->ordered_bumped += 1;
503 
504         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
505                         gtid, pr->ordered_bumped ) );
506 
507         KMP_MB();       /* Flush all pending memory write invalidates.  */
508 
509         /* TODO use general release procedure? */
510         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
511 
512         KMP_MB();       /* Flush all pending memory write invalidates.  */
513     }
514     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
515 }
516 
517 /* Computes and returns x to the power of y, where y must a non-negative integer */
518 template< typename UT >
519 static __forceinline long double
520 __kmp_pow(long double x, UT y) {
521     long double s=1.0L;
522 
523     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
524     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
525     while(y) {
526         if ( y & 1 )
527             s *= x;
528         x *= x;
529         y >>= 1;
530     }
531     return s;
532 }
533 
534 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
535    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
536    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
537    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
538 */
539 template< typename T >
540 static __inline typename traits_t< T >::unsigned_t
541 __kmp_dispatch_guided_remaining(
542     T                                  tc,
543     typename traits_t< T >::floating_t base,
544     typename traits_t< T >::unsigned_t idx
545 ) {
546     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
547        least for ICL 8.1, long double arithmetic may not really have
548        long double precision, even with /Qlong_double.  Currently, we
549        workaround that in the caller code, by manipulating the FPCW for
550        Windows* OS on IA-32 architecture.  The lack of precision is not
551        expected to be a correctness issue, though.
552     */
553     typedef typename traits_t< T >::unsigned_t  UT;
554 
555     long double x = tc * __kmp_pow< UT >(base, idx);
556     UT r = (UT) x;
557     if ( x == r )
558         return r;
559     return r + 1;
560 }
561 
562 // Parameters of the guided-iterative algorithm:
563 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
564 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
565 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
566 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
567 static int guided_int_param = 2;
568 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
569 
570 // UT - unsigned flavor of T, ST - signed flavor of T,
571 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
572 template< typename T >
573 static void
574 __kmp_dispatch_init(
575     ident_t                        * loc,
576     int                              gtid,
577     enum sched_type                  schedule,
578     T                                lb,
579     T                                ub,
580     typename traits_t< T >::signed_t st,
581     typename traits_t< T >::signed_t chunk,
582     int                              push_ws
583 ) {
584     typedef typename traits_t< T >::unsigned_t  UT;
585     typedef typename traits_t< T >::signed_t    ST;
586     typedef typename traits_t< T >::floating_t  DBL;
587 
588     int                                            active;
589     T                                              tc;
590     kmp_info_t *                                   th;
591     kmp_team_t *                                   team;
592     kmp_uint32                                     my_buffer_index;
593     dispatch_private_info_template< T >          * pr;
594     dispatch_shared_info_template< UT > volatile * sh;
595 
596     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
597     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
598 
599     if ( ! TCR_4( __kmp_init_parallel ) )
600         __kmp_parallel_initialize();
601 
602 #if INCLUDE_SSC_MARKS
603     SSC_MARK_DISPATCH_INIT();
604 #endif
605     #ifdef KMP_DEBUG
606     {
607         const char * buff;
608         // create format specifiers before the debug output
609         buff = __kmp_str_format(
610             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
611             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
612         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
613         __kmp_str_free( &buff );
614     }
615     #endif
616     /* setup data */
617     th     = __kmp_threads[ gtid ];
618     team   = th -> th.th_team;
619     active = ! team -> t.t_serialized;
620     th->th.th_ident = loc;
621 
622 #if USE_ITT_BUILD
623     kmp_uint64 cur_chunk = chunk;
624     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
625         KMP_MASTER_GTID(gtid) &&
626 #if OMP_40_ENABLED
627         th->th.th_teams_microtask == NULL &&
628 #endif
629         team->t.t_active_level == 1;
630 #endif
631     if ( ! active ) {
632         pr = reinterpret_cast< dispatch_private_info_template< T >* >
633             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
634     } else {
635         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
636                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
637 
638         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
639 
640         /* What happens when number of threads changes, need to resize buffer? */
641         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
642             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
643         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
644             ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
645     }
646 
647     #if  ( KMP_STATIC_STEAL_ENABLED )
648     if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
649         // AC: we now have only one implementation of stealing, so use it
650         schedule = kmp_sch_static_steal;
651     else
652     #endif
653         schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
654 
655     /* Pick up the nomerge/ordered bits from the scheduling type */
656     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
657         pr->nomerge = TRUE;
658         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
659     } else {
660         pr->nomerge = FALSE;
661     }
662     pr->type_size = traits_t<T>::type_size; // remember the size of variables
663     if ( kmp_ord_lower & schedule ) {
664         pr->ordered = TRUE;
665         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
666     } else {
667         pr->ordered = FALSE;
668     }
669 
670     if ( schedule == kmp_sch_static ) {
671         schedule = __kmp_static;
672     } else {
673         if ( schedule == kmp_sch_runtime ) {
674             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
675             schedule = team -> t.t_sched.r_sched_type;
676             // Detail the schedule if needed (global controls are differentiated appropriately)
677             if ( schedule == kmp_sch_guided_chunked ) {
678                 schedule = __kmp_guided;
679             } else if ( schedule == kmp_sch_static ) {
680                 schedule = __kmp_static;
681             }
682             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
683             chunk = team -> t.t_sched.chunk;
684 #if USE_ITT_BUILD
685             cur_chunk = chunk;
686 #endif
687             #ifdef KMP_DEBUG
688             {
689                 const char * buff;
690                 // create format specifiers before the debug output
691                 buff = __kmp_str_format(
692                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
693                     traits_t< ST >::spec );
694                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
695                 __kmp_str_free( &buff );
696             }
697             #endif
698         } else {
699             if ( schedule == kmp_sch_guided_chunked ) {
700                 schedule = __kmp_guided;
701             }
702             if ( chunk <= 0 ) {
703                 chunk = KMP_DEFAULT_CHUNK;
704             }
705         }
706 
707         if ( schedule == kmp_sch_auto ) {
708             // mapping and differentiation: in the __kmp_do_serial_initialize()
709             schedule = __kmp_auto;
710             #ifdef KMP_DEBUG
711             {
712                 const char * buff;
713                 // create format specifiers before the debug output
714                 buff = __kmp_str_format(
715                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
716                     traits_t< ST >::spec );
717                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
718                 __kmp_str_free( &buff );
719             }
720             #endif
721         }
722 
723         /* guided analytical not safe for too many threads */
724         if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) {
725             schedule = kmp_sch_guided_iterative_chunked;
726             KMP_WARNING( DispatchManyThreads );
727         }
728         pr->u.p.parm1 = chunk;
729     }
730     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
731                 "unknown scheduling type" );
732 
733     pr->u.p.count = 0;
734 
735     if ( __kmp_env_consistency_check ) {
736         if ( st == 0 ) {
737             __kmp_error_construct(
738                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
739                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
740             );
741         }
742     }
743     // compute trip count
744     if ( st == 1 ) {   // most common case
745         if ( ub >= lb ) {
746             tc = ub - lb + 1;
747         } else {   // ub < lb
748             tc = 0;            // zero-trip
749         }
750     } else if ( st < 0 ) {
751         if ( lb >= ub ) {
752             // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
753             //     where the division needs to be unsigned regardless of the result type
754             tc = (UT)(lb - ub) / (-st) + 1;
755         } else {   // lb < ub
756             tc = 0;            // zero-trip
757         }
758     } else {       // st > 0
759         if ( ub >= lb ) {
760             // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
761             //     where the division needs to be unsigned regardless of the result type
762             tc = (UT)(ub - lb) / st + 1;
763         } else {   // ub < lb
764             tc = 0;            // zero-trip
765         }
766     }
767 
768     // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
769     // when statistics are disabled.
770     if (schedule == __kmp_static)
771     {
772         KMP_COUNT_BLOCK(OMP_FOR_static);
773         KMP_COUNT_VALUE(FOR_static_iterations, tc);
774     }
775     else
776     {
777         KMP_COUNT_BLOCK(OMP_FOR_dynamic);
778         KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
779     }
780 
781     pr->u.p.lb = lb;
782     pr->u.p.ub = ub;
783     pr->u.p.st = st;
784     pr->u.p.tc = tc;
785 
786     #if KMP_OS_WINDOWS
787     pr->u.p.last_upper = ub + st;
788     #endif /* KMP_OS_WINDOWS */
789 
790     /* NOTE: only the active parallel region(s) has active ordered sections */
791 
792     if ( active ) {
793         if ( pr->ordered == 0 ) {
794             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
795             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
796         } else {
797             pr->ordered_bumped = 0;
798 
799             pr->u.p.ordered_lower = 1;
800             pr->u.p.ordered_upper = 0;
801 
802             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
803             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
804         }
805     }
806 
807     if ( __kmp_env_consistency_check ) {
808         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
809         if ( push_ws ) {
810             __kmp_push_workshare( gtid, ws, loc );
811             pr->pushed_ws = ws;
812         } else {
813             __kmp_check_workshare( gtid, ws, loc );
814             pr->pushed_ws = ct_none;
815         }
816     }
817 
818     switch ( schedule ) {
819     #if  ( KMP_STATIC_STEAL_ENABLED )
820     case kmp_sch_static_steal:
821         {
822             T nproc = th->th.th_team_nproc;
823             T ntc, init;
824 
825             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
826 
827             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
828             if ( nproc > 1 && ntc >= nproc ) {
829                 T id = __kmp_tid_from_gtid(gtid);
830                 T small_chunk, extras;
831 
832                 small_chunk = ntc / nproc;
833                 extras = ntc % nproc;
834 
835                 init = id * small_chunk + ( id < extras ? id : extras );
836                 pr->u.p.count = init;
837                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
838 
839                 pr->u.p.parm2 = lb;
840                 //pr->pfields.parm3 = 0; // it's not used in static_steal
841                 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
842                 pr->u.p.st = st;
843                 if ( traits_t<T>::type_size > 4 ) {
844                     // AC: TODO: check if 16-byte CAS available and use it to
845                     // improve performance (probably wait for explicit request
846                     // before spending time on this).
847                     // For now use dynamically allocated per-thread lock,
848                     // free memory in __kmp_dispatch_next when status==0.
849                     KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
850                     th->th.th_dispatch->th_steal_lock =
851                         (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
852                     __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
853                 }
854                 break;
855             } else {
856                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
857                                gtid ) );
858                 schedule = kmp_sch_static_balanced;
859                 /* too few iterations: fall-through to kmp_sch_static_balanced */
860             } // if
861             /* FALL-THROUGH to static balanced */
862         } // case
863     #endif
864     case kmp_sch_static_balanced:
865         {
866             T nproc = th->th.th_team_nproc;
867             T init, limit;
868 
869             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
870                             gtid ) );
871 
872             if ( nproc > 1 ) {
873                 T id = __kmp_tid_from_gtid(gtid);
874 
875                 if ( tc < nproc ) {
876                     if ( id < tc ) {
877                         init = id;
878                         limit = id;
879                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
880                     } else {
881                         pr->u.p.count = 1;  /* means no more chunks to execute */
882                         pr->u.p.parm1 = FALSE;
883                         break;
884                     }
885                 } else {
886                     T small_chunk = tc / nproc;
887                     T extras = tc % nproc;
888                     init = id * small_chunk + (id < extras ? id : extras);
889                     limit = init + small_chunk - (id < extras ? 0 : 1);
890                     pr->u.p.parm1 = (id == nproc - 1);
891                 }
892             } else {
893                 if ( tc > 0 ) {
894                     init = 0;
895                     limit = tc - 1;
896                     pr->u.p.parm1 = TRUE;
897                 } else {
898                     // zero trip count
899                     pr->u.p.count = 1;  /* means no more chunks to execute */
900                     pr->u.p.parm1 = FALSE;
901                     break;
902                 }
903             }
904 #if USE_ITT_BUILD
905             // Calculate chunk for metadata report
906             if ( itt_need_metadata_reporting )
907                 cur_chunk = limit - init + 1;
908 #endif
909             if ( st == 1 ) {
910                 pr->u.p.lb = lb + init;
911                 pr->u.p.ub = lb + limit;
912             } else {
913                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
914                 pr->u.p.lb = lb + init * st;
915                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
916                 if ( st > 0 ) {
917                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
918                 } else {
919                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
920                 }
921             }
922             if ( pr->ordered ) {
923                 pr->u.p.ordered_lower = init;
924                 pr->u.p.ordered_upper = limit;
925             }
926             break;
927         } // case
928     case kmp_sch_guided_iterative_chunked :
929         {
930             T nproc = th->th.th_team_nproc;
931             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
932 
933             if ( nproc > 1 ) {
934                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
935                     /* chunk size too large, switch to dynamic */
936                     schedule = kmp_sch_dynamic_chunked;
937                 } else {
938                     // when remaining iters become less than parm2 - switch to dynamic
939                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
940                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
941                 }
942             } else {
943                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
944                 schedule = kmp_sch_static_greedy;
945                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
947                 pr->u.p.parm1 = tc;
948             } // if
949         } // case
950         break;
951     case kmp_sch_guided_analytical_chunked:
952         {
953             T nproc = th->th.th_team_nproc;
954             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
955 
956             if ( nproc > 1 ) {
957                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
958                     /* chunk size too large, switch to dynamic */
959                     schedule = kmp_sch_dynamic_chunked;
960                 } else {
961                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
962                     DBL x;
963 
964                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
965                     /* Linux* OS already has 64-bit computation by default for
966 		       long double, and on Windows* OS on Intel(R) 64,
967 		       /Qlong_double doesn't work.  On Windows* OS
968 		       on IA-32 architecture, we need to set precision to
969 		       64-bit instead of the default 53-bit. Even though long
970 		       double doesn't work on Windows* OS on Intel(R) 64, the
971 		       resulting lack of precision is not expected to impact
972 		       the correctness of the algorithm, but this has not been
973 		       mathematically proven.
974                     */
975                     // save original FPCW and set precision to 64-bit, as
976                     // Windows* OS on IA-32 architecture defaults to 53-bit
977                     unsigned int oldFpcw = _control87(0,0);
978                     _control87(_PC_64,_MCW_PC); // 0,0x30000
979                     #endif
980                     /* value used for comparison in solver for cross-over point */
981                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
982 
983                     /* crossover point--chunk indexes equal to or greater than
984 		       this point switch to dynamic-style scheduling */
985                     UT   cross;
986 
987                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
988                     x = (long double)1.0 - (long double)0.5 / nproc;
989 
990                     #ifdef KMP_DEBUG
991                     { // test natural alignment
992                         struct _test_a {
993                             char a;
994                             union {
995                                 char b;
996                                 DBL  d;
997                             };
998                         } t;
999                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1000                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1001                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1002                     }
1003                     #endif // KMP_DEBUG
1004 
1005                     /* save the term in thread private dispatch structure */
1006                     *(DBL*)&pr->u.p.parm3 = x;
1007 
1008                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1009                     {
1010                         UT          left, right, mid;
1011                         long double p;
1012 
1013                         /* estimate initial upper and lower bound */
1014 
1015                         /* doesn't matter what value right is as long as it is positive, but
1016                            it affects performance of the solver
1017                         */
1018                         right = 229;
1019                         p = __kmp_pow< UT >(x,right);
1020                         if ( p > target ) {
1021                             do{
1022                                 p *= p;
1023                                 right <<= 1;
1024                             } while(p>target && right < (1<<27));
1025                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1026                         } else {
1027                             left = 0;
1028                         }
1029 
1030                         /* bisection root-finding method */
1031                         while ( left + 1 < right ) {
1032                             mid = (left + right) / 2;
1033                             if ( __kmp_pow< UT >(x,mid) > target ) {
1034                                 left = mid;
1035                             } else {
1036                                 right = mid;
1037                             }
1038                         } // while
1039                         cross = right;
1040                     }
1041                     /* assert sanity of computed crossover point */
1042                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1043 
1044                     /* save the crossover point in thread private dispatch structure */
1045                     pr->u.p.parm2 = cross;
1046 
1047                     // C75803
1048                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1049                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1050                     #else
1051                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1052                     #endif
1053                     /* dynamic-style scheduling offset */
1054                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1055                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1056                         // restore FPCW
1057                         _control87(oldFpcw,_MCW_PC);
1058                     #endif
1059                 } // if
1060             } else {
1061                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1062                                gtid ) );
1063                 schedule = kmp_sch_static_greedy;
1064                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1065                 pr->u.p.parm1 = tc;
1066             } // if
1067         } // case
1068         break;
1069     case kmp_sch_static_greedy:
1070         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1071             pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ?
1072                 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc :
1073                 tc;
1074         break;
1075     case kmp_sch_static_chunked :
1076     case kmp_sch_dynamic_chunked :
1077         if ( pr->u.p.parm1 <= 0 ) {
1078             pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1079         }
1080         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1081         break;
1082     case kmp_sch_trapezoidal :
1083         {
1084             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1085 
1086             T parm1, parm2, parm3, parm4;
1087             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1088 
1089             parm1 = chunk;
1090 
1091             /* F : size of the first cycle */
1092             parm2 = ( tc / (2 * th->th.th_team_nproc) );
1093 
1094             if ( parm2 < 1 ) {
1095                 parm2 = 1;
1096             }
1097 
1098             /* L : size of the last cycle.  Make sure the last cycle
1099              *     is not larger than the first cycle.
1100              */
1101             if ( parm1 < 1 ) {
1102                 parm1 = 1;
1103             } else if ( parm1 > parm2 ) {
1104                 parm1 = parm2;
1105             }
1106 
1107             /* N : number of cycles */
1108             parm3 = ( parm2 + parm1 );
1109             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1110 
1111             if ( parm3 < 2 ) {
1112                 parm3 = 2;
1113             }
1114 
1115             /* sigma : decreasing incr of the trapezoid */
1116             parm4 = ( parm3 - 1 );
1117             parm4 = ( parm2 - parm1 ) / parm4;
1118 
1119             // pointless check, because parm4 >= 0 always
1120             //if ( parm4 < 0 ) {
1121             //    parm4 = 0;
1122             //}
1123 
1124             pr->u.p.parm1 = parm1;
1125             pr->u.p.parm2 = parm2;
1126             pr->u.p.parm3 = parm3;
1127             pr->u.p.parm4 = parm4;
1128         } // case
1129         break;
1130 
1131     default:
1132         {
1133             __kmp_msg(
1134                 kmp_ms_fatal,                        // Severity
1135                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1136                 KMP_HNT( GetNewerLibrary ),          // Hint
1137                 __kmp_msg_null                       // Variadic argument list terminator
1138             );
1139         }
1140         break;
1141     } // switch
1142     pr->schedule = schedule;
1143     if ( active ) {
1144         /* The name of this buffer should be my_buffer_index when it's free to use it */
1145 
1146         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1147                         gtid, my_buffer_index, sh->buffer_index) );
1148         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1149                                         USE_ITT_BUILD_ARG( NULL )
1150                                         );
1151             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1152             // *always* 32-bit integers.
1153         KMP_MB();  /* is this necessary? */
1154         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1155                         gtid, my_buffer_index, sh->buffer_index) );
1156 
1157         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1158         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1159 #if USE_ITT_BUILD
1160         if ( pr->ordered ) {
1161             __kmp_itt_ordered_init( gtid );
1162         }; // if
1163         // Report loop metadata
1164         if ( itt_need_metadata_reporting ) {
1165             // Only report metadata by master of active team at level 1
1166             kmp_uint64 schedtype = 0;
1167             switch ( schedule ) {
1168             case kmp_sch_static_chunked:
1169             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1170                 break;
1171             case kmp_sch_static_greedy:
1172                 cur_chunk = pr->u.p.parm1;
1173                 break;
1174             case kmp_sch_dynamic_chunked:
1175                 schedtype = 1;
1176                 break;
1177             case kmp_sch_guided_iterative_chunked:
1178             case kmp_sch_guided_analytical_chunked:
1179                 schedtype = 2;
1180                 break;
1181             default:
1182 //            Should we put this case under "static"?
1183 //            case kmp_sch_static_steal:
1184                 schedtype = 3;
1185                 break;
1186             }
1187             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1188         }
1189 #endif /* USE_ITT_BUILD */
1190     }; // if
1191 
1192     #ifdef KMP_DEBUG
1193     {
1194         const char * buff;
1195         // create format specifiers before the debug output
1196         buff = __kmp_str_format(
1197             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1198             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1199             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1200             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1201             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1202             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1203             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1204         KD_TRACE(10, ( buff,
1205             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1206             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1207             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1208             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1209         __kmp_str_free( &buff );
1210     }
1211     #endif
1212     #if ( KMP_STATIC_STEAL_ENABLED )
1213       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1214       // all the parm3 variables will contain the same value.
1215       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1216       // rather than program life-time increment.
1217       // So the dedicated variable is required. The 'static_steal_counter' is used.
1218       if( schedule == kmp_sch_static_steal ) {
1219         // Other threads will inspect this variable when searching for a victim.
1220         // This is a flag showing that other threads may steal from this thread since then.
1221         volatile T * p = &pr->u.p.static_steal_counter;
1222         *p = *p + 1;
1223       }
1224     #endif // ( KMP_STATIC_STEAL_ENABLED )
1225 
1226 #if OMPT_SUPPORT && OMPT_TRACE
1227     if (ompt_enabled &&
1228         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1229         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1230         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1231         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1232             team_info->parallel_id, task_info->task_id, team_info->microtask);
1233     }
1234 #endif
1235 }
1236 
1237 /*
1238  * For ordered loops, either __kmp_dispatch_finish() should be called after
1239  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1240  * every chunk of iterations.  If the ordered section(s) were not executed
1241  * for this iteration (or every iteration in this chunk), we need to set the
1242  * ordered iteration counters so that the next thread can proceed.
1243  */
1244 template< typename UT >
1245 static void
1246 __kmp_dispatch_finish( int gtid, ident_t *loc )
1247 {
1248     typedef typename traits_t< UT >::signed_t ST;
1249     kmp_info_t *th = __kmp_threads[ gtid ];
1250 
1251     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1252     if ( ! th -> th.th_team -> t.t_serialized ) {
1253 
1254         dispatch_private_info_template< UT > * pr =
1255             reinterpret_cast< dispatch_private_info_template< UT >* >
1256             ( th->th.th_dispatch->th_dispatch_pr_current );
1257         dispatch_shared_info_template< UT > volatile * sh =
1258             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1259             ( th->th.th_dispatch->th_dispatch_sh_current );
1260         KMP_DEBUG_ASSERT( pr );
1261         KMP_DEBUG_ASSERT( sh );
1262         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1263                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1264 
1265         if ( pr->ordered_bumped ) {
1266             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1267                             gtid ) );
1268             pr->ordered_bumped = 0;
1269         } else {
1270             UT lower = pr->u.p.ordered_lower;
1271 
1272             #ifdef KMP_DEBUG
1273             {
1274                 const char * buff;
1275                 // create format specifiers before the debug output
1276                 buff = __kmp_str_format(
1277                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1278                     traits_t< UT >::spec, traits_t< UT >::spec );
1279                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1280                 __kmp_str_free( &buff );
1281             }
1282             #endif
1283 
1284             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1285                                    USE_ITT_BUILD_ARG(NULL)
1286                                    );
1287             KMP_MB();  /* is this necessary? */
1288             #ifdef KMP_DEBUG
1289             {
1290                 const char * buff;
1291                 // create format specifiers before the debug output
1292                 buff = __kmp_str_format(
1293                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1294                     traits_t< UT >::spec, traits_t< UT >::spec );
1295                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1296                 __kmp_str_free( &buff );
1297             }
1298             #endif
1299 
1300             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1301         } // if
1302     } // if
1303     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1304 }
1305 
1306 #ifdef KMP_GOMP_COMPAT
1307 
1308 template< typename UT >
1309 static void
1310 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1311 {
1312     typedef typename traits_t< UT >::signed_t ST;
1313     kmp_info_t *th = __kmp_threads[ gtid ];
1314 
1315     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1316     if ( ! th -> th.th_team -> t.t_serialized ) {
1317 //        int cid;
1318         dispatch_private_info_template< UT > * pr =
1319             reinterpret_cast< dispatch_private_info_template< UT >* >
1320             ( th->th.th_dispatch->th_dispatch_pr_current );
1321         dispatch_shared_info_template< UT > volatile * sh =
1322             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1323             ( th->th.th_dispatch->th_dispatch_sh_current );
1324         KMP_DEBUG_ASSERT( pr );
1325         KMP_DEBUG_ASSERT( sh );
1326         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1327                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1328 
1329 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1330             UT lower = pr->u.p.ordered_lower;
1331             UT upper = pr->u.p.ordered_upper;
1332             UT inc = upper - lower + 1;
1333 
1334             if ( pr->ordered_bumped == inc ) {
1335                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1336                   gtid ) );
1337                 pr->ordered_bumped = 0;
1338             } else {
1339                 inc -= pr->ordered_bumped;
1340 
1341                 #ifdef KMP_DEBUG
1342                 {
1343                     const char * buff;
1344                     // create format specifiers before the debug output
1345                     buff = __kmp_str_format(
1346                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1347                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1348                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1349                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1350                     __kmp_str_free( &buff );
1351                 }
1352                 #endif
1353 
1354                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1355                                        USE_ITT_BUILD_ARG(NULL)
1356                                        );
1357 
1358                 KMP_MB();  /* is this necessary? */
1359                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1360                   gtid ) );
1361                 pr->ordered_bumped = 0;
1362 //!!!!! TODO check if the inc should be unsigned, or signed???
1363                 #ifdef KMP_DEBUG
1364                 {
1365                     const char * buff;
1366                     // create format specifiers before the debug output
1367                     buff = __kmp_str_format(
1368                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1369                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1370                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1371                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1372                     __kmp_str_free( &buff );
1373                 }
1374                 #endif
1375 
1376                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1377             }
1378 //        }
1379     }
1380     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1381 }
1382 
1383 #endif /* KMP_GOMP_COMPAT */
1384 
1385 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1386  * (no more work), then tell OMPT the loop is over. In some cases
1387  * kmp_dispatch_fini() is not called. */
1388 #if OMPT_SUPPORT && OMPT_TRACE
1389 #define OMPT_LOOP_END                                                          \
1390     if (status == 0) {                                                         \
1391         if (ompt_enabled &&                     \
1392             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1393             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1394             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1395             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1396                 team_info->parallel_id, task_info->task_id);                   \
1397         }                                                                      \
1398     }
1399 #else
1400 #define OMPT_LOOP_END // no-op
1401 #endif
1402 
1403 template< typename T >
1404 static int
1405 __kmp_dispatch_next(
1406     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1407 ) {
1408 
1409     typedef typename traits_t< T >::unsigned_t  UT;
1410     typedef typename traits_t< T >::signed_t    ST;
1411     typedef typename traits_t< T >::floating_t  DBL;
1412 
1413     // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1414     // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1415     // more than a compile time choice to use static scheduling would.)
1416     KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1417 
1418     int                                   status;
1419     dispatch_private_info_template< T > * pr;
1420     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1421     kmp_team_t                          * team = th -> th.th_team;
1422 
1423     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1424     #ifdef KMP_DEBUG
1425     {
1426         const char * buff;
1427         // create format specifiers before the debug output
1428         buff = __kmp_str_format(
1429             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1430             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1431         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1432         __kmp_str_free( &buff );
1433     }
1434     #endif
1435 
1436     if ( team -> t.t_serialized ) {
1437         /* NOTE: serialize this dispatch becase we are not at the active level */
1438         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1439             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1440         KMP_DEBUG_ASSERT( pr );
1441 
1442         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1443             *p_lb = 0;
1444             *p_ub = 0;
1445 //            if ( p_last != NULL )
1446 //                *p_last = 0;
1447             if ( p_st != NULL )
1448                 *p_st = 0;
1449             if ( __kmp_env_consistency_check ) {
1450                 if ( pr->pushed_ws != ct_none ) {
1451                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1452                 }
1453             }
1454         } else if ( pr->nomerge ) {
1455             kmp_int32 last;
1456             T         start;
1457             UT        limit, trip, init;
1458             ST        incr;
1459             T         chunk = pr->u.p.parm1;
1460 
1461             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1462 
1463             init = chunk * pr->u.p.count++;
1464             trip = pr->u.p.tc - 1;
1465 
1466             if ( (status = (init <= trip)) == 0 ) {
1467                 *p_lb = 0;
1468                 *p_ub = 0;
1469 //                if ( p_last != NULL )
1470 //                    *p_last = 0;
1471                 if ( p_st != NULL )
1472                     *p_st = 0;
1473                 if ( __kmp_env_consistency_check ) {
1474                     if ( pr->pushed_ws != ct_none ) {
1475                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1476                     }
1477                 }
1478             } else {
1479                 start = pr->u.p.lb;
1480                 limit = chunk + init - 1;
1481                 incr  = pr->u.p.st;
1482 
1483                 if ( (last = (limit >= trip)) != 0 ) {
1484                     limit = trip;
1485                     #if KMP_OS_WINDOWS
1486                     pr->u.p.last_upper = pr->u.p.ub;
1487                     #endif /* KMP_OS_WINDOWS */
1488                 }
1489                 if ( p_last != NULL )
1490                     *p_last = last;
1491                 if ( p_st != NULL )
1492                     *p_st = incr;
1493                 if ( incr == 1 ) {
1494                     *p_lb = start + init;
1495                     *p_ub = start + limit;
1496                 } else {
1497                     *p_lb = start + init * incr;
1498                     *p_ub = start + limit * incr;
1499                 }
1500 
1501                 if ( pr->ordered ) {
1502                     pr->u.p.ordered_lower = init;
1503                     pr->u.p.ordered_upper = limit;
1504                     #ifdef KMP_DEBUG
1505                     {
1506                         const char * buff;
1507                         // create format specifiers before the debug output
1508                         buff = __kmp_str_format(
1509                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1510                             traits_t< UT >::spec, traits_t< UT >::spec );
1511                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1512                         __kmp_str_free( &buff );
1513                     }
1514                     #endif
1515                 } // if
1516             } // if
1517         } else {
1518             pr->u.p.tc = 0;
1519             *p_lb = pr->u.p.lb;
1520             *p_ub = pr->u.p.ub;
1521             #if KMP_OS_WINDOWS
1522             pr->u.p.last_upper = *p_ub;
1523             #endif /* KMP_OS_WINDOWS */
1524             if ( p_last != NULL )
1525                 *p_last = TRUE;
1526             if ( p_st != NULL )
1527                 *p_st = pr->u.p.st;
1528         } // if
1529         #ifdef KMP_DEBUG
1530         {
1531             const char * buff;
1532             // create format specifiers before the debug output
1533             buff = __kmp_str_format(
1534                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1535                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1536                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1537             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1538             __kmp_str_free( &buff );
1539         }
1540         #endif
1541 #if INCLUDE_SSC_MARKS
1542         SSC_MARK_DISPATCH_NEXT();
1543 #endif
1544         OMPT_LOOP_END;
1545         return status;
1546     } else {
1547         kmp_int32 last = 0;
1548         dispatch_shared_info_template< UT > *sh;
1549         T         start;
1550         ST        incr;
1551         UT        limit, trip, init;
1552 
1553         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1554                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1555 
1556         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1557             ( th->th.th_dispatch->th_dispatch_pr_current );
1558         KMP_DEBUG_ASSERT( pr );
1559         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1560             ( th->th.th_dispatch->th_dispatch_sh_current );
1561         KMP_DEBUG_ASSERT( sh );
1562 
1563         if ( pr->u.p.tc == 0 ) {
1564             // zero trip count
1565             status = 0;
1566         } else {
1567             switch (pr->schedule) {
1568             #if ( KMP_STATIC_STEAL_ENABLED )
1569             case kmp_sch_static_steal:
1570                 {
1571                     T chunk = pr->u.p.parm1;
1572                     int nproc = th->th.th_team_nproc;
1573 
1574                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1575 
1576                     trip = pr->u.p.tc - 1;
1577 
1578                     if ( traits_t<T>::type_size > 4 ) {
1579                         // use lock for 8-byte and CAS for 4-byte induction
1580                         // variable. TODO (optional): check and use 16-byte CAS
1581                         kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
1582                         KMP_DEBUG_ASSERT(lck != NULL);
1583                         if( pr->u.p.count < (UT)pr->u.p.ub ) {
1584                             __kmp_acquire_lock(lck, gtid);
1585                             // try to get own chunk of iterations
1586                             init   = ( pr->u.p.count )++;
1587                             status = ( init < (UT)pr->u.p.ub );
1588                             __kmp_release_lock(lck, gtid);
1589                         } else {
1590                             status = 0; // no own chunks
1591                         }
1592                         if( !status ) { // try to steal
1593                             kmp_info_t   **other_threads = team->t.t_threads;
1594                             int          while_limit = nproc; // nproc attempts to find a victim
1595                             int          while_index = 0;
1596                             // TODO: algorithm of searching for a victim
1597                             // should be cleaned up and measured
1598                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1599                                 T remaining;
1600                                 T victimIdx    = pr->u.p.parm4;
1601                                 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1602                                 dispatch_private_info_template< T > * victim =
1603                                     reinterpret_cast< dispatch_private_info_template< T >* >
1604                                     (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1605                                 while( ( victim == NULL || victim == pr ||
1606                                     ( *(volatile T*)&victim->u.p.static_steal_counter !=
1607                                     *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
1608                                     oldVictimIdx != victimIdx )
1609                                 {
1610                                     victimIdx = (victimIdx + 1) % nproc;
1611                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1612                                         (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1613                                 };
1614                                 if( !victim ||
1615                                     ( *(volatile T *)&victim->u.p.static_steal_counter !=
1616                                     *(volatile T *)&pr->u.p.static_steal_counter ) )
1617                                 {
1618                                     continue; // try once more (nproc attempts in total)
1619                                     // no victim is ready yet to participate in stealing
1620                                     // because all victims are still in kmp_init_dispatch
1621                                 }
1622                                 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
1623                                     pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1624                                     continue; // not enough chunks to steal, goto next victim
1625                                 }
1626 
1627                                 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1628                                 KMP_ASSERT(lck != NULL);
1629                                 __kmp_acquire_lock(lck, gtid);
1630                                 limit = victim->u.p.ub; // keep initial ub
1631                                 if( victim->u.p.count >= limit ||
1632                                     (remaining = limit - victim->u.p.count) < 2 )
1633                                 {
1634                                     __kmp_release_lock(lck, gtid);
1635                                     pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1636                                     continue; // not enough chunks to steal
1637                                 }
1638                                 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
1639                                 if( remaining > 3 ) {
1640                                     init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
1641                                 } else {
1642                                     init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
1643                                 }
1644                                 __kmp_release_lock(lck, gtid);
1645 
1646                                 KMP_DEBUG_ASSERT(init + 1 <= limit);
1647                                 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1648                                 status = 1;
1649                                 while_index = 0;
1650                                 // now update own count and ub with stolen range but init chunk
1651                                 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1652                                 pr->u.p.count = init + 1;
1653                                 pr->u.p.ub = limit;
1654                                 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1655                             } // while (search for victim)
1656                         } // if (try to find victim and steal)
1657                     } else {
1658                         // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1659                         typedef union {
1660                             struct {
1661                                 UT count;
1662                                 T  ub;
1663                             } p;
1664                             kmp_int64 b;
1665                         } union_i4;
1666                         // All operations on 'count' or 'ub' must be combined atomically together.
1667                         {
1668                             union_i4 vold, vnew;
1669                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1670                             vnew = vold;
1671                             vnew.p.count++;
1672                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1673                                         ( volatile kmp_int64* )&pr->u.p.count,
1674                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1675                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1676                                 KMP_CPU_PAUSE();
1677                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1678                                 vnew = vold;
1679                                 vnew.p.count++;
1680                             }
1681                             vnew = vold;
1682                             init   = vnew.p.count;
1683                             status = ( init < (UT)vnew.p.ub ) ;
1684                         }
1685 
1686                         if( !status ) {
1687                             kmp_info_t   **other_threads = team->t.t_threads;
1688                             int          while_limit = nproc; // nproc attempts to find a victim
1689                             int          while_index = 0;
1690 
1691                             // TODO: algorithm of searching for a victim
1692                             // should be cleaned up and measured
1693                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1694                                 union_i4  vold, vnew;
1695                                 kmp_int32 remaining;
1696                                 T         victimIdx    = pr->u.p.parm4;
1697                                 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1698                                 dispatch_private_info_template< T > * victim =
1699                                     reinterpret_cast< dispatch_private_info_template< T >* >
1700                                     (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1701                                 while( (victim == NULL || victim == pr ||
1702                                     (*(volatile T*)&victim->u.p.static_steal_counter !=
1703                                     *(volatile T*)&pr->u.p.static_steal_counter)) &&
1704                                     oldVictimIdx != victimIdx )
1705                                 {
1706                                     victimIdx = (victimIdx + 1) % nproc;
1707                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1708                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1709                                 };
1710                                 if( !victim ||
1711                                     ( *(volatile T *)&victim->u.p.static_steal_counter !=
1712                                     *(volatile T *)&pr->u.p.static_steal_counter ) )
1713                                 {
1714                                     continue; // try once more (nproc attempts in total)
1715                                     // no victim is ready yet to participate in stealing
1716                                     // because all victims are still in kmp_init_dispatch
1717                                 }
1718                                 pr->u.p.parm4 = victimIdx; // new victim found
1719                                 while( 1 ) { // CAS loop if victim has enough chunks to steal
1720                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1721                                     vnew = vold;
1722 
1723                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1724                                     if ( vnew.p.count >= (UT)vnew.p.ub ||
1725                                         (remaining = vnew.p.ub - vnew.p.count) < 2 )
1726                                     {
1727                                         pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1728                                         break; // not enough chunks to steal, goto next victim
1729                                     }
1730                                     if( remaining > 3 ) {
1731                                         vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
1732                                     } else {
1733                                         vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1734                                     }
1735                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1736                                     // TODO: Should this be acquire or release?
1737                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1738                                             ( volatile kmp_int64 * )&victim->u.p.count,
1739                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1740                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1741                                         // stealing succedded
1742                                         status = 1;
1743                                         while_index = 0;
1744                                         // now update own count and ub
1745                                         init = vnew.p.ub;
1746                                         vold.p.count = init + 1;
1747                                         #if KMP_ARCH_X86
1748                                         KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
1749                                         #else
1750                                         *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1751                                         #endif
1752                                         break;
1753                                     } // if (check CAS result)
1754                                     KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1755                                 } // while (try to steal from particular victim)
1756                             } // while (search for victim)
1757                         } // if (try to find victim and steal)
1758                     } // if (4-byte induction variable)
1759                     if ( !status ) {
1760                         *p_lb = 0;
1761                         *p_ub = 0;
1762                         if ( p_st != NULL ) *p_st = 0;
1763                     } else {
1764                         start = pr->u.p.parm2;
1765                         init *= chunk;
1766                         limit = chunk + init - 1;
1767                         incr  = pr->u.p.st;
1768 
1769                         KMP_DEBUG_ASSERT(init <= trip);
1770                         if ( (last = (limit >= trip)) != 0 )
1771                             limit = trip;
1772                         if ( p_st != NULL ) *p_st = incr;
1773 
1774                         if ( incr == 1 ) {
1775                             *p_lb = start + init;
1776                             *p_ub = start + limit;
1777                         } else {
1778                             *p_lb = start + init * incr;
1779                             *p_ub = start + limit * incr;
1780                         }
1781 
1782                         if ( pr->ordered ) {
1783                             pr->u.p.ordered_lower = init;
1784                             pr->u.p.ordered_upper = limit;
1785                             #ifdef KMP_DEBUG
1786                             {
1787                                 const char * buff;
1788                                 // create format specifiers before the debug output
1789                                 buff = __kmp_str_format(
1790                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1791                                     traits_t< UT >::spec, traits_t< UT >::spec );
1792                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1793                                 __kmp_str_free( &buff );
1794                             }
1795                             #endif
1796                         } // if
1797                     } // if
1798                     break;
1799                 } // case
1800             #endif // ( KMP_STATIC_STEAL_ENABLED )
1801             case kmp_sch_static_balanced:
1802                 {
1803                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1804                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1805                         pr->u.p.count = 1;
1806                         *p_lb = pr->u.p.lb;
1807                         *p_ub = pr->u.p.ub;
1808                         last = pr->u.p.parm1;
1809                         if ( p_st != NULL )
1810                             *p_st = pr->u.p.st;
1811                     } else {  /* no iterations to do */
1812                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1813                     }
1814                     if ( pr->ordered ) {
1815                         #ifdef KMP_DEBUG
1816                         {
1817                             const char * buff;
1818                             // create format specifiers before the debug output
1819                             buff = __kmp_str_format(
1820                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1821                                 traits_t< UT >::spec, traits_t< UT >::spec );
1822                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1823                             __kmp_str_free( &buff );
1824                         }
1825                         #endif
1826                     } // if
1827                 } // case
1828                 break;
1829             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1830             case kmp_sch_static_chunked:
1831                 {
1832                     T parm1;
1833 
1834                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1835                                    gtid ) );
1836                     parm1 = pr->u.p.parm1;
1837 
1838                     trip  = pr->u.p.tc - 1;
1839                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1840 
1841                     if ( (status = (init <= trip)) != 0 ) {
1842                         start = pr->u.p.lb;
1843                         incr  = pr->u.p.st;
1844                         limit = parm1 + init - 1;
1845 
1846                         if ( (last = (limit >= trip)) != 0 )
1847                             limit = trip;
1848 
1849                         if ( p_st != NULL ) *p_st = incr;
1850 
1851                         pr->u.p.count += th->th.th_team_nproc;
1852 
1853                         if ( incr == 1 ) {
1854                             *p_lb = start + init;
1855                             *p_ub = start + limit;
1856                         }
1857                         else {
1858                             *p_lb = start + init * incr;
1859                             *p_ub = start + limit * incr;
1860                         }
1861 
1862                         if ( pr->ordered ) {
1863                             pr->u.p.ordered_lower = init;
1864                             pr->u.p.ordered_upper = limit;
1865                             #ifdef KMP_DEBUG
1866                             {
1867                                 const char * buff;
1868                                 // create format specifiers before the debug output
1869                                 buff = __kmp_str_format(
1870                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1871                                     traits_t< UT >::spec, traits_t< UT >::spec );
1872                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1873                                 __kmp_str_free( &buff );
1874                             }
1875                             #endif
1876                         } // if
1877                     } // if
1878                 } // case
1879                 break;
1880 
1881             case kmp_sch_dynamic_chunked:
1882                 {
1883                     T chunk = pr->u.p.parm1;
1884 
1885                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1886                                    gtid ) );
1887 
1888                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1889                     trip = pr->u.p.tc - 1;
1890 
1891                     if ( (status = (init <= trip)) == 0 ) {
1892                         *p_lb = 0;
1893                         *p_ub = 0;
1894                         if ( p_st != NULL ) *p_st = 0;
1895                     } else {
1896                         start = pr->u.p.lb;
1897                         limit = chunk + init - 1;
1898                         incr  = pr->u.p.st;
1899 
1900                         if ( (last = (limit >= trip)) != 0 )
1901                             limit = trip;
1902 
1903                         if ( p_st != NULL ) *p_st = incr;
1904 
1905                         if ( incr == 1 ) {
1906                             *p_lb = start + init;
1907                             *p_ub = start + limit;
1908                         } else {
1909                             *p_lb = start + init * incr;
1910                             *p_ub = start + limit * incr;
1911                         }
1912 
1913                         if ( pr->ordered ) {
1914                             pr->u.p.ordered_lower = init;
1915                             pr->u.p.ordered_upper = limit;
1916                             #ifdef KMP_DEBUG
1917                             {
1918                                 const char * buff;
1919                                 // create format specifiers before the debug output
1920                                 buff = __kmp_str_format(
1921                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1922                                     traits_t< UT >::spec, traits_t< UT >::spec );
1923                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1924                                 __kmp_str_free( &buff );
1925                             }
1926                             #endif
1927                         } // if
1928                     } // if
1929                 } // case
1930                 break;
1931 
1932             case kmp_sch_guided_iterative_chunked:
1933                 {
1934                     T  chunkspec = pr->u.p.parm1;
1935                     KD_TRACE(100,
1936                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1937                     trip  = pr->u.p.tc;
1938                     // Start atomic part of calculations
1939                     while(1) {
1940                         ST  remaining;             // signed, because can be < 0
1941                         init = sh->u.s.iteration;  // shared value
1942                         remaining = trip - init;
1943                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1944                             // nothing to do, don't try atomic op
1945                             status = 0;
1946                             break;
1947                         }
1948                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1949                             // use dynamic-style shcedule
1950                             // atomically inrement iterations, get old value
1951                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1952                             remaining = trip - init;
1953                             if (remaining <= 0) {
1954                                 status = 0;    // all iterations got by other threads
1955                             } else {
1956                                 // got some iterations to work on
1957                                 status = 1;
1958                                 if ( (T)remaining > chunkspec ) {
1959                                     limit = init + chunkspec - 1;
1960                                 } else {
1961                                     last = 1;   // the last chunk
1962                                     limit = init + remaining - 1;
1963                                 } // if
1964                             } // if
1965                             break;
1966                         } // if
1967                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1968                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1969                             // CAS was successful, chunk obtained
1970                             status = 1;
1971                             --limit;
1972                             break;
1973                         } // if
1974                     } // while
1975                     if ( status != 0 ) {
1976                         start = pr->u.p.lb;
1977                         incr = pr->u.p.st;
1978                         if ( p_st != NULL )
1979                             *p_st = incr;
1980                         *p_lb = start + init * incr;
1981                         *p_ub = start + limit * incr;
1982                         if ( pr->ordered ) {
1983                             pr->u.p.ordered_lower = init;
1984                             pr->u.p.ordered_upper = limit;
1985                             #ifdef KMP_DEBUG
1986                             {
1987                                 const char * buff;
1988                                 // create format specifiers before the debug output
1989                                 buff = __kmp_str_format(
1990                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1991                                     traits_t< UT >::spec, traits_t< UT >::spec );
1992                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1993                                 __kmp_str_free( &buff );
1994                             }
1995                             #endif
1996                         } // if
1997                     } else {
1998                         *p_lb = 0;
1999                         *p_ub = 0;
2000                         if ( p_st != NULL )
2001                             *p_st = 0;
2002                     } // if
2003                 } // case
2004                 break;
2005 
2006             case kmp_sch_guided_analytical_chunked:
2007                 {
2008                     T   chunkspec = pr->u.p.parm1;
2009                     UT chunkIdx;
2010     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2011                     /* for storing original FPCW value for Windows* OS on
2012 		       IA-32 architecture 8-byte version */
2013                     unsigned int oldFpcw;
2014                     unsigned int fpcwSet = 0;
2015     #endif
2016                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
2017                                    gtid ) );
2018 
2019                     trip  = pr->u.p.tc;
2020 
2021                     KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2022                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip);
2023 
2024                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
2025                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
2026                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
2027                             --trip;
2028                             /* use dynamic-style scheduling */
2029                             init = chunkIdx * chunkspec + pr->u.p.count;
2030                             /* need to verify init > 0 in case of overflow in the above calculation */
2031                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
2032                                 limit = init + chunkspec -1;
2033 
2034                                 if ( (last = (limit >= trip)) != 0 )
2035                                     limit = trip;
2036                             }
2037                             break;
2038                         } else {
2039                             /* use exponential-style scheduling */
2040                             /* The following check is to workaround the lack of long double precision on Windows* OS.
2041                                This check works around the possible effect that init != 0 for chunkIdx == 0.
2042                              */
2043     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2044                             /* If we haven't already done so, save original
2045 			       FPCW and set precision to 64-bit, as Windows* OS
2046 			       on IA-32 architecture defaults to 53-bit */
2047                             if ( !fpcwSet ) {
2048                                 oldFpcw = _control87(0,0);
2049                                 _control87(_PC_64,_MCW_PC);
2050                                 fpcwSet = 0x30000;
2051                             }
2052     #endif
2053                             if ( chunkIdx ) {
2054                                 init = __kmp_dispatch_guided_remaining< T >(
2055                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2056                                 KMP_DEBUG_ASSERT(init);
2057                                 init = trip - init;
2058                             } else
2059                                 init = 0;
2060                             limit = trip - __kmp_dispatch_guided_remaining< T >(
2061                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2062                             KMP_ASSERT(init <= limit);
2063                             if ( init < limit ) {
2064                                 KMP_DEBUG_ASSERT(limit <= trip);
2065                                 --limit;
2066                                 status = 1;
2067                                 break;
2068                             } // if
2069                         } // if
2070                     } // while (1)
2071     #if KMP_OS_WINDOWS && KMP_ARCH_X86
2072                     /* restore FPCW if necessary
2073                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2074                     */
2075                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2076                         _control87(oldFpcw,_MCW_PC);
2077     #endif
2078                     if ( status != 0 ) {
2079                         start = pr->u.p.lb;
2080                         incr = pr->u.p.st;
2081                         if ( p_st != NULL )
2082                             *p_st = incr;
2083                         *p_lb = start + init * incr;
2084                         *p_ub = start + limit * incr;
2085                         if ( pr->ordered ) {
2086                             pr->u.p.ordered_lower = init;
2087                             pr->u.p.ordered_upper = limit;
2088                             #ifdef KMP_DEBUG
2089                             {
2090                                 const char * buff;
2091                                 // create format specifiers before the debug output
2092                                 buff = __kmp_str_format(
2093                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2094                                     traits_t< UT >::spec, traits_t< UT >::spec );
2095                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2096                                 __kmp_str_free( &buff );
2097                             }
2098                             #endif
2099                         }
2100                     } else {
2101                         *p_lb = 0;
2102                         *p_ub = 0;
2103                         if ( p_st != NULL )
2104                             *p_st = 0;
2105                     }
2106                 } // case
2107                 break;
2108 
2109             case kmp_sch_trapezoidal:
2110                 {
2111                     UT   index;
2112                     T    parm2 = pr->u.p.parm2;
2113                     T    parm3 = pr->u.p.parm3;
2114                     T    parm4 = pr->u.p.parm4;
2115                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2116                                    gtid ) );
2117 
2118                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2119 
2120                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2121                     trip = pr->u.p.tc - 1;
2122 
2123                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2124                         *p_lb = 0;
2125                         *p_ub = 0;
2126                         if ( p_st != NULL ) *p_st = 0;
2127                     } else {
2128                         start = pr->u.p.lb;
2129                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2130                         incr  = pr->u.p.st;
2131 
2132                         if ( (last = (limit >= trip)) != 0 )
2133                             limit = trip;
2134 
2135                         if ( p_st != NULL ) *p_st = incr;
2136 
2137                         if ( incr == 1 ) {
2138                             *p_lb = start + init;
2139                             *p_ub = start + limit;
2140                         } else {
2141                             *p_lb = start + init * incr;
2142                             *p_ub = start + limit * incr;
2143                         }
2144 
2145                         if ( pr->ordered ) {
2146                             pr->u.p.ordered_lower = init;
2147                             pr->u.p.ordered_upper = limit;
2148                             #ifdef KMP_DEBUG
2149                             {
2150                                 const char * buff;
2151                                 // create format specifiers before the debug output
2152                                 buff = __kmp_str_format(
2153                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2154                                     traits_t< UT >::spec, traits_t< UT >::spec );
2155                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2156                                 __kmp_str_free( &buff );
2157                             }
2158                             #endif
2159                         } // if
2160                     } // if
2161                 } // case
2162                 break;
2163             default:
2164                 {
2165                     status = 0; // to avoid complaints on uninitialized variable use
2166                     __kmp_msg(
2167                         kmp_ms_fatal,                        // Severity
2168                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2169                         KMP_HNT( GetNewerLibrary ),          // Hint
2170                         __kmp_msg_null                       // Variadic argument list terminator
2171                     );
2172                 }
2173                 break;
2174             } // switch
2175         } // if tc == 0;
2176 
2177         if ( status == 0 ) {
2178             UT   num_done;
2179 
2180             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2181             #ifdef KMP_DEBUG
2182             {
2183                 const char * buff;
2184                 // create format specifiers before the debug output
2185                 buff = __kmp_str_format(
2186                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2187                     traits_t< UT >::spec );
2188                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2189                 __kmp_str_free( &buff );
2190             }
2191             #endif
2192 
2193             if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
2194                 #if ( KMP_STATIC_STEAL_ENABLED )
2195                 if( pr->schedule == kmp_sch_static_steal && traits_t<T>::type_size > 4 ) {
2196                     int i;
2197                     kmp_info_t **other_threads = team->t.t_threads;
2198                     // loop complete, safe to destroy locks used for stealing
2199                     for( i = 0; i < th->th.th_team_nproc; ++i ) {
2200                         kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2201                         KMP_ASSERT(lck != NULL);
2202                         __kmp_destroy_lock( lck );
2203                         __kmp_free( lck );
2204                         other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2205                     }
2206                 }
2207                 #endif
2208                 /* NOTE: release this buffer to be reused */
2209 
2210                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2211 
2212                 sh->u.s.num_done = 0;
2213                 sh->u.s.iteration = 0;
2214 
2215                 /* TODO replace with general release procedure? */
2216                 if ( pr->ordered ) {
2217                     sh->u.s.ordered_iteration = 0;
2218                 }
2219 
2220                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2221 
2222                 sh -> buffer_index += __kmp_dispatch_num_buffers;
2223                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2224                                 gtid, sh->buffer_index) );
2225 
2226                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2227 
2228             } // if
2229             if ( __kmp_env_consistency_check ) {
2230                 if ( pr->pushed_ws != ct_none ) {
2231                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2232                 }
2233             }
2234 
2235             th -> th.th_dispatch -> th_deo_fcn = NULL;
2236             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2237             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2238             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2239         } // if (status == 0)
2240 #if KMP_OS_WINDOWS
2241         else if ( last ) {
2242             pr->u.p.last_upper = pr->u.p.ub;
2243         }
2244 #endif /* KMP_OS_WINDOWS */
2245         if ( p_last != NULL && status != 0 )
2246             *p_last = last;
2247     } // if
2248 
2249     #ifdef KMP_DEBUG
2250     {
2251         const char * buff;
2252         // create format specifiers before the debug output
2253         buff = __kmp_str_format(
2254             "__kmp_dispatch_next: T#%%d normal case: " \
2255             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2256             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2257         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2258         __kmp_str_free( &buff );
2259     }
2260     #endif
2261 #if INCLUDE_SSC_MARKS
2262     SSC_MARK_DISPATCH_NEXT();
2263 #endif
2264     OMPT_LOOP_END;
2265     return status;
2266 }
2267 
2268 template< typename T >
2269 static void
2270 __kmp_dist_get_bounds(
2271     ident_t                          *loc,
2272     kmp_int32                         gtid,
2273     kmp_int32                        *plastiter,
2274     T                                *plower,
2275     T                                *pupper,
2276     typename traits_t< T >::signed_t  incr
2277 ) {
2278     typedef typename traits_t< T >::unsigned_t  UT;
2279     typedef typename traits_t< T >::signed_t    ST;
2280     register kmp_uint32  team_id;
2281     register kmp_uint32  nteams;
2282     register UT          trip_count;
2283     register kmp_team_t *team;
2284     kmp_info_t * th;
2285 
2286     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2287     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2288     #ifdef KMP_DEBUG
2289     {
2290         const char * buff;
2291         // create format specifiers before the debug output
2292         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2293             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2294             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2295             traits_t< T >::spec );
2296         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2297         __kmp_str_free( &buff );
2298     }
2299     #endif
2300 
2301     if( __kmp_env_consistency_check ) {
2302         if( incr == 0 ) {
2303             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2304         }
2305         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2306             // The loop is illegal.
2307             // Some zero-trip loops maintained by compiler, e.g.:
2308             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2309             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2310             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2311             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2312             // Compiler does not check the following illegal loops:
2313             //   for(i=0;i<10;i+=incr) // where incr<0
2314             //   for(i=10;i>0;i-=incr) // where incr<0
2315             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2316         }
2317     }
2318     th = __kmp_threads[gtid];
2319     team = th->th.th_team;
2320     #if OMP_40_ENABLED
2321     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2322     nteams = th->th.th_teams_size.nteams;
2323     #endif
2324     team_id = team->t.t_master_tid;
2325     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2326 
2327     // compute global trip count
2328     if( incr == 1 ) {
2329         trip_count = *pupper - *plower + 1;
2330     } else if(incr == -1) {
2331         trip_count = *plower - *pupper + 1;
2332     } else if ( incr > 0 ) {
2333         // upper-lower can exceed the limit of signed type
2334         trip_count = (UT)(*pupper - *plower) / incr + 1;
2335     } else {
2336         trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
2337     }
2338 
2339     if( trip_count <= nteams ) {
2340         KMP_DEBUG_ASSERT(
2341             __kmp_static == kmp_sch_static_greedy || \
2342             __kmp_static == kmp_sch_static_balanced
2343         ); // Unknown static scheduling type.
2344         // only some teams get single iteration, others get nothing
2345         if( team_id < trip_count ) {
2346             *pupper = *plower = *plower + team_id * incr;
2347         } else {
2348             *plower = *pupper + incr; // zero-trip loop
2349         }
2350         if( plastiter != NULL )
2351             *plastiter = ( team_id == trip_count - 1 );
2352     } else {
2353         if( __kmp_static == kmp_sch_static_balanced ) {
2354             register UT chunk = trip_count / nteams;
2355             register UT extras = trip_count % nteams;
2356             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2357             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2358             if( plastiter != NULL )
2359                 *plastiter = ( team_id == nteams - 1 );
2360         } else {
2361             register T chunk_inc_count =
2362                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2363             register T upper = *pupper;
2364             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2365                 // Unknown static scheduling type.
2366             *plower += team_id * chunk_inc_count;
2367             *pupper = *plower + chunk_inc_count - incr;
2368             // Check/correct bounds if needed
2369             if( incr > 0 ) {
2370                 if( *pupper < *plower )
2371                     *pupper = traits_t<T>::max_value;
2372                 if( plastiter != NULL )
2373                     *plastiter = *plower <= upper && *pupper > upper - incr;
2374                 if( *pupper > upper )
2375                     *pupper = upper; // tracker C73258
2376             } else {
2377                 if( *pupper > *plower )
2378                     *pupper = traits_t<T>::min_value;
2379                 if( plastiter != NULL )
2380                     *plastiter = *plower >= upper && *pupper < upper - incr;
2381                 if( *pupper < upper )
2382                     *pupper = upper; // tracker C73258
2383             }
2384         }
2385     }
2386 }
2387 
2388 //-----------------------------------------------------------------------------------------
2389 // Dispatch routines
2390 //    Transfer call to template< type T >
2391 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2392 //                         T lb, T ub, ST st, ST chunk )
2393 extern "C" {
2394 
2395 /*!
2396 @ingroup WORK_SHARING
2397 @{
2398 @param loc Source location
2399 @param gtid Global thread id
2400 @param schedule Schedule type
2401 @param lb  Lower bound
2402 @param ub  Upper bound
2403 @param st  Step (or increment if you prefer)
2404 @param chunk The chunk size to block with
2405 
2406 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2407 These functions are all identical apart from the types of the arguments.
2408 */
2409 
2410 void
2411 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2412                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2413 {
2414     KMP_DEBUG_ASSERT( __kmp_init_serial );
2415     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2416 }
2417 /*!
2418 See @ref __kmpc_dispatch_init_4
2419 */
2420 void
2421 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2422                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2423 {
2424     KMP_DEBUG_ASSERT( __kmp_init_serial );
2425     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2426 }
2427 
2428 /*!
2429 See @ref __kmpc_dispatch_init_4
2430 */
2431 void
2432 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2433                         kmp_int64 lb, kmp_int64 ub,
2434                         kmp_int64 st, kmp_int64 chunk )
2435 {
2436     KMP_DEBUG_ASSERT( __kmp_init_serial );
2437     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2438 }
2439 
2440 /*!
2441 See @ref __kmpc_dispatch_init_4
2442 */
2443 void
2444 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2445                          kmp_uint64 lb, kmp_uint64 ub,
2446                          kmp_int64 st, kmp_int64 chunk )
2447 {
2448     KMP_DEBUG_ASSERT( __kmp_init_serial );
2449     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2450 }
2451 
2452 /*!
2453 See @ref __kmpc_dispatch_init_4
2454 
2455 Difference from __kmpc_dispatch_init set of functions is these functions
2456 are called for composite distribute parallel for construct. Thus before
2457 regular iterations dispatching we need to calc per-team iteration space.
2458 
2459 These functions are all identical apart from the types of the arguments.
2460 */
2461 void
2462 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2463     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2464 {
2465     KMP_DEBUG_ASSERT( __kmp_init_serial );
2466     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2467     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2468 }
2469 
2470 void
2471 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2472     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2473 {
2474     KMP_DEBUG_ASSERT( __kmp_init_serial );
2475     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2476     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2477 }
2478 
2479 void
2480 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2481     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2482 {
2483     KMP_DEBUG_ASSERT( __kmp_init_serial );
2484     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2485     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2486 }
2487 
2488 void
2489 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2490     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2491 {
2492     KMP_DEBUG_ASSERT( __kmp_init_serial );
2493     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2494     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2495 }
2496 
2497 /*!
2498 @param loc Source code location
2499 @param gtid Global thread id
2500 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2501 @param p_lb   Pointer to the lower bound for the next chunk of work
2502 @param p_ub   Pointer to the upper bound for the next chunk of work
2503 @param p_st   Pointer to the stride for the next chunk of work
2504 @return one if there is work to be done, zero otherwise
2505 
2506 Get the next dynamically allocated chunk of work for this thread.
2507 If there is no more work, then the lb,ub and stride need not be modified.
2508 */
2509 int
2510 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2511                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2512 {
2513     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2514 }
2515 
2516 /*!
2517 See @ref __kmpc_dispatch_next_4
2518 */
2519 int
2520 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2521                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2522 {
2523     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2524 }
2525 
2526 /*!
2527 See @ref __kmpc_dispatch_next_4
2528 */
2529 int
2530 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2531                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2532 {
2533     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2534 }
2535 
2536 /*!
2537 See @ref __kmpc_dispatch_next_4
2538 */
2539 int
2540 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2541                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2542 {
2543     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2544 }
2545 
2546 /*!
2547 @param loc Source code location
2548 @param gtid Global thread id
2549 
2550 Mark the end of a dynamic loop.
2551 */
2552 void
2553 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2554 {
2555     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2556 }
2557 
2558 /*!
2559 See @ref __kmpc_dispatch_fini_4
2560 */
2561 void
2562 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2563 {
2564     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2565 }
2566 
2567 /*!
2568 See @ref __kmpc_dispatch_fini_4
2569 */
2570 void
2571 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2572 {
2573     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2574 }
2575 
2576 /*!
2577 See @ref __kmpc_dispatch_fini_4
2578 */
2579 void
2580 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2581 {
2582     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2583 }
2584 /*! @} */
2585 
2586 //-----------------------------------------------------------------------------------------
2587 //Non-template routines from kmp_dispatch.cpp used in other sources
2588 
2589 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2590     return value == checker;
2591 }
2592 
2593 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2594     return value != checker;
2595 }
2596 
2597 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2598     return value < checker;
2599 }
2600 
2601 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2602     return value >= checker;
2603 }
2604 
2605 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2606     return value <= checker;
2607 }
2608 
2609 kmp_uint32
2610 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2611                    kmp_uint32            checker,
2612                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2613                    , void        * obj    // Higher-level synchronization object, or NULL.
2614                    )
2615 {
2616     // note: we may not belong to a team at this point
2617     register volatile kmp_uint32         * spin          = spinner;
2618     register          kmp_uint32           check         = checker;
2619     register          kmp_uint32   spins;
2620     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2621     register          kmp_uint32           r;
2622 
2623     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2624     KMP_INIT_YIELD( spins );
2625     // main wait spin loop
2626     while(!f(r = TCR_4(*spin), check)) {
2627         KMP_FSYNC_SPIN_PREPARE( obj );
2628         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2629            It causes problems with infinite recursion because of exit lock */
2630         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2631             __kmp_abort_thread(); */
2632 
2633         /* if we have waited a bit, or are oversubscribed, yield */
2634         /* pause is in the following code */
2635         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2636         KMP_YIELD_SPIN( spins );
2637     }
2638     KMP_FSYNC_SPIN_ACQUIRED( obj );
2639     return r;
2640 }
2641 
2642 void
2643 __kmp_wait_yield_4_ptr(void *spinner,
2644                    kmp_uint32 checker,
2645                    kmp_uint32 (*pred)( void *, kmp_uint32 ),
2646                    void        *obj    // Higher-level synchronization object, or NULL.
2647                    )
2648 {
2649     // note: we may not belong to a team at this point
2650     register void                *spin          = spinner;
2651     register kmp_uint32           check         = checker;
2652     register kmp_uint32           spins;
2653     register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2654 
2655     KMP_FSYNC_SPIN_INIT( obj, spin );
2656     KMP_INIT_YIELD( spins );
2657     // main wait spin loop
2658     while ( !f( spin, check ) ) {
2659         KMP_FSYNC_SPIN_PREPARE( obj );
2660         /* if we have waited a bit, or are oversubscribed, yield */
2661         /* pause is in the following code */
2662         KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2663         KMP_YIELD_SPIN( spins );
2664     }
2665     KMP_FSYNC_SPIN_ACQUIRED( obj );
2666 }
2667 
2668 } // extern "C"
2669 
2670 #ifdef KMP_GOMP_COMPAT
2671 
2672 void
2673 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2674                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2675                            kmp_int32 chunk, int push_ws )
2676 {
2677     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2678                                       push_ws );
2679 }
2680 
2681 void
2682 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2683                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2684                             kmp_int32 chunk, int push_ws )
2685 {
2686     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2687                                        push_ws );
2688 }
2689 
2690 void
2691 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2692                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2693                            kmp_int64 chunk, int push_ws )
2694 {
2695     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2696                                       push_ws );
2697 }
2698 
2699 void
2700 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2701                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2702                             kmp_int64 chunk, int push_ws )
2703 {
2704     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2705                                        push_ws );
2706 }
2707 
2708 void
2709 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2710 {
2711     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2712 }
2713 
2714 void
2715 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2716 {
2717     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2718 }
2719 
2720 void
2721 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2722 {
2723     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2724 }
2725 
2726 void
2727 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2728 {
2729     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2730 }
2731 
2732 #endif /* KMP_GOMP_COMPAT */
2733 
2734 /* ------------------------------------------------------------------------ */
2735 /* ------------------------------------------------------------------------ */
2736 
2737