1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  *       it may change values between parallel regions.  __kmp_max_nth
21  *       is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35     #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49     static const T mx;
50     static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54     static const int mx = 0x7fffffff;
55     static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59     static const unsigned int mx = 0xffffffff;
60     static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64     static const long long mx = 0x7fffffffffffffffLL;
65     static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69     static const unsigned long long mx = 0xffffffffffffffffLL;
70     static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77     template< typename T >
78     struct dispatch_private_infoXX_template {
79         typedef typename traits_t< T >::unsigned_t  UT;
80         typedef typename traits_t< T >::signed_t    ST;
81         UT count;                // unsigned
82         T  ub;
83         /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84         T  lb;
85         ST st;                   // signed
86         UT tc;                   // unsigned
87         T  static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89         /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91         // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92         //    a) parm3 is properly aligned and
93         //    b) all parm1-4 are in the same cache line.
94         // Because of parm1-4 are used together, performance seems to be better
95         // if they are in the same line (not measured though).
96 
97         struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98             T  parm1;
99             T  parm2;
100             T  parm3;
101             T  parm4;
102         };
103 
104         UT ordered_lower; // unsigned
105         UT ordered_upper; // unsigned
106         #if KMP_OS_WINDOWS
107         T  last_upper;
108         #endif /* KMP_OS_WINDOWS */
109     };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114     template< typename T >
115     struct dispatch_private_infoXX_template {
116         typedef typename traits_t< T >::unsigned_t  UT;
117         typedef typename traits_t< T >::signed_t    ST;
118         T  lb;
119         T  ub;
120         ST st;            // signed
121         UT tc;            // unsigned
122 
123         T  parm1;
124         T  parm2;
125         T  parm3;
126         T  parm4;
127 
128         UT count;         // unsigned
129 
130         UT ordered_lower; // unsigned
131         UT ordered_upper; // unsigned
132         #if KMP_OS_WINDOWS
133 	T  last_upper;
134         #endif /* KMP_OS_WINDOWS */
135     };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142     // duplicate alignment here, otherwise size of structure is not correct in our compiler
143     union KMP_ALIGN_CACHE private_info_tmpl {
144         dispatch_private_infoXX_template< T > p;
145         dispatch_private_info64_t             p64;
146     } u;
147     enum sched_type schedule;  /* scheduling algorithm */
148     kmp_uint32      ordered;   /* ordered clause specified */
149     kmp_uint32      ordered_bumped;
150     kmp_int32   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151     dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152     kmp_uint32      nomerge;   /* don't merge iters if serialized */
153     kmp_uint32      type_size;
154     enum cons_type  pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161     /* chunk index under dynamic, number of idle threads under static-steal;
162        iteration index otherwise */
163     volatile UT     iteration;
164     volatile UT     num_done;
165     volatile UT     ordered_iteration;
166     UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172     // we need union here to keep the structure size
173     union shared_info_tmpl {
174         dispatch_shared_infoXX_template< UT >  s;
175         dispatch_shared_info64_t               s64;
176     } u;
177     volatile kmp_uint32     buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194     kmp_int32 r;
195     r = KMP_TEST_THEN_ADD32( p, d );
196     return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203     kmp_int64 r;
204     r = KMP_TEST_THEN_ADD64( p, d );
205     return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217     kmp_int32 r;
218     r = KMP_TEST_THEN_INC_ACQ32( p );
219     return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226     kmp_int64 r;
227     r = KMP_TEST_THEN_INC_ACQ64( p );
228     return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240     kmp_int32 r;
241     r = KMP_TEST_THEN_INC32( p );
242     return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249     kmp_int64 r;
250     r = KMP_TEST_THEN_INC64( p );
251     return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263     return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270     return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274     Spin wait loop that first does pause, then yield.
275     Waits until function returns non-zero when called with *spinner and check.
276     Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278     Arguments:
279         obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280             locks consistently. For example, if lock is acquired immediately, its address is
281             reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282             immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283             address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT  // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290                   UT            checker,
291                   kmp_uint32 (* pred)( UT, UT )
292                   USE_ITT_BUILD_ARG(void        * obj)    // Higher-level synchronization object, or NULL.
293                   )
294 {
295     // note: we may not belong to a team at this point
296     register volatile UT         * spin          = spinner;
297     register          UT           check         = checker;
298     register          kmp_uint32   spins;
299     register          kmp_uint32 (*f) ( UT, UT ) = pred;
300     register          UT           r;
301 
302     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303     KMP_INIT_YIELD( spins );
304     // main wait spin loop
305     while(!f(r = *spin, check))
306     {
307         KMP_FSYNC_SPIN_PREPARE( obj );
308         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309            It causes problems with infinite recursion because of exit lock */
310         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311             __kmp_abort_thread(); */
312 
313         // if we are oversubscribed,
314         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315         // pause is in the following code
316         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317         KMP_YIELD_SPIN( spins );
318     }
319     KMP_FSYNC_SPIN_ACQUIRED( obj );
320     return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325     return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330     return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335     return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340     return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345     return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355     kmp_info_t *th;
356 
357     KMP_DEBUG_ASSERT( gtid_ref );
358 
359     if ( __kmp_env_consistency_check ) {
360         th = __kmp_threads[*gtid_ref];
361         if ( th -> th.th_root -> r.r_active
362           && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366             __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368         }
369     }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376     typedef typename traits_t< UT >::signed_t    ST;
377     dispatch_private_info_template< UT > * pr;
378 
379     int gtid = *gtid_ref;
380 //    int  cid = *cid_ref;
381     kmp_info_t *th = __kmp_threads[ gtid ];
382     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385     if ( __kmp_env_consistency_check ) {
386         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387             ( th -> th.th_dispatch -> th_dispatch_pr_current );
388         if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392             __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394         }
395     }
396 
397     if ( ! th -> th.th_team -> t.t_serialized ) {
398         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399             ( th -> th.th_dispatch -> th_dispatch_sh_current );
400         UT  lower;
401 
402         if ( ! __kmp_env_consistency_check ) {
403                 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404                     ( th -> th.th_dispatch -> th_dispatch_pr_current );
405         }
406         lower = pr->u.p.ordered_lower;
407 
408         #if ! defined( KMP_GOMP_COMPAT )
409             if ( __kmp_env_consistency_check ) {
410                 if ( pr->ordered_bumped ) {
411                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412                     __kmp_error_construct2(
413                         kmp_i18n_msg_CnsMultipleNesting,
414                         ct_ordered_in_pdo, loc_ref,
415                         & p->stack_data[ p->w_top ]
416                     );
417                 }
418             }
419         #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421         KMP_MB();
422         #ifdef KMP_DEBUG
423         {
424             const char * buff;
425             // create format specifiers before the debug output
426             buff = __kmp_str_format(
427                 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428                 traits_t< UT >::spec, traits_t< UT >::spec );
429             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430             __kmp_str_free( &buff );
431         }
432         #endif
433 
434         __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435                                 USE_ITT_BUILD_ARG( NULL )
436                                 );
437         KMP_MB();  /* is this necessary? */
438         #ifdef KMP_DEBUG
439         {
440             const char * buff;
441             // create format specifiers before the debug output
442             buff = __kmp_str_format(
443                 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444                 traits_t< UT >::spec, traits_t< UT >::spec );
445             KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446             __kmp_str_free( &buff );
447         }
448         #endif
449     }
450     KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456     kmp_info_t *th;
457 
458     if ( __kmp_env_consistency_check ) {
459         th = __kmp_threads[*gtid_ref];
460         if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461             __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462         }
463     }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470     typedef typename traits_t< UT >::signed_t    ST;
471     dispatch_private_info_template< UT > * pr;
472 
473     int gtid = *gtid_ref;
474 //    int  cid = *cid_ref;
475     kmp_info_t *th = __kmp_threads[ gtid ];
476     KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479     if ( __kmp_env_consistency_check ) {
480         pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481             ( th -> th.th_dispatch -> th_dispatch_pr_current );
482         if ( pr -> pushed_ws != ct_none ) {
483             __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484         }
485     }
486 
487     if ( ! th -> th.th_team -> t.t_serialized ) {
488         dispatch_shared_info_template< UT >  * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489             ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491         if ( ! __kmp_env_consistency_check ) {
492             pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493                 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494         }
495 
496         KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497         #if ! defined( KMP_GOMP_COMPAT )
498             if ( __kmp_env_consistency_check ) {
499                 if ( pr->ordered_bumped != 0 ) {
500                     struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501                     /* How to test it? - OM */
502                     __kmp_error_construct2(
503                         kmp_i18n_msg_CnsMultipleNesting,
504                         ct_ordered_in_pdo, loc_ref,
505                         & p->stack_data[ p->w_top ]
506                     );
507                 }
508             }
509         #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511         KMP_MB();       /* Flush all pending memory write invalidates.  */
512 
513         pr->ordered_bumped += 1;
514 
515         KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516                         gtid, pr->ordered_bumped ) );
517 
518         KMP_MB();       /* Flush all pending memory write invalidates.  */
519 
520         /* TODO use general release procedure? */
521         test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523         KMP_MB();       /* Flush all pending memory write invalidates.  */
524     }
525     KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532     long double s=1.0L;
533 
534     KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535     //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536     while(y) {
537         if ( y & 1 )
538             s *= x;
539         x *= x;
540         y >>= 1;
541     }
542     return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546    (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547    __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548    (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553     T                                  tc,
554     typename traits_t< T >::floating_t base,
555     typename traits_t< T >::unsigned_t idx
556 ) {
557     /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558        least for ICL 8.1, long double arithmetic may not really have
559        long double precision, even with /Qlong_double.  Currently, we
560        workaround that in the caller code, by manipulating the FPCW for
561        Windows* OS on IA-32 architecture.  The lack of precision is not
562        expected to be a correctness issue, though.
563     */
564     typedef typename traits_t< T >::unsigned_t  UT;
565 
566     long double x = tc * __kmp_pow< UT >(base, idx);
567     UT r = (UT) x;
568     if ( x == r )
569         return r;
570     return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
575 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586     ident_t                        * loc,
587     int                              gtid,
588     enum sched_type                  schedule,
589     T                                lb,
590     T                                ub,
591     typename traits_t< T >::signed_t st,
592     typename traits_t< T >::signed_t chunk,
593     int                              push_ws
594 ) {
595     typedef typename traits_t< T >::unsigned_t  UT;
596     typedef typename traits_t< T >::signed_t    ST;
597     typedef typename traits_t< T >::floating_t  DBL;
598     static const int ___kmp_size_type = sizeof( UT );
599 
600     int                                            active;
601     T                                              tc;
602     kmp_info_t *                                   th;
603     kmp_team_t *                                   team;
604     kmp_uint32                                     my_buffer_index;
605     dispatch_private_info_template< T >          * pr;
606     dispatch_shared_info_template< UT > volatile * sh;
607 
608     KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609     KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611     if ( ! TCR_4( __kmp_init_parallel ) )
612         __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615     SSC_MARK_DISPATCH_INIT();
616 #endif
617     #ifdef KMP_DEBUG
618     {
619         const char * buff;
620         // create format specifiers before the debug output
621         buff = __kmp_str_format(
622             "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623             traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624         KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625         __kmp_str_free( &buff );
626     }
627     #endif
628     /* setup data */
629     th     = __kmp_threads[ gtid ];
630     team   = th -> th.th_team;
631     active = ! team -> t.t_serialized;
632     th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635     kmp_uint64 cur_chunk = chunk;
636     int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637         KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639         th->th.th_teams_microtask == NULL &&
640 #endif
641         team->t.t_active_level == 1;
642 #endif
643     if ( ! active ) {
644         pr = reinterpret_cast< dispatch_private_info_template< T >* >
645             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646     } else {
647         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650         my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652         /* What happens when number of threads changes, need to resize buffer? */
653         pr = reinterpret_cast< dispatch_private_info_template< T >  * >
654             ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655         sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656             ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657     }
658 
659     /* Pick up the nomerge/ordered bits from the scheduling type */
660     if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661         pr->nomerge = TRUE;
662         schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663     } else {
664         pr->nomerge = FALSE;
665     }
666     pr->type_size = ___kmp_size_type; // remember the size of variables
667     if ( kmp_ord_lower & schedule ) {
668         pr->ordered = TRUE;
669         schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670     } else {
671         pr->ordered = FALSE;
672     }
673     if ( schedule == kmp_sch_static ) {
674         schedule = __kmp_static;
675     } else {
676         if ( schedule == kmp_sch_runtime ) {
677             // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
678             schedule = team -> t.t_sched.r_sched_type;
679             // Detail the schedule if needed (global controls are differentiated appropriately)
680             if ( schedule == kmp_sch_guided_chunked ) {
681                 schedule = __kmp_guided;
682             } else if ( schedule == kmp_sch_static ) {
683                 schedule = __kmp_static;
684             }
685             // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
686             chunk = team -> t.t_sched.chunk;
687 
688             #ifdef KMP_DEBUG
689             {
690                 const char * buff;
691                 // create format specifiers before the debug output
692                 buff = __kmp_str_format(
693                     "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
694                     traits_t< ST >::spec );
695                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
696                 __kmp_str_free( &buff );
697             }
698             #endif
699         } else {
700             if ( schedule == kmp_sch_guided_chunked ) {
701                 schedule = __kmp_guided;
702             }
703             if ( chunk <= 0 ) {
704                 chunk = KMP_DEFAULT_CHUNK;
705             }
706         }
707 
708         if ( schedule == kmp_sch_auto ) {
709             // mapping and differentiation: in the __kmp_do_serial_initialize()
710             schedule = __kmp_auto;
711             #ifdef KMP_DEBUG
712             {
713                 const char * buff;
714                 // create format specifiers before the debug output
715                 buff = __kmp_str_format(
716                     "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
717                     traits_t< ST >::spec );
718                 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
719                 __kmp_str_free( &buff );
720             }
721             #endif
722         }
723 
724         /* guided analytical not safe for too many threads */
725         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
726             schedule = kmp_sch_guided_iterative_chunked;
727             KMP_WARNING( DispatchManyThreads );
728         }
729         pr->u.p.parm1 = chunk;
730     }
731     KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
732                 "unknown scheduling type" );
733 
734     pr->u.p.count = 0;
735 
736     if ( __kmp_env_consistency_check ) {
737         if ( st == 0 ) {
738             __kmp_error_construct(
739                 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
740                 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
741             );
742         }
743     }
744 
745     tc = ( ub - lb + st );
746     if ( st != 1 ) {
747         if ( st < 0 ) {
748             if ( lb < ub ) {
749                 tc = 0;            // zero-trip
750             } else {   // lb >= ub
751                 tc = (ST)tc / st;  // convert to signed division
752             }
753         } else {       // st > 0
754             if ( ub < lb ) {
755                 tc = 0;            // zero-trip
756             } else {   // lb >= ub
757                 tc /= st;
758             }
759         }
760     } else if ( ub < lb ) {        // st == 1
761         tc = 0;                    // zero-trip
762     }
763 
764     pr->u.p.lb = lb;
765     pr->u.p.ub = ub;
766     pr->u.p.st = st;
767     pr->u.p.tc = tc;
768 
769     #if KMP_OS_WINDOWS
770     pr->u.p.last_upper = ub + st;
771     #endif /* KMP_OS_WINDOWS */
772 
773     /* NOTE: only the active parallel region(s) has active ordered sections */
774 
775     if ( active ) {
776         if ( pr->ordered == 0 ) {
777             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
778             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
779         } else {
780             pr->ordered_bumped = 0;
781 
782             pr->u.p.ordered_lower = 1;
783             pr->u.p.ordered_upper = 0;
784 
785             th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
786             th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
787         }
788     }
789 
790     if ( __kmp_env_consistency_check ) {
791         enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
792         if ( push_ws ) {
793             __kmp_push_workshare( gtid, ws, loc );
794             pr->pushed_ws = ws;
795         } else {
796             __kmp_check_workshare( gtid, ws, loc );
797             pr->pushed_ws = ct_none;
798         }
799     }
800 
801     switch ( schedule ) {
802     #if  ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
803     case kmp_sch_static_steal:
804         {
805             T nproc = team->t.t_nproc;
806             T ntc, init;
807 
808             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
809 
810             ntc = (tc % chunk ? 1 : 0) + tc / chunk;
811             if ( nproc > 1 && ntc >= nproc ) {
812                 T id = __kmp_tid_from_gtid(gtid);
813                 T small_chunk, extras;
814 
815                 small_chunk = ntc / nproc;
816                 extras = ntc % nproc;
817 
818                 init = id * small_chunk + ( id < extras ? id : extras );
819                 pr->u.p.count = init;
820                 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
821 
822                 pr->u.p.parm2 = lb;
823                 //pr->pfields.parm3 = 0; // it's not used in static_steal
824                 pr->u.p.parm4 = id;
825                 pr->u.p.st = st;
826                 break;
827             } else {
828                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
829                                gtid ) );
830                 schedule = kmp_sch_static_balanced;
831                 /* too few iterations: fall-through to kmp_sch_static_balanced */
832             } // if
833             /* FALL-THROUGH to static balanced */
834         } // case
835     #endif
836     case kmp_sch_static_balanced:
837         {
838             T nproc = team->t.t_nproc;
839             T init, limit;
840 
841             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
842                             gtid ) );
843 
844             if ( nproc > 1 ) {
845                 T id = __kmp_tid_from_gtid(gtid);
846 
847                 if ( tc < nproc ) {
848                     if ( id < tc ) {
849                         init = id;
850                         limit = id;
851                         pr->u.p.parm1 = (id == tc - 1);  /* parm1 stores *plastiter */
852                     } else {
853                         pr->u.p.count = 1;  /* means no more chunks to execute */
854                         pr->u.p.parm1 = FALSE;
855                         break;
856                     }
857                 } else {
858                     T small_chunk = tc / nproc;
859                     T extras = tc % nproc;
860                     init = id * small_chunk + (id < extras ? id : extras);
861                     limit = init + small_chunk - (id < extras ? 0 : 1);
862                     pr->u.p.parm1 = (id == nproc - 1);
863                 }
864             } else {
865                 if ( tc > 0 ) {
866                     init = 0;
867                     limit = tc - 1;
868                     pr->u.p.parm1 = TRUE;
869                 } else {
870                     // zero trip count
871                     pr->u.p.count = 1;  /* means no more chunks to execute */
872                     pr->u.p.parm1 = FALSE;
873                     break;
874                 }
875             }
876 #if USE_ITT_BUILD
877             // Calculate chunk for metadata report
878             if ( itt_need_metadata_reporting )
879                 cur_chunk = limit - init + 1;
880 #endif
881             if ( st == 1 ) {
882                 pr->u.p.lb = lb + init;
883                 pr->u.p.ub = lb + limit;
884             } else {
885                 T ub_tmp = lb + limit * st;   // calculated upper bound, "ub" is user-defined upper bound
886                 pr->u.p.lb = lb + init * st;
887                 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
888                 if ( st > 0 ) {
889                     pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
890                 } else {
891                     pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
892                 }
893             }
894             if ( pr->ordered ) {
895                 pr->u.p.ordered_lower = init;
896                 pr->u.p.ordered_upper = limit;
897             }
898             break;
899         } // case
900     case kmp_sch_guided_iterative_chunked :
901         {
902             T nproc = team->t.t_nproc;
903             KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
904 
905             if ( nproc > 1 ) {
906                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
907                     /* chunk size too large, switch to dynamic */
908                     schedule = kmp_sch_dynamic_chunked;
909                 } else {
910                     // when remaining iters become less than parm2 - switch to dynamic
911                     pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
912                     *(double*)&pr->u.p.parm3 = guided_flt_param / nproc;   // may occupy parm3 and parm4
913                 }
914             } else {
915                 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
916                 schedule = kmp_sch_static_greedy;
917                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
918                 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
919                 pr->u.p.parm1 = tc;
920             } // if
921         } // case
922         break;
923     case kmp_sch_guided_analytical_chunked:
924         {
925             T nproc = team->t.t_nproc;
926             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
927 
928             if ( nproc > 1 ) {
929                 if ( (2L * chunk + 1 ) * nproc >= tc ) {
930                     /* chunk size too large, switch to dynamic */
931                     schedule = kmp_sch_dynamic_chunked;
932                 } else {
933                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
934                     DBL x;
935 
936                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
937                     /* Linux* OS already has 64-bit computation by default for
938 		       long double, and on Windows* OS on Intel(R) 64,
939 		       /Qlong_double doesn't work.  On Windows* OS
940 		       on IA-32 architecture, we need to set precision to
941 		       64-bit instead of the default 53-bit. Even though long
942 		       double doesn't work on Windows* OS on Intel(R) 64, the
943 		       resulting lack of precision is not expected to impact
944 		       the correctness of the algorithm, but this has not been
945 		       mathematically proven.
946                     */
947                     // save original FPCW and set precision to 64-bit, as
948                     // Windows* OS on IA-32 architecture defaults to 53-bit
949                     unsigned int oldFpcw = _control87(0,0);
950                     _control87(_PC_64,_MCW_PC); // 0,0x30000
951                     #endif
952                     /* value used for comparison in solver for cross-over point */
953                     long double target = ((long double)chunk * 2 + 1) * nproc / tc;
954 
955                     /* crossover point--chunk indexes equal to or greater than
956 		       this point switch to dynamic-style scheduling */
957                     UT   cross;
958 
959                     /* commonly used term: (2 nproc - 1)/(2 nproc) */
960                     x = (long double)1.0 - (long double)0.5 / nproc;
961 
962                     #ifdef KMP_DEBUG
963                     { // test natural alignment
964                         struct _test_a {
965                             char a;
966                             union {
967                                 char b;
968                                 DBL  d;
969                             };
970                         } t;
971                         ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
972                         //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
973                         KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
974                     }
975                     #endif // KMP_DEBUG
976 
977                     /* save the term in thread private dispatch structure */
978                     *(DBL*)&pr->u.p.parm3 = x;
979 
980                     /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
981                     {
982                         UT          left, right, mid;
983                         long double p;
984 
985                         /* estimate initial upper and lower bound */
986 
987                         /* doesn't matter what value right is as long as it is positive, but
988                            it affects performance of the solver
989                         */
990                         right = 229;
991                         p = __kmp_pow< UT >(x,right);
992                         if ( p > target ) {
993                             do{
994                                 p *= p;
995                                 right <<= 1;
996                             } while(p>target && right < (1<<27));
997                             left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
998                         } else {
999                             left = 0;
1000                         }
1001 
1002                         /* bisection root-finding method */
1003                         while ( left + 1 < right ) {
1004                             mid = (left + right) / 2;
1005                             if ( __kmp_pow< UT >(x,mid) > target ) {
1006                                 left = mid;
1007                             } else {
1008                                 right = mid;
1009                             }
1010                         } // while
1011                         cross = right;
1012                     }
1013                     /* assert sanity of computed crossover point */
1014                     KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1015 
1016                     /* save the crossover point in thread private dispatch structure */
1017                     pr->u.p.parm2 = cross;
1018 
1019                     // C75803
1020                     #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1021                         #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1022                     #else
1023                         #define GUIDED_ANALYTICAL_WORKAROUND (x)
1024                     #endif
1025                     /* dynamic-style scheduling offset */
1026                     pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1027                     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1028                         // restore FPCW
1029                         _control87(oldFpcw,_MCW_PC);
1030                     #endif
1031                 } // if
1032             } else {
1033                 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1034                                gtid ) );
1035                 schedule = kmp_sch_static_greedy;
1036                 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1037                 pr->u.p.parm1 = tc;
1038             } // if
1039         } // case
1040         break;
1041     case kmp_sch_static_greedy:
1042         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1043             pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1044                 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1045                 tc;
1046         break;
1047     case kmp_sch_static_chunked :
1048     case kmp_sch_dynamic_chunked :
1049         KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1050         break;
1051     case kmp_sch_trapezoidal :
1052         {
1053             /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1054 
1055             T parm1, parm2, parm3, parm4;
1056             KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1057 
1058             parm1 = chunk;
1059 
1060             /* F : size of the first cycle */
1061             parm2 = ( tc / (2 * team->t.t_nproc) );
1062 
1063             if ( parm2 < 1 ) {
1064                 parm2 = 1;
1065             }
1066 
1067             /* L : size of the last cycle.  Make sure the last cycle
1068              *     is not larger than the first cycle.
1069              */
1070             if ( parm1 < 1 ) {
1071                 parm1 = 1;
1072             } else if ( parm1 > parm2 ) {
1073                 parm1 = parm2;
1074             }
1075 
1076             /* N : number of cycles */
1077             parm3 = ( parm2 + parm1 );
1078             parm3 = ( 2 * tc + parm3 - 1) / parm3;
1079 
1080             if ( parm3 < 2 ) {
1081                 parm3 = 2;
1082             }
1083 
1084             /* sigma : decreasing incr of the trapezoid */
1085             parm4 = ( parm3 - 1 );
1086             parm4 = ( parm2 - parm1 ) / parm4;
1087 
1088             // pointless check, because parm4 >= 0 always
1089             //if ( parm4 < 0 ) {
1090             //    parm4 = 0;
1091             //}
1092 
1093             pr->u.p.parm1 = parm1;
1094             pr->u.p.parm2 = parm2;
1095             pr->u.p.parm3 = parm3;
1096             pr->u.p.parm4 = parm4;
1097         } // case
1098         break;
1099 
1100     default:
1101         {
1102             __kmp_msg(
1103                 kmp_ms_fatal,                        // Severity
1104                 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1105                 KMP_HNT( GetNewerLibrary ),          // Hint
1106                 __kmp_msg_null                       // Variadic argument list terminator
1107             );
1108         }
1109         break;
1110     } // switch
1111     pr->schedule = schedule;
1112     if ( active ) {
1113         /* The name of this buffer should be my_buffer_index when it's free to use it */
1114 
1115         KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1116                         gtid, my_buffer_index, sh->buffer_index) );
1117         __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1118                                         USE_ITT_BUILD_ARG( NULL )
1119                                         );
1120             // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1121             // *always* 32-bit integers.
1122         KMP_MB();  /* is this necessary? */
1123         KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1124                         gtid, my_buffer_index, sh->buffer_index) );
1125 
1126         th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1127         th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*)  sh;
1128 #if USE_ITT_BUILD
1129         if ( pr->ordered ) {
1130             __kmp_itt_ordered_init( gtid );
1131         }; // if
1132         // Report loop metadata
1133         if ( itt_need_metadata_reporting ) {
1134             // Only report metadata by master of active team at level 1
1135             kmp_uint64 schedtype = 0;
1136             switch ( schedule ) {
1137             case kmp_sch_static_chunked:
1138             case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1139                 break;
1140             case kmp_sch_static_greedy:
1141                 cur_chunk = pr->u.p.parm1;
1142                 break;
1143             case kmp_sch_dynamic_chunked:
1144                 schedtype = 1;
1145                 break;
1146             case kmp_sch_guided_iterative_chunked:
1147             case kmp_sch_guided_analytical_chunked:
1148                 schedtype = 2;
1149                 break;
1150             default:
1151 //            Should we put this case under "static"?
1152 //            case kmp_sch_static_steal:
1153                 schedtype = 3;
1154                 break;
1155             }
1156             __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1157         }
1158 #endif /* USE_ITT_BUILD */
1159     }; // if
1160 
1161     #ifdef KMP_DEBUG
1162     {
1163         const char * buff;
1164         // create format specifiers before the debug output
1165         buff = __kmp_str_format(
1166             "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1167             " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1168             " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1169             traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1170             traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1171             traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1172             traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1173         KD_TRACE(10, ( buff,
1174             gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1175             pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1176             pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1177             pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1178         __kmp_str_free( &buff );
1179     }
1180     #endif
1181     #if ( KMP_STATIC_STEAL_ENABLED )
1182     if ( ___kmp_size_type < 8 ) {
1183       // It cannot be guaranteed that after execution of a loop with some other schedule kind
1184       // all the parm3 variables will contain the same value.
1185       // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1186       // rather than program life-time increment.
1187       // So the dedicated variable is required. The 'static_steal_counter' is used.
1188       if( schedule == kmp_sch_static_steal ) {
1189         // Other threads will inspect this variable when searching for a victim.
1190         // This is a flag showing that other threads may steal from this thread since then.
1191         volatile T * p = &pr->u.p.static_steal_counter;
1192         *p = *p + 1;
1193       }
1194     }
1195     #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1196 
1197 #if OMPT_SUPPORT && OMPT_TRACE
1198     if ((ompt_status == ompt_status_track_callback) &&
1199         ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1200         ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1201         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1202         ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1203             team_info->parallel_id, task_info->task_id, team_info->microtask);
1204     }
1205 #endif
1206 }
1207 
1208 /*
1209  * For ordered loops, either __kmp_dispatch_finish() should be called after
1210  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1211  * every chunk of iterations.  If the ordered section(s) were not executed
1212  * for this iteration (or every iteration in this chunk), we need to set the
1213  * ordered iteration counters so that the next thread can proceed.
1214  */
1215 template< typename UT >
1216 static void
1217 __kmp_dispatch_finish( int gtid, ident_t *loc )
1218 {
1219     typedef typename traits_t< UT >::signed_t ST;
1220     kmp_info_t *th = __kmp_threads[ gtid ];
1221 
1222     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1223     if ( ! th -> th.th_team -> t.t_serialized ) {
1224 
1225         dispatch_private_info_template< UT > * pr =
1226             reinterpret_cast< dispatch_private_info_template< UT >* >
1227             ( th->th.th_dispatch->th_dispatch_pr_current );
1228         dispatch_shared_info_template< UT > volatile * sh =
1229             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1230             ( th->th.th_dispatch->th_dispatch_sh_current );
1231         KMP_DEBUG_ASSERT( pr );
1232         KMP_DEBUG_ASSERT( sh );
1233         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1234                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1235 
1236         if ( pr->ordered_bumped ) {
1237             KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1238                             gtid ) );
1239             pr->ordered_bumped = 0;
1240         } else {
1241             UT lower = pr->u.p.ordered_lower;
1242 
1243             #ifdef KMP_DEBUG
1244             {
1245                 const char * buff;
1246                 // create format specifiers before the debug output
1247                 buff = __kmp_str_format(
1248                     "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1249                     traits_t< UT >::spec, traits_t< UT >::spec );
1250                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1251                 __kmp_str_free( &buff );
1252             }
1253             #endif
1254 
1255             __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1256                                    USE_ITT_BUILD_ARG(NULL)
1257                                    );
1258             KMP_MB();  /* is this necessary? */
1259             #ifdef KMP_DEBUG
1260             {
1261                 const char * buff;
1262                 // create format specifiers before the debug output
1263                 buff = __kmp_str_format(
1264                     "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1265                     traits_t< UT >::spec, traits_t< UT >::spec );
1266                 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1267                 __kmp_str_free( &buff );
1268             }
1269             #endif
1270 
1271             test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1272         } // if
1273     } // if
1274     KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1275 }
1276 
1277 #ifdef KMP_GOMP_COMPAT
1278 
1279 template< typename UT >
1280 static void
1281 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1282 {
1283     typedef typename traits_t< UT >::signed_t ST;
1284     kmp_info_t *th = __kmp_threads[ gtid ];
1285 
1286     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1287     if ( ! th -> th.th_team -> t.t_serialized ) {
1288 //        int cid;
1289         dispatch_private_info_template< UT > * pr =
1290             reinterpret_cast< dispatch_private_info_template< UT >* >
1291             ( th->th.th_dispatch->th_dispatch_pr_current );
1292         dispatch_shared_info_template< UT > volatile * sh =
1293             reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1294             ( th->th.th_dispatch->th_dispatch_sh_current );
1295         KMP_DEBUG_ASSERT( pr );
1296         KMP_DEBUG_ASSERT( sh );
1297         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1298                  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1299 
1300 //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1301             UT lower = pr->u.p.ordered_lower;
1302             UT upper = pr->u.p.ordered_upper;
1303             UT inc = upper - lower + 1;
1304 
1305             if ( pr->ordered_bumped == inc ) {
1306                 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1307                   gtid ) );
1308                 pr->ordered_bumped = 0;
1309             } else {
1310                 inc -= pr->ordered_bumped;
1311 
1312                 #ifdef KMP_DEBUG
1313                 {
1314                     const char * buff;
1315                     // create format specifiers before the debug output
1316                     buff = __kmp_str_format(
1317                         "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1318                         "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1319                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1320                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1321                     __kmp_str_free( &buff );
1322                 }
1323                 #endif
1324 
1325                 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1326                                        USE_ITT_BUILD_ARG(NULL)
1327                                        );
1328 
1329                 KMP_MB();  /* is this necessary? */
1330                 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1331                   gtid ) );
1332                 pr->ordered_bumped = 0;
1333 //!!!!! TODO check if the inc should be unsigned, or signed???
1334                 #ifdef KMP_DEBUG
1335                 {
1336                     const char * buff;
1337                     // create format specifiers before the debug output
1338                     buff = __kmp_str_format(
1339                         "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1340                         "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1341                         traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1342                     KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1343                     __kmp_str_free( &buff );
1344                 }
1345                 #endif
1346 
1347                 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1348             }
1349 //        }
1350     }
1351     KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1352 }
1353 
1354 #endif /* KMP_GOMP_COMPAT */
1355 
1356 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1357  * (no more work), then tell OMPT the loop is over. In some cases
1358  * kmp_dispatch_fini() is not called. */
1359 #if OMPT_SUPPORT && OMPT_TRACE
1360 #define OMPT_LOOP_END                                                          \
1361     if (status == 0) {                                                         \
1362         if ((ompt_status == ompt_status_track_callback) &&                     \
1363             ompt_callbacks.ompt_callback(ompt_event_loop_end)) {               \
1364             ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);        \
1365             ompt_task_info_t *task_info = __ompt_get_taskinfo(0);              \
1366             ompt_callbacks.ompt_callback(ompt_event_loop_end)(                 \
1367                 team_info->parallel_id, task_info->task_id);                   \
1368         }                                                                      \
1369     }
1370 #else
1371 #define OMPT_LOOP_END // no-op
1372 #endif
1373 
1374 template< typename T >
1375 static int
1376 __kmp_dispatch_next(
1377     ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1378 ) {
1379 
1380     typedef typename traits_t< T >::unsigned_t  UT;
1381     typedef typename traits_t< T >::signed_t    ST;
1382     typedef typename traits_t< T >::floating_t  DBL;
1383 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1384     static const int ___kmp_size_type = sizeof( UT );
1385 #endif
1386 
1387     int                                   status;
1388     dispatch_private_info_template< T > * pr;
1389     kmp_info_t                          * th   = __kmp_threads[ gtid ];
1390     kmp_team_t                          * team = th -> th.th_team;
1391 
1392     KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1393     #ifdef KMP_DEBUG
1394     {
1395         const char * buff;
1396         // create format specifiers before the debug output
1397         buff = __kmp_str_format(
1398             "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1399             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1400         KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1401         __kmp_str_free( &buff );
1402     }
1403     #endif
1404 
1405     if ( team -> t.t_serialized ) {
1406         /* NOTE: serialize this dispatch becase we are not at the active level */
1407         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1408             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1409         KMP_DEBUG_ASSERT( pr );
1410 
1411         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1412             *p_lb = 0;
1413             *p_ub = 0;
1414 //            if ( p_last != NULL )
1415 //                *p_last = 0;
1416             if ( p_st != NULL )
1417                 *p_st = 0;
1418             if ( __kmp_env_consistency_check ) {
1419                 if ( pr->pushed_ws != ct_none ) {
1420                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1421                 }
1422             }
1423         } else if ( pr->nomerge ) {
1424             kmp_int32 last;
1425             T         start;
1426             UT        limit, trip, init;
1427             ST        incr;
1428             T         chunk = pr->u.p.parm1;
1429 
1430             KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1431 
1432             init = chunk * pr->u.p.count++;
1433             trip = pr->u.p.tc - 1;
1434 
1435             if ( (status = (init <= trip)) == 0 ) {
1436                 *p_lb = 0;
1437                 *p_ub = 0;
1438 //                if ( p_last != NULL )
1439 //                    *p_last = 0;
1440                 if ( p_st != NULL )
1441                     *p_st = 0;
1442                 if ( __kmp_env_consistency_check ) {
1443                     if ( pr->pushed_ws != ct_none ) {
1444                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1445                     }
1446                 }
1447             } else {
1448                 start = pr->u.p.lb;
1449                 limit = chunk + init - 1;
1450                 incr  = pr->u.p.st;
1451 
1452                 if ( (last = (limit >= trip)) != 0 ) {
1453                     limit = trip;
1454                     #if KMP_OS_WINDOWS
1455                     pr->u.p.last_upper = pr->u.p.ub;
1456                     #endif /* KMP_OS_WINDOWS */
1457                 }
1458                 if ( p_last != NULL )
1459                     *p_last = last;
1460                 if ( p_st != NULL )
1461                     *p_st = incr;
1462                 if ( incr == 1 ) {
1463                     *p_lb = start + init;
1464                     *p_ub = start + limit;
1465                 } else {
1466                     *p_lb = start + init * incr;
1467                     *p_ub = start + limit * incr;
1468                 }
1469 
1470                 if ( pr->ordered ) {
1471                     pr->u.p.ordered_lower = init;
1472                     pr->u.p.ordered_upper = limit;
1473                     #ifdef KMP_DEBUG
1474                     {
1475                         const char * buff;
1476                         // create format specifiers before the debug output
1477                         buff = __kmp_str_format(
1478                             "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1479                             traits_t< UT >::spec, traits_t< UT >::spec );
1480                         KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1481                         __kmp_str_free( &buff );
1482                     }
1483                     #endif
1484                 } // if
1485             } // if
1486         } else {
1487             pr->u.p.tc = 0;
1488             *p_lb = pr->u.p.lb;
1489             *p_ub = pr->u.p.ub;
1490             #if KMP_OS_WINDOWS
1491             pr->u.p.last_upper = *p_ub;
1492             #endif /* KMP_OS_WINDOWS */
1493             if ( p_last != NULL )
1494                 *p_last = TRUE;
1495             if ( p_st != NULL )
1496                 *p_st = pr->u.p.st;
1497         } // if
1498         #ifdef KMP_DEBUG
1499         {
1500             const char * buff;
1501             // create format specifiers before the debug output
1502             buff = __kmp_str_format(
1503                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1504                 "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1505                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1506             KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1507             __kmp_str_free( &buff );
1508         }
1509         #endif
1510 #if INCLUDE_SSC_MARKS
1511         SSC_MARK_DISPATCH_NEXT();
1512 #endif
1513         OMPT_LOOP_END;
1514         return status;
1515     } else {
1516         kmp_int32 last = 0;
1517         dispatch_shared_info_template< UT > *sh;
1518         T         start;
1519         ST        incr;
1520         UT        limit, trip, init;
1521 
1522         KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1523                 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1524 
1525         pr = reinterpret_cast< dispatch_private_info_template< T >* >
1526             ( th->th.th_dispatch->th_dispatch_pr_current );
1527         KMP_DEBUG_ASSERT( pr );
1528         sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1529             ( th->th.th_dispatch->th_dispatch_sh_current );
1530         KMP_DEBUG_ASSERT( sh );
1531 
1532         if ( pr->u.p.tc == 0 ) {
1533             // zero trip count
1534             status = 0;
1535         } else {
1536             switch (pr->schedule) {
1537             #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1538             case kmp_sch_static_steal:
1539                 {
1540                     T chunk = pr->u.p.parm1;
1541 
1542                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1543 
1544                     trip = pr->u.p.tc - 1;
1545 
1546                     if ( ___kmp_size_type > 4 ) {
1547                         // Other threads do not look into the data of this thread,
1548                         //  so it's not necessary to make volatile casting.
1549                         init   = ( pr->u.p.count )++;
1550                         status = ( init < (UT)pr->u.p.ub );
1551                     } else {
1552                         typedef union {
1553                             struct {
1554                                 UT count;
1555                                 T  ub;
1556                             } p;
1557                             kmp_int64 b;
1558                         } union_i4;
1559                         // All operations on 'count' or 'ub' must be combined atomically together.
1560                         // stealing implemented only for 4-byte indexes
1561                         {
1562                             union_i4 vold, vnew;
1563                             vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1564                             vnew = vold;
1565                             vnew.p.count++;
1566                             while( ! KMP_COMPARE_AND_STORE_ACQ64(
1567                                         ( volatile kmp_int64* )&pr->u.p.count,
1568                                         *VOLATILE_CAST(kmp_int64 *)&vold.b,
1569                                         *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1570                                 KMP_CPU_PAUSE();
1571                                 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1572                                 vnew = vold;
1573                                 vnew.p.count++;
1574                             }
1575                             vnew = vold;
1576                             init   = vnew.p.count;
1577                             status = ( init < (UT)vnew.p.ub ) ;
1578                         }
1579 
1580                         if( !status ) {
1581                             kmp_info_t   **other_threads = team->t.t_threads;
1582                             int          while_limit = 10;
1583                             int          while_index = 0;
1584 
1585                             // TODO: algorithm of searching for a victim
1586                             // should be cleaned up and measured
1587                             while ( ( !status ) && ( while_limit != ++while_index ) ) {
1588                                 union_i4  vold, vnew;
1589                                 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1590                                 T         victimIdx    = pr->u.p.parm4;
1591                                 T         oldVictimIdx = victimIdx;
1592                                 dispatch_private_info_template< T > * victim;
1593 
1594                                 do {
1595                                     if( !victimIdx ) {
1596                                         victimIdx = team->t.t_nproc - 1;
1597                                     } else {
1598                                         --victimIdx;
1599                                     }
1600                                     victim = reinterpret_cast< dispatch_private_info_template< T >* >
1601                                         ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1602                                 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1603                                 // TODO: think about a proper place of this test
1604                                 if ( ( !victim ) ||
1605                                    ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1606                                      (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1607                                     // TODO: delay would be nice
1608                                     continue;
1609                                     // the victim is not ready yet to participate in stealing
1610                                     // because the victim is still in kmp_init_dispatch
1611                                 }
1612                                 if ( oldVictimIdx == victimIdx ) {
1613                                     break;
1614                                 }
1615                                 pr->u.p.parm4 = victimIdx;
1616 
1617                                 while( 1 ) {
1618                                     vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1619                                     vnew = vold;
1620 
1621                                     KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1622                                     if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1623                                         break;
1624                                     }
1625                                     vnew.p.ub -= (remaining >> 2);
1626                                     KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1627                                     #pragma warning( push )
1628                                     // disable warning on pointless comparison of unsigned with 0
1629                                     #pragma warning( disable: 186 )
1630                                         KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1631                                     #pragma warning( pop )
1632                                     // TODO: Should this be acquire or release?
1633                                     if ( KMP_COMPARE_AND_STORE_ACQ64(
1634                                             ( volatile kmp_int64 * )&victim->u.p.count,
1635                                             *VOLATILE_CAST(kmp_int64 *)&vold.b,
1636                                             *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1637                                         status = 1;
1638                                         while_index = 0;
1639                                         // now update own count and ub
1640                                         #if KMP_ARCH_X86
1641                                         // stealing executed on non-KMP_ARCH_X86 only
1642                                             // Atomic 64-bit write on ia32 is
1643                                             // unavailable, so we do this in steps.
1644                                             //     This code is not tested.
1645                                             init = vold.p.count;
1646                                             pr->u.p.ub = 0;
1647                                             pr->u.p.count = init + 1;
1648                                             pr->u.p.ub = vnew.p.count;
1649                                         #else
1650                                             init = vnew.p.ub;
1651                                             vold.p.count = init + 1;
1652                                             // TODO: is it safe and enough?
1653                                             *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1654                                         #endif // KMP_ARCH_X86
1655                                         break;
1656                                     } // if
1657                                 KMP_CPU_PAUSE();
1658                                 } // while (1)
1659                             } // while
1660                         } // if
1661                     } // if
1662                     if ( !status ) {
1663                         *p_lb = 0;
1664                         *p_ub = 0;
1665                         if ( p_st != NULL ) *p_st = 0;
1666                     } else {
1667                         start = pr->u.p.parm2;
1668                         init *= chunk;
1669                         limit = chunk + init - 1;
1670                         incr  = pr->u.p.st;
1671 
1672                         KMP_DEBUG_ASSERT(init <= trip);
1673                         if ( (last = (limit >= trip)) != 0 )
1674                             limit = trip;
1675                         if ( p_st != NULL ) *p_st = incr;
1676 
1677                         if ( incr == 1 ) {
1678                             *p_lb = start + init;
1679                             *p_ub = start + limit;
1680                         } else {
1681                             *p_lb = start + init * incr;
1682                             *p_ub = start + limit * incr;
1683                         }
1684 
1685                         if ( pr->ordered ) {
1686                             pr->u.p.ordered_lower = init;
1687                             pr->u.p.ordered_upper = limit;
1688                             #ifdef KMP_DEBUG
1689                             {
1690                                 const char * buff;
1691                                 // create format specifiers before the debug output
1692                                 buff = __kmp_str_format(
1693                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1694                                     traits_t< UT >::spec, traits_t< UT >::spec );
1695                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1696                                 __kmp_str_free( &buff );
1697                             }
1698                             #endif
1699                         } // if
1700                     } // if
1701                     break;
1702                 } // case
1703             #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1704             case kmp_sch_static_balanced:
1705                 {
1706                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1707                     if ( (status = !pr->u.p.count) != 0 ) {  /* check if thread has any iteration to do */
1708                         pr->u.p.count = 1;
1709                         *p_lb = pr->u.p.lb;
1710                         *p_ub = pr->u.p.ub;
1711                         last = pr->u.p.parm1;
1712                         if ( p_st != NULL )
1713                             *p_st = pr->u.p.st;
1714                     } else {  /* no iterations to do */
1715                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1716                     }
1717                     if ( pr->ordered ) {
1718                         #ifdef KMP_DEBUG
1719                         {
1720                             const char * buff;
1721                             // create format specifiers before the debug output
1722                             buff = __kmp_str_format(
1723                                 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1724                                 traits_t< UT >::spec, traits_t< UT >::spec );
1725                             KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1726                             __kmp_str_free( &buff );
1727                         }
1728                         #endif
1729                     } // if
1730                 } // case
1731                 break;
1732             case kmp_sch_static_greedy:  /* original code for kmp_sch_static_greedy was merged here */
1733             case kmp_sch_static_chunked:
1734                 {
1735                     T parm1;
1736 
1737                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1738                                    gtid ) );
1739                     parm1 = pr->u.p.parm1;
1740 
1741                     trip  = pr->u.p.tc - 1;
1742                     init  = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1743 
1744                     if ( (status = (init <= trip)) != 0 ) {
1745                         start = pr->u.p.lb;
1746                         incr  = pr->u.p.st;
1747                         limit = parm1 + init - 1;
1748 
1749                         if ( (last = (limit >= trip)) != 0 )
1750                             limit = trip;
1751 
1752                         if ( p_st != NULL ) *p_st = incr;
1753 
1754                         pr->u.p.count += team->t.t_nproc;
1755 
1756                         if ( incr == 1 ) {
1757                             *p_lb = start + init;
1758                             *p_ub = start + limit;
1759                         }
1760                         else {
1761                             *p_lb = start + init * incr;
1762                             *p_ub = start + limit * incr;
1763                         }
1764 
1765                         if ( pr->ordered ) {
1766                             pr->u.p.ordered_lower = init;
1767                             pr->u.p.ordered_upper = limit;
1768                             #ifdef KMP_DEBUG
1769                             {
1770                                 const char * buff;
1771                                 // create format specifiers before the debug output
1772                                 buff = __kmp_str_format(
1773                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1774                                     traits_t< UT >::spec, traits_t< UT >::spec );
1775                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1776                                 __kmp_str_free( &buff );
1777                             }
1778                             #endif
1779                         } // if
1780                     } // if
1781                 } // case
1782                 break;
1783 
1784             case kmp_sch_dynamic_chunked:
1785                 {
1786                     T chunk = pr->u.p.parm1;
1787 
1788                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1789                                    gtid ) );
1790 
1791                     init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1792                     trip = pr->u.p.tc - 1;
1793 
1794                     if ( (status = (init <= trip)) == 0 ) {
1795                         *p_lb = 0;
1796                         *p_ub = 0;
1797                         if ( p_st != NULL ) *p_st = 0;
1798                     } else {
1799                         start = pr->u.p.lb;
1800                         limit = chunk + init - 1;
1801                         incr  = pr->u.p.st;
1802 
1803                         if ( (last = (limit >= trip)) != 0 )
1804                             limit = trip;
1805 
1806                         if ( p_st != NULL ) *p_st = incr;
1807 
1808                         if ( incr == 1 ) {
1809                             *p_lb = start + init;
1810                             *p_ub = start + limit;
1811                         } else {
1812                             *p_lb = start + init * incr;
1813                             *p_ub = start + limit * incr;
1814                         }
1815 
1816                         if ( pr->ordered ) {
1817                             pr->u.p.ordered_lower = init;
1818                             pr->u.p.ordered_upper = limit;
1819                             #ifdef KMP_DEBUG
1820                             {
1821                                 const char * buff;
1822                                 // create format specifiers before the debug output
1823                                 buff = __kmp_str_format(
1824                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1825                                     traits_t< UT >::spec, traits_t< UT >::spec );
1826                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1827                                 __kmp_str_free( &buff );
1828                             }
1829                             #endif
1830                         } // if
1831                     } // if
1832                 } // case
1833                 break;
1834 
1835             case kmp_sch_guided_iterative_chunked:
1836                 {
1837                     T  chunkspec = pr->u.p.parm1;
1838                     KD_TRACE(100,
1839                         ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1840                     trip  = pr->u.p.tc;
1841                     // Start atomic part of calculations
1842                     while(1) {
1843                         ST  remaining;             // signed, because can be < 0
1844                         init = sh->u.s.iteration;  // shared value
1845                         remaining = trip - init;
1846                         if ( remaining <= 0 ) {    // AC: need to compare with 0 first
1847                             // nothing to do, don't try atomic op
1848                             status = 0;
1849                             break;
1850                         }
1851                         if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1852                             // use dynamic-style shcedule
1853                             // atomically inrement iterations, get old value
1854                             init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1855                             remaining = trip - init;
1856                             if (remaining <= 0) {
1857                                 status = 0;    // all iterations got by other threads
1858                             } else {
1859                                 // got some iterations to work on
1860                                 status = 1;
1861                                 if ( (T)remaining > chunkspec ) {
1862                                     limit = init + chunkspec - 1;
1863                                 } else {
1864                                     last = 1;   // the last chunk
1865                                     limit = init + remaining - 1;
1866                                 } // if
1867                             } // if
1868                             break;
1869                         } // if
1870                         limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1871                         if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1872                             // CAS was successful, chunk obtained
1873                             status = 1;
1874                             --limit;
1875                             break;
1876                         } // if
1877                     } // while
1878                     if ( status != 0 ) {
1879                         start = pr->u.p.lb;
1880                         incr = pr->u.p.st;
1881                         if ( p_st != NULL )
1882                             *p_st = incr;
1883                         *p_lb = start + init * incr;
1884                         *p_ub = start + limit * incr;
1885                         if ( pr->ordered ) {
1886                             pr->u.p.ordered_lower = init;
1887                             pr->u.p.ordered_upper = limit;
1888                             #ifdef KMP_DEBUG
1889                             {
1890                                 const char * buff;
1891                                 // create format specifiers before the debug output
1892                                 buff = __kmp_str_format(
1893                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1894                                     traits_t< UT >::spec, traits_t< UT >::spec );
1895                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1896                                 __kmp_str_free( &buff );
1897                             }
1898                             #endif
1899                         } // if
1900                     } else {
1901                         *p_lb = 0;
1902                         *p_ub = 0;
1903                         if ( p_st != NULL )
1904                             *p_st = 0;
1905                     } // if
1906                 } // case
1907                 break;
1908 
1909             case kmp_sch_guided_analytical_chunked:
1910                 {
1911                     T   chunkspec = pr->u.p.parm1;
1912                     UT chunkIdx;
1913     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1914                     /* for storing original FPCW value for Windows* OS on
1915 		       IA-32 architecture 8-byte version */
1916                     unsigned int oldFpcw;
1917                     unsigned int fpcwSet = 0;
1918     #endif
1919                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1920                                    gtid ) );
1921 
1922                     trip  = pr->u.p.tc;
1923 
1924                     KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1925                     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1926 
1927                     while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1928                         chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1929                         if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1930                             --trip;
1931                             /* use dynamic-style scheduling */
1932                             init = chunkIdx * chunkspec + pr->u.p.count;
1933                             /* need to verify init > 0 in case of overflow in the above calculation */
1934                             if ( (status = (init > 0 && init <= trip)) != 0 ) {
1935                                 limit = init + chunkspec -1;
1936 
1937                                 if ( (last = (limit >= trip)) != 0 )
1938                                     limit = trip;
1939                             }
1940                             break;
1941                         } else {
1942                             /* use exponential-style scheduling */
1943                             /* The following check is to workaround the lack of long double precision on Windows* OS.
1944                                This check works around the possible effect that init != 0 for chunkIdx == 0.
1945                              */
1946     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1947                             /* If we haven't already done so, save original
1948 			       FPCW and set precision to 64-bit, as Windows* OS
1949 			       on IA-32 architecture defaults to 53-bit */
1950                             if ( !fpcwSet ) {
1951                                 oldFpcw = _control87(0,0);
1952                                 _control87(_PC_64,_MCW_PC);
1953                                 fpcwSet = 0x30000;
1954                             }
1955     #endif
1956                             if ( chunkIdx ) {
1957                                 init = __kmp_dispatch_guided_remaining< T >(
1958                                            trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1959                                 KMP_DEBUG_ASSERT(init);
1960                                 init = trip - init;
1961                             } else
1962                                 init = 0;
1963                             limit = trip - __kmp_dispatch_guided_remaining< T >(
1964                                                trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1965                             KMP_ASSERT(init <= limit);
1966                             if ( init < limit ) {
1967                                 KMP_DEBUG_ASSERT(limit <= trip);
1968                                 --limit;
1969                                 status = 1;
1970                                 break;
1971                             } // if
1972                         } // if
1973                     } // while (1)
1974     #if KMP_OS_WINDOWS && KMP_ARCH_X86
1975                     /* restore FPCW if necessary
1976                        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1977                     */
1978                     if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1979                         _control87(oldFpcw,_MCW_PC);
1980     #endif
1981                     if ( status != 0 ) {
1982                         start = pr->u.p.lb;
1983                         incr = pr->u.p.st;
1984                         if ( p_st != NULL )
1985                             *p_st = incr;
1986                         *p_lb = start + init * incr;
1987                         *p_ub = start + limit * incr;
1988                         if ( pr->ordered ) {
1989                             pr->u.p.ordered_lower = init;
1990                             pr->u.p.ordered_upper = limit;
1991                             #ifdef KMP_DEBUG
1992                             {
1993                                 const char * buff;
1994                                 // create format specifiers before the debug output
1995                                 buff = __kmp_str_format(
1996                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1997                                     traits_t< UT >::spec, traits_t< UT >::spec );
1998                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1999                                 __kmp_str_free( &buff );
2000                             }
2001                             #endif
2002                         }
2003                     } else {
2004                         *p_lb = 0;
2005                         *p_ub = 0;
2006                         if ( p_st != NULL )
2007                             *p_st = 0;
2008                     }
2009                 } // case
2010                 break;
2011 
2012             case kmp_sch_trapezoidal:
2013                 {
2014                     UT   index;
2015                     T    parm2 = pr->u.p.parm2;
2016                     T    parm3 = pr->u.p.parm3;
2017                     T    parm4 = pr->u.p.parm4;
2018                     KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2019                                    gtid ) );
2020 
2021                     index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2022 
2023                     init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2024                     trip = pr->u.p.tc - 1;
2025 
2026                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2027                         *p_lb = 0;
2028                         *p_ub = 0;
2029                         if ( p_st != NULL ) *p_st = 0;
2030                     } else {
2031                         start = pr->u.p.lb;
2032                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2033                         incr  = pr->u.p.st;
2034 
2035                         if ( (last = (limit >= trip)) != 0 )
2036                             limit = trip;
2037 
2038                         if ( p_st != NULL ) *p_st = incr;
2039 
2040                         if ( incr == 1 ) {
2041                             *p_lb = start + init;
2042                             *p_ub = start + limit;
2043                         } else {
2044                             *p_lb = start + init * incr;
2045                             *p_ub = start + limit * incr;
2046                         }
2047 
2048                         if ( pr->ordered ) {
2049                             pr->u.p.ordered_lower = init;
2050                             pr->u.p.ordered_upper = limit;
2051                             #ifdef KMP_DEBUG
2052                             {
2053                                 const char * buff;
2054                                 // create format specifiers before the debug output
2055                                 buff = __kmp_str_format(
2056                                     "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2057                                     traits_t< UT >::spec, traits_t< UT >::spec );
2058                                 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2059                                 __kmp_str_free( &buff );
2060                             }
2061                             #endif
2062                         } // if
2063                     } // if
2064                 } // case
2065                 break;
2066             default:
2067                 {
2068                     status = 0; // to avoid complaints on uninitialized variable use
2069                     __kmp_msg(
2070                         kmp_ms_fatal,                        // Severity
2071                         KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2072                         KMP_HNT( GetNewerLibrary ),          // Hint
2073                         __kmp_msg_null                       // Variadic argument list terminator
2074                     );
2075                 }
2076                 break;
2077             } // switch
2078         } // if tc == 0;
2079 
2080         if ( status == 0 ) {
2081             UT   num_done;
2082 
2083             num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2084             #ifdef KMP_DEBUG
2085             {
2086                 const char * buff;
2087                 // create format specifiers before the debug output
2088                 buff = __kmp_str_format(
2089                     "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2090                     traits_t< UT >::spec );
2091                 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2092                 __kmp_str_free( &buff );
2093             }
2094             #endif
2095 
2096             if ( (ST)num_done == team->t.t_nproc-1 ) {
2097                 /* NOTE: release this buffer to be reused */
2098 
2099                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2100 
2101                 sh->u.s.num_done = 0;
2102                 sh->u.s.iteration = 0;
2103 
2104                 /* TODO replace with general release procedure? */
2105                 if ( pr->ordered ) {
2106                     sh->u.s.ordered_iteration = 0;
2107                 }
2108 
2109                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2110 
2111                 sh -> buffer_index += KMP_MAX_DISP_BUF;
2112                 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2113                                 gtid, sh->buffer_index) );
2114 
2115                 KMP_MB();       /* Flush all pending memory write invalidates.  */
2116 
2117             } // if
2118             if ( __kmp_env_consistency_check ) {
2119                 if ( pr->pushed_ws != ct_none ) {
2120                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2121                 }
2122             }
2123 
2124             th -> th.th_dispatch -> th_deo_fcn = NULL;
2125             th -> th.th_dispatch -> th_dxo_fcn = NULL;
2126             th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2127             th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2128         } // if (status == 0)
2129 #if KMP_OS_WINDOWS
2130         else if ( last ) {
2131             pr->u.p.last_upper = pr->u.p.ub;
2132         }
2133 #endif /* KMP_OS_WINDOWS */
2134         if ( p_last != NULL && status != 0 )
2135             *p_last = last;
2136     } // if
2137 
2138     #ifdef KMP_DEBUG
2139     {
2140         const char * buff;
2141         // create format specifiers before the debug output
2142         buff = __kmp_str_format(
2143             "__kmp_dispatch_next: T#%%d normal case: " \
2144             "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2145             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2146         KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2147         __kmp_str_free( &buff );
2148     }
2149     #endif
2150 #if INCLUDE_SSC_MARKS
2151     SSC_MARK_DISPATCH_NEXT();
2152 #endif
2153     OMPT_LOOP_END;
2154     return status;
2155 }
2156 
2157 template< typename T >
2158 static void
2159 __kmp_dist_get_bounds(
2160     ident_t                          *loc,
2161     kmp_int32                         gtid,
2162     kmp_int32                        *plastiter,
2163     T                                *plower,
2164     T                                *pupper,
2165     typename traits_t< T >::signed_t  incr
2166 ) {
2167     KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2168     typedef typename traits_t< T >::unsigned_t  UT;
2169     typedef typename traits_t< T >::signed_t    ST;
2170     register kmp_uint32  team_id;
2171     register kmp_uint32  nteams;
2172     register UT          trip_count;
2173     register kmp_team_t *team;
2174     kmp_info_t * th;
2175 
2176     KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2177     KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2178     #ifdef KMP_DEBUG
2179     {
2180         const char * buff;
2181         // create format specifiers before the debug output
2182         buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2183             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2184             traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2185             traits_t< T >::spec );
2186         KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2187         __kmp_str_free( &buff );
2188     }
2189     #endif
2190 
2191     if( __kmp_env_consistency_check ) {
2192         if( incr == 0 ) {
2193             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2194         }
2195         if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2196             // The loop is illegal.
2197             // Some zero-trip loops maintained by compiler, e.g.:
2198             //   for(i=10;i<0;++i) // lower >= upper - run-time check
2199             //   for(i=0;i>10;--i) // lower <= upper - run-time check
2200             //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2201             //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2202             // Compiler does not check the following illegal loops:
2203             //   for(i=0;i<10;i+=incr) // where incr<0
2204             //   for(i=10;i>0;i-=incr) // where incr<0
2205             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2206         }
2207     }
2208     th = __kmp_threads[gtid];
2209     KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
2210     team = th->th.th_team;
2211     #if OMP_40_ENABLED
2212     nteams = th->th.th_teams_size.nteams;
2213     #endif
2214     team_id = team->t.t_master_tid;
2215     KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2216 
2217     // compute global trip count
2218     if( incr == 1 ) {
2219         trip_count = *pupper - *plower + 1;
2220     } else if(incr == -1) {
2221         trip_count = *plower - *pupper + 1;
2222     } else {
2223         trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2224     }
2225     if( trip_count <= nteams ) {
2226         KMP_DEBUG_ASSERT(
2227             __kmp_static == kmp_sch_static_greedy || \
2228             __kmp_static == kmp_sch_static_balanced
2229         ); // Unknown static scheduling type.
2230         // only some teams get single iteration, others get nothing
2231         if( team_id < trip_count ) {
2232             *pupper = *plower = *plower + team_id * incr;
2233         } else {
2234             *plower = *pupper + incr; // zero-trip loop
2235         }
2236         if( plastiter != NULL )
2237             *plastiter = ( team_id == trip_count - 1 );
2238     } else {
2239         if( __kmp_static == kmp_sch_static_balanced ) {
2240             register UT chunk = trip_count / nteams;
2241             register UT extras = trip_count % nteams;
2242             *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2243             *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2244             if( plastiter != NULL )
2245                 *plastiter = ( team_id == nteams - 1 );
2246         } else {
2247             register T chunk_inc_count =
2248                 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2249             register T upper = *pupper;
2250             KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2251                 // Unknown static scheduling type.
2252             *plower += team_id * chunk_inc_count;
2253             *pupper = *plower + chunk_inc_count - incr;
2254             // Check/correct bounds if needed
2255             if( incr > 0 ) {
2256                 if( *pupper < *plower )
2257                     *pupper = i_maxmin< T >::mx;
2258                 if( plastiter != NULL )
2259                     *plastiter = *plower <= upper && *pupper > upper - incr;
2260                 if( *pupper > upper )
2261                     *pupper = upper; // tracker C73258
2262             } else {
2263                 if( *pupper > *plower )
2264                     *pupper = i_maxmin< T >::mn;
2265                 if( plastiter != NULL )
2266                     *plastiter = *plower >= upper && *pupper < upper - incr;
2267                 if( *pupper < upper )
2268                     *pupper = upper; // tracker C73258
2269             }
2270         }
2271     }
2272 }
2273 
2274 //-----------------------------------------------------------------------------------------
2275 // Dispatch routines
2276 //    Transfer call to template< type T >
2277 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2278 //                         T lb, T ub, ST st, ST chunk )
2279 extern "C" {
2280 
2281 /*!
2282 @ingroup WORK_SHARING
2283 @{
2284 @param loc Source location
2285 @param gtid Global thread id
2286 @param schedule Schedule type
2287 @param lb  Lower bound
2288 @param ub  Upper bound
2289 @param st  Step (or increment if you prefer)
2290 @param chunk The chunk size to block with
2291 
2292 This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2293 These functions are all identical apart from the types of the arguments.
2294 */
2295 
2296 void
2297 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2298                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2299 {
2300     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2301     KMP_DEBUG_ASSERT( __kmp_init_serial );
2302     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2303 }
2304 /*!
2305 See @ref __kmpc_dispatch_init_4
2306 */
2307 void
2308 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2309                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2310 {
2311     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2312     KMP_DEBUG_ASSERT( __kmp_init_serial );
2313     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2314 }
2315 
2316 /*!
2317 See @ref __kmpc_dispatch_init_4
2318 */
2319 void
2320 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2321                         kmp_int64 lb, kmp_int64 ub,
2322                         kmp_int64 st, kmp_int64 chunk )
2323 {
2324     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2325     KMP_DEBUG_ASSERT( __kmp_init_serial );
2326     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2327 }
2328 
2329 /*!
2330 See @ref __kmpc_dispatch_init_4
2331 */
2332 void
2333 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2334                          kmp_uint64 lb, kmp_uint64 ub,
2335                          kmp_int64 st, kmp_int64 chunk )
2336 {
2337     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2338     KMP_DEBUG_ASSERT( __kmp_init_serial );
2339     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2340 }
2341 
2342 /*!
2343 See @ref __kmpc_dispatch_init_4
2344 
2345 Difference from __kmpc_dispatch_init set of functions is these functions
2346 are called for composite distribute parallel for construct. Thus before
2347 regular iterations dispatching we need to calc per-team iteration space.
2348 
2349 These functions are all identical apart from the types of the arguments.
2350 */
2351 void
2352 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353     kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2354 {
2355     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2356     KMP_DEBUG_ASSERT( __kmp_init_serial );
2357     __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2358     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2359 }
2360 
2361 void
2362 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2363     kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2364 {
2365     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2366     KMP_DEBUG_ASSERT( __kmp_init_serial );
2367     __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2368     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2369 }
2370 
2371 void
2372 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2373     kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2374 {
2375     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2376     KMP_DEBUG_ASSERT( __kmp_init_serial );
2377     __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2378     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2379 }
2380 
2381 void
2382 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2383     kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2384 {
2385     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2386     KMP_DEBUG_ASSERT( __kmp_init_serial );
2387     __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2388     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2389 }
2390 
2391 /*!
2392 @param loc Source code location
2393 @param gtid Global thread id
2394 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2395 @param p_lb   Pointer to the lower bound for the next chunk of work
2396 @param p_ub   Pointer to the upper bound for the next chunk of work
2397 @param p_st   Pointer to the stride for the next chunk of work
2398 @return one if there is work to be done, zero otherwise
2399 
2400 Get the next dynamically allocated chunk of work for this thread.
2401 If there is no more work, then the lb,ub and stride need not be modified.
2402 */
2403 int
2404 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2405                         kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2406 {
2407     return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2408 }
2409 
2410 /*!
2411 See @ref __kmpc_dispatch_next_4
2412 */
2413 int
2414 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2415                         kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2416 {
2417     return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2418 }
2419 
2420 /*!
2421 See @ref __kmpc_dispatch_next_4
2422 */
2423 int
2424 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2425                         kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2426 {
2427     return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2428 }
2429 
2430 /*!
2431 See @ref __kmpc_dispatch_next_4
2432 */
2433 int
2434 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2435                         kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2436 {
2437     return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2438 }
2439 
2440 /*!
2441 @param loc Source code location
2442 @param gtid Global thread id
2443 
2444 Mark the end of a dynamic loop.
2445 */
2446 void
2447 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2448 {
2449     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2450 }
2451 
2452 /*!
2453 See @ref __kmpc_dispatch_fini_4
2454 */
2455 void
2456 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2457 {
2458     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2459 }
2460 
2461 /*!
2462 See @ref __kmpc_dispatch_fini_4
2463 */
2464 void
2465 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2466 {
2467     __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2468 }
2469 
2470 /*!
2471 See @ref __kmpc_dispatch_fini_4
2472 */
2473 void
2474 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2475 {
2476     __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2477 }
2478 /*! @} */
2479 
2480 //-----------------------------------------------------------------------------------------
2481 //Non-template routines from kmp_dispatch.c used in other sources
2482 
2483 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2484     return value == checker;
2485 }
2486 
2487 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2488     return value != checker;
2489 }
2490 
2491 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2492     return value < checker;
2493 }
2494 
2495 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2496     return value >= checker;
2497 }
2498 
2499 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2500     return value <= checker;
2501 }
2502 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2503     return value == checker;
2504 }
2505 
2506 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2507     return value != checker;
2508 }
2509 
2510 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2511     return value < checker;
2512 }
2513 
2514 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2515     return value >= checker;
2516 }
2517 
2518 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2519     return value <= checker;
2520 }
2521 
2522 kmp_uint32
2523 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2524                    kmp_uint32            checker,
2525                    kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2526                    , void        * obj    // Higher-level synchronization object, or NULL.
2527                    )
2528 {
2529     // note: we may not belong to a team at this point
2530     register volatile kmp_uint32         * spin          = spinner;
2531     register          kmp_uint32           check         = checker;
2532     register          kmp_uint32   spins;
2533     register          kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2534     register          kmp_uint32           r;
2535 
2536     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2537     KMP_INIT_YIELD( spins );
2538     // main wait spin loop
2539     while(!f(r = TCR_4(*spin), check)) {
2540         KMP_FSYNC_SPIN_PREPARE( obj );
2541         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2542            It causes problems with infinite recursion because of exit lock */
2543         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2544             __kmp_abort_thread(); */
2545 
2546         /* if we have waited a bit, or are oversubscribed, yield */
2547         /* pause is in the following code */
2548         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2549         KMP_YIELD_SPIN( spins );
2550     }
2551     KMP_FSYNC_SPIN_ACQUIRED( obj );
2552     return r;
2553 }
2554 
2555 kmp_uint64
2556 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2557                     kmp_uint64            checker,
2558                     kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2559                     , void        * obj    // Higher-level synchronization object, or NULL.
2560                     )
2561 {
2562     // note: we may not belong to a team at this point
2563     register volatile kmp_uint64         * spin          = spinner;
2564     register          kmp_uint64           check         = checker;
2565     register          kmp_uint32   spins;
2566     register          kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2567     register          kmp_uint64           r;
2568 
2569     KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2570     KMP_INIT_YIELD( spins );
2571     // main wait spin loop
2572     while(!f(r = *spin, check))
2573     {
2574         KMP_FSYNC_SPIN_PREPARE( obj );
2575         /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2576            It causes problems with infinite recursion because of exit lock */
2577         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2578             __kmp_abort_thread(); */
2579 
2580         // if we are oversubscribed,
2581         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2582         // pause is in the following code
2583         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2584         KMP_YIELD_SPIN( spins );
2585     }
2586     KMP_FSYNC_SPIN_ACQUIRED( obj );
2587     return r;
2588 }
2589 
2590 } // extern "C"
2591 
2592 #ifdef KMP_GOMP_COMPAT
2593 
2594 void
2595 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2596                            kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2597                            kmp_int32 chunk, int push_ws )
2598 {
2599     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2600                                       push_ws );
2601 }
2602 
2603 void
2604 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2605                             kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2606                             kmp_int32 chunk, int push_ws )
2607 {
2608     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2609                                        push_ws );
2610 }
2611 
2612 void
2613 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2614                            kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2615                            kmp_int64 chunk, int push_ws )
2616 {
2617     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2618                                       push_ws );
2619 }
2620 
2621 void
2622 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2623                             kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2624                             kmp_int64 chunk, int push_ws )
2625 {
2626     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2627                                        push_ws );
2628 }
2629 
2630 void
2631 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2632 {
2633     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2634 }
2635 
2636 void
2637 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2638 {
2639     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2640 }
2641 
2642 void
2643 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2644 {
2645     __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2646 }
2647 
2648 void
2649 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2650 {
2651     __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2652 }
2653 
2654 #endif /* KMP_GOMP_COMPAT */
2655 
2656 /* ------------------------------------------------------------------------ */
2657 /* ------------------------------------------------------------------------ */
2658 
2659