1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 /* Dynamic scheduling initialization and dispatch.
15  *
16  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
17  *       it may change values between parallel regions.  __kmp_max_nth
18  *       is the largest value __kmp_nth may take, 1 is the smallest.
19  */
20 
21 // Need to raise Win version from XP to Vista here for support of
22 // InterlockedExchange64
23 #if defined(_WIN32_WINNT) && defined(_M_IX86)
24 #undef _WIN32_WINNT
25 #define _WIN32_WINNT 0x0502
26 #endif
27 
28 #include "kmp.h"
29 #include "kmp_error.h"
30 #include "kmp_i18n.h"
31 #include "kmp_itt.h"
32 #include "kmp_stats.h"
33 #include "kmp_str.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 
45 #if KMP_STATIC_STEAL_ENABLED
46 
47 // replaces dispatch_private_info{32,64} structures and
48 // dispatch_private_info{32,64}_t types
49 template <typename T> struct dispatch_private_infoXX_template {
50   typedef typename traits_t<T>::unsigned_t UT;
51   typedef typename traits_t<T>::signed_t ST;
52   UT count; // unsigned
53   T ub;
54   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
55   T lb;
56   ST st; // signed
57   UT tc; // unsigned
58   T static_steal_counter; // for static_steal only; maybe better to put after ub
59 
60   /* parm[1-4] are used in different ways by different scheduling algorithms */
61 
62   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
63   //    a) parm3 is properly aligned and
64   //    b) all parm1-4 are in the same cache line.
65   // Because of parm1-4 are used together, performance seems to be better
66   // if they are in the same line (not measured though).
67 
68   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
69     T parm1;
70     T parm2;
71     T parm3;
72     T parm4;
73   };
74 
75   UT ordered_lower; // unsigned
76   UT ordered_upper; // unsigned
77 #if KMP_OS_WINDOWS
78   T last_upper;
79 #endif /* KMP_OS_WINDOWS */
80 };
81 
82 #else /* KMP_STATIC_STEAL_ENABLED */
83 
84 // replaces dispatch_private_info{32,64} structures and
85 // dispatch_private_info{32,64}_t types
86 template <typename T> struct dispatch_private_infoXX_template {
87   typedef typename traits_t<T>::unsigned_t UT;
88   typedef typename traits_t<T>::signed_t ST;
89   T lb;
90   T ub;
91   ST st; // signed
92   UT tc; // unsigned
93 
94   T parm1;
95   T parm2;
96   T parm3;
97   T parm4;
98 
99   UT count; // unsigned
100 
101   UT ordered_lower; // unsigned
102   UT ordered_upper; // unsigned
103 #if KMP_OS_WINDOWS
104   T last_upper;
105 #endif /* KMP_OS_WINDOWS */
106 };
107 
108 #endif /* KMP_STATIC_STEAL_ENABLED */
109 
110 // replaces dispatch_private_info structure and dispatch_private_info_t type
111 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
112   // duplicate alignment here, otherwise size of structure is not correct in our
113   // compiler
114   union KMP_ALIGN_CACHE private_info_tmpl {
115     dispatch_private_infoXX_template<T> p;
116     dispatch_private_info64_t p64;
117   } u;
118   enum sched_type schedule; /* scheduling algorithm */
119   kmp_uint32 ordered; /* ordered clause specified */
120   kmp_uint32 ordered_bumped;
121   // To retain the structure size after making ordered_iteration scalar
122   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
123   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
124   kmp_uint32 nomerge; /* don't merge iters if serialized */
125   kmp_uint32 type_size;
126   enum cons_type pushed_ws;
127 };
128 
129 // replaces dispatch_shared_info{32,64} structures and
130 // dispatch_shared_info{32,64}_t types
131 template <typename UT> struct dispatch_shared_infoXX_template {
132   /* chunk index under dynamic, number of idle threads under static-steal;
133      iteration index otherwise */
134   volatile UT iteration;
135   volatile UT num_done;
136   volatile UT ordered_iteration;
137   // to retain the structure size making ordered_iteration scalar
138   UT ordered_dummy[KMP_MAX_ORDERED - 3];
139 };
140 
141 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
142 template <typename UT> struct dispatch_shared_info_template {
143   // we need union here to keep the structure size
144   union shared_info_tmpl {
145     dispatch_shared_infoXX_template<UT> s;
146     dispatch_shared_info64_t s64;
147   } u;
148   volatile kmp_uint32 buffer_index;
149 #if OMP_45_ENABLED
150   volatile kmp_int32 doacross_buf_idx; // teamwise index
151   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
152   kmp_int32 doacross_num_done; // count finished threads
153 #endif
154 #if KMP_USE_HWLOC
155   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
156   // machines (> 48 cores). Performance analysis showed that a cache thrash
157   // was occurring and this padding helps alleviate the problem.
158   char padding[64];
159 #endif
160 };
161 
162 /* ------------------------------------------------------------------------ */
163 
164 #undef USE_TEST_LOCKS
165 
166 // test_then_add template (general template should NOT be used)
167 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
168 
169 template <>
170 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
171                                                  kmp_int32 d) {
172   kmp_int32 r;
173   r = KMP_TEST_THEN_ADD32(p, d);
174   return r;
175 }
176 
177 template <>
178 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
179                                                  kmp_int64 d) {
180   kmp_int64 r;
181   r = KMP_TEST_THEN_ADD64(p, d);
182   return r;
183 }
184 
185 // test_then_inc_acq template (general template should NOT be used)
186 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
187 
188 template <>
189 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
190   kmp_int32 r;
191   r = KMP_TEST_THEN_INC_ACQ32(p);
192   return r;
193 }
194 
195 template <>
196 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
197   kmp_int64 r;
198   r = KMP_TEST_THEN_INC_ACQ64(p);
199   return r;
200 }
201 
202 // test_then_inc template (general template should NOT be used)
203 template <typename T> static __forceinline T test_then_inc(volatile T *p);
204 
205 template <>
206 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
207   kmp_int32 r;
208   r = KMP_TEST_THEN_INC32(p);
209   return r;
210 }
211 
212 template <>
213 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
214   kmp_int64 r;
215   r = KMP_TEST_THEN_INC64(p);
216   return r;
217 }
218 
219 // compare_and_swap template (general template should NOT be used)
220 template <typename T>
221 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
222 
223 template <>
224 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
225                                                     kmp_int32 c, kmp_int32 s) {
226   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
227 }
228 
229 template <>
230 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
231                                                     kmp_int64 c, kmp_int64 s) {
232   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
233 }
234 
235 /* Spin wait loop that first does pause, then yield.
236     Waits until function returns non-zero when called with *spinner and check.
237     Does NOT put threads to sleep.
238 #if USE_ITT_BUILD
239     Arguments:
240         obj -- is higher-level synchronization object to report to ittnotify.
241         It is used to report locks consistently. For example, if lock is
242         acquired immediately, its address is reported to ittnotify via
243         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
244         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
245         same address, not an address of low-level spinner.
246 #endif // USE_ITT_BUILD
247 */
248 template <typename UT>
249 // ToDo: make inline function (move to header file for icl)
250 static UT // unsigned 4- or 8-byte type
251     __kmp_wait_yield(
252         volatile UT *spinner, UT checker,
253         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
254             void *obj) // Higher-level synchronization object, or NULL.
255         ) {
256   // note: we may not belong to a team at this point
257   volatile UT *spin = spinner;
258   UT check = checker;
259   kmp_uint32 spins;
260   kmp_uint32 (*f)(UT, UT) = pred;
261   UT r;
262 
263   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
264   KMP_INIT_YIELD(spins);
265   // main wait spin loop
266   while (!f(r = *spin, check)) {
267     KMP_FSYNC_SPIN_PREPARE(obj);
268     /* GEH - remove this since it was accidentally introduced when kmp_wait was
269        split. It causes problems with infinite recursion because of exit lock */
270     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
271         __kmp_abort_thread(); */
272 
273     // if we are oversubscribed, or have waited a bit (and
274     // KMP_LIBRARY=throughput, then yield. pause is in the following code
275     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
276     KMP_YIELD_SPIN(spins);
277   }
278   KMP_FSYNC_SPIN_ACQUIRED(obj);
279   return r;
280 }
281 
282 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
283   return value == checker;
284 }
285 
286 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
287   return value != checker;
288 }
289 
290 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
291   return value < checker;
292 }
293 
294 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
295   return value >= checker;
296 }
297 
298 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
299   return value <= checker;
300 }
301 
302 /* ------------------------------------------------------------------------ */
303 
304 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
305                                      ident_t *loc_ref) {
306   kmp_info_t *th;
307 
308   KMP_DEBUG_ASSERT(gtid_ref);
309 
310   if (__kmp_env_consistency_check) {
311     th = __kmp_threads[*gtid_ref];
312     if (th->th.th_root->r.r_active &&
313         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
314 #if KMP_USE_DYNAMIC_LOCK
315       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
316 #else
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
318 #endif
319     }
320   }
321 }
322 
323 template <typename UT>
324 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
325   typedef typename traits_t<UT>::signed_t ST;
326   dispatch_private_info_template<UT> *pr;
327 
328   int gtid = *gtid_ref;
329   //    int  cid = *cid_ref;
330   kmp_info_t *th = __kmp_threads[gtid];
331   KMP_DEBUG_ASSERT(th->th.th_dispatch);
332 
333   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
334   if (__kmp_env_consistency_check) {
335     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
336         th->th.th_dispatch->th_dispatch_pr_current);
337     if (pr->pushed_ws != ct_none) {
338 #if KMP_USE_DYNAMIC_LOCK
339       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
340 #else
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
342 #endif
343     }
344   }
345 
346   if (!th->th.th_team->t.t_serialized) {
347     dispatch_shared_info_template<UT> *sh =
348         reinterpret_cast<dispatch_shared_info_template<UT> *>(
349             th->th.th_dispatch->th_dispatch_sh_current);
350     UT lower;
351 
352     if (!__kmp_env_consistency_check) {
353       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
354           th->th.th_dispatch->th_dispatch_pr_current);
355     }
356     lower = pr->u.p.ordered_lower;
357 
358 #if !defined(KMP_GOMP_COMPAT)
359     if (__kmp_env_consistency_check) {
360       if (pr->ordered_bumped) {
361         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
362         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
363                                ct_ordered_in_pdo, loc_ref,
364                                &p->stack_data[p->w_top]);
365       }
366     }
367 #endif /* !defined(KMP_GOMP_COMPAT) */
368 
369     KMP_MB();
370 #ifdef KMP_DEBUG
371     {
372       const char *buff;
373       // create format specifiers before the debug output
374       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
375                               "ordered_iter:%%%s lower:%%%s\n",
376                               traits_t<UT>::spec, traits_t<UT>::spec);
377       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
378       __kmp_str_free(&buff);
379     }
380 #endif
381 
382     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
383                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
384     KMP_MB(); /* is this necessary? */
385 #ifdef KMP_DEBUG
386     {
387       const char *buff;
388       // create format specifiers before the debug output
389       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
390                               "ordered_iter:%%%s lower:%%%s\n",
391                               traits_t<UT>::spec, traits_t<UT>::spec);
392       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
393       __kmp_str_free(&buff);
394     }
395 #endif
396   }
397   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
398 }
399 
400 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
401                                      ident_t *loc_ref) {
402   kmp_info_t *th;
403 
404   if (__kmp_env_consistency_check) {
405     th = __kmp_threads[*gtid_ref];
406     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
407       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
408     }
409   }
410 }
411 
412 template <typename UT>
413 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
414   typedef typename traits_t<UT>::signed_t ST;
415   dispatch_private_info_template<UT> *pr;
416 
417   int gtid = *gtid_ref;
418   //    int  cid = *cid_ref;
419   kmp_info_t *th = __kmp_threads[gtid];
420   KMP_DEBUG_ASSERT(th->th.th_dispatch);
421 
422   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
423   if (__kmp_env_consistency_check) {
424     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
425         th->th.th_dispatch->th_dispatch_pr_current);
426     if (pr->pushed_ws != ct_none) {
427       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
428     }
429   }
430 
431   if (!th->th.th_team->t.t_serialized) {
432     dispatch_shared_info_template<UT> *sh =
433         reinterpret_cast<dispatch_shared_info_template<UT> *>(
434             th->th.th_dispatch->th_dispatch_sh_current);
435 
436     if (!__kmp_env_consistency_check) {
437       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
438           th->th.th_dispatch->th_dispatch_pr_current);
439     }
440 
441     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
442 #if !defined(KMP_GOMP_COMPAT)
443     if (__kmp_env_consistency_check) {
444       if (pr->ordered_bumped != 0) {
445         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
446         /* How to test it? - OM */
447         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
448                                ct_ordered_in_pdo, loc_ref,
449                                &p->stack_data[p->w_top]);
450       }
451     }
452 #endif /* !defined(KMP_GOMP_COMPAT) */
453 
454     KMP_MB(); /* Flush all pending memory write invalidates.  */
455 
456     pr->ordered_bumped += 1;
457 
458     KD_TRACE(1000,
459              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
460               gtid, pr->ordered_bumped));
461 
462     KMP_MB(); /* Flush all pending memory write invalidates.  */
463 
464     /* TODO use general release procedure? */
465     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
466 
467     KMP_MB(); /* Flush all pending memory write invalidates.  */
468   }
469   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
470 }
471 
472 // Computes and returns x to the power of y, where y must a non-negative integer
473 template <typename UT>
474 static __forceinline long double __kmp_pow(long double x, UT y) {
475   long double s = 1.0L;
476 
477   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
478   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
479   while (y) {
480     if (y & 1)
481       s *= x;
482     x *= x;
483     y >>= 1;
484   }
485   return s;
486 }
487 
488 /* Computes and returns the number of unassigned iterations after idx chunks
489    have been assigned (the total number of unassigned iterations in chunks with
490    index greater than or equal to idx). __forceinline seems to be broken so that
491    if we __forceinline this function, the behavior is wrong
492    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
493 template <typename T>
494 static __inline typename traits_t<T>::unsigned_t
495 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
496                                 typename traits_t<T>::unsigned_t idx) {
497   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
498      ICL 8.1, long double arithmetic may not really have long double precision,
499      even with /Qlong_double.  Currently, we workaround that in the caller code,
500      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
501      of precision is not expected to be a correctness issue, though. */
502   typedef typename traits_t<T>::unsigned_t UT;
503 
504   long double x = tc * __kmp_pow<UT>(base, idx);
505   UT r = (UT)x;
506   if (x == r)
507     return r;
508   return r + 1;
509 }
510 
511 // Parameters of the guided-iterative algorithm:
512 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
513 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
514 // by default n = 2. For example with n = 3 the chunks distribution will be more
515 // flat.
516 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
517 static int guided_int_param = 2;
518 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
519 
520 // UT - unsigned flavor of T, ST - signed flavor of T,
521 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
522 template <typename T>
523 static void
524 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
525                     T ub, typename traits_t<T>::signed_t st,
526                     typename traits_t<T>::signed_t chunk, int push_ws) {
527   typedef typename traits_t<T>::unsigned_t UT;
528   typedef typename traits_t<T>::signed_t ST;
529   typedef typename traits_t<T>::floating_t DBL;
530 
531   int active;
532   T tc;
533   kmp_info_t *th;
534   kmp_team_t *team;
535   kmp_uint32 my_buffer_index;
536   dispatch_private_info_template<T> *pr;
537   dispatch_shared_info_template<UT> volatile *sh;
538 
539   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
540                    sizeof(dispatch_private_info));
541   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
542                    sizeof(dispatch_shared_info));
543 
544   if (!TCR_4(__kmp_init_parallel))
545     __kmp_parallel_initialize();
546 
547 #if INCLUDE_SSC_MARKS
548   SSC_MARK_DISPATCH_INIT();
549 #endif
550 #ifdef KMP_DEBUG
551   {
552     const char *buff;
553     // create format specifiers before the debug output
554     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
555                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
556                             traits_t<ST>::spec, traits_t<T>::spec,
557                             traits_t<T>::spec, traits_t<ST>::spec);
558     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
559     __kmp_str_free(&buff);
560   }
561 #endif
562   /* setup data */
563   th = __kmp_threads[gtid];
564   team = th->th.th_team;
565   active = !team->t.t_serialized;
566   th->th.th_ident = loc;
567 
568 #if USE_ITT_BUILD
569   kmp_uint64 cur_chunk = chunk;
570   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
571                                     __kmp_forkjoin_frames_mode == 3 &&
572                                     KMP_MASTER_GTID(gtid) &&
573 #if OMP_40_ENABLED
574                                     th->th.th_teams_microtask == NULL &&
575 #endif
576                                     team->t.t_active_level == 1;
577 #endif
578   if (!active) {
579     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
580         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
581   } else {
582     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
583                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
584 
585     my_buffer_index = th->th.th_dispatch->th_disp_index++;
586 
587     /* What happens when number of threads changes, need to resize buffer? */
588     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
589         &th->th.th_dispatch
590              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
591     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
592         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593   }
594 
595 #if (KMP_STATIC_STEAL_ENABLED)
596   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
597     // AC: we now have only one implementation of stealing, so use it
598     schedule = kmp_sch_static_steal;
599   else
600 #endif
601     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
602 
603   /* Pick up the nomerge/ordered bits from the scheduling type */
604   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
605     pr->nomerge = TRUE;
606     schedule =
607         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
608   } else {
609     pr->nomerge = FALSE;
610   }
611   pr->type_size = traits_t<T>::type_size; // remember the size of variables
612   if (kmp_ord_lower & schedule) {
613     pr->ordered = TRUE;
614     schedule =
615         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
616   } else {
617     pr->ordered = FALSE;
618   }
619 
620   if (schedule == kmp_sch_static) {
621     schedule = __kmp_static;
622   } else {
623     if (schedule == kmp_sch_runtime) {
624       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
625       // not specified)
626       schedule = team->t.t_sched.r_sched_type;
627       // Detail the schedule if needed (global controls are differentiated
628       // appropriately)
629       if (schedule == kmp_sch_guided_chunked) {
630         schedule = __kmp_guided;
631       } else if (schedule == kmp_sch_static) {
632         schedule = __kmp_static;
633       }
634       // Use the chunk size specified by OMP_SCHEDULE (or default if not
635       // specified)
636       chunk = team->t.t_sched.chunk;
637 #if USE_ITT_BUILD
638       cur_chunk = chunk;
639 #endif
640 #ifdef KMP_DEBUG
641       {
642         const char *buff;
643         // create format specifiers before the debug output
644         buff = __kmp_str_format(
645             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
646             traits_t<ST>::spec);
647         KD_TRACE(10, (buff, gtid, schedule, chunk));
648         __kmp_str_free(&buff);
649       }
650 #endif
651     } else {
652       if (schedule == kmp_sch_guided_chunked) {
653         schedule = __kmp_guided;
654       }
655       if (chunk <= 0) {
656         chunk = KMP_DEFAULT_CHUNK;
657       }
658     }
659 
660     if (schedule == kmp_sch_auto) {
661       // mapping and differentiation: in the __kmp_do_serial_initialize()
662       schedule = __kmp_auto;
663 #ifdef KMP_DEBUG
664       {
665         const char *buff;
666         // create format specifiers before the debug output
667         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
668                                 "schedule:%%d chunk:%%%s\n",
669                                 traits_t<ST>::spec);
670         KD_TRACE(10, (buff, gtid, schedule, chunk));
671         __kmp_str_free(&buff);
672       }
673 #endif
674     }
675 
676     /* guided analytical not safe for too many threads */
677     if (schedule == kmp_sch_guided_analytical_chunked &&
678         th->th.th_team_nproc > 1 << 20) {
679       schedule = kmp_sch_guided_iterative_chunked;
680       KMP_WARNING(DispatchManyThreads);
681     }
682     if (schedule == kmp_sch_runtime_simd) {
683       // compiler provides simd_width in the chunk parameter
684       schedule = team->t.t_sched.r_sched_type;
685       // Detail the schedule if needed (global controls are differentiated
686       // appropriately)
687       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
688           schedule == __kmp_static) {
689         schedule = kmp_sch_static_balanced_chunked;
690       } else {
691         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
692           schedule = kmp_sch_guided_simd;
693         }
694         chunk = team->t.t_sched.chunk * chunk;
695       }
696 #if USE_ITT_BUILD
697       cur_chunk = chunk;
698 #endif
699 #ifdef KMP_DEBUG
700       {
701         const char *buff;
702         // create format specifiers before the debug output
703         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
704                                 " chunk:%%%s\n",
705                                 traits_t<ST>::spec);
706         KD_TRACE(10, (buff, gtid, schedule, chunk));
707         __kmp_str_free(&buff);
708       }
709 #endif
710     }
711     pr->u.p.parm1 = chunk;
712   }
713   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
714               "unknown scheduling type");
715 
716   pr->u.p.count = 0;
717 
718   if (__kmp_env_consistency_check) {
719     if (st == 0) {
720       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
721                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
722     }
723   }
724   // compute trip count
725   if (st == 1) { // most common case
726     if (ub >= lb) {
727       tc = ub - lb + 1;
728     } else { // ub < lb
729       tc = 0; // zero-trip
730     }
731   } else if (st < 0) {
732     if (lb >= ub) {
733       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
734       // where the division needs to be unsigned regardless of the result type
735       tc = (UT)(lb - ub) / (-st) + 1;
736     } else { // lb < ub
737       tc = 0; // zero-trip
738     }
739   } else { // st > 0
740     if (ub >= lb) {
741       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
742       // where the division needs to be unsigned regardless of the result type
743       tc = (UT)(ub - lb) / st + 1;
744     } else { // ub < lb
745       tc = 0; // zero-trip
746     }
747   }
748 
749   // Any half-decent optimizer will remove this test when the blocks are empty
750   // since the macros expand to nothing when statistics are disabled.
751   if (schedule == __kmp_static) {
752     KMP_COUNT_BLOCK(OMP_FOR_static);
753     KMP_COUNT_VALUE(FOR_static_iterations, tc);
754   } else {
755     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
756     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
757   }
758 
759   pr->u.p.lb = lb;
760   pr->u.p.ub = ub;
761   pr->u.p.st = st;
762   pr->u.p.tc = tc;
763 
764 #if KMP_OS_WINDOWS
765   pr->u.p.last_upper = ub + st;
766 #endif /* KMP_OS_WINDOWS */
767 
768   /* NOTE: only the active parallel region(s) has active ordered sections */
769 
770   if (active) {
771     if (pr->ordered == 0) {
772       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
773       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
774     } else {
775       pr->ordered_bumped = 0;
776 
777       pr->u.p.ordered_lower = 1;
778       pr->u.p.ordered_upper = 0;
779 
780       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
781       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
782     }
783   }
784 
785   if (__kmp_env_consistency_check) {
786     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
787     if (push_ws) {
788       __kmp_push_workshare(gtid, ws, loc);
789       pr->pushed_ws = ws;
790     } else {
791       __kmp_check_workshare(gtid, ws, loc);
792       pr->pushed_ws = ct_none;
793     }
794   }
795 
796   switch (schedule) {
797 #if (KMP_STATIC_STEAL_ENABLED)
798   case kmp_sch_static_steal: {
799     T nproc = th->th.th_team_nproc;
800     T ntc, init;
801 
802     KD_TRACE(100,
803              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
804 
805     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
806     if (nproc > 1 && ntc >= nproc) {
807       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
808       T id = __kmp_tid_from_gtid(gtid);
809       T small_chunk, extras;
810 
811       small_chunk = ntc / nproc;
812       extras = ntc % nproc;
813 
814       init = id * small_chunk + (id < extras ? id : extras);
815       pr->u.p.count = init;
816       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
817 
818       pr->u.p.parm2 = lb;
819       // pr->pfields.parm3 = 0; // it's not used in static_steal
820       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
821       pr->u.p.st = st;
822       if (traits_t<T>::type_size > 4) {
823         // AC: TODO: check if 16-byte CAS available and use it to
824         // improve performance (probably wait for explicit request
825         // before spending time on this).
826         // For now use dynamically allocated per-thread lock,
827         // free memory in __kmp_dispatch_next when status==0.
828         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
829         th->th.th_dispatch->th_steal_lock =
830             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
831         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
832       }
833       break;
834     } else {
835       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
836                      "kmp_sch_static_balanced\n",
837                      gtid));
838       schedule = kmp_sch_static_balanced;
839       /* too few iterations: fall-through to kmp_sch_static_balanced */
840     } // if
841     /* FALL-THROUGH to static balanced */
842   } // case
843 #endif
844   case kmp_sch_static_balanced: {
845     T nproc = th->th.th_team_nproc;
846     T init, limit;
847 
848     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
849                    gtid));
850 
851     if (nproc > 1) {
852       T id = __kmp_tid_from_gtid(gtid);
853 
854       if (tc < nproc) {
855         if (id < tc) {
856           init = id;
857           limit = id;
858           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
859         } else {
860           pr->u.p.count = 1; /* means no more chunks to execute */
861           pr->u.p.parm1 = FALSE;
862           break;
863         }
864       } else {
865         T small_chunk = tc / nproc;
866         T extras = tc % nproc;
867         init = id * small_chunk + (id < extras ? id : extras);
868         limit = init + small_chunk - (id < extras ? 0 : 1);
869         pr->u.p.parm1 = (id == nproc - 1);
870       }
871     } else {
872       if (tc > 0) {
873         init = 0;
874         limit = tc - 1;
875         pr->u.p.parm1 = TRUE;
876       } else { // zero trip count
877         pr->u.p.count = 1; /* means no more chunks to execute */
878         pr->u.p.parm1 = FALSE;
879         break;
880       }
881     }
882 #if USE_ITT_BUILD
883     // Calculate chunk for metadata report
884     if (itt_need_metadata_reporting)
885       cur_chunk = limit - init + 1;
886 #endif
887     if (st == 1) {
888       pr->u.p.lb = lb + init;
889       pr->u.p.ub = lb + limit;
890     } else {
891       // calculated upper bound, "ub" is user-defined upper bound
892       T ub_tmp = lb + limit * st;
893       pr->u.p.lb = lb + init * st;
894       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
895       // it exactly
896       if (st > 0) {
897         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
898       } else {
899         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
900       }
901     }
902     if (pr->ordered) {
903       pr->u.p.ordered_lower = init;
904       pr->u.p.ordered_upper = limit;
905     }
906     break;
907   } // case
908   case kmp_sch_static_balanced_chunked: {
909     // similar to balanced, but chunk adjusted to multiple of simd width
910     T nth = th->th.th_team_nproc;
911     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
912                    " -> falling-through to static_greedy\n",
913                    gtid));
914     schedule = kmp_sch_static_greedy;
915     if (nth > 1)
916       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
917     else
918       pr->u.p.parm1 = tc;
919     break;
920   } // case
921   case kmp_sch_guided_iterative_chunked:
922   case kmp_sch_guided_simd: {
923     T nproc = th->th.th_team_nproc;
924     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
925                    " case\n",
926                    gtid));
927 
928     if (nproc > 1) {
929       if ((2L * chunk + 1) * nproc >= tc) {
930         /* chunk size too large, switch to dynamic */
931         schedule = kmp_sch_dynamic_chunked;
932       } else {
933         // when remaining iters become less than parm2 - switch to dynamic
934         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
935         *(double *)&pr->u.p.parm3 =
936             guided_flt_param / nproc; // may occupy parm3 and parm4
937       }
938     } else {
939       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
940                      "kmp_sch_static_greedy\n",
941                      gtid));
942       schedule = kmp_sch_static_greedy;
943       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
944       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
945                      gtid));
946       pr->u.p.parm1 = tc;
947     } // if
948   } // case
949   break;
950   case kmp_sch_guided_analytical_chunked: {
951     T nproc = th->th.th_team_nproc;
952     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
953                    " case\n",
954                    gtid));
955     if (nproc > 1) {
956       if ((2L * chunk + 1) * nproc >= tc) {
957         /* chunk size too large, switch to dynamic */
958         schedule = kmp_sch_dynamic_chunked;
959       } else {
960         /* commonly used term: (2 nproc - 1)/(2 nproc) */
961         DBL x;
962 
963 #if KMP_OS_WINDOWS && KMP_ARCH_X86
964         /* Linux* OS already has 64-bit computation by default for long double,
965            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
966            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
967            instead of the default 53-bit. Even though long double doesn't work
968            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
969            expected to impact the correctness of the algorithm, but this has not
970            been mathematically proven. */
971         // save original FPCW and set precision to 64-bit, as
972         // Windows* OS on IA-32 architecture defaults to 53-bit
973         unsigned int oldFpcw = _control87(0, 0);
974         _control87(_PC_64, _MCW_PC); // 0,0x30000
975 #endif
976         /* value used for comparison in solver for cross-over point */
977         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
978 
979         /* crossover point--chunk indexes equal to or greater than
980            this point switch to dynamic-style scheduling */
981         UT cross;
982 
983         /* commonly used term: (2 nproc - 1)/(2 nproc) */
984         x = (long double)1.0 - (long double)0.5 / nproc;
985 
986 #ifdef KMP_DEBUG
987         { // test natural alignment
988           struct _test_a {
989             char a;
990             union {
991               char b;
992               DBL d;
993             };
994           } t;
995           ptrdiff_t natural_alignment =
996               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
997           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
998           // long)natural_alignment );
999           KMP_DEBUG_ASSERT(
1000               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1001         }
1002 #endif // KMP_DEBUG
1003 
1004         /* save the term in thread private dispatch structure */
1005         *(DBL *)&pr->u.p.parm3 = x;
1006 
1007         /* solve for the crossover point to the nearest integer i for which C_i
1008            <= chunk */
1009         {
1010           UT left, right, mid;
1011           long double p;
1012 
1013           /* estimate initial upper and lower bound */
1014 
1015           /* doesn't matter what value right is as long as it is positive, but
1016              it affects performance of the solver */
1017           right = 229;
1018           p = __kmp_pow<UT>(x, right);
1019           if (p > target) {
1020             do {
1021               p *= p;
1022               right <<= 1;
1023             } while (p > target && right < (1 << 27));
1024             /* lower bound is previous (failed) estimate of upper bound */
1025             left = right >> 1;
1026           } else {
1027             left = 0;
1028           }
1029 
1030           /* bisection root-finding method */
1031           while (left + 1 < right) {
1032             mid = (left + right) / 2;
1033             if (__kmp_pow<UT>(x, mid) > target) {
1034               left = mid;
1035             } else {
1036               right = mid;
1037             }
1038           } // while
1039           cross = right;
1040         }
1041         /* assert sanity of computed crossover point */
1042         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1043                    __kmp_pow<UT>(x, cross) <= target);
1044 
1045         /* save the crossover point in thread private dispatch structure */
1046         pr->u.p.parm2 = cross;
1047 
1048 // C75803
1049 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1050 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1051 #else
1052 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1053 #endif
1054         /* dynamic-style scheduling offset */
1055         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1056                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1057                         cross * chunk;
1058 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1059         // restore FPCW
1060         _control87(oldFpcw, _MCW_PC);
1061 #endif
1062       } // if
1063     } else {
1064       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1065                      "kmp_sch_static_greedy\n",
1066                      gtid));
1067       schedule = kmp_sch_static_greedy;
1068       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1069       pr->u.p.parm1 = tc;
1070     } // if
1071   } // case
1072   break;
1073   case kmp_sch_static_greedy:
1074     KD_TRACE(100,
1075              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1076     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1077                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1078                         : tc;
1079     break;
1080   case kmp_sch_static_chunked:
1081   case kmp_sch_dynamic_chunked:
1082     if (pr->u.p.parm1 <= 0) {
1083       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1084     }
1085     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1086                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1087                    gtid));
1088     break;
1089   case kmp_sch_trapezoidal: {
1090     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1091 
1092     T parm1, parm2, parm3, parm4;
1093     KD_TRACE(100,
1094              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1095 
1096     parm1 = chunk;
1097 
1098     /* F : size of the first cycle */
1099     parm2 = (tc / (2 * th->th.th_team_nproc));
1100 
1101     if (parm2 < 1) {
1102       parm2 = 1;
1103     }
1104 
1105     /* L : size of the last cycle.  Make sure the last cycle is not larger
1106        than the first cycle. */
1107     if (parm1 < 1) {
1108       parm1 = 1;
1109     } else if (parm1 > parm2) {
1110       parm1 = parm2;
1111     }
1112 
1113     /* N : number of cycles */
1114     parm3 = (parm2 + parm1);
1115     parm3 = (2 * tc + parm3 - 1) / parm3;
1116 
1117     if (parm3 < 2) {
1118       parm3 = 2;
1119     }
1120 
1121     /* sigma : decreasing incr of the trapezoid */
1122     parm4 = (parm3 - 1);
1123     parm4 = (parm2 - parm1) / parm4;
1124 
1125     // pointless check, because parm4 >= 0 always
1126     // if ( parm4 < 0 ) {
1127     //    parm4 = 0;
1128     //}
1129 
1130     pr->u.p.parm1 = parm1;
1131     pr->u.p.parm2 = parm2;
1132     pr->u.p.parm3 = parm3;
1133     pr->u.p.parm4 = parm4;
1134   } // case
1135   break;
1136 
1137   default: {
1138     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1139                 KMP_HNT(GetNewerLibrary), // Hint
1140                 __kmp_msg_null // Variadic argument list terminator
1141                 );
1142   } break;
1143   } // switch
1144   pr->schedule = schedule;
1145   if (active) {
1146     /* The name of this buffer should be my_buffer_index when it's free to use
1147      * it */
1148 
1149     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1150                    "sh->buffer_index:%d\n",
1151                    gtid, my_buffer_index, sh->buffer_index));
1152     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1153                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1154     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1155     // my_buffer_index are *always* 32-bit integers.
1156     KMP_MB(); /* is this necessary? */
1157     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1158                    "sh->buffer_index:%d\n",
1159                    gtid, my_buffer_index, sh->buffer_index));
1160 
1161     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1162     th->th.th_dispatch->th_dispatch_sh_current =
1163         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1164 #if USE_ITT_BUILD
1165     if (pr->ordered) {
1166       __kmp_itt_ordered_init(gtid);
1167     }
1168     // Report loop metadata
1169     if (itt_need_metadata_reporting) {
1170       // Only report metadata by master of active team at level 1
1171       kmp_uint64 schedtype = 0;
1172       switch (schedule) {
1173       case kmp_sch_static_chunked:
1174       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1175         break;
1176       case kmp_sch_static_greedy:
1177         cur_chunk = pr->u.p.parm1;
1178         break;
1179       case kmp_sch_dynamic_chunked:
1180         schedtype = 1;
1181         break;
1182       case kmp_sch_guided_iterative_chunked:
1183       case kmp_sch_guided_analytical_chunked:
1184       case kmp_sch_guided_simd:
1185         schedtype = 2;
1186         break;
1187       default:
1188         // Should we put this case under "static"?
1189         // case kmp_sch_static_steal:
1190         schedtype = 3;
1191         break;
1192       }
1193       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1194     }
1195 #endif /* USE_ITT_BUILD */
1196   }
1197 
1198 #ifdef KMP_DEBUG
1199   {
1200     const char *buff;
1201     // create format specifiers before the debug output
1202     buff = __kmp_str_format(
1203         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1204         "lb:%%%s ub:%%%s"
1205         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1206         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1207         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1208         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1209         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1210         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1211     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1212                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1213                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1214                   pr->u.p.parm3, pr->u.p.parm4));
1215     __kmp_str_free(&buff);
1216   }
1217 #endif
1218 #if (KMP_STATIC_STEAL_ENABLED)
1219   // It cannot be guaranteed that after execution of a loop with some other
1220   // schedule kind all the parm3 variables will contain the same value. Even if
1221   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1222   // rather than program life-time increment. So the dedicated variable is
1223   // required. The 'static_steal_counter' is used.
1224   if (schedule == kmp_sch_static_steal) {
1225     // Other threads will inspect this variable when searching for a victim.
1226     // This is a flag showing that other threads may steal from this thread
1227     // since then.
1228     volatile T *p = &pr->u.p.static_steal_counter;
1229     *p = *p + 1;
1230   }
1231 #endif // ( KMP_STATIC_STEAL_ENABLED )
1232 
1233 #if OMPT_SUPPORT && OMPT_TRACE
1234   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1235     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1236     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1237     ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1238         team_info->parallel_id, task_info->task_id, team_info->microtask);
1239   }
1240 #endif
1241 }
1242 
1243 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1244  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1245  * every chunk of iterations.  If the ordered section(s) were not executed
1246  * for this iteration (or every iteration in this chunk), we need to set the
1247  * ordered iteration counters so that the next thread can proceed. */
1248 template <typename UT>
1249 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1250   typedef typename traits_t<UT>::signed_t ST;
1251   kmp_info_t *th = __kmp_threads[gtid];
1252 
1253   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1254   if (!th->th.th_team->t.t_serialized) {
1255 
1256     dispatch_private_info_template<UT> *pr =
1257         reinterpret_cast<dispatch_private_info_template<UT> *>(
1258             th->th.th_dispatch->th_dispatch_pr_current);
1259     dispatch_shared_info_template<UT> volatile *sh =
1260         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1261             th->th.th_dispatch->th_dispatch_sh_current);
1262     KMP_DEBUG_ASSERT(pr);
1263     KMP_DEBUG_ASSERT(sh);
1264     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1265                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1266 
1267     if (pr->ordered_bumped) {
1268       KD_TRACE(
1269           1000,
1270           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1271            gtid));
1272       pr->ordered_bumped = 0;
1273     } else {
1274       UT lower = pr->u.p.ordered_lower;
1275 
1276 #ifdef KMP_DEBUG
1277       {
1278         const char *buff;
1279         // create format specifiers before the debug output
1280         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1281                                 "ordered_iteration:%%%s lower:%%%s\n",
1282                                 traits_t<UT>::spec, traits_t<UT>::spec);
1283         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1284         __kmp_str_free(&buff);
1285       }
1286 #endif
1287 
1288       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1289                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1290       KMP_MB(); /* is this necessary? */
1291 #ifdef KMP_DEBUG
1292       {
1293         const char *buff;
1294         // create format specifiers before the debug output
1295         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1296                                 "ordered_iteration:%%%s lower:%%%s\n",
1297                                 traits_t<UT>::spec, traits_t<UT>::spec);
1298         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1299         __kmp_str_free(&buff);
1300       }
1301 #endif
1302 
1303       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1304     } // if
1305   } // if
1306   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1307 }
1308 
1309 #ifdef KMP_GOMP_COMPAT
1310 
1311 template <typename UT>
1312 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1313   typedef typename traits_t<UT>::signed_t ST;
1314   kmp_info_t *th = __kmp_threads[gtid];
1315 
1316   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1317   if (!th->th.th_team->t.t_serialized) {
1318     //        int cid;
1319     dispatch_private_info_template<UT> *pr =
1320         reinterpret_cast<dispatch_private_info_template<UT> *>(
1321             th->th.th_dispatch->th_dispatch_pr_current);
1322     dispatch_shared_info_template<UT> volatile *sh =
1323         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1324             th->th.th_dispatch->th_dispatch_sh_current);
1325     KMP_DEBUG_ASSERT(pr);
1326     KMP_DEBUG_ASSERT(sh);
1327     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1328                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1329 
1330     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1331     UT lower = pr->u.p.ordered_lower;
1332     UT upper = pr->u.p.ordered_upper;
1333     UT inc = upper - lower + 1;
1334 
1335     if (pr->ordered_bumped == inc) {
1336       KD_TRACE(
1337           1000,
1338           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1339            gtid));
1340       pr->ordered_bumped = 0;
1341     } else {
1342       inc -= pr->ordered_bumped;
1343 
1344 #ifdef KMP_DEBUG
1345       {
1346         const char *buff;
1347         // create format specifiers before the debug output
1348         buff = __kmp_str_format(
1349             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1350             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1351             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1352         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1353         __kmp_str_free(&buff);
1354       }
1355 #endif
1356 
1357       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1358                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1359 
1360       KMP_MB(); /* is this necessary? */
1361       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1362                       "ordered_bumped to zero\n",
1363                       gtid));
1364       pr->ordered_bumped = 0;
1365 //!!!!! TODO check if the inc should be unsigned, or signed???
1366 #ifdef KMP_DEBUG
1367       {
1368         const char *buff;
1369         // create format specifiers before the debug output
1370         buff = __kmp_str_format(
1371             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1372             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1373             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1374             traits_t<UT>::spec);
1375         KD_TRACE(1000,
1376                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1377         __kmp_str_free(&buff);
1378       }
1379 #endif
1380 
1381       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1382     }
1383     //        }
1384   }
1385   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1386 }
1387 
1388 #endif /* KMP_GOMP_COMPAT */
1389 
1390 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1391    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1392    is not called. */
1393 #if OMPT_SUPPORT && OMPT_TRACE
1394 #define OMPT_LOOP_END                                                          \
1395   if (status == 0) {                                                           \
1396     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
1397       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1398       ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
1399       ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
1400           team_info->parallel_id, task_info->task_id);                         \
1401     }                                                                          \
1402   }
1403 #else
1404 #define OMPT_LOOP_END // no-op
1405 #endif
1406 
1407 template <typename T>
1408 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1409                                T *p_lb, T *p_ub,
1410                                typename traits_t<T>::signed_t *p_st) {
1411 
1412   typedef typename traits_t<T>::unsigned_t UT;
1413   typedef typename traits_t<T>::signed_t ST;
1414   typedef typename traits_t<T>::floating_t DBL;
1415 
1416   // This is potentially slightly misleading, schedule(runtime) will appear here
1417   // even if the actual runtme schedule is static. (Which points out a
1418   // disadavantage of schedule(runtime): even when static scheduling is used it
1419   // costs more than a compile time choice to use static scheduling would.)
1420   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1421 
1422   int status;
1423   dispatch_private_info_template<T> *pr;
1424   kmp_info_t *th = __kmp_threads[gtid];
1425   kmp_team_t *team = th->th.th_team;
1426 
1427   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1428 #ifdef KMP_DEBUG
1429   {
1430     const char *buff;
1431     // create format specifiers before the debug output
1432     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1433                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1434                             traits_t<T>::spec, traits_t<T>::spec,
1435                             traits_t<ST>::spec);
1436     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1437     __kmp_str_free(&buff);
1438   }
1439 #endif
1440 
1441   if (team->t.t_serialized) {
1442     /* NOTE: serialize this dispatch becase we are not at the active level */
1443     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1444         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1445     KMP_DEBUG_ASSERT(pr);
1446 
1447     if ((status = (pr->u.p.tc != 0)) == 0) {
1448       *p_lb = 0;
1449       *p_ub = 0;
1450       //            if ( p_last != NULL )
1451       //                *p_last = 0;
1452       if (p_st != NULL)
1453         *p_st = 0;
1454       if (__kmp_env_consistency_check) {
1455         if (pr->pushed_ws != ct_none) {
1456           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1457         }
1458       }
1459     } else if (pr->nomerge) {
1460       kmp_int32 last;
1461       T start;
1462       UT limit, trip, init;
1463       ST incr;
1464       T chunk = pr->u.p.parm1;
1465 
1466       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1467                      gtid));
1468 
1469       init = chunk * pr->u.p.count++;
1470       trip = pr->u.p.tc - 1;
1471 
1472       if ((status = (init <= trip)) == 0) {
1473         *p_lb = 0;
1474         *p_ub = 0;
1475         //                if ( p_last != NULL )
1476         //                    *p_last = 0;
1477         if (p_st != NULL)
1478           *p_st = 0;
1479         if (__kmp_env_consistency_check) {
1480           if (pr->pushed_ws != ct_none) {
1481             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1482           }
1483         }
1484       } else {
1485         start = pr->u.p.lb;
1486         limit = chunk + init - 1;
1487         incr = pr->u.p.st;
1488 
1489         if ((last = (limit >= trip)) != 0) {
1490           limit = trip;
1491 #if KMP_OS_WINDOWS
1492           pr->u.p.last_upper = pr->u.p.ub;
1493 #endif /* KMP_OS_WINDOWS */
1494         }
1495         if (p_last != NULL)
1496           *p_last = last;
1497         if (p_st != NULL)
1498           *p_st = incr;
1499         if (incr == 1) {
1500           *p_lb = start + init;
1501           *p_ub = start + limit;
1502         } else {
1503           *p_lb = start + init * incr;
1504           *p_ub = start + limit * incr;
1505         }
1506 
1507         if (pr->ordered) {
1508           pr->u.p.ordered_lower = init;
1509           pr->u.p.ordered_upper = limit;
1510 #ifdef KMP_DEBUG
1511           {
1512             const char *buff;
1513             // create format specifiers before the debug output
1514             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1515                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1516                                     traits_t<UT>::spec, traits_t<UT>::spec);
1517             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1518                             pr->u.p.ordered_upper));
1519             __kmp_str_free(&buff);
1520           }
1521 #endif
1522         } // if
1523       } // if
1524     } else {
1525       pr->u.p.tc = 0;
1526       *p_lb = pr->u.p.lb;
1527       *p_ub = pr->u.p.ub;
1528 #if KMP_OS_WINDOWS
1529       pr->u.p.last_upper = *p_ub;
1530 #endif /* KMP_OS_WINDOWS */
1531       if (p_last != NULL)
1532         *p_last = TRUE;
1533       if (p_st != NULL)
1534         *p_st = pr->u.p.st;
1535     } // if
1536 #ifdef KMP_DEBUG
1537     {
1538       const char *buff;
1539       // create format specifiers before the debug output
1540       buff = __kmp_str_format(
1541           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1542           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1543           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1544       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1545       __kmp_str_free(&buff);
1546     }
1547 #endif
1548 #if INCLUDE_SSC_MARKS
1549     SSC_MARK_DISPATCH_NEXT();
1550 #endif
1551     OMPT_LOOP_END;
1552     return status;
1553   } else {
1554     kmp_int32 last = 0;
1555     dispatch_shared_info_template<UT> *sh;
1556     T start;
1557     ST incr;
1558     UT limit, trip, init;
1559 
1560     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1561                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1562 
1563     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1564         th->th.th_dispatch->th_dispatch_pr_current);
1565     KMP_DEBUG_ASSERT(pr);
1566     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1567         th->th.th_dispatch->th_dispatch_sh_current);
1568     KMP_DEBUG_ASSERT(sh);
1569 
1570     if (pr->u.p.tc == 0) {
1571       // zero trip count
1572       status = 0;
1573     } else {
1574       switch (pr->schedule) {
1575 #if (KMP_STATIC_STEAL_ENABLED)
1576       case kmp_sch_static_steal: {
1577         T chunk = pr->u.p.parm1;
1578         int nproc = th->th.th_team_nproc;
1579 
1580         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1581                        gtid));
1582 
1583         trip = pr->u.p.tc - 1;
1584 
1585         if (traits_t<T>::type_size > 4) {
1586           // use lock for 8-byte and CAS for 4-byte induction
1587           // variable. TODO (optional): check and use 16-byte CAS
1588           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1589           KMP_DEBUG_ASSERT(lck != NULL);
1590           if (pr->u.p.count < (UT)pr->u.p.ub) {
1591             __kmp_acquire_lock(lck, gtid);
1592             // try to get own chunk of iterations
1593             init = (pr->u.p.count)++;
1594             status = (init < (UT)pr->u.p.ub);
1595             __kmp_release_lock(lck, gtid);
1596           } else {
1597             status = 0; // no own chunks
1598           }
1599           if (!status) { // try to steal
1600             kmp_info_t **other_threads = team->t.t_threads;
1601             int while_limit = nproc; // nproc attempts to find a victim
1602             int while_index = 0;
1603             // TODO: algorithm of searching for a victim
1604             // should be cleaned up and measured
1605             while ((!status) && (while_limit != ++while_index)) {
1606               T remaining;
1607               T victimIdx = pr->u.p.parm4;
1608               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1609               dispatch_private_info_template<T> *victim =
1610                   reinterpret_cast<dispatch_private_info_template<T> *>(
1611                       other_threads[victimIdx]
1612                           ->th.th_dispatch->th_dispatch_pr_current);
1613               while ((victim == NULL || victim == pr ||
1614                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1615                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1616                      oldVictimIdx != victimIdx) {
1617                 victimIdx = (victimIdx + 1) % nproc;
1618                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1619                     other_threads[victimIdx]
1620                         ->th.th_dispatch->th_dispatch_pr_current);
1621               }
1622               if (!victim ||
1623                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1624                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1625                 continue; // try once more (nproc attempts in total)
1626                 // no victim is ready yet to participate in stealing
1627                 // because all victims are still in kmp_init_dispatch
1628               }
1629               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1630                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1631                 continue; // not enough chunks to steal, goto next victim
1632               }
1633 
1634               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1635               KMP_ASSERT(lck != NULL);
1636               __kmp_acquire_lock(lck, gtid);
1637               limit = victim->u.p.ub; // keep initial ub
1638               if (victim->u.p.count >= limit ||
1639                   (remaining = limit - victim->u.p.count) < 2) {
1640                 __kmp_release_lock(lck, gtid);
1641                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1642                 continue; // not enough chunks to steal
1643               }
1644               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1645               // or by 1
1646               if (remaining > 3) {
1647                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1648                 init = (victim->u.p.ub -=
1649                         (remaining >> 2)); // steal 1/4 of remaining
1650               } else {
1651                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1652                 init =
1653                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1654               }
1655               __kmp_release_lock(lck, gtid);
1656 
1657               KMP_DEBUG_ASSERT(init + 1 <= limit);
1658               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1659               status = 1;
1660               while_index = 0;
1661               // now update own count and ub with stolen range but init chunk
1662               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1663               pr->u.p.count = init + 1;
1664               pr->u.p.ub = limit;
1665               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1666             } // while (search for victim)
1667           } // if (try to find victim and steal)
1668         } else {
1669           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1670           typedef union {
1671             struct {
1672               UT count;
1673               T ub;
1674             } p;
1675             kmp_int64 b;
1676           } union_i4;
1677           // All operations on 'count' or 'ub' must be combined atomically
1678           // together.
1679           {
1680             union_i4 vold, vnew;
1681             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1682             vnew = vold;
1683             vnew.p.count++;
1684             while (!KMP_COMPARE_AND_STORE_ACQ64(
1685                 (volatile kmp_int64 *)&pr->u.p.count,
1686                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1687                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1688               KMP_CPU_PAUSE();
1689               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1690               vnew = vold;
1691               vnew.p.count++;
1692             }
1693             vnew = vold;
1694             init = vnew.p.count;
1695             status = (init < (UT)vnew.p.ub);
1696           }
1697 
1698           if (!status) {
1699             kmp_info_t **other_threads = team->t.t_threads;
1700             int while_limit = nproc; // nproc attempts to find a victim
1701             int while_index = 0;
1702 
1703             // TODO: algorithm of searching for a victim
1704             // should be cleaned up and measured
1705             while ((!status) && (while_limit != ++while_index)) {
1706               union_i4 vold, vnew;
1707               kmp_int32 remaining;
1708               T victimIdx = pr->u.p.parm4;
1709               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1710               dispatch_private_info_template<T> *victim =
1711                   reinterpret_cast<dispatch_private_info_template<T> *>(
1712                       other_threads[victimIdx]
1713                           ->th.th_dispatch->th_dispatch_pr_current);
1714               while ((victim == NULL || victim == pr ||
1715                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1716                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1717                      oldVictimIdx != victimIdx) {
1718                 victimIdx = (victimIdx + 1) % nproc;
1719                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1720                     other_threads[victimIdx]
1721                         ->th.th_dispatch->th_dispatch_pr_current);
1722               }
1723               if (!victim ||
1724                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1725                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1726                 continue; // try once more (nproc attempts in total)
1727                 // no victim is ready yet to participate in stealing
1728                 // because all victims are still in kmp_init_dispatch
1729               }
1730               pr->u.p.parm4 = victimIdx; // new victim found
1731               while (1) { // CAS loop if victim has enough chunks to steal
1732                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1733                 vnew = vold;
1734 
1735                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1736                 if (vnew.p.count >= (UT)vnew.p.ub ||
1737                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1738                   pr->u.p.parm4 =
1739                       (victimIdx + 1) % nproc; // shift start victim id
1740                   break; // not enough chunks to steal, goto next victim
1741                 }
1742                 if (remaining > 3) {
1743                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1744                 } else {
1745                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1746                 }
1747                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1748                 // TODO: Should this be acquire or release?
1749                 if (KMP_COMPARE_AND_STORE_ACQ64(
1750                         (volatile kmp_int64 *)&victim->u.p.count,
1751                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1752                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1753                   // stealing succeeded
1754                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1755                                   vold.p.ub - vnew.p.ub);
1756                   status = 1;
1757                   while_index = 0;
1758                   // now update own count and ub
1759                   init = vnew.p.ub;
1760                   vold.p.count = init + 1;
1761 #if KMP_ARCH_X86
1762                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1763                                    vold.b);
1764 #else
1765                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1766 #endif
1767                   break;
1768                 } // if (check CAS result)
1769                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1770               } // while (try to steal from particular victim)
1771             } // while (search for victim)
1772           } // if (try to find victim and steal)
1773         } // if (4-byte induction variable)
1774         if (!status) {
1775           *p_lb = 0;
1776           *p_ub = 0;
1777           if (p_st != NULL)
1778             *p_st = 0;
1779         } else {
1780           start = pr->u.p.parm2;
1781           init *= chunk;
1782           limit = chunk + init - 1;
1783           incr = pr->u.p.st;
1784           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1785 
1786           KMP_DEBUG_ASSERT(init <= trip);
1787           if ((last = (limit >= trip)) != 0)
1788             limit = trip;
1789           if (p_st != NULL)
1790             *p_st = incr;
1791 
1792           if (incr == 1) {
1793             *p_lb = start + init;
1794             *p_ub = start + limit;
1795           } else {
1796             *p_lb = start + init * incr;
1797             *p_ub = start + limit * incr;
1798           }
1799 
1800           if (pr->ordered) {
1801             pr->u.p.ordered_lower = init;
1802             pr->u.p.ordered_upper = limit;
1803 #ifdef KMP_DEBUG
1804             {
1805               const char *buff;
1806               // create format specifiers before the debug output
1807               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1808                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1809                                       traits_t<UT>::spec, traits_t<UT>::spec);
1810               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1811                               pr->u.p.ordered_upper));
1812               __kmp_str_free(&buff);
1813             }
1814 #endif
1815           } // if
1816         } // if
1817         break;
1818       } // case
1819 #endif // ( KMP_STATIC_STEAL_ENABLED )
1820       case kmp_sch_static_balanced: {
1821         KD_TRACE(
1822             100,
1823             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1824         if ((status = !pr->u.p.count) !=
1825             0) { /* check if thread has any iteration to do */
1826           pr->u.p.count = 1;
1827           *p_lb = pr->u.p.lb;
1828           *p_ub = pr->u.p.ub;
1829           last = pr->u.p.parm1;
1830           if (p_st != NULL)
1831             *p_st = pr->u.p.st;
1832         } else { /* no iterations to do */
1833           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1834         }
1835         if (pr->ordered) {
1836 #ifdef KMP_DEBUG
1837           {
1838             const char *buff;
1839             // create format specifiers before the debug output
1840             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1841                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1842                                     traits_t<UT>::spec, traits_t<UT>::spec);
1843             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1844                             pr->u.p.ordered_upper));
1845             __kmp_str_free(&buff);
1846           }
1847 #endif
1848         } // if
1849       } // case
1850       break;
1851       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1852                                      merged here */
1853       case kmp_sch_static_chunked: {
1854         T parm1;
1855 
1856         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1857                        "kmp_sch_static_[affinity|chunked] case\n",
1858                        gtid));
1859         parm1 = pr->u.p.parm1;
1860 
1861         trip = pr->u.p.tc - 1;
1862         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1863 
1864         if ((status = (init <= trip)) != 0) {
1865           start = pr->u.p.lb;
1866           incr = pr->u.p.st;
1867           limit = parm1 + init - 1;
1868 
1869           if ((last = (limit >= trip)) != 0)
1870             limit = trip;
1871 
1872           if (p_st != NULL)
1873             *p_st = incr;
1874 
1875           pr->u.p.count += th->th.th_team_nproc;
1876 
1877           if (incr == 1) {
1878             *p_lb = start + init;
1879             *p_ub = start + limit;
1880           } else {
1881             *p_lb = start + init * incr;
1882             *p_ub = start + limit * incr;
1883           }
1884 
1885           if (pr->ordered) {
1886             pr->u.p.ordered_lower = init;
1887             pr->u.p.ordered_upper = limit;
1888 #ifdef KMP_DEBUG
1889             {
1890               const char *buff;
1891               // create format specifiers before the debug output
1892               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1893                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1894                                       traits_t<UT>::spec, traits_t<UT>::spec);
1895               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1896                               pr->u.p.ordered_upper));
1897               __kmp_str_free(&buff);
1898             }
1899 #endif
1900           } // if
1901         } // if
1902       } // case
1903       break;
1904 
1905       case kmp_sch_dynamic_chunked: {
1906         T chunk = pr->u.p.parm1;
1907 
1908         KD_TRACE(
1909             100,
1910             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1911 
1912         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1913         trip = pr->u.p.tc - 1;
1914 
1915         if ((status = (init <= trip)) == 0) {
1916           *p_lb = 0;
1917           *p_ub = 0;
1918           if (p_st != NULL)
1919             *p_st = 0;
1920         } else {
1921           start = pr->u.p.lb;
1922           limit = chunk + init - 1;
1923           incr = pr->u.p.st;
1924 
1925           if ((last = (limit >= trip)) != 0)
1926             limit = trip;
1927 
1928           if (p_st != NULL)
1929             *p_st = incr;
1930 
1931           if (incr == 1) {
1932             *p_lb = start + init;
1933             *p_ub = start + limit;
1934           } else {
1935             *p_lb = start + init * incr;
1936             *p_ub = start + limit * incr;
1937           }
1938 
1939           if (pr->ordered) {
1940             pr->u.p.ordered_lower = init;
1941             pr->u.p.ordered_upper = limit;
1942 #ifdef KMP_DEBUG
1943             {
1944               const char *buff;
1945               // create format specifiers before the debug output
1946               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1947                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1948                                       traits_t<UT>::spec, traits_t<UT>::spec);
1949               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1950                               pr->u.p.ordered_upper));
1951               __kmp_str_free(&buff);
1952             }
1953 #endif
1954           } // if
1955         } // if
1956       } // case
1957       break;
1958 
1959       case kmp_sch_guided_iterative_chunked: {
1960         T chunkspec = pr->u.p.parm1;
1961         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1962                        "iterative case\n",
1963                        gtid));
1964         trip = pr->u.p.tc;
1965         // Start atomic part of calculations
1966         while (1) {
1967           ST remaining; // signed, because can be < 0
1968           init = sh->u.s.iteration; // shared value
1969           remaining = trip - init;
1970           if (remaining <= 0) { // AC: need to compare with 0 first
1971             // nothing to do, don't try atomic op
1972             status = 0;
1973             break;
1974           }
1975           if ((T)remaining <
1976               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1977             // use dynamic-style shcedule
1978             // atomically inrement iterations, get old value
1979             init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1980                                      (ST)chunkspec);
1981             remaining = trip - init;
1982             if (remaining <= 0) {
1983               status = 0; // all iterations got by other threads
1984             } else { // got some iterations to work on
1985               status = 1;
1986               if ((T)remaining > chunkspec) {
1987                 limit = init + chunkspec - 1;
1988               } else {
1989                 last = 1; // the last chunk
1990                 limit = init + remaining - 1;
1991               } // if
1992             } // if
1993             break;
1994           } // if
1995           limit = init + (UT)(remaining *
1996                               *(double *)&pr->u.p.parm3); // divide by K*nproc
1997           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1998                                    (ST)init, (ST)limit)) {
1999             // CAS was successful, chunk obtained
2000             status = 1;
2001             --limit;
2002             break;
2003           } // if
2004         } // while
2005         if (status != 0) {
2006           start = pr->u.p.lb;
2007           incr = pr->u.p.st;
2008           if (p_st != NULL)
2009             *p_st = incr;
2010           *p_lb = start + init * incr;
2011           *p_ub = start + limit * incr;
2012           if (pr->ordered) {
2013             pr->u.p.ordered_lower = init;
2014             pr->u.p.ordered_upper = limit;
2015 #ifdef KMP_DEBUG
2016             {
2017               const char *buff;
2018               // create format specifiers before the debug output
2019               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2020                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2021                                       traits_t<UT>::spec, traits_t<UT>::spec);
2022               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2023                               pr->u.p.ordered_upper));
2024               __kmp_str_free(&buff);
2025             }
2026 #endif
2027           } // if
2028         } else {
2029           *p_lb = 0;
2030           *p_ub = 0;
2031           if (p_st != NULL)
2032             *p_st = 0;
2033         } // if
2034       } // case
2035       break;
2036 
2037       case kmp_sch_guided_simd: {
2038         // same as iterative but curr-chunk adjusted to be multiple of given
2039         // chunk
2040         T chunk = pr->u.p.parm1;
2041         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2042                        gtid));
2043         trip = pr->u.p.tc;
2044         // Start atomic part of calculations
2045         while (1) {
2046           ST remaining; // signed, because can be < 0
2047           init = sh->u.s.iteration; // shared value
2048           remaining = trip - init;
2049           if (remaining <= 0) { // AC: need to compare with 0 first
2050             status = 0; // nothing to do, don't try atomic op
2051             break;
2052           }
2053           KMP_DEBUG_ASSERT(init % chunk == 0);
2054           // compare with K*nproc*(chunk+1), K=2 by default
2055           if ((T)remaining < pr->u.p.parm2) {
2056             // use dynamic-style shcedule
2057             // atomically inrement iterations, get old value
2058             init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2059                                      (ST)chunk);
2060             remaining = trip - init;
2061             if (remaining <= 0) {
2062               status = 0; // all iterations got by other threads
2063             } else {
2064               // got some iterations to work on
2065               status = 1;
2066               if ((T)remaining > chunk) {
2067                 limit = init + chunk - 1;
2068               } else {
2069                 last = 1; // the last chunk
2070                 limit = init + remaining - 1;
2071               } // if
2072             } // if
2073             break;
2074           } // if
2075           // divide by K*nproc
2076           UT span = remaining * (*(double *)&pr->u.p.parm3);
2077           UT rem = span % chunk;
2078           if (rem) // adjust so that span%chunk == 0
2079             span += chunk - rem;
2080           limit = init + span;
2081           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2082                                    (ST)init, (ST)limit)) {
2083             // CAS was successful, chunk obtained
2084             status = 1;
2085             --limit;
2086             break;
2087           } // if
2088         } // while
2089         if (status != 0) {
2090           start = pr->u.p.lb;
2091           incr = pr->u.p.st;
2092           if (p_st != NULL)
2093             *p_st = incr;
2094           *p_lb = start + init * incr;
2095           *p_ub = start + limit * incr;
2096           if (pr->ordered) {
2097             pr->u.p.ordered_lower = init;
2098             pr->u.p.ordered_upper = limit;
2099 #ifdef KMP_DEBUG
2100             {
2101               const char *buff;
2102               // create format specifiers before the debug output
2103               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2104                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2105                                       traits_t<UT>::spec, traits_t<UT>::spec);
2106               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2107                               pr->u.p.ordered_upper));
2108               __kmp_str_free(&buff);
2109             }
2110 #endif
2111           } // if
2112         } else {
2113           *p_lb = 0;
2114           *p_ub = 0;
2115           if (p_st != NULL)
2116             *p_st = 0;
2117         } // if
2118       } // case
2119       break;
2120 
2121       case kmp_sch_guided_analytical_chunked: {
2122         T chunkspec = pr->u.p.parm1;
2123         UT chunkIdx;
2124 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2125         /* for storing original FPCW value for Windows* OS on
2126            IA-32 architecture 8-byte version */
2127         unsigned int oldFpcw;
2128         unsigned int fpcwSet = 0;
2129 #endif
2130         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2131                        "analytical case\n",
2132                        gtid));
2133 
2134         trip = pr->u.p.tc;
2135 
2136         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2137         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2138                          trip);
2139 
2140         while (1) { /* this while loop is a safeguard against unexpected zero
2141                        chunk sizes */
2142           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2143           if (chunkIdx >= (UT)pr->u.p.parm2) {
2144             --trip;
2145             /* use dynamic-style scheduling */
2146             init = chunkIdx * chunkspec + pr->u.p.count;
2147             /* need to verify init > 0 in case of overflow in the above
2148              * calculation */
2149             if ((status = (init > 0 && init <= trip)) != 0) {
2150               limit = init + chunkspec - 1;
2151 
2152               if ((last = (limit >= trip)) != 0)
2153                 limit = trip;
2154             }
2155             break;
2156           } else {
2157 /* use exponential-style scheduling */
2158 /* The following check is to workaround the lack of long double precision on
2159    Windows* OS.
2160    This check works around the possible effect that init != 0 for chunkIdx == 0.
2161  */
2162 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2163             /* If we haven't already done so, save original FPCW and set
2164                precision to 64-bit, as Windows* OS on IA-32 architecture
2165                defaults to 53-bit */
2166             if (!fpcwSet) {
2167               oldFpcw = _control87(0, 0);
2168               _control87(_PC_64, _MCW_PC);
2169               fpcwSet = 0x30000;
2170             }
2171 #endif
2172             if (chunkIdx) {
2173               init = __kmp_dispatch_guided_remaining<T>(
2174                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2175               KMP_DEBUG_ASSERT(init);
2176               init = trip - init;
2177             } else
2178               init = 0;
2179             limit = trip - __kmp_dispatch_guided_remaining<T>(
2180                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2181             KMP_ASSERT(init <= limit);
2182             if (init < limit) {
2183               KMP_DEBUG_ASSERT(limit <= trip);
2184               --limit;
2185               status = 1;
2186               break;
2187             } // if
2188           } // if
2189         } // while (1)
2190 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2191         /* restore FPCW if necessary
2192            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2193            here */
2194         if (fpcwSet && (oldFpcw & fpcwSet))
2195           _control87(oldFpcw, _MCW_PC);
2196 #endif
2197         if (status != 0) {
2198           start = pr->u.p.lb;
2199           incr = pr->u.p.st;
2200           if (p_st != NULL)
2201             *p_st = incr;
2202           *p_lb = start + init * incr;
2203           *p_ub = start + limit * incr;
2204           if (pr->ordered) {
2205             pr->u.p.ordered_lower = init;
2206             pr->u.p.ordered_upper = limit;
2207 #ifdef KMP_DEBUG
2208             {
2209               const char *buff;
2210               // create format specifiers before the debug output
2211               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2212                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2213                                       traits_t<UT>::spec, traits_t<UT>::spec);
2214               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2215                               pr->u.p.ordered_upper));
2216               __kmp_str_free(&buff);
2217             }
2218 #endif
2219           }
2220         } else {
2221           *p_lb = 0;
2222           *p_ub = 0;
2223           if (p_st != NULL)
2224             *p_st = 0;
2225         }
2226       } // case
2227       break;
2228 
2229       case kmp_sch_trapezoidal: {
2230         UT index;
2231         T parm2 = pr->u.p.parm2;
2232         T parm3 = pr->u.p.parm3;
2233         T parm4 = pr->u.p.parm4;
2234         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2235                        gtid));
2236 
2237         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2238 
2239         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2240         trip = pr->u.p.tc - 1;
2241 
2242         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2243           *p_lb = 0;
2244           *p_ub = 0;
2245           if (p_st != NULL)
2246             *p_st = 0;
2247         } else {
2248           start = pr->u.p.lb;
2249           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2250           incr = pr->u.p.st;
2251 
2252           if ((last = (limit >= trip)) != 0)
2253             limit = trip;
2254 
2255           if (p_st != NULL)
2256             *p_st = incr;
2257 
2258           if (incr == 1) {
2259             *p_lb = start + init;
2260             *p_ub = start + limit;
2261           } else {
2262             *p_lb = start + init * incr;
2263             *p_ub = start + limit * incr;
2264           }
2265 
2266           if (pr->ordered) {
2267             pr->u.p.ordered_lower = init;
2268             pr->u.p.ordered_upper = limit;
2269 #ifdef KMP_DEBUG
2270             {
2271               const char *buff;
2272               // create format specifiers before the debug output
2273               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2274                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2275                                       traits_t<UT>::spec, traits_t<UT>::spec);
2276               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2277                               pr->u.p.ordered_upper));
2278               __kmp_str_free(&buff);
2279             }
2280 #endif
2281           } // if
2282         } // if
2283       } // case
2284       break;
2285       default: {
2286         status = 0; // to avoid complaints on uninitialized variable use
2287         __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2288                     KMP_HNT(GetNewerLibrary), // Hint
2289                     __kmp_msg_null // Variadic argument list terminator
2290                     );
2291       } break;
2292       } // switch
2293     } // if tc == 0;
2294 
2295     if (status == 0) {
2296       UT num_done;
2297 
2298       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2299 #ifdef KMP_DEBUG
2300       {
2301         const char *buff;
2302         // create format specifiers before the debug output
2303         buff = __kmp_str_format(
2304             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2305             traits_t<UT>::spec);
2306         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2307         __kmp_str_free(&buff);
2308       }
2309 #endif
2310 
2311       if ((ST)num_done == th->th.th_team_nproc - 1) {
2312 #if (KMP_STATIC_STEAL_ENABLED)
2313         if (pr->schedule == kmp_sch_static_steal &&
2314             traits_t<T>::type_size > 4) {
2315           int i;
2316           kmp_info_t **other_threads = team->t.t_threads;
2317           // loop complete, safe to destroy locks used for stealing
2318           for (i = 0; i < th->th.th_team_nproc; ++i) {
2319             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2320             KMP_ASSERT(lck != NULL);
2321             __kmp_destroy_lock(lck);
2322             __kmp_free(lck);
2323             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2324           }
2325         }
2326 #endif
2327         /* NOTE: release this buffer to be reused */
2328 
2329         KMP_MB(); /* Flush all pending memory write invalidates.  */
2330 
2331         sh->u.s.num_done = 0;
2332         sh->u.s.iteration = 0;
2333 
2334         /* TODO replace with general release procedure? */
2335         if (pr->ordered) {
2336           sh->u.s.ordered_iteration = 0;
2337         }
2338 
2339         KMP_MB(); /* Flush all pending memory write invalidates.  */
2340 
2341         sh->buffer_index += __kmp_dispatch_num_buffers;
2342         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2343                        gtid, sh->buffer_index));
2344 
2345         KMP_MB(); /* Flush all pending memory write invalidates.  */
2346 
2347       } // if
2348       if (__kmp_env_consistency_check) {
2349         if (pr->pushed_ws != ct_none) {
2350           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2351         }
2352       }
2353 
2354       th->th.th_dispatch->th_deo_fcn = NULL;
2355       th->th.th_dispatch->th_dxo_fcn = NULL;
2356       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2357       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2358     } // if (status == 0)
2359 #if KMP_OS_WINDOWS
2360     else if (last) {
2361       pr->u.p.last_upper = pr->u.p.ub;
2362     }
2363 #endif /* KMP_OS_WINDOWS */
2364     if (p_last != NULL && status != 0)
2365       *p_last = last;
2366   } // if
2367 
2368 #ifdef KMP_DEBUG
2369   {
2370     const char *buff;
2371     // create format specifiers before the debug output
2372     buff = __kmp_str_format(
2373         "__kmp_dispatch_next: T#%%d normal case: "
2374         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2375         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2376     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2377     __kmp_str_free(&buff);
2378   }
2379 #endif
2380 #if INCLUDE_SSC_MARKS
2381   SSC_MARK_DISPATCH_NEXT();
2382 #endif
2383   OMPT_LOOP_END;
2384   return status;
2385 }
2386 
2387 template <typename T>
2388 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2389                                   kmp_int32 *plastiter, T *plower, T *pupper,
2390                                   typename traits_t<T>::signed_t incr) {
2391   typedef typename traits_t<T>::unsigned_t UT;
2392   typedef typename traits_t<T>::signed_t ST;
2393   kmp_uint32 team_id;
2394   kmp_uint32 nteams;
2395   UT trip_count;
2396   kmp_team_t *team;
2397   kmp_info_t *th;
2398 
2399   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2400   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2401 #ifdef KMP_DEBUG
2402   {
2403     const char *buff;
2404     // create format specifiers before the debug output
2405     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2406                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2407                             traits_t<T>::spec, traits_t<T>::spec,
2408                             traits_t<ST>::spec, traits_t<T>::spec);
2409     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2410     __kmp_str_free(&buff);
2411   }
2412 #endif
2413 
2414   if (__kmp_env_consistency_check) {
2415     if (incr == 0) {
2416       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2417                             loc);
2418     }
2419     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2420       // The loop is illegal.
2421       // Some zero-trip loops maintained by compiler, e.g.:
2422       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2423       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2424       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2425       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2426       // Compiler does not check the following illegal loops:
2427       //   for(i=0;i<10;i+=incr) // where incr<0
2428       //   for(i=10;i>0;i-=incr) // where incr<0
2429       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2430     }
2431   }
2432   th = __kmp_threads[gtid];
2433   team = th->th.th_team;
2434 #if OMP_40_ENABLED
2435   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2436   nteams = th->th.th_teams_size.nteams;
2437 #endif
2438   team_id = team->t.t_master_tid;
2439   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2440 
2441   // compute global trip count
2442   if (incr == 1) {
2443     trip_count = *pupper - *plower + 1;
2444   } else if (incr == -1) {
2445     trip_count = *plower - *pupper + 1;
2446   } else if (incr > 0) {
2447     // upper-lower can exceed the limit of signed type
2448     trip_count = (UT)(*pupper - *plower) / incr + 1;
2449   } else {
2450     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2451   }
2452 
2453   if (trip_count <= nteams) {
2454     KMP_DEBUG_ASSERT(
2455         __kmp_static == kmp_sch_static_greedy ||
2456         __kmp_static ==
2457             kmp_sch_static_balanced); // Unknown static scheduling type.
2458     // only some teams get single iteration, others get nothing
2459     if (team_id < trip_count) {
2460       *pupper = *plower = *plower + team_id * incr;
2461     } else {
2462       *plower = *pupper + incr; // zero-trip loop
2463     }
2464     if (plastiter != NULL)
2465       *plastiter = (team_id == trip_count - 1);
2466   } else {
2467     if (__kmp_static == kmp_sch_static_balanced) {
2468       UT chunk = trip_count / nteams;
2469       UT extras = trip_count % nteams;
2470       *plower +=
2471           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2472       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2473       if (plastiter != NULL)
2474         *plastiter = (team_id == nteams - 1);
2475     } else {
2476       T chunk_inc_count =
2477           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2478       T upper = *pupper;
2479       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2480       // Unknown static scheduling type.
2481       *plower += team_id * chunk_inc_count;
2482       *pupper = *plower + chunk_inc_count - incr;
2483       // Check/correct bounds if needed
2484       if (incr > 0) {
2485         if (*pupper < *plower)
2486           *pupper = traits_t<T>::max_value;
2487         if (plastiter != NULL)
2488           *plastiter = *plower <= upper && *pupper > upper - incr;
2489         if (*pupper > upper)
2490           *pupper = upper; // tracker C73258
2491       } else {
2492         if (*pupper > *plower)
2493           *pupper = traits_t<T>::min_value;
2494         if (plastiter != NULL)
2495           *plastiter = *plower >= upper && *pupper < upper - incr;
2496         if (*pupper < upper)
2497           *pupper = upper; // tracker C73258
2498       }
2499     }
2500   }
2501 }
2502 
2503 //-----------------------------------------------------------------------------
2504 // Dispatch routines
2505 //    Transfer call to template< type T >
2506 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2507 //                         T lb, T ub, ST st, ST chunk )
2508 extern "C" {
2509 
2510 /*!
2511 @ingroup WORK_SHARING
2512 @{
2513 @param loc Source location
2514 @param gtid Global thread id
2515 @param schedule Schedule type
2516 @param lb  Lower bound
2517 @param ub  Upper bound
2518 @param st  Step (or increment if you prefer)
2519 @param chunk The chunk size to block with
2520 
2521 This function prepares the runtime to start a dynamically scheduled for loop,
2522 saving the loop arguments.
2523 These functions are all identical apart from the types of the arguments.
2524 */
2525 
2526 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2527                             enum sched_type schedule, kmp_int32 lb,
2528                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2529   KMP_DEBUG_ASSERT(__kmp_init_serial);
2530   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2531 }
2532 /*!
2533 See @ref __kmpc_dispatch_init_4
2534 */
2535 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2536                              enum sched_type schedule, kmp_uint32 lb,
2537                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2538   KMP_DEBUG_ASSERT(__kmp_init_serial);
2539   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2540 }
2541 
2542 /*!
2543 See @ref __kmpc_dispatch_init_4
2544 */
2545 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2546                             enum sched_type schedule, kmp_int64 lb,
2547                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2548   KMP_DEBUG_ASSERT(__kmp_init_serial);
2549   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2550 }
2551 
2552 /*!
2553 See @ref __kmpc_dispatch_init_4
2554 */
2555 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2556                              enum sched_type schedule, kmp_uint64 lb,
2557                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2558   KMP_DEBUG_ASSERT(__kmp_init_serial);
2559   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2560 }
2561 
2562 /*!
2563 See @ref __kmpc_dispatch_init_4
2564 
2565 Difference from __kmpc_dispatch_init set of functions is these functions
2566 are called for composite distribute parallel for construct. Thus before
2567 regular iterations dispatching we need to calc per-team iteration space.
2568 
2569 These functions are all identical apart from the types of the arguments.
2570 */
2571 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2572                                  enum sched_type schedule, kmp_int32 *p_last,
2573                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2574                                  kmp_int32 chunk) {
2575   KMP_DEBUG_ASSERT(__kmp_init_serial);
2576   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2577   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2578 }
2579 
2580 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2581                                   enum sched_type schedule, kmp_int32 *p_last,
2582                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2583                                   kmp_int32 chunk) {
2584   KMP_DEBUG_ASSERT(__kmp_init_serial);
2585   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2586   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2587 }
2588 
2589 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2590                                  enum sched_type schedule, kmp_int32 *p_last,
2591                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2592                                  kmp_int64 chunk) {
2593   KMP_DEBUG_ASSERT(__kmp_init_serial);
2594   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2595   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2596 }
2597 
2598 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2599                                   enum sched_type schedule, kmp_int32 *p_last,
2600                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2601                                   kmp_int64 chunk) {
2602   KMP_DEBUG_ASSERT(__kmp_init_serial);
2603   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2604   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2605 }
2606 
2607 /*!
2608 @param loc Source code location
2609 @param gtid Global thread id
2610 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2611 otherwise
2612 @param p_lb   Pointer to the lower bound for the next chunk of work
2613 @param p_ub   Pointer to the upper bound for the next chunk of work
2614 @param p_st   Pointer to the stride for the next chunk of work
2615 @return one if there is work to be done, zero otherwise
2616 
2617 Get the next dynamically allocated chunk of work for this thread.
2618 If there is no more work, then the lb,ub and stride need not be modified.
2619 */
2620 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2621                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2622   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2623 }
2624 
2625 /*!
2626 See @ref __kmpc_dispatch_next_4
2627 */
2628 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2629                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2630                             kmp_int32 *p_st) {
2631   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2632 }
2633 
2634 /*!
2635 See @ref __kmpc_dispatch_next_4
2636 */
2637 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2638                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2639   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2640 }
2641 
2642 /*!
2643 See @ref __kmpc_dispatch_next_4
2644 */
2645 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2646                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2647                             kmp_int64 *p_st) {
2648   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2649 }
2650 
2651 /*!
2652 @param loc Source code location
2653 @param gtid Global thread id
2654 
2655 Mark the end of a dynamic loop.
2656 */
2657 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2658   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2659 }
2660 
2661 /*!
2662 See @ref __kmpc_dispatch_fini_4
2663 */
2664 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2665   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2666 }
2667 
2668 /*!
2669 See @ref __kmpc_dispatch_fini_4
2670 */
2671 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2672   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2673 }
2674 
2675 /*!
2676 See @ref __kmpc_dispatch_fini_4
2677 */
2678 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2679   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2680 }
2681 /*! @} */
2682 
2683 //-----------------------------------------------------------------------------
2684 // Non-template routines from kmp_dispatch.cpp used in other sources
2685 
2686 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2687   return value == checker;
2688 }
2689 
2690 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2691   return value != checker;
2692 }
2693 
2694 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2695   return value < checker;
2696 }
2697 
2698 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2699   return value >= checker;
2700 }
2701 
2702 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2703   return value <= checker;
2704 }
2705 
2706 kmp_uint32
2707 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2708                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2709                    void *obj // Higher-level synchronization object, or NULL.
2710                    ) {
2711   // note: we may not belong to a team at this point
2712   volatile kmp_uint32 *spin = spinner;
2713   kmp_uint32 check = checker;
2714   kmp_uint32 spins;
2715   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2716   kmp_uint32 r;
2717 
2718   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2719   KMP_INIT_YIELD(spins);
2720   // main wait spin loop
2721   while (!f(r = TCR_4(*spin), check)) {
2722     KMP_FSYNC_SPIN_PREPARE(obj);
2723     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2724        split. It causes problems with infinite recursion because of exit lock */
2725     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2726         __kmp_abort_thread(); */
2727 
2728     /* if we have waited a bit, or are oversubscribed, yield */
2729     /* pause is in the following code */
2730     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2731     KMP_YIELD_SPIN(spins);
2732   }
2733   KMP_FSYNC_SPIN_ACQUIRED(obj);
2734   return r;
2735 }
2736 
2737 void __kmp_wait_yield_4_ptr(
2738     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2739     void *obj // Higher-level synchronization object, or NULL.
2740     ) {
2741   // note: we may not belong to a team at this point
2742   void *spin = spinner;
2743   kmp_uint32 check = checker;
2744   kmp_uint32 spins;
2745   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2746 
2747   KMP_FSYNC_SPIN_INIT(obj, spin);
2748   KMP_INIT_YIELD(spins);
2749   // main wait spin loop
2750   while (!f(spin, check)) {
2751     KMP_FSYNC_SPIN_PREPARE(obj);
2752     /* if we have waited a bit, or are oversubscribed, yield */
2753     /* pause is in the following code */
2754     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2755     KMP_YIELD_SPIN(spins);
2756   }
2757   KMP_FSYNC_SPIN_ACQUIRED(obj);
2758 }
2759 
2760 } // extern "C"
2761 
2762 #ifdef KMP_GOMP_COMPAT
2763 
2764 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2765                                enum sched_type schedule, kmp_int32 lb,
2766                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2767                                int push_ws) {
2768   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2769                                  push_ws);
2770 }
2771 
2772 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2773                                 enum sched_type schedule, kmp_uint32 lb,
2774                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2775                                 int push_ws) {
2776   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2777                                   push_ws);
2778 }
2779 
2780 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2781                                enum sched_type schedule, kmp_int64 lb,
2782                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2783                                int push_ws) {
2784   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2785                                  push_ws);
2786 }
2787 
2788 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2789                                 enum sched_type schedule, kmp_uint64 lb,
2790                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2791                                 int push_ws) {
2792   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2793                                   push_ws);
2794 }
2795 
2796 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2797   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2798 }
2799 
2800 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2801   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2802 }
2803 
2804 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2805   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2806 }
2807 
2808 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2809   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2810 }
2811 
2812 #endif /* KMP_GOMP_COMPAT */
2813 
2814 /* ------------------------------------------------------------------------ */
2815