1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /* Dynamic scheduling initialization and dispatch.
17  *
18  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19  *       it may change values between parallel regions.  __kmp_max_nth
20  *       is the largest value __kmp_nth may take, 1 is the smallest.
21  */
22 
23 // Need to raise Win version from XP to Vista here for support of
24 // InterlockedExchange64
25 #if defined(_WIN32_WINNT) && defined(_M_IX86)
26 #undef _WIN32_WINNT
27 #define _WIN32_WINNT 0x0502
28 #endif
29 
30 #include "kmp.h"
31 #include "kmp_error.h"
32 #include "kmp_i18n.h"
33 #include "kmp_itt.h"
34 #include "kmp_stats.h"
35 #include "kmp_str.h"
36 #if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38 #endif
39 
40 #if OMPT_SUPPORT
41 #include "ompt-internal.h"
42 #include "ompt-specific.h"
43 #endif
44 
45 /* ------------------------------------------------------------------------ */
46 
47 #if KMP_STATIC_STEAL_ENABLED
48 
49 // replaces dispatch_private_info{32,64} structures and
50 // dispatch_private_info{32,64}_t types
51 template <typename T> struct dispatch_private_infoXX_template {
52   typedef typename traits_t<T>::unsigned_t UT;
53   typedef typename traits_t<T>::signed_t ST;
54   UT count; // unsigned
55   T ub;
56   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57   T lb;
58   ST st; // signed
59   UT tc; // unsigned
60   T static_steal_counter; // for static_steal only; maybe better to put after ub
61 
62   /* parm[1-4] are used in different ways by different scheduling algorithms */
63 
64   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65   //    a) parm3 is properly aligned and
66   //    b) all parm1-4 are in the same cache line.
67   // Because of parm1-4 are used together, performance seems to be better
68   // if they are in the same line (not measured though).
69 
70   struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71     T parm1;
72     T parm2;
73     T parm3;
74     T parm4;
75   };
76 
77   UT ordered_lower; // unsigned
78   UT ordered_upper; // unsigned
79 #if KMP_OS_WINDOWS
80   T last_upper;
81 #endif /* KMP_OS_WINDOWS */
82 };
83 
84 #else /* KMP_STATIC_STEAL_ENABLED */
85 
86 // replaces dispatch_private_info{32,64} structures and
87 // dispatch_private_info{32,64}_t types
88 template <typename T> struct dispatch_private_infoXX_template {
89   typedef typename traits_t<T>::unsigned_t UT;
90   typedef typename traits_t<T>::signed_t ST;
91   T lb;
92   T ub;
93   ST st; // signed
94   UT tc; // unsigned
95 
96   T parm1;
97   T parm2;
98   T parm3;
99   T parm4;
100 
101   UT count; // unsigned
102 
103   UT ordered_lower; // unsigned
104   UT ordered_upper; // unsigned
105 #if KMP_OS_WINDOWS
106   T last_upper;
107 #endif /* KMP_OS_WINDOWS */
108 };
109 
110 #endif /* KMP_STATIC_STEAL_ENABLED */
111 
112 // replaces dispatch_private_info structure and dispatch_private_info_t type
113 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114   // duplicate alignment here, otherwise size of structure is not correct in our
115   // compiler
116   union KMP_ALIGN_CACHE private_info_tmpl {
117     dispatch_private_infoXX_template<T> p;
118     dispatch_private_info64_t p64;
119   } u;
120   enum sched_type schedule; /* scheduling algorithm */
121   kmp_uint32 ordered; /* ordered clause specified */
122   kmp_uint32 ordered_bumped;
123   // To retain the structure size after making ordered_iteration scalar
124   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125   dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126   kmp_uint32 nomerge; /* don't merge iters if serialized */
127   kmp_uint32 type_size;
128   enum cons_type pushed_ws;
129 };
130 
131 // replaces dispatch_shared_info{32,64} structures and
132 // dispatch_shared_info{32,64}_t types
133 template <typename UT> struct dispatch_shared_infoXX_template {
134   /* chunk index under dynamic, number of idle threads under static-steal;
135      iteration index otherwise */
136   volatile UT iteration;
137   volatile UT num_done;
138   volatile UT ordered_iteration;
139   // to retain the structure size making ordered_iteration scalar
140   UT ordered_dummy[KMP_MAX_ORDERED - 3];
141 };
142 
143 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
144 template <typename UT> struct dispatch_shared_info_template {
145   // we need union here to keep the structure size
146   union shared_info_tmpl {
147     dispatch_shared_infoXX_template<UT> s;
148     dispatch_shared_info64_t s64;
149   } u;
150   volatile kmp_uint32 buffer_index;
151 #if OMP_45_ENABLED
152   volatile kmp_int32 doacross_buf_idx; // teamwise index
153   kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154   kmp_int32 doacross_num_done; // count finished threads
155 #endif
156 #if KMP_USE_HWLOC
157   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158   // machines (> 48 cores). Performance analysis showed that a cache thrash
159   // was occurring and this padding helps alleviate the problem.
160   char padding[64];
161 #endif
162 };
163 
164 /* ------------------------------------------------------------------------ */
165 
166 #undef USE_TEST_LOCKS
167 
168 // test_then_add template (general template should NOT be used)
169 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
170 
171 template <>
172 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173                                                  kmp_int32 d) {
174   kmp_int32 r;
175   r = KMP_TEST_THEN_ADD32(p, d);
176   return r;
177 }
178 
179 template <>
180 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181                                                  kmp_int64 d) {
182   kmp_int64 r;
183   r = KMP_TEST_THEN_ADD64(p, d);
184   return r;
185 }
186 
187 // test_then_inc_acq template (general template should NOT be used)
188 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
189 
190 template <>
191 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192   kmp_int32 r;
193   r = KMP_TEST_THEN_INC_ACQ32(p);
194   return r;
195 }
196 
197 template <>
198 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199   kmp_int64 r;
200   r = KMP_TEST_THEN_INC_ACQ64(p);
201   return r;
202 }
203 
204 // test_then_inc template (general template should NOT be used)
205 template <typename T> static __forceinline T test_then_inc(volatile T *p);
206 
207 template <>
208 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209   kmp_int32 r;
210   r = KMP_TEST_THEN_INC32(p);
211   return r;
212 }
213 
214 template <>
215 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216   kmp_int64 r;
217   r = KMP_TEST_THEN_INC64(p);
218   return r;
219 }
220 
221 // compare_and_swap template (general template should NOT be used)
222 template <typename T>
223 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
224 
225 template <>
226 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227                                                     kmp_int32 c, kmp_int32 s) {
228   return KMP_COMPARE_AND_STORE_REL32(p, c, s);
229 }
230 
231 template <>
232 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233                                                     kmp_int64 c, kmp_int64 s) {
234   return KMP_COMPARE_AND_STORE_REL64(p, c, s);
235 }
236 
237 /* Spin wait loop that first does pause, then yield.
238     Waits until function returns non-zero when called with *spinner and check.
239     Does NOT put threads to sleep.
240 #if USE_ITT_BUILD
241     Arguments:
242         obj -- is higher-level synchronization object to report to ittnotify.
243         It is used to report locks consistently. For example, if lock is
244         acquired immediately, its address is reported to ittnotify via
245         KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246         and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247         same address, not an address of low-level spinner.
248 #endif // USE_ITT_BUILD
249 */
250 template <typename UT>
251 // ToDo: make inline function (move to header file for icl)
252 static UT // unsigned 4- or 8-byte type
253     __kmp_wait_yield(
254         volatile UT *spinner, UT checker,
255         kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256             void *obj) // Higher-level synchronization object, or NULL.
257         ) {
258   // note: we may not belong to a team at this point
259   volatile UT *spin = spinner;
260   UT check = checker;
261   kmp_uint32 spins;
262   kmp_uint32 (*f)(UT, UT) = pred;
263   UT r;
264 
265   KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
266   KMP_INIT_YIELD(spins);
267   // main wait spin loop
268   while (!f(r = *spin, check)) {
269     KMP_FSYNC_SPIN_PREPARE(obj);
270     /* GEH - remove this since it was accidentally introduced when kmp_wait was
271        split. It causes problems with infinite recursion because of exit lock */
272     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273         __kmp_abort_thread(); */
274 
275     // if we are oversubscribed, or have waited a bit (and
276     // KMP_LIBRARY=throughput, then yield. pause is in the following code
277     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278     KMP_YIELD_SPIN(spins);
279   }
280   KMP_FSYNC_SPIN_ACQUIRED(obj);
281   return r;
282 }
283 
284 template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285   return value == checker;
286 }
287 
288 template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289   return value != checker;
290 }
291 
292 template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293   return value < checker;
294 }
295 
296 template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297   return value >= checker;
298 }
299 
300 template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301   return value <= checker;
302 }
303 
304 /* ------------------------------------------------------------------------ */
305 
306 static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307                                      ident_t *loc_ref) {
308   kmp_info_t *th;
309 
310   KMP_DEBUG_ASSERT(gtid_ref);
311 
312   if (__kmp_env_consistency_check) {
313     th = __kmp_threads[*gtid_ref];
314     if (th->th.th_root->r.r_active &&
315         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316 #if KMP_USE_DYNAMIC_LOCK
317       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318 #else
319       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320 #endif
321     }
322   }
323 }
324 
325 template <typename UT>
326 static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327   typedef typename traits_t<UT>::signed_t ST;
328   dispatch_private_info_template<UT> *pr;
329 
330   int gtid = *gtid_ref;
331   //    int  cid = *cid_ref;
332   kmp_info_t *th = __kmp_threads[gtid];
333   KMP_DEBUG_ASSERT(th->th.th_dispatch);
334 
335   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336   if (__kmp_env_consistency_check) {
337     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338         th->th.th_dispatch->th_dispatch_pr_current);
339     if (pr->pushed_ws != ct_none) {
340 #if KMP_USE_DYNAMIC_LOCK
341       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342 #else
343       __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344 #endif
345     }
346   }
347 
348   if (!th->th.th_team->t.t_serialized) {
349     dispatch_shared_info_template<UT> *sh =
350         reinterpret_cast<dispatch_shared_info_template<UT> *>(
351             th->th.th_dispatch->th_dispatch_sh_current);
352     UT lower;
353 
354     if (!__kmp_env_consistency_check) {
355       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356           th->th.th_dispatch->th_dispatch_pr_current);
357     }
358     lower = pr->u.p.ordered_lower;
359 
360 #if !defined(KMP_GOMP_COMPAT)
361     if (__kmp_env_consistency_check) {
362       if (pr->ordered_bumped) {
363         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365                                ct_ordered_in_pdo, loc_ref,
366                                &p->stack_data[p->w_top]);
367       }
368     }
369 #endif /* !defined(KMP_GOMP_COMPAT) */
370 
371     KMP_MB();
372 #ifdef KMP_DEBUG
373     {
374       const char *buff;
375       // create format specifiers before the debug output
376       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377                               "ordered_iter:%%%s lower:%%%s\n",
378                               traits_t<UT>::spec, traits_t<UT>::spec);
379       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380       __kmp_str_free(&buff);
381     }
382 #endif
383 
384     __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385                          __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386     KMP_MB(); /* is this necessary? */
387 #ifdef KMP_DEBUG
388     {
389       const char *buff;
390       // create format specifiers before the debug output
391       buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392                               "ordered_iter:%%%s lower:%%%s\n",
393                               traits_t<UT>::spec, traits_t<UT>::spec);
394       KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395       __kmp_str_free(&buff);
396     }
397 #endif
398   }
399   KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400 }
401 
402 static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403                                      ident_t *loc_ref) {
404   kmp_info_t *th;
405 
406   if (__kmp_env_consistency_check) {
407     th = __kmp_threads[*gtid_ref];
408     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410     }
411   }
412 }
413 
414 template <typename UT>
415 static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416   typedef typename traits_t<UT>::signed_t ST;
417   dispatch_private_info_template<UT> *pr;
418 
419   int gtid = *gtid_ref;
420   //    int  cid = *cid_ref;
421   kmp_info_t *th = __kmp_threads[gtid];
422   KMP_DEBUG_ASSERT(th->th.th_dispatch);
423 
424   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425   if (__kmp_env_consistency_check) {
426     pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427         th->th.th_dispatch->th_dispatch_pr_current);
428     if (pr->pushed_ws != ct_none) {
429       __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430     }
431   }
432 
433   if (!th->th.th_team->t.t_serialized) {
434     dispatch_shared_info_template<UT> *sh =
435         reinterpret_cast<dispatch_shared_info_template<UT> *>(
436             th->th.th_dispatch->th_dispatch_sh_current);
437 
438     if (!__kmp_env_consistency_check) {
439       pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440           th->th.th_dispatch->th_dispatch_pr_current);
441     }
442 
443     KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
444 #if !defined(KMP_GOMP_COMPAT)
445     if (__kmp_env_consistency_check) {
446       if (pr->ordered_bumped != 0) {
447         struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448         /* How to test it? - OM */
449         __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450                                ct_ordered_in_pdo, loc_ref,
451                                &p->stack_data[p->w_top]);
452       }
453     }
454 #endif /* !defined(KMP_GOMP_COMPAT) */
455 
456     KMP_MB(); /* Flush all pending memory write invalidates.  */
457 
458     pr->ordered_bumped += 1;
459 
460     KD_TRACE(1000,
461              ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462               gtid, pr->ordered_bumped));
463 
464     KMP_MB(); /* Flush all pending memory write invalidates.  */
465 
466     /* TODO use general release procedure? */
467     test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468 
469     KMP_MB(); /* Flush all pending memory write invalidates.  */
470   }
471   KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472 }
473 
474 // Computes and returns x to the power of y, where y must a non-negative integer
475 template <typename UT>
476 static __forceinline long double __kmp_pow(long double x, UT y) {
477   long double s = 1.0L;
478 
479   KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480   // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481   while (y) {
482     if (y & 1)
483       s *= x;
484     x *= x;
485     y >>= 1;
486   }
487   return s;
488 }
489 
490 /* Computes and returns the number of unassigned iterations after idx chunks
491    have been assigned (the total number of unassigned iterations in chunks with
492    index greater than or equal to idx). __forceinline seems to be broken so that
493    if we __forceinline this function, the behavior is wrong
494    (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495 template <typename T>
496 static __inline typename traits_t<T>::unsigned_t
497 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498                                 typename traits_t<T>::unsigned_t idx) {
499   /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500      ICL 8.1, long double arithmetic may not really have long double precision,
501      even with /Qlong_double.  Currently, we workaround that in the caller code,
502      by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
503      of precision is not expected to be a correctness issue, though. */
504   typedef typename traits_t<T>::unsigned_t UT;
505 
506   long double x = tc * __kmp_pow<UT>(base, idx);
507   UT r = (UT)x;
508   if (x == r)
509     return r;
510   return r + 1;
511 }
512 
513 // Parameters of the guided-iterative algorithm:
514 //   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
515 //   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
516 // by default n = 2. For example with n = 3 the chunks distribution will be more
517 // flat.
518 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519 static int guided_int_param = 2;
520 static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
521 
522 // UT - unsigned flavor of T, ST - signed flavor of T,
523 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
524 template <typename T>
525 static void
526 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527                     T ub, typename traits_t<T>::signed_t st,
528                     typename traits_t<T>::signed_t chunk, int push_ws) {
529   typedef typename traits_t<T>::unsigned_t UT;
530   typedef typename traits_t<T>::signed_t ST;
531   typedef typename traits_t<T>::floating_t DBL;
532 
533   int active;
534   T tc;
535   kmp_info_t *th;
536   kmp_team_t *team;
537   kmp_uint32 my_buffer_index;
538   dispatch_private_info_template<T> *pr;
539   dispatch_shared_info_template<UT> volatile *sh;
540 
541   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542                    sizeof(dispatch_private_info));
543   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544                    sizeof(dispatch_shared_info));
545 
546   if (!TCR_4(__kmp_init_parallel))
547     __kmp_parallel_initialize();
548 
549 #if INCLUDE_SSC_MARKS
550   SSC_MARK_DISPATCH_INIT();
551 #endif
552 #ifdef KMP_DEBUG
553   {
554     const char *buff;
555     // create format specifiers before the debug output
556     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558                             traits_t<ST>::spec, traits_t<T>::spec,
559                             traits_t<T>::spec, traits_t<ST>::spec);
560     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561     __kmp_str_free(&buff);
562   }
563 #endif
564   /* setup data */
565   th = __kmp_threads[gtid];
566   team = th->th.th_team;
567   active = !team->t.t_serialized;
568   th->th.th_ident = loc;
569 
570 #if USE_ITT_BUILD
571   kmp_uint64 cur_chunk = chunk;
572   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573                                     __kmp_forkjoin_frames_mode == 3 &&
574                                     KMP_MASTER_GTID(gtid) &&
575 #if OMP_40_ENABLED
576                                     th->th.th_teams_microtask == NULL &&
577 #endif
578                                     team->t.t_active_level == 1;
579 #endif
580   if (!active) {
581     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583   } else {
584     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
586 
587     my_buffer_index = th->th.th_dispatch->th_disp_index++;
588 
589     /* What happens when number of threads changes, need to resize buffer? */
590     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591         &th->th.th_dispatch
592              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593     sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595   }
596 
597 #if (KMP_STATIC_STEAL_ENABLED)
598   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599     // AC: we now have only one implementation of stealing, so use it
600     schedule = kmp_sch_static_steal;
601   else
602 #endif
603     schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
604 
605   /* Pick up the nomerge/ordered bits from the scheduling type */
606   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607     pr->nomerge = TRUE;
608     schedule =
609         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610   } else {
611     pr->nomerge = FALSE;
612   }
613   pr->type_size = traits_t<T>::type_size; // remember the size of variables
614   if (kmp_ord_lower & schedule) {
615     pr->ordered = TRUE;
616     schedule =
617         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618   } else {
619     pr->ordered = FALSE;
620   }
621 
622   if (schedule == kmp_sch_static) {
623     schedule = __kmp_static;
624   } else {
625     if (schedule == kmp_sch_runtime) {
626       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627       // not specified)
628       schedule = team->t.t_sched.r_sched_type;
629       // Detail the schedule if needed (global controls are differentiated
630       // appropriately)
631       if (schedule == kmp_sch_guided_chunked) {
632         schedule = __kmp_guided;
633       } else if (schedule == kmp_sch_static) {
634         schedule = __kmp_static;
635       }
636       // Use the chunk size specified by OMP_SCHEDULE (or default if not
637       // specified)
638       chunk = team->t.t_sched.chunk;
639 #if USE_ITT_BUILD
640       cur_chunk = chunk;
641 #endif
642 #ifdef KMP_DEBUG
643       {
644         const char *buff;
645         // create format specifiers before the debug output
646         buff = __kmp_str_format(
647             "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648             traits_t<ST>::spec);
649         KD_TRACE(10, (buff, gtid, schedule, chunk));
650         __kmp_str_free(&buff);
651       }
652 #endif
653     } else {
654       if (schedule == kmp_sch_guided_chunked) {
655         schedule = __kmp_guided;
656       }
657       if (chunk <= 0) {
658         chunk = KMP_DEFAULT_CHUNK;
659       }
660     }
661 
662     if (schedule == kmp_sch_auto) {
663       // mapping and differentiation: in the __kmp_do_serial_initialize()
664       schedule = __kmp_auto;
665 #ifdef KMP_DEBUG
666       {
667         const char *buff;
668         // create format specifiers before the debug output
669         buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670                                 "schedule:%%d chunk:%%%s\n",
671                                 traits_t<ST>::spec);
672         KD_TRACE(10, (buff, gtid, schedule, chunk));
673         __kmp_str_free(&buff);
674       }
675 #endif
676     }
677 
678     /* guided analytical not safe for too many threads */
679     if (schedule == kmp_sch_guided_analytical_chunked &&
680         th->th.th_team_nproc > 1 << 20) {
681       schedule = kmp_sch_guided_iterative_chunked;
682       KMP_WARNING(DispatchManyThreads);
683     }
684     if (schedule == kmp_sch_runtime_simd) {
685       // compiler provides simd_width in the chunk parameter
686       schedule = team->t.t_sched.r_sched_type;
687       // Detail the schedule if needed (global controls are differentiated
688       // appropriately)
689       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690           schedule == __kmp_static) {
691         schedule = kmp_sch_static_balanced_chunked;
692       } else {
693         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694           schedule = kmp_sch_guided_simd;
695         }
696         chunk = team->t.t_sched.chunk * chunk;
697       }
698 #if USE_ITT_BUILD
699       cur_chunk = chunk;
700 #endif
701 #ifdef KMP_DEBUG
702       {
703         const char *buff;
704         // create format specifiers before the debug output
705         buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706                                 " chunk:%%%s\n",
707                                 traits_t<ST>::spec);
708         KD_TRACE(10, (buff, gtid, schedule, chunk));
709         __kmp_str_free(&buff);
710       }
711 #endif
712     }
713     pr->u.p.parm1 = chunk;
714   }
715   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716               "unknown scheduling type");
717 
718   pr->u.p.count = 0;
719 
720   if (__kmp_env_consistency_check) {
721     if (st == 0) {
722       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723                             (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724     }
725   }
726   // compute trip count
727   if (st == 1) { // most common case
728     if (ub >= lb) {
729       tc = ub - lb + 1;
730     } else { // ub < lb
731       tc = 0; // zero-trip
732     }
733   } else if (st < 0) {
734     if (lb >= ub) {
735       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736       // where the division needs to be unsigned regardless of the result type
737       tc = (UT)(lb - ub) / (-st) + 1;
738     } else { // lb < ub
739       tc = 0; // zero-trip
740     }
741   } else { // st > 0
742     if (ub >= lb) {
743       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744       // where the division needs to be unsigned regardless of the result type
745       tc = (UT)(ub - lb) / st + 1;
746     } else { // ub < lb
747       tc = 0; // zero-trip
748     }
749   }
750 
751   // Any half-decent optimizer will remove this test when the blocks are empty
752   // since the macros expand to nothing when statistics are disabled.
753   if (schedule == __kmp_static) {
754     KMP_COUNT_BLOCK(OMP_FOR_static);
755     KMP_COUNT_VALUE(FOR_static_iterations, tc);
756   } else {
757     KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758     KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759   }
760 
761   pr->u.p.lb = lb;
762   pr->u.p.ub = ub;
763   pr->u.p.st = st;
764   pr->u.p.tc = tc;
765 
766 #if KMP_OS_WINDOWS
767   pr->u.p.last_upper = ub + st;
768 #endif /* KMP_OS_WINDOWS */
769 
770   /* NOTE: only the active parallel region(s) has active ordered sections */
771 
772   if (active) {
773     if (pr->ordered == 0) {
774       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776     } else {
777       pr->ordered_bumped = 0;
778 
779       pr->u.p.ordered_lower = 1;
780       pr->u.p.ordered_upper = 0;
781 
782       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784     }
785   }
786 
787   if (__kmp_env_consistency_check) {
788     enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789     if (push_ws) {
790       __kmp_push_workshare(gtid, ws, loc);
791       pr->pushed_ws = ws;
792     } else {
793       __kmp_check_workshare(gtid, ws, loc);
794       pr->pushed_ws = ct_none;
795     }
796   }
797 
798   switch (schedule) {
799 #if (KMP_STATIC_STEAL_ENABLED)
800   case kmp_sch_static_steal: {
801     T nproc = th->th.th_team_nproc;
802     T ntc, init;
803 
804     KD_TRACE(100,
805              ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806 
807     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808     if (nproc > 1 && ntc >= nproc) {
809       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810       T id = __kmp_tid_from_gtid(gtid);
811       T small_chunk, extras;
812 
813       small_chunk = ntc / nproc;
814       extras = ntc % nproc;
815 
816       init = id * small_chunk + (id < extras ? id : extras);
817       pr->u.p.count = init;
818       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819 
820       pr->u.p.parm2 = lb;
821       // pr->pfields.parm3 = 0; // it's not used in static_steal
822       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823       pr->u.p.st = st;
824       if (traits_t<T>::type_size > 4) {
825         // AC: TODO: check if 16-byte CAS available and use it to
826         // improve performance (probably wait for explicit request
827         // before spending time on this).
828         // For now use dynamically allocated per-thread lock,
829         // free memory in __kmp_dispatch_next when status==0.
830         KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831         th->th.th_dispatch->th_steal_lock =
832             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833         __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834       }
835       break;
836     } else {
837       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838                      "kmp_sch_static_balanced\n",
839                      gtid));
840       schedule = kmp_sch_static_balanced;
841       /* too few iterations: fall-through to kmp_sch_static_balanced */
842     } // if
843     /* FALL-THROUGH to static balanced */
844   } // case
845 #endif
846   case kmp_sch_static_balanced: {
847     T nproc = th->th.th_team_nproc;
848     T init, limit;
849 
850     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851                    gtid));
852 
853     if (nproc > 1) {
854       T id = __kmp_tid_from_gtid(gtid);
855 
856       if (tc < nproc) {
857         if (id < tc) {
858           init = id;
859           limit = id;
860           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861         } else {
862           pr->u.p.count = 1; /* means no more chunks to execute */
863           pr->u.p.parm1 = FALSE;
864           break;
865         }
866       } else {
867         T small_chunk = tc / nproc;
868         T extras = tc % nproc;
869         init = id * small_chunk + (id < extras ? id : extras);
870         limit = init + small_chunk - (id < extras ? 0 : 1);
871         pr->u.p.parm1 = (id == nproc - 1);
872       }
873     } else {
874       if (tc > 0) {
875         init = 0;
876         limit = tc - 1;
877         pr->u.p.parm1 = TRUE;
878       } else { // zero trip count
879         pr->u.p.count = 1; /* means no more chunks to execute */
880         pr->u.p.parm1 = FALSE;
881         break;
882       }
883     }
884 #if USE_ITT_BUILD
885     // Calculate chunk for metadata report
886     if (itt_need_metadata_reporting)
887       cur_chunk = limit - init + 1;
888 #endif
889     if (st == 1) {
890       pr->u.p.lb = lb + init;
891       pr->u.p.ub = lb + limit;
892     } else {
893       // calculated upper bound, "ub" is user-defined upper bound
894       T ub_tmp = lb + limit * st;
895       pr->u.p.lb = lb + init * st;
896       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897       // it exactly
898       if (st > 0) {
899         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900       } else {
901         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902       }
903     }
904     if (pr->ordered) {
905       pr->u.p.ordered_lower = init;
906       pr->u.p.ordered_upper = limit;
907     }
908     break;
909   } // case
910   case kmp_sch_static_balanced_chunked: {
911     // similar to balanced, but chunk adjusted to multiple of simd width
912     T nth = th->th.th_team_nproc;
913     KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914                    " -> falling-through to static_greedy\n",
915                    gtid));
916     schedule = kmp_sch_static_greedy;
917     if (nth > 1)
918       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919     else
920       pr->u.p.parm1 = tc;
921     break;
922   } // case
923   case kmp_sch_guided_iterative_chunked:
924   case kmp_sch_guided_simd: {
925     T nproc = th->th.th_team_nproc;
926     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927                    " case\n",
928                    gtid));
929 
930     if (nproc > 1) {
931       if ((2L * chunk + 1) * nproc >= tc) {
932         /* chunk size too large, switch to dynamic */
933         schedule = kmp_sch_dynamic_chunked;
934       } else {
935         // when remaining iters become less than parm2 - switch to dynamic
936         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937         *(double *)&pr->u.p.parm3 =
938             guided_flt_param / nproc; // may occupy parm3 and parm4
939       }
940     } else {
941       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942                      "kmp_sch_static_greedy\n",
943                      gtid));
944       schedule = kmp_sch_static_greedy;
945       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946       KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947                      gtid));
948       pr->u.p.parm1 = tc;
949     } // if
950   } // case
951   break;
952   case kmp_sch_guided_analytical_chunked: {
953     T nproc = th->th.th_team_nproc;
954     KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955                    " case\n",
956                    gtid));
957     if (nproc > 1) {
958       if ((2L * chunk + 1) * nproc >= tc) {
959         /* chunk size too large, switch to dynamic */
960         schedule = kmp_sch_dynamic_chunked;
961       } else {
962         /* commonly used term: (2 nproc - 1)/(2 nproc) */
963         DBL x;
964 
965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
966         /* Linux* OS already has 64-bit computation by default for long double,
967            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969            instead of the default 53-bit. Even though long double doesn't work
970            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971            expected to impact the correctness of the algorithm, but this has not
972            been mathematically proven. */
973         // save original FPCW and set precision to 64-bit, as
974         // Windows* OS on IA-32 architecture defaults to 53-bit
975         unsigned int oldFpcw = _control87(0, 0);
976         _control87(_PC_64, _MCW_PC); // 0,0x30000
977 #endif
978         /* value used for comparison in solver for cross-over point */
979         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980 
981         /* crossover point--chunk indexes equal to or greater than
982            this point switch to dynamic-style scheduling */
983         UT cross;
984 
985         /* commonly used term: (2 nproc - 1)/(2 nproc) */
986         x = (long double)1.0 - (long double)0.5 / nproc;
987 
988 #ifdef KMP_DEBUG
989         { // test natural alignment
990           struct _test_a {
991             char a;
992             union {
993               char b;
994               DBL d;
995             };
996           } t;
997           ptrdiff_t natural_alignment =
998               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000           // long)natural_alignment );
1001           KMP_DEBUG_ASSERT(
1002               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003         }
1004 #endif // KMP_DEBUG
1005 
1006         /* save the term in thread private dispatch structure */
1007         *(DBL *)&pr->u.p.parm3 = x;
1008 
1009         /* solve for the crossover point to the nearest integer i for which C_i
1010            <= chunk */
1011         {
1012           UT left, right, mid;
1013           long double p;
1014 
1015           /* estimate initial upper and lower bound */
1016 
1017           /* doesn't matter what value right is as long as it is positive, but
1018              it affects performance of the solver */
1019           right = 229;
1020           p = __kmp_pow<UT>(x, right);
1021           if (p > target) {
1022             do {
1023               p *= p;
1024               right <<= 1;
1025             } while (p > target && right < (1 << 27));
1026             /* lower bound is previous (failed) estimate of upper bound */
1027             left = right >> 1;
1028           } else {
1029             left = 0;
1030           }
1031 
1032           /* bisection root-finding method */
1033           while (left + 1 < right) {
1034             mid = (left + right) / 2;
1035             if (__kmp_pow<UT>(x, mid) > target) {
1036               left = mid;
1037             } else {
1038               right = mid;
1039             }
1040           } // while
1041           cross = right;
1042         }
1043         /* assert sanity of computed crossover point */
1044         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045                    __kmp_pow<UT>(x, cross) <= target);
1046 
1047         /* save the crossover point in thread private dispatch structure */
1048         pr->u.p.parm2 = cross;
1049 
1050 // C75803
1051 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053 #else
1054 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1055 #endif
1056         /* dynamic-style scheduling offset */
1057         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059                         cross * chunk;
1060 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1061         // restore FPCW
1062         _control87(oldFpcw, _MCW_PC);
1063 #endif
1064       } // if
1065     } else {
1066       KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067                      "kmp_sch_static_greedy\n",
1068                      gtid));
1069       schedule = kmp_sch_static_greedy;
1070       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071       pr->u.p.parm1 = tc;
1072     } // if
1073   } // case
1074   break;
1075   case kmp_sch_static_greedy:
1076     KD_TRACE(100,
1077              ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078     pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079                         ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080                         : tc;
1081     break;
1082   case kmp_sch_static_chunked:
1083   case kmp_sch_dynamic_chunked:
1084     if (pr->u.p.parm1 <= 0) {
1085       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086     }
1087     KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089                    gtid));
1090     break;
1091   case kmp_sch_trapezoidal: {
1092     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093 
1094     T parm1, parm2, parm3, parm4;
1095     KD_TRACE(100,
1096              ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097 
1098     parm1 = chunk;
1099 
1100     /* F : size of the first cycle */
1101     parm2 = (tc / (2 * th->th.th_team_nproc));
1102 
1103     if (parm2 < 1) {
1104       parm2 = 1;
1105     }
1106 
1107     /* L : size of the last cycle.  Make sure the last cycle is not larger
1108        than the first cycle. */
1109     if (parm1 < 1) {
1110       parm1 = 1;
1111     } else if (parm1 > parm2) {
1112       parm1 = parm2;
1113     }
1114 
1115     /* N : number of cycles */
1116     parm3 = (parm2 + parm1);
1117     parm3 = (2 * tc + parm3 - 1) / parm3;
1118 
1119     if (parm3 < 2) {
1120       parm3 = 2;
1121     }
1122 
1123     /* sigma : decreasing incr of the trapezoid */
1124     parm4 = (parm3 - 1);
1125     parm4 = (parm2 - parm1) / parm4;
1126 
1127     // pointless check, because parm4 >= 0 always
1128     // if ( parm4 < 0 ) {
1129     //    parm4 = 0;
1130     //}
1131 
1132     pr->u.p.parm1 = parm1;
1133     pr->u.p.parm2 = parm2;
1134     pr->u.p.parm3 = parm3;
1135     pr->u.p.parm4 = parm4;
1136   } // case
1137   break;
1138 
1139   default: {
1140     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1141                 KMP_HNT(GetNewerLibrary), // Hint
1142                 __kmp_msg_null // Variadic argument list terminator
1143                 );
1144   } break;
1145   } // switch
1146   pr->schedule = schedule;
1147   if (active) {
1148     /* The name of this buffer should be my_buffer_index when it's free to use
1149      * it */
1150 
1151     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1152                    "sh->buffer_index:%d\n",
1153                    gtid, my_buffer_index, sh->buffer_index));
1154     __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1155                                  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1156     // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1157     // my_buffer_index are *always* 32-bit integers.
1158     KMP_MB(); /* is this necessary? */
1159     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1160                    "sh->buffer_index:%d\n",
1161                    gtid, my_buffer_index, sh->buffer_index));
1162 
1163     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1164     th->th.th_dispatch->th_dispatch_sh_current =
1165         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1166 #if USE_ITT_BUILD
1167     if (pr->ordered) {
1168       __kmp_itt_ordered_init(gtid);
1169     }; // if
1170     // Report loop metadata
1171     if (itt_need_metadata_reporting) {
1172       // Only report metadata by master of active team at level 1
1173       kmp_uint64 schedtype = 0;
1174       switch (schedule) {
1175       case kmp_sch_static_chunked:
1176       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1177         break;
1178       case kmp_sch_static_greedy:
1179         cur_chunk = pr->u.p.parm1;
1180         break;
1181       case kmp_sch_dynamic_chunked:
1182         schedtype = 1;
1183         break;
1184       case kmp_sch_guided_iterative_chunked:
1185       case kmp_sch_guided_analytical_chunked:
1186       case kmp_sch_guided_simd:
1187         schedtype = 2;
1188         break;
1189       default:
1190         // Should we put this case under "static"?
1191         // case kmp_sch_static_steal:
1192         schedtype = 3;
1193         break;
1194       }
1195       __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1196     }
1197 #endif /* USE_ITT_BUILD */
1198   }; // if
1199 
1200 #ifdef KMP_DEBUG
1201   {
1202     const char *buff;
1203     // create format specifiers before the debug output
1204     buff = __kmp_str_format(
1205         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1206         "lb:%%%s ub:%%%s"
1207         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1208         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1209         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1210         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1211         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1212         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1213     KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1214                   pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1215                   pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1216                   pr->u.p.parm3, pr->u.p.parm4));
1217     __kmp_str_free(&buff);
1218   }
1219 #endif
1220 #if (KMP_STATIC_STEAL_ENABLED)
1221   // It cannot be guaranteed that after execution of a loop with some other
1222   // schedule kind all the parm3 variables will contain the same value. Even if
1223   // all parm3 will be the same, it still exists a bad case like using 0 and 1
1224   // rather than program life-time increment. So the dedicated variable is
1225   // required. The 'static_steal_counter' is used.
1226   if (schedule == kmp_sch_static_steal) {
1227     // Other threads will inspect this variable when searching for a victim.
1228     // This is a flag showing that other threads may steal from this thread
1229     // since then.
1230     volatile T *p = &pr->u.p.static_steal_counter;
1231     *p = *p + 1;
1232   }
1233 #endif // ( KMP_STATIC_STEAL_ENABLED )
1234 
1235 #if OMPT_SUPPORT && OMPT_TRACE
1236   if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1237     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1238     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1239     ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1240         team_info->parallel_id, task_info->task_id, team_info->microtask);
1241   }
1242 #endif
1243 }
1244 
1245 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1246  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1247  * every chunk of iterations.  If the ordered section(s) were not executed
1248  * for this iteration (or every iteration in this chunk), we need to set the
1249  * ordered iteration counters so that the next thread can proceed. */
1250 template <typename UT>
1251 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1252   typedef typename traits_t<UT>::signed_t ST;
1253   kmp_info_t *th = __kmp_threads[gtid];
1254 
1255   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1256   if (!th->th.th_team->t.t_serialized) {
1257 
1258     dispatch_private_info_template<UT> *pr =
1259         reinterpret_cast<dispatch_private_info_template<UT> *>(
1260             th->th.th_dispatch->th_dispatch_pr_current);
1261     dispatch_shared_info_template<UT> volatile *sh =
1262         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1263             th->th.th_dispatch->th_dispatch_sh_current);
1264     KMP_DEBUG_ASSERT(pr);
1265     KMP_DEBUG_ASSERT(sh);
1266     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1267                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1268 
1269     if (pr->ordered_bumped) {
1270       KD_TRACE(
1271           1000,
1272           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1273            gtid));
1274       pr->ordered_bumped = 0;
1275     } else {
1276       UT lower = pr->u.p.ordered_lower;
1277 
1278 #ifdef KMP_DEBUG
1279       {
1280         const char *buff;
1281         // create format specifiers before the debug output
1282         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1283                                 "ordered_iteration:%%%s lower:%%%s\n",
1284                                 traits_t<UT>::spec, traits_t<UT>::spec);
1285         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1286         __kmp_str_free(&buff);
1287       }
1288 #endif
1289 
1290       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1291                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1292       KMP_MB(); /* is this necessary? */
1293 #ifdef KMP_DEBUG
1294       {
1295         const char *buff;
1296         // create format specifiers before the debug output
1297         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1298                                 "ordered_iteration:%%%s lower:%%%s\n",
1299                                 traits_t<UT>::spec, traits_t<UT>::spec);
1300         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1301         __kmp_str_free(&buff);
1302       }
1303 #endif
1304 
1305       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1306     } // if
1307   } // if
1308   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1309 }
1310 
1311 #ifdef KMP_GOMP_COMPAT
1312 
1313 template <typename UT>
1314 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1315   typedef typename traits_t<UT>::signed_t ST;
1316   kmp_info_t *th = __kmp_threads[gtid];
1317 
1318   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1319   if (!th->th.th_team->t.t_serialized) {
1320     //        int cid;
1321     dispatch_private_info_template<UT> *pr =
1322         reinterpret_cast<dispatch_private_info_template<UT> *>(
1323             th->th.th_dispatch->th_dispatch_pr_current);
1324     dispatch_shared_info_template<UT> volatile *sh =
1325         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1326             th->th.th_dispatch->th_dispatch_sh_current);
1327     KMP_DEBUG_ASSERT(pr);
1328     KMP_DEBUG_ASSERT(sh);
1329     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1330                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1331 
1332     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333     UT lower = pr->u.p.ordered_lower;
1334     UT upper = pr->u.p.ordered_upper;
1335     UT inc = upper - lower + 1;
1336 
1337     if (pr->ordered_bumped == inc) {
1338       KD_TRACE(
1339           1000,
1340           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1341            gtid));
1342       pr->ordered_bumped = 0;
1343     } else {
1344       inc -= pr->ordered_bumped;
1345 
1346 #ifdef KMP_DEBUG
1347       {
1348         const char *buff;
1349         // create format specifiers before the debug output
1350         buff = __kmp_str_format(
1351             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1352             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1353             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1354         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1355         __kmp_str_free(&buff);
1356       }
1357 #endif
1358 
1359       __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1360                            __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1361 
1362       KMP_MB(); /* is this necessary? */
1363       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1364                       "ordered_bumped to zero\n",
1365                       gtid));
1366       pr->ordered_bumped = 0;
1367 //!!!!! TODO check if the inc should be unsigned, or signed???
1368 #ifdef KMP_DEBUG
1369       {
1370         const char *buff;
1371         // create format specifiers before the debug output
1372         buff = __kmp_str_format(
1373             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1374             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1375             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1376             traits_t<UT>::spec);
1377         KD_TRACE(1000,
1378                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1379         __kmp_str_free(&buff);
1380       }
1381 #endif
1382 
1383       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1384     }
1385     //        }
1386   }
1387   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1388 }
1389 
1390 #endif /* KMP_GOMP_COMPAT */
1391 
1392 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1393    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1394    is not called. */
1395 #if OMPT_SUPPORT && OMPT_TRACE
1396 #define OMPT_LOOP_END                                                          \
1397   if (status == 0) {                                                           \
1398     if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) {   \
1399       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1400       ompt_task_info_t *task_info = __ompt_get_taskinfo(0);                    \
1401       ompt_callbacks.ompt_callback(ompt_event_loop_end)(                       \
1402           team_info->parallel_id, task_info->task_id);                         \
1403     }                                                                          \
1404   }
1405 #else
1406 #define OMPT_LOOP_END // no-op
1407 #endif
1408 
1409 template <typename T>
1410 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1411                                T *p_lb, T *p_ub,
1412                                typename traits_t<T>::signed_t *p_st) {
1413 
1414   typedef typename traits_t<T>::unsigned_t UT;
1415   typedef typename traits_t<T>::signed_t ST;
1416   typedef typename traits_t<T>::floating_t DBL;
1417 
1418   // This is potentially slightly misleading, schedule(runtime) will appear here
1419   // even if the actual runtme schedule is static. (Which points out a
1420   // disadavantage of schedule(runtime): even when static scheduling is used it
1421   // costs more than a compile time choice to use static scheduling would.)
1422   KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
1423 
1424   int status;
1425   dispatch_private_info_template<T> *pr;
1426   kmp_info_t *th = __kmp_threads[gtid];
1427   kmp_team_t *team = th->th.th_team;
1428 
1429   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1430 #ifdef KMP_DEBUG
1431   {
1432     const char *buff;
1433     // create format specifiers before the debug output
1434     buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1435                             "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1436                             traits_t<T>::spec, traits_t<T>::spec,
1437                             traits_t<ST>::spec);
1438     KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1439     __kmp_str_free(&buff);
1440   }
1441 #endif
1442 
1443   if (team->t.t_serialized) {
1444     /* NOTE: serialize this dispatch becase we are not at the active level */
1445     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1446         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1447     KMP_DEBUG_ASSERT(pr);
1448 
1449     if ((status = (pr->u.p.tc != 0)) == 0) {
1450       *p_lb = 0;
1451       *p_ub = 0;
1452       //            if ( p_last != NULL )
1453       //                *p_last = 0;
1454       if (p_st != NULL)
1455         *p_st = 0;
1456       if (__kmp_env_consistency_check) {
1457         if (pr->pushed_ws != ct_none) {
1458           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1459         }
1460       }
1461     } else if (pr->nomerge) {
1462       kmp_int32 last;
1463       T start;
1464       UT limit, trip, init;
1465       ST incr;
1466       T chunk = pr->u.p.parm1;
1467 
1468       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1469                      gtid));
1470 
1471       init = chunk * pr->u.p.count++;
1472       trip = pr->u.p.tc - 1;
1473 
1474       if ((status = (init <= trip)) == 0) {
1475         *p_lb = 0;
1476         *p_ub = 0;
1477         //                if ( p_last != NULL )
1478         //                    *p_last = 0;
1479         if (p_st != NULL)
1480           *p_st = 0;
1481         if (__kmp_env_consistency_check) {
1482           if (pr->pushed_ws != ct_none) {
1483             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1484           }
1485         }
1486       } else {
1487         start = pr->u.p.lb;
1488         limit = chunk + init - 1;
1489         incr = pr->u.p.st;
1490 
1491         if ((last = (limit >= trip)) != 0) {
1492           limit = trip;
1493 #if KMP_OS_WINDOWS
1494           pr->u.p.last_upper = pr->u.p.ub;
1495 #endif /* KMP_OS_WINDOWS */
1496         }
1497         if (p_last != NULL)
1498           *p_last = last;
1499         if (p_st != NULL)
1500           *p_st = incr;
1501         if (incr == 1) {
1502           *p_lb = start + init;
1503           *p_ub = start + limit;
1504         } else {
1505           *p_lb = start + init * incr;
1506           *p_ub = start + limit * incr;
1507         }
1508 
1509         if (pr->ordered) {
1510           pr->u.p.ordered_lower = init;
1511           pr->u.p.ordered_upper = limit;
1512 #ifdef KMP_DEBUG
1513           {
1514             const char *buff;
1515             // create format specifiers before the debug output
1516             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1517                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1518                                     traits_t<UT>::spec, traits_t<UT>::spec);
1519             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1520                             pr->u.p.ordered_upper));
1521             __kmp_str_free(&buff);
1522           }
1523 #endif
1524         } // if
1525       } // if
1526     } else {
1527       pr->u.p.tc = 0;
1528       *p_lb = pr->u.p.lb;
1529       *p_ub = pr->u.p.ub;
1530 #if KMP_OS_WINDOWS
1531       pr->u.p.last_upper = *p_ub;
1532 #endif /* KMP_OS_WINDOWS */
1533       if (p_last != NULL)
1534         *p_last = TRUE;
1535       if (p_st != NULL)
1536         *p_st = pr->u.p.st;
1537     } // if
1538 #ifdef KMP_DEBUG
1539     {
1540       const char *buff;
1541       // create format specifiers before the debug output
1542       buff = __kmp_str_format(
1543           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1544           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
1545           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1546       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1547       __kmp_str_free(&buff);
1548     }
1549 #endif
1550 #if INCLUDE_SSC_MARKS
1551     SSC_MARK_DISPATCH_NEXT();
1552 #endif
1553     OMPT_LOOP_END;
1554     return status;
1555   } else {
1556     kmp_int32 last = 0;
1557     dispatch_shared_info_template<UT> *sh;
1558     T start;
1559     ST incr;
1560     UT limit, trip, init;
1561 
1562     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1563                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1564 
1565     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1566         th->th.th_dispatch->th_dispatch_pr_current);
1567     KMP_DEBUG_ASSERT(pr);
1568     sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1569         th->th.th_dispatch->th_dispatch_sh_current);
1570     KMP_DEBUG_ASSERT(sh);
1571 
1572     if (pr->u.p.tc == 0) {
1573       // zero trip count
1574       status = 0;
1575     } else {
1576       switch (pr->schedule) {
1577 #if (KMP_STATIC_STEAL_ENABLED)
1578       case kmp_sch_static_steal: {
1579         T chunk = pr->u.p.parm1;
1580         int nproc = th->th.th_team_nproc;
1581 
1582         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1583                        gtid));
1584 
1585         trip = pr->u.p.tc - 1;
1586 
1587         if (traits_t<T>::type_size > 4) {
1588           // use lock for 8-byte and CAS for 4-byte induction
1589           // variable. TODO (optional): check and use 16-byte CAS
1590           kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1591           KMP_DEBUG_ASSERT(lck != NULL);
1592           if (pr->u.p.count < (UT)pr->u.p.ub) {
1593             __kmp_acquire_lock(lck, gtid);
1594             // try to get own chunk of iterations
1595             init = (pr->u.p.count)++;
1596             status = (init < (UT)pr->u.p.ub);
1597             __kmp_release_lock(lck, gtid);
1598           } else {
1599             status = 0; // no own chunks
1600           }
1601           if (!status) { // try to steal
1602             kmp_info_t **other_threads = team->t.t_threads;
1603             int while_limit = nproc; // nproc attempts to find a victim
1604             int while_index = 0;
1605             // TODO: algorithm of searching for a victim
1606             // should be cleaned up and measured
1607             while ((!status) && (while_limit != ++while_index)) {
1608               T remaining;
1609               T victimIdx = pr->u.p.parm4;
1610               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1611               dispatch_private_info_template<T> *victim =
1612                   reinterpret_cast<dispatch_private_info_template<T> *>(
1613                       other_threads[victimIdx]
1614                           ->th.th_dispatch->th_dispatch_pr_current);
1615               while ((victim == NULL || victim == pr ||
1616                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1617                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1618                      oldVictimIdx != victimIdx) {
1619                 victimIdx = (victimIdx + 1) % nproc;
1620                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1621                     other_threads[victimIdx]
1622                         ->th.th_dispatch->th_dispatch_pr_current);
1623               };
1624               if (!victim ||
1625                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1626                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1627                 continue; // try once more (nproc attempts in total)
1628                 // no victim is ready yet to participate in stealing
1629                 // because all victims are still in kmp_init_dispatch
1630               }
1631               if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1632                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1633                 continue; // not enough chunks to steal, goto next victim
1634               }
1635 
1636               lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1637               KMP_ASSERT(lck != NULL);
1638               __kmp_acquire_lock(lck, gtid);
1639               limit = victim->u.p.ub; // keep initial ub
1640               if (victim->u.p.count >= limit ||
1641                   (remaining = limit - victim->u.p.count) < 2) {
1642                 __kmp_release_lock(lck, gtid);
1643                 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1644                 continue; // not enough chunks to steal
1645               }
1646               // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1647               // or by 1
1648               if (remaining > 3) {
1649                 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1650                 init = (victim->u.p.ub -=
1651                         (remaining >> 2)); // steal 1/4 of remaining
1652               } else {
1653                 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1654                 init =
1655                     (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1656               }
1657               __kmp_release_lock(lck, gtid);
1658 
1659               KMP_DEBUG_ASSERT(init + 1 <= limit);
1660               pr->u.p.parm4 = victimIdx; // remember victim to steal from
1661               status = 1;
1662               while_index = 0;
1663               // now update own count and ub with stolen range but init chunk
1664               __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1665               pr->u.p.count = init + 1;
1666               pr->u.p.ub = limit;
1667               __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1668             } // while (search for victim)
1669           } // if (try to find victim and steal)
1670         } else {
1671           // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1672           typedef union {
1673             struct {
1674               UT count;
1675               T ub;
1676             } p;
1677             kmp_int64 b;
1678           } union_i4;
1679           // All operations on 'count' or 'ub' must be combined atomically
1680           // together.
1681           {
1682             union_i4 vold, vnew;
1683             vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1684             vnew = vold;
1685             vnew.p.count++;
1686             while (!KMP_COMPARE_AND_STORE_ACQ64(
1687                 (volatile kmp_int64 *)&pr->u.p.count,
1688                 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1689                 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1690               KMP_CPU_PAUSE();
1691               vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1692               vnew = vold;
1693               vnew.p.count++;
1694             }
1695             vnew = vold;
1696             init = vnew.p.count;
1697             status = (init < (UT)vnew.p.ub);
1698           }
1699 
1700           if (!status) {
1701             kmp_info_t **other_threads = team->t.t_threads;
1702             int while_limit = nproc; // nproc attempts to find a victim
1703             int while_index = 0;
1704 
1705             // TODO: algorithm of searching for a victim
1706             // should be cleaned up and measured
1707             while ((!status) && (while_limit != ++while_index)) {
1708               union_i4 vold, vnew;
1709               kmp_int32 remaining;
1710               T victimIdx = pr->u.p.parm4;
1711               T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1712               dispatch_private_info_template<T> *victim =
1713                   reinterpret_cast<dispatch_private_info_template<T> *>(
1714                       other_threads[victimIdx]
1715                           ->th.th_dispatch->th_dispatch_pr_current);
1716               while ((victim == NULL || victim == pr ||
1717                       (*(volatile T *)&victim->u.p.static_steal_counter !=
1718                        *(volatile T *)&pr->u.p.static_steal_counter)) &&
1719                      oldVictimIdx != victimIdx) {
1720                 victimIdx = (victimIdx + 1) % nproc;
1721                 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1722                     other_threads[victimIdx]
1723                         ->th.th_dispatch->th_dispatch_pr_current);
1724               };
1725               if (!victim ||
1726                   (*(volatile T *)&victim->u.p.static_steal_counter !=
1727                    *(volatile T *)&pr->u.p.static_steal_counter)) {
1728                 continue; // try once more (nproc attempts in total)
1729                 // no victim is ready yet to participate in stealing
1730                 // because all victims are still in kmp_init_dispatch
1731               }
1732               pr->u.p.parm4 = victimIdx; // new victim found
1733               while (1) { // CAS loop if victim has enough chunks to steal
1734                 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1735                 vnew = vold;
1736 
1737                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1738                 if (vnew.p.count >= (UT)vnew.p.ub ||
1739                     (remaining = vnew.p.ub - vnew.p.count) < 2) {
1740                   pr->u.p.parm4 =
1741                       (victimIdx + 1) % nproc; // shift start victim id
1742                   break; // not enough chunks to steal, goto next victim
1743                 }
1744                 if (remaining > 3) {
1745                   vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1746                 } else {
1747                   vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1748                 }
1749                 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1750                 // TODO: Should this be acquire or release?
1751                 if (KMP_COMPARE_AND_STORE_ACQ64(
1752                         (volatile kmp_int64 *)&victim->u.p.count,
1753                         *VOLATILE_CAST(kmp_int64 *) & vold.b,
1754                         *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1755                   // stealing succeeded
1756                   KMP_COUNT_VALUE(FOR_static_steal_stolen,
1757                                   vold.p.ub - vnew.p.ub);
1758                   status = 1;
1759                   while_index = 0;
1760                   // now update own count and ub
1761                   init = vnew.p.ub;
1762                   vold.p.count = init + 1;
1763 #if KMP_ARCH_X86
1764                   KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1765                                    vold.b);
1766 #else
1767                   *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1768 #endif
1769                   break;
1770                 } // if (check CAS result)
1771                 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1772               } // while (try to steal from particular victim)
1773             } // while (search for victim)
1774           } // if (try to find victim and steal)
1775         } // if (4-byte induction variable)
1776         if (!status) {
1777           *p_lb = 0;
1778           *p_ub = 0;
1779           if (p_st != NULL)
1780             *p_st = 0;
1781         } else {
1782           start = pr->u.p.parm2;
1783           init *= chunk;
1784           limit = chunk + init - 1;
1785           incr = pr->u.p.st;
1786           KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1787 
1788           KMP_DEBUG_ASSERT(init <= trip);
1789           if ((last = (limit >= trip)) != 0)
1790             limit = trip;
1791           if (p_st != NULL)
1792             *p_st = incr;
1793 
1794           if (incr == 1) {
1795             *p_lb = start + init;
1796             *p_ub = start + limit;
1797           } else {
1798             *p_lb = start + init * incr;
1799             *p_ub = start + limit * incr;
1800           }
1801 
1802           if (pr->ordered) {
1803             pr->u.p.ordered_lower = init;
1804             pr->u.p.ordered_upper = limit;
1805 #ifdef KMP_DEBUG
1806             {
1807               const char *buff;
1808               // create format specifiers before the debug output
1809               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1810                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1811                                       traits_t<UT>::spec, traits_t<UT>::spec);
1812               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1813                               pr->u.p.ordered_upper));
1814               __kmp_str_free(&buff);
1815             }
1816 #endif
1817           } // if
1818         } // if
1819         break;
1820       } // case
1821 #endif // ( KMP_STATIC_STEAL_ENABLED )
1822       case kmp_sch_static_balanced: {
1823         KD_TRACE(
1824             100,
1825             ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1826         if ((status = !pr->u.p.count) !=
1827             0) { /* check if thread has any iteration to do */
1828           pr->u.p.count = 1;
1829           *p_lb = pr->u.p.lb;
1830           *p_ub = pr->u.p.ub;
1831           last = pr->u.p.parm1;
1832           if (p_st != NULL)
1833             *p_st = pr->u.p.st;
1834         } else { /* no iterations to do */
1835           pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1836         }
1837         if (pr->ordered) {
1838 #ifdef KMP_DEBUG
1839           {
1840             const char *buff;
1841             // create format specifiers before the debug output
1842             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1843                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1844                                     traits_t<UT>::spec, traits_t<UT>::spec);
1845             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1846                             pr->u.p.ordered_upper));
1847             __kmp_str_free(&buff);
1848           }
1849 #endif
1850         } // if
1851       } // case
1852       break;
1853       case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1854                                      merged here */
1855       case kmp_sch_static_chunked: {
1856         T parm1;
1857 
1858         KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1859                        "kmp_sch_static_[affinity|chunked] case\n",
1860                        gtid));
1861         parm1 = pr->u.p.parm1;
1862 
1863         trip = pr->u.p.tc - 1;
1864         init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1865 
1866         if ((status = (init <= trip)) != 0) {
1867           start = pr->u.p.lb;
1868           incr = pr->u.p.st;
1869           limit = parm1 + init - 1;
1870 
1871           if ((last = (limit >= trip)) != 0)
1872             limit = trip;
1873 
1874           if (p_st != NULL)
1875             *p_st = incr;
1876 
1877           pr->u.p.count += th->th.th_team_nproc;
1878 
1879           if (incr == 1) {
1880             *p_lb = start + init;
1881             *p_ub = start + limit;
1882           } else {
1883             *p_lb = start + init * incr;
1884             *p_ub = start + limit * incr;
1885           }
1886 
1887           if (pr->ordered) {
1888             pr->u.p.ordered_lower = init;
1889             pr->u.p.ordered_upper = limit;
1890 #ifdef KMP_DEBUG
1891             {
1892               const char *buff;
1893               // create format specifiers before the debug output
1894               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1895                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1896                                       traits_t<UT>::spec, traits_t<UT>::spec);
1897               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1898                               pr->u.p.ordered_upper));
1899               __kmp_str_free(&buff);
1900             }
1901 #endif
1902           } // if
1903         } // if
1904       } // case
1905       break;
1906 
1907       case kmp_sch_dynamic_chunked: {
1908         T chunk = pr->u.p.parm1;
1909 
1910         KD_TRACE(
1911             100,
1912             ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1913 
1914         init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1915         trip = pr->u.p.tc - 1;
1916 
1917         if ((status = (init <= trip)) == 0) {
1918           *p_lb = 0;
1919           *p_ub = 0;
1920           if (p_st != NULL)
1921             *p_st = 0;
1922         } else {
1923           start = pr->u.p.lb;
1924           limit = chunk + init - 1;
1925           incr = pr->u.p.st;
1926 
1927           if ((last = (limit >= trip)) != 0)
1928             limit = trip;
1929 
1930           if (p_st != NULL)
1931             *p_st = incr;
1932 
1933           if (incr == 1) {
1934             *p_lb = start + init;
1935             *p_ub = start + limit;
1936           } else {
1937             *p_lb = start + init * incr;
1938             *p_ub = start + limit * incr;
1939           }
1940 
1941           if (pr->ordered) {
1942             pr->u.p.ordered_lower = init;
1943             pr->u.p.ordered_upper = limit;
1944 #ifdef KMP_DEBUG
1945             {
1946               const char *buff;
1947               // create format specifiers before the debug output
1948               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1949                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
1950                                       traits_t<UT>::spec, traits_t<UT>::spec);
1951               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1952                               pr->u.p.ordered_upper));
1953               __kmp_str_free(&buff);
1954             }
1955 #endif
1956           } // if
1957         } // if
1958       } // case
1959       break;
1960 
1961       case kmp_sch_guided_iterative_chunked: {
1962         T chunkspec = pr->u.p.parm1;
1963         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1964                        "iterative case\n",
1965                        gtid));
1966         trip = pr->u.p.tc;
1967         // Start atomic part of calculations
1968         while (1) {
1969           ST remaining; // signed, because can be < 0
1970           init = sh->u.s.iteration; // shared value
1971           remaining = trip - init;
1972           if (remaining <= 0) { // AC: need to compare with 0 first
1973             // nothing to do, don't try atomic op
1974             status = 0;
1975             break;
1976           }
1977           if ((T)remaining <
1978               pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1979             // use dynamic-style shcedule
1980             // atomically inrement iterations, get old value
1981             init = test_then_add<ST>(
1982                 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunkspec);
1983             remaining = trip - init;
1984             if (remaining <= 0) {
1985               status = 0; // all iterations got by other threads
1986             } else { // got some iterations to work on
1987               status = 1;
1988               if ((T)remaining > chunkspec) {
1989                 limit = init + chunkspec - 1;
1990               } else {
1991                 last = 1; // the last chunk
1992                 limit = init + remaining - 1;
1993               } // if
1994             } // if
1995             break;
1996           } // if
1997           limit = init + (UT)(remaining *
1998                               *(double *)&pr->u.p.parm3); // divide by K*nproc
1999           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2000                                    (ST)init, (ST)limit)) {
2001             // CAS was successful, chunk obtained
2002             status = 1;
2003             --limit;
2004             break;
2005           } // if
2006         } // while
2007         if (status != 0) {
2008           start = pr->u.p.lb;
2009           incr = pr->u.p.st;
2010           if (p_st != NULL)
2011             *p_st = incr;
2012           *p_lb = start + init * incr;
2013           *p_ub = start + limit * incr;
2014           if (pr->ordered) {
2015             pr->u.p.ordered_lower = init;
2016             pr->u.p.ordered_upper = limit;
2017 #ifdef KMP_DEBUG
2018             {
2019               const char *buff;
2020               // create format specifiers before the debug output
2021               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2022                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2023                                       traits_t<UT>::spec, traits_t<UT>::spec);
2024               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2025                               pr->u.p.ordered_upper));
2026               __kmp_str_free(&buff);
2027             }
2028 #endif
2029           } // if
2030         } else {
2031           *p_lb = 0;
2032           *p_ub = 0;
2033           if (p_st != NULL)
2034             *p_st = 0;
2035         } // if
2036       } // case
2037       break;
2038 
2039       case kmp_sch_guided_simd: {
2040         // same as iterative but curr-chunk adjusted to be multiple of given
2041         // chunk
2042         T chunk = pr->u.p.parm1;
2043         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2044                        gtid));
2045         trip = pr->u.p.tc;
2046         // Start atomic part of calculations
2047         while (1) {
2048           ST remaining; // signed, because can be < 0
2049           init = sh->u.s.iteration; // shared value
2050           remaining = trip - init;
2051           if (remaining <= 0) { // AC: need to compare with 0 first
2052             status = 0; // nothing to do, don't try atomic op
2053             break;
2054           }
2055           KMP_DEBUG_ASSERT(init % chunk == 0);
2056           // compare with K*nproc*(chunk+1), K=2 by default
2057           if ((T)remaining < pr->u.p.parm2) {
2058             // use dynamic-style shcedule
2059             // atomically inrement iterations, get old value
2060             init = test_then_add<ST>(
2061                 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunk);
2062             remaining = trip - init;
2063             if (remaining <= 0) {
2064               status = 0; // all iterations got by other threads
2065             } else {
2066               // got some iterations to work on
2067               status = 1;
2068               if ((T)remaining > chunk) {
2069                 limit = init + chunk - 1;
2070               } else {
2071                 last = 1; // the last chunk
2072                 limit = init + remaining - 1;
2073               } // if
2074             } // if
2075             break;
2076           } // if
2077           // divide by K*nproc
2078           UT span = remaining * (*(double *)&pr->u.p.parm3);
2079           UT rem = span % chunk;
2080           if (rem) // adjust so that span%chunk == 0
2081             span += chunk - rem;
2082           limit = init + span;
2083           if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
2084                                    (ST)init, (ST)limit)) {
2085             // CAS was successful, chunk obtained
2086             status = 1;
2087             --limit;
2088             break;
2089           } // if
2090         } // while
2091         if (status != 0) {
2092           start = pr->u.p.lb;
2093           incr = pr->u.p.st;
2094           if (p_st != NULL)
2095             *p_st = incr;
2096           *p_lb = start + init * incr;
2097           *p_ub = start + limit * incr;
2098           if (pr->ordered) {
2099             pr->u.p.ordered_lower = init;
2100             pr->u.p.ordered_upper = limit;
2101 #ifdef KMP_DEBUG
2102             {
2103               const char *buff;
2104               // create format specifiers before the debug output
2105               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2106                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2107                                       traits_t<UT>::spec, traits_t<UT>::spec);
2108               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2109                               pr->u.p.ordered_upper));
2110               __kmp_str_free(&buff);
2111             }
2112 #endif
2113           } // if
2114         } else {
2115           *p_lb = 0;
2116           *p_ub = 0;
2117           if (p_st != NULL)
2118             *p_st = 0;
2119         } // if
2120       } // case
2121       break;
2122 
2123       case kmp_sch_guided_analytical_chunked: {
2124         T chunkspec = pr->u.p.parm1;
2125         UT chunkIdx;
2126 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2127         /* for storing original FPCW value for Windows* OS on
2128            IA-32 architecture 8-byte version */
2129         unsigned int oldFpcw;
2130         unsigned int fpcwSet = 0;
2131 #endif
2132         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2133                        "analytical case\n",
2134                        gtid));
2135 
2136         trip = pr->u.p.tc;
2137 
2138         KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2139         KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2140                          trip);
2141 
2142         while (1) { /* this while loop is a safeguard against unexpected zero
2143                        chunk sizes */
2144           chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2145           if (chunkIdx >= (UT)pr->u.p.parm2) {
2146             --trip;
2147             /* use dynamic-style scheduling */
2148             init = chunkIdx * chunkspec + pr->u.p.count;
2149             /* need to verify init > 0 in case of overflow in the above
2150              * calculation */
2151             if ((status = (init > 0 && init <= trip)) != 0) {
2152               limit = init + chunkspec - 1;
2153 
2154               if ((last = (limit >= trip)) != 0)
2155                 limit = trip;
2156             }
2157             break;
2158           } else {
2159 /* use exponential-style scheduling */
2160 /* The following check is to workaround the lack of long double precision on
2161    Windows* OS.
2162    This check works around the possible effect that init != 0 for chunkIdx == 0.
2163  */
2164 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2165             /* If we haven't already done so, save original FPCW and set
2166                precision to 64-bit, as Windows* OS on IA-32 architecture
2167                defaults to 53-bit */
2168             if (!fpcwSet) {
2169               oldFpcw = _control87(0, 0);
2170               _control87(_PC_64, _MCW_PC);
2171               fpcwSet = 0x30000;
2172             }
2173 #endif
2174             if (chunkIdx) {
2175               init = __kmp_dispatch_guided_remaining<T>(
2176                   trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2177               KMP_DEBUG_ASSERT(init);
2178               init = trip - init;
2179             } else
2180               init = 0;
2181             limit = trip - __kmp_dispatch_guided_remaining<T>(
2182                                trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2183             KMP_ASSERT(init <= limit);
2184             if (init < limit) {
2185               KMP_DEBUG_ASSERT(limit <= trip);
2186               --limit;
2187               status = 1;
2188               break;
2189             } // if
2190           } // if
2191         } // while (1)
2192 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2193         /* restore FPCW if necessary
2194            AC: check fpcwSet flag first because oldFpcw can be uninitialized
2195            here */
2196         if (fpcwSet && (oldFpcw & fpcwSet))
2197           _control87(oldFpcw, _MCW_PC);
2198 #endif
2199         if (status != 0) {
2200           start = pr->u.p.lb;
2201           incr = pr->u.p.st;
2202           if (p_st != NULL)
2203             *p_st = incr;
2204           *p_lb = start + init * incr;
2205           *p_ub = start + limit * incr;
2206           if (pr->ordered) {
2207             pr->u.p.ordered_lower = init;
2208             pr->u.p.ordered_upper = limit;
2209 #ifdef KMP_DEBUG
2210             {
2211               const char *buff;
2212               // create format specifiers before the debug output
2213               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2214                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2215                                       traits_t<UT>::spec, traits_t<UT>::spec);
2216               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2217                               pr->u.p.ordered_upper));
2218               __kmp_str_free(&buff);
2219             }
2220 #endif
2221           }
2222         } else {
2223           *p_lb = 0;
2224           *p_ub = 0;
2225           if (p_st != NULL)
2226             *p_st = 0;
2227         }
2228       } // case
2229       break;
2230 
2231       case kmp_sch_trapezoidal: {
2232         UT index;
2233         T parm2 = pr->u.p.parm2;
2234         T parm3 = pr->u.p.parm3;
2235         T parm4 = pr->u.p.parm4;
2236         KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2237                        gtid));
2238 
2239         index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2240 
2241         init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2242         trip = pr->u.p.tc - 1;
2243 
2244         if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2245           *p_lb = 0;
2246           *p_ub = 0;
2247           if (p_st != NULL)
2248             *p_st = 0;
2249         } else {
2250           start = pr->u.p.lb;
2251           limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2252           incr = pr->u.p.st;
2253 
2254           if ((last = (limit >= trip)) != 0)
2255             limit = trip;
2256 
2257           if (p_st != NULL)
2258             *p_st = incr;
2259 
2260           if (incr == 1) {
2261             *p_lb = start + init;
2262             *p_ub = start + limit;
2263           } else {
2264             *p_lb = start + init * incr;
2265             *p_ub = start + limit * incr;
2266           }
2267 
2268           if (pr->ordered) {
2269             pr->u.p.ordered_lower = init;
2270             pr->u.p.ordered_upper = limit;
2271 #ifdef KMP_DEBUG
2272             {
2273               const char *buff;
2274               // create format specifiers before the debug output
2275               buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2276                                       "ordered_lower:%%%s ordered_upper:%%%s\n",
2277                                       traits_t<UT>::spec, traits_t<UT>::spec);
2278               KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2279                               pr->u.p.ordered_upper));
2280               __kmp_str_free(&buff);
2281             }
2282 #endif
2283           } // if
2284         } // if
2285       } // case
2286       break;
2287       default: {
2288         status = 0; // to avoid complaints on uninitialized variable use
2289         __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2290                     KMP_HNT(GetNewerLibrary), // Hint
2291                     __kmp_msg_null // Variadic argument list terminator
2292                     );
2293       } break;
2294       } // switch
2295     } // if tc == 0;
2296 
2297     if (status == 0) {
2298       UT num_done;
2299 
2300       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2301 #ifdef KMP_DEBUG
2302       {
2303         const char *buff;
2304         // create format specifiers before the debug output
2305         buff = __kmp_str_format(
2306             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2307             traits_t<UT>::spec);
2308         KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2309         __kmp_str_free(&buff);
2310       }
2311 #endif
2312 
2313       if ((ST)num_done == th->th.th_team_nproc - 1) {
2314 #if (KMP_STATIC_STEAL_ENABLED)
2315         if (pr->schedule == kmp_sch_static_steal &&
2316             traits_t<T>::type_size > 4) {
2317           int i;
2318           kmp_info_t **other_threads = team->t.t_threads;
2319           // loop complete, safe to destroy locks used for stealing
2320           for (i = 0; i < th->th.th_team_nproc; ++i) {
2321             kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2322             KMP_ASSERT(lck != NULL);
2323             __kmp_destroy_lock(lck);
2324             __kmp_free(lck);
2325             other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2326           }
2327         }
2328 #endif
2329         /* NOTE: release this buffer to be reused */
2330 
2331         KMP_MB(); /* Flush all pending memory write invalidates.  */
2332 
2333         sh->u.s.num_done = 0;
2334         sh->u.s.iteration = 0;
2335 
2336         /* TODO replace with general release procedure? */
2337         if (pr->ordered) {
2338           sh->u.s.ordered_iteration = 0;
2339         }
2340 
2341         KMP_MB(); /* Flush all pending memory write invalidates.  */
2342 
2343         sh->buffer_index += __kmp_dispatch_num_buffers;
2344         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2345                        gtid, sh->buffer_index));
2346 
2347         KMP_MB(); /* Flush all pending memory write invalidates.  */
2348 
2349       } // if
2350       if (__kmp_env_consistency_check) {
2351         if (pr->pushed_ws != ct_none) {
2352           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2353         }
2354       }
2355 
2356       th->th.th_dispatch->th_deo_fcn = NULL;
2357       th->th.th_dispatch->th_dxo_fcn = NULL;
2358       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2359       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2360     } // if (status == 0)
2361 #if KMP_OS_WINDOWS
2362     else if (last) {
2363       pr->u.p.last_upper = pr->u.p.ub;
2364     }
2365 #endif /* KMP_OS_WINDOWS */
2366     if (p_last != NULL && status != 0)
2367       *p_last = last;
2368   } // if
2369 
2370 #ifdef KMP_DEBUG
2371   {
2372     const char *buff;
2373     // create format specifiers before the debug output
2374     buff = __kmp_str_format(
2375         "__kmp_dispatch_next: T#%%d normal case: "
2376         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
2377         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2378     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2379     __kmp_str_free(&buff);
2380   }
2381 #endif
2382 #if INCLUDE_SSC_MARKS
2383   SSC_MARK_DISPATCH_NEXT();
2384 #endif
2385   OMPT_LOOP_END;
2386   return status;
2387 }
2388 
2389 template <typename T>
2390 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2391                                   kmp_int32 *plastiter, T *plower, T *pupper,
2392                                   typename traits_t<T>::signed_t incr) {
2393   typedef typename traits_t<T>::unsigned_t UT;
2394   typedef typename traits_t<T>::signed_t ST;
2395   kmp_uint32 team_id;
2396   kmp_uint32 nteams;
2397   UT trip_count;
2398   kmp_team_t *team;
2399   kmp_info_t *th;
2400 
2401   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2402   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2403 #ifdef KMP_DEBUG
2404   {
2405     const char *buff;
2406     // create format specifiers before the debug output
2407     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2408                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2409                             traits_t<T>::spec, traits_t<T>::spec,
2410                             traits_t<ST>::spec, traits_t<T>::spec);
2411     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2412     __kmp_str_free(&buff);
2413   }
2414 #endif
2415 
2416   if (__kmp_env_consistency_check) {
2417     if (incr == 0) {
2418       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2419                             loc);
2420     }
2421     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2422       // The loop is illegal.
2423       // Some zero-trip loops maintained by compiler, e.g.:
2424       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2425       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2426       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2427       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2428       // Compiler does not check the following illegal loops:
2429       //   for(i=0;i<10;i+=incr) // where incr<0
2430       //   for(i=10;i>0;i-=incr) // where incr<0
2431       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2432     }
2433   }
2434   th = __kmp_threads[gtid];
2435   team = th->th.th_team;
2436 #if OMP_40_ENABLED
2437   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2438   nteams = th->th.th_teams_size.nteams;
2439 #endif
2440   team_id = team->t.t_master_tid;
2441   KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2442 
2443   // compute global trip count
2444   if (incr == 1) {
2445     trip_count = *pupper - *plower + 1;
2446   } else if (incr == -1) {
2447     trip_count = *plower - *pupper + 1;
2448   } else if (incr > 0) {
2449     // upper-lower can exceed the limit of signed type
2450     trip_count = (UT)(*pupper - *plower) / incr + 1;
2451   } else {
2452     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2453   }
2454 
2455   if (trip_count <= nteams) {
2456     KMP_DEBUG_ASSERT(
2457         __kmp_static == kmp_sch_static_greedy ||
2458         __kmp_static ==
2459             kmp_sch_static_balanced); // Unknown static scheduling type.
2460     // only some teams get single iteration, others get nothing
2461     if (team_id < trip_count) {
2462       *pupper = *plower = *plower + team_id * incr;
2463     } else {
2464       *plower = *pupper + incr; // zero-trip loop
2465     }
2466     if (plastiter != NULL)
2467       *plastiter = (team_id == trip_count - 1);
2468   } else {
2469     if (__kmp_static == kmp_sch_static_balanced) {
2470       UT chunk = trip_count / nteams;
2471       UT extras = trip_count % nteams;
2472       *plower +=
2473           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2474       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2475       if (plastiter != NULL)
2476         *plastiter = (team_id == nteams - 1);
2477     } else {
2478       T chunk_inc_count =
2479           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2480       T upper = *pupper;
2481       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2482       // Unknown static scheduling type.
2483       *plower += team_id * chunk_inc_count;
2484       *pupper = *plower + chunk_inc_count - incr;
2485       // Check/correct bounds if needed
2486       if (incr > 0) {
2487         if (*pupper < *plower)
2488           *pupper = traits_t<T>::max_value;
2489         if (plastiter != NULL)
2490           *plastiter = *plower <= upper && *pupper > upper - incr;
2491         if (*pupper > upper)
2492           *pupper = upper; // tracker C73258
2493       } else {
2494         if (*pupper > *plower)
2495           *pupper = traits_t<T>::min_value;
2496         if (plastiter != NULL)
2497           *plastiter = *plower >= upper && *pupper < upper - incr;
2498         if (*pupper < upper)
2499           *pupper = upper; // tracker C73258
2500       }
2501     }
2502   }
2503 }
2504 
2505 //-----------------------------------------------------------------------------
2506 // Dispatch routines
2507 //    Transfer call to template< type T >
2508 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2509 //                         T lb, T ub, ST st, ST chunk )
2510 extern "C" {
2511 
2512 /*!
2513 @ingroup WORK_SHARING
2514 @{
2515 @param loc Source location
2516 @param gtid Global thread id
2517 @param schedule Schedule type
2518 @param lb  Lower bound
2519 @param ub  Upper bound
2520 @param st  Step (or increment if you prefer)
2521 @param chunk The chunk size to block with
2522 
2523 This function prepares the runtime to start a dynamically scheduled for loop,
2524 saving the loop arguments.
2525 These functions are all identical apart from the types of the arguments.
2526 */
2527 
2528 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2529                             enum sched_type schedule, kmp_int32 lb,
2530                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2531   KMP_DEBUG_ASSERT(__kmp_init_serial);
2532   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2533 }
2534 /*!
2535 See @ref __kmpc_dispatch_init_4
2536 */
2537 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2538                              enum sched_type schedule, kmp_uint32 lb,
2539                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2540   KMP_DEBUG_ASSERT(__kmp_init_serial);
2541   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2542 }
2543 
2544 /*!
2545 See @ref __kmpc_dispatch_init_4
2546 */
2547 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2548                             enum sched_type schedule, kmp_int64 lb,
2549                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2550   KMP_DEBUG_ASSERT(__kmp_init_serial);
2551   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2552 }
2553 
2554 /*!
2555 See @ref __kmpc_dispatch_init_4
2556 */
2557 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2558                              enum sched_type schedule, kmp_uint64 lb,
2559                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2560   KMP_DEBUG_ASSERT(__kmp_init_serial);
2561   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2562 }
2563 
2564 /*!
2565 See @ref __kmpc_dispatch_init_4
2566 
2567 Difference from __kmpc_dispatch_init set of functions is these functions
2568 are called for composite distribute parallel for construct. Thus before
2569 regular iterations dispatching we need to calc per-team iteration space.
2570 
2571 These functions are all identical apart from the types of the arguments.
2572 */
2573 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2574                                  enum sched_type schedule, kmp_int32 *p_last,
2575                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2576                                  kmp_int32 chunk) {
2577   KMP_DEBUG_ASSERT(__kmp_init_serial);
2578   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2579   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2580 }
2581 
2582 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2583                                   enum sched_type schedule, kmp_int32 *p_last,
2584                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2585                                   kmp_int32 chunk) {
2586   KMP_DEBUG_ASSERT(__kmp_init_serial);
2587   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2588   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2589 }
2590 
2591 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2592                                  enum sched_type schedule, kmp_int32 *p_last,
2593                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2594                                  kmp_int64 chunk) {
2595   KMP_DEBUG_ASSERT(__kmp_init_serial);
2596   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2597   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2598 }
2599 
2600 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2601                                   enum sched_type schedule, kmp_int32 *p_last,
2602                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2603                                   kmp_int64 chunk) {
2604   KMP_DEBUG_ASSERT(__kmp_init_serial);
2605   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2606   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2607 }
2608 
2609 /*!
2610 @param loc Source code location
2611 @param gtid Global thread id
2612 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2613 otherwise
2614 @param p_lb   Pointer to the lower bound for the next chunk of work
2615 @param p_ub   Pointer to the upper bound for the next chunk of work
2616 @param p_st   Pointer to the stride for the next chunk of work
2617 @return one if there is work to be done, zero otherwise
2618 
2619 Get the next dynamically allocated chunk of work for this thread.
2620 If there is no more work, then the lb,ub and stride need not be modified.
2621 */
2622 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2623                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2624   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2625 }
2626 
2627 /*!
2628 See @ref __kmpc_dispatch_next_4
2629 */
2630 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2631                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2632                             kmp_int32 *p_st) {
2633   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
2634 }
2635 
2636 /*!
2637 See @ref __kmpc_dispatch_next_4
2638 */
2639 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2640                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2641   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2642 }
2643 
2644 /*!
2645 See @ref __kmpc_dispatch_next_4
2646 */
2647 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2648                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2649                             kmp_int64 *p_st) {
2650   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
2651 }
2652 
2653 /*!
2654 @param loc Source code location
2655 @param gtid Global thread id
2656 
2657 Mark the end of a dynamic loop.
2658 */
2659 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2660   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2661 }
2662 
2663 /*!
2664 See @ref __kmpc_dispatch_fini_4
2665 */
2666 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2667   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2668 }
2669 
2670 /*!
2671 See @ref __kmpc_dispatch_fini_4
2672 */
2673 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2674   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2675 }
2676 
2677 /*!
2678 See @ref __kmpc_dispatch_fini_4
2679 */
2680 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2681   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2682 }
2683 /*! @} */
2684 
2685 //-----------------------------------------------------------------------------
2686 // Non-template routines from kmp_dispatch.cpp used in other sources
2687 
2688 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2689   return value == checker;
2690 }
2691 
2692 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2693   return value != checker;
2694 }
2695 
2696 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2697   return value < checker;
2698 }
2699 
2700 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2701   return value >= checker;
2702 }
2703 
2704 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2705   return value <= checker;
2706 }
2707 
2708 kmp_uint32
2709 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2710                    kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2711                    void *obj // Higher-level synchronization object, or NULL.
2712                    ) {
2713   // note: we may not belong to a team at this point
2714   volatile kmp_uint32 *spin = spinner;
2715   kmp_uint32 check = checker;
2716   kmp_uint32 spins;
2717   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2718   kmp_uint32 r;
2719 
2720   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2721   KMP_INIT_YIELD(spins);
2722   // main wait spin loop
2723   while (!f(r = TCR_4(*spin), check)) {
2724     KMP_FSYNC_SPIN_PREPARE(obj);
2725     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2726        split. It causes problems with infinite recursion because of exit lock */
2727     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2728         __kmp_abort_thread(); */
2729 
2730     /* if we have waited a bit, or are oversubscribed, yield */
2731     /* pause is in the following code */
2732     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2733     KMP_YIELD_SPIN(spins);
2734   }
2735   KMP_FSYNC_SPIN_ACQUIRED(obj);
2736   return r;
2737 }
2738 
2739 void __kmp_wait_yield_4_ptr(
2740     void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2741     void *obj // Higher-level synchronization object, or NULL.
2742     ) {
2743   // note: we may not belong to a team at this point
2744   void *spin = spinner;
2745   kmp_uint32 check = checker;
2746   kmp_uint32 spins;
2747   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2748 
2749   KMP_FSYNC_SPIN_INIT(obj, spin);
2750   KMP_INIT_YIELD(spins);
2751   // main wait spin loop
2752   while (!f(spin, check)) {
2753     KMP_FSYNC_SPIN_PREPARE(obj);
2754     /* if we have waited a bit, or are oversubscribed, yield */
2755     /* pause is in the following code */
2756     KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2757     KMP_YIELD_SPIN(spins);
2758   }
2759   KMP_FSYNC_SPIN_ACQUIRED(obj);
2760 }
2761 
2762 } // extern "C"
2763 
2764 #ifdef KMP_GOMP_COMPAT
2765 
2766 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2767                                enum sched_type schedule, kmp_int32 lb,
2768                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2769                                int push_ws) {
2770   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2771                                  push_ws);
2772 }
2773 
2774 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2775                                 enum sched_type schedule, kmp_uint32 lb,
2776                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2777                                 int push_ws) {
2778   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2779                                   push_ws);
2780 }
2781 
2782 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2783                                enum sched_type schedule, kmp_int64 lb,
2784                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2785                                int push_ws) {
2786   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2787                                  push_ws);
2788 }
2789 
2790 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2791                                 enum sched_type schedule, kmp_uint64 lb,
2792                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2793                                 int push_ws) {
2794   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2795                                   push_ws);
2796 }
2797 
2798 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2799   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2800 }
2801 
2802 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2803   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2804 }
2805 
2806 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2807   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2808 }
2809 
2810 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2811   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2812 }
2813 
2814 #endif /* KMP_GOMP_COMPAT */
2815 
2816 /* ------------------------------------------------------------------------ */
2817